1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This contains code to emit Builtin calls as LLVM code.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CodeGenFunction.h"
15 #include "CGObjCRuntime.h"
16 #include "CodeGenModule.h"
17 #include "TargetInfo.h"
18 #include "clang/AST/ASTContext.h"
19 #include "clang/AST/Decl.h"
20 #include "clang/Basic/TargetBuiltins.h"
21 #include "clang/Basic/TargetInfo.h"
22 #include "clang/CodeGen/CGFunctionInfo.h"
23 #include "llvm/IR/DataLayout.h"
24 #include "llvm/IR/Intrinsics.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm;
29 
30 /// getBuiltinLibFunction - Given a builtin id for a function like
31 /// "__builtin_fabsf", return a Function* for "fabsf".
32 llvm::Value *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
33                                                   unsigned BuiltinID) {
34   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
35 
36   // Get the name, skip over the __builtin_ prefix (if necessary).
37   StringRef Name;
38   GlobalDecl D(FD);
39 
40   // If the builtin has been declared explicitly with an assembler label,
41   // use the mangled name. This differs from the plain label on platforms
42   // that prefix labels.
43   if (FD->hasAttr<AsmLabelAttr>())
44     Name = getMangledName(D);
45   else
46     Name = Context.BuiltinInfo.GetName(BuiltinID) + 10;
47 
48   llvm::FunctionType *Ty =
49     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
50 
51   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
52 }
53 
54 /// Emit the conversions required to turn the given value into an
55 /// integer of the given size.
56 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
57                         QualType T, llvm::IntegerType *IntType) {
58   V = CGF.EmitToMemory(V, T);
59 
60   if (V->getType()->isPointerTy())
61     return CGF.Builder.CreatePtrToInt(V, IntType);
62 
63   assert(V->getType() == IntType);
64   return V;
65 }
66 
67 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
68                           QualType T, llvm::Type *ResultType) {
69   V = CGF.EmitFromMemory(V, T);
70 
71   if (ResultType->isPointerTy())
72     return CGF.Builder.CreateIntToPtr(V, ResultType);
73 
74   assert(V->getType() == ResultType);
75   return V;
76 }
77 
78 /// Utility to insert an atomic instruction based on Instrinsic::ID
79 /// and the expression node.
80 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
81                                llvm::AtomicRMWInst::BinOp Kind,
82                                const CallExpr *E) {
83   QualType T = E->getType();
84   assert(E->getArg(0)->getType()->isPointerType());
85   assert(CGF.getContext().hasSameUnqualifiedType(T,
86                                   E->getArg(0)->getType()->getPointeeType()));
87   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
88 
89   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
90   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
91 
92   llvm::IntegerType *IntType =
93     llvm::IntegerType::get(CGF.getLLVMContext(),
94                            CGF.getContext().getTypeSize(T));
95   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
96 
97   llvm::Value *Args[2];
98   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
99   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
100   llvm::Type *ValueType = Args[1]->getType();
101   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
102 
103   llvm::Value *Result =
104       CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
105                                   llvm::SequentiallyConsistent);
106   Result = EmitFromInt(CGF, Result, T, ValueType);
107   return RValue::get(Result);
108 }
109 
110 /// Utility to insert an atomic instruction based Instrinsic::ID and
111 /// the expression node, where the return value is the result of the
112 /// operation.
113 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
114                                    llvm::AtomicRMWInst::BinOp Kind,
115                                    const CallExpr *E,
116                                    Instruction::BinaryOps Op) {
117   QualType T = E->getType();
118   assert(E->getArg(0)->getType()->isPointerType());
119   assert(CGF.getContext().hasSameUnqualifiedType(T,
120                                   E->getArg(0)->getType()->getPointeeType()));
121   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
122 
123   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
124   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
125 
126   llvm::IntegerType *IntType =
127     llvm::IntegerType::get(CGF.getLLVMContext(),
128                            CGF.getContext().getTypeSize(T));
129   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
130 
131   llvm::Value *Args[2];
132   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
133   llvm::Type *ValueType = Args[1]->getType();
134   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
135   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
136 
137   llvm::Value *Result =
138       CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
139                                   llvm::SequentiallyConsistent);
140   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
141   Result = EmitFromInt(CGF, Result, T, ValueType);
142   return RValue::get(Result);
143 }
144 
145 /// EmitFAbs - Emit a call to fabs/fabsf/fabsl, depending on the type of ValTy,
146 /// which must be a scalar floating point type.
147 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V, QualType ValTy) {
148   const BuiltinType *ValTyP = ValTy->getAs<BuiltinType>();
149   assert(ValTyP && "isn't scalar fp type!");
150 
151   StringRef FnName;
152   switch (ValTyP->getKind()) {
153   default: llvm_unreachable("Isn't a scalar fp type!");
154   case BuiltinType::Float:      FnName = "fabsf"; break;
155   case BuiltinType::Double:     FnName = "fabs"; break;
156   case BuiltinType::LongDouble: FnName = "fabsl"; break;
157   }
158 
159   // The prototype is something that takes and returns whatever V's type is.
160   llvm::FunctionType *FT = llvm::FunctionType::get(V->getType(), V->getType(),
161                                                    false);
162   llvm::Value *Fn = CGF.CGM.CreateRuntimeFunction(FT, FnName);
163 
164   return CGF.EmitNounwindRuntimeCall(Fn, V, "abs");
165 }
166 
167 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *Fn,
168                               const CallExpr *E, llvm::Value *calleeValue) {
169   return CGF.EmitCall(E->getCallee()->getType(), calleeValue, E->getLocStart(),
170                       ReturnValueSlot(), E->arg_begin(), E->arg_end(), Fn);
171 }
172 
173 /// \brief Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
174 /// depending on IntrinsicID.
175 ///
176 /// \arg CGF The current codegen function.
177 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
178 /// \arg X The first argument to the llvm.*.with.overflow.*.
179 /// \arg Y The second argument to the llvm.*.with.overflow.*.
180 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
181 /// \returns The result (i.e. sum/product) returned by the intrinsic.
182 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
183                                           const llvm::Intrinsic::ID IntrinsicID,
184                                           llvm::Value *X, llvm::Value *Y,
185                                           llvm::Value *&Carry) {
186   // Make sure we have integers of the same width.
187   assert(X->getType() == Y->getType() &&
188          "Arguments must be the same type. (Did you forget to make sure both "
189          "arguments have the same integer width?)");
190 
191   llvm::Value *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
192   llvm::Value *Tmp = CGF.Builder.CreateCall2(Callee, X, Y);
193   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
194   return CGF.Builder.CreateExtractValue(Tmp, 0);
195 }
196 
197 RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
198                                         unsigned BuiltinID, const CallExpr *E) {
199   // See if we can constant fold this builtin.  If so, don't emit it at all.
200   Expr::EvalResult Result;
201   if (E->EvaluateAsRValue(Result, CGM.getContext()) &&
202       !Result.hasSideEffects()) {
203     if (Result.Val.isInt())
204       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
205                                                 Result.Val.getInt()));
206     if (Result.Val.isFloat())
207       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
208                                                Result.Val.getFloat()));
209   }
210 
211   switch (BuiltinID) {
212   default: break;  // Handle intrinsics and libm functions below.
213   case Builtin::BI__builtin___CFStringMakeConstantString:
214   case Builtin::BI__builtin___NSStringMakeConstantString:
215     return RValue::get(CGM.EmitConstantExpr(E, E->getType(), 0));
216   case Builtin::BI__builtin_stdarg_start:
217   case Builtin::BI__builtin_va_start:
218   case Builtin::BI__va_start:
219   case Builtin::BI__builtin_va_end: {
220     Value *ArgValue = (BuiltinID == Builtin::BI__va_start)
221                           ? EmitScalarExpr(E->getArg(0))
222                           : EmitVAListRef(E->getArg(0));
223     llvm::Type *DestType = Int8PtrTy;
224     if (ArgValue->getType() != DestType)
225       ArgValue = Builder.CreateBitCast(ArgValue, DestType,
226                                        ArgValue->getName().data());
227 
228     Intrinsic::ID inst = (BuiltinID == Builtin::BI__builtin_va_end) ?
229       Intrinsic::vaend : Intrinsic::vastart;
230     return RValue::get(Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue));
231   }
232   case Builtin::BI__builtin_va_copy: {
233     Value *DstPtr = EmitVAListRef(E->getArg(0));
234     Value *SrcPtr = EmitVAListRef(E->getArg(1));
235 
236     llvm::Type *Type = Int8PtrTy;
237 
238     DstPtr = Builder.CreateBitCast(DstPtr, Type);
239     SrcPtr = Builder.CreateBitCast(SrcPtr, Type);
240     return RValue::get(Builder.CreateCall2(CGM.getIntrinsic(Intrinsic::vacopy),
241                                            DstPtr, SrcPtr));
242   }
243   case Builtin::BI__builtin_abs:
244   case Builtin::BI__builtin_labs:
245   case Builtin::BI__builtin_llabs: {
246     Value *ArgValue = EmitScalarExpr(E->getArg(0));
247 
248     Value *NegOp = Builder.CreateNeg(ArgValue, "neg");
249     Value *CmpResult =
250     Builder.CreateICmpSGE(ArgValue,
251                           llvm::Constant::getNullValue(ArgValue->getType()),
252                                                             "abscond");
253     Value *Result =
254       Builder.CreateSelect(CmpResult, ArgValue, NegOp, "abs");
255 
256     return RValue::get(Result);
257   }
258 
259   case Builtin::BI__builtin_conj:
260   case Builtin::BI__builtin_conjf:
261   case Builtin::BI__builtin_conjl: {
262     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
263     Value *Real = ComplexVal.first;
264     Value *Imag = ComplexVal.second;
265     Value *Zero =
266       Imag->getType()->isFPOrFPVectorTy()
267         ? llvm::ConstantFP::getZeroValueForNegation(Imag->getType())
268         : llvm::Constant::getNullValue(Imag->getType());
269 
270     Imag = Builder.CreateFSub(Zero, Imag, "sub");
271     return RValue::getComplex(std::make_pair(Real, Imag));
272   }
273   case Builtin::BI__builtin_creal:
274   case Builtin::BI__builtin_crealf:
275   case Builtin::BI__builtin_creall:
276   case Builtin::BIcreal:
277   case Builtin::BIcrealf:
278   case Builtin::BIcreall: {
279     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
280     return RValue::get(ComplexVal.first);
281   }
282 
283   case Builtin::BI__builtin_cimag:
284   case Builtin::BI__builtin_cimagf:
285   case Builtin::BI__builtin_cimagl:
286   case Builtin::BIcimag:
287   case Builtin::BIcimagf:
288   case Builtin::BIcimagl: {
289     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
290     return RValue::get(ComplexVal.second);
291   }
292 
293   case Builtin::BI__builtin_ctzs:
294   case Builtin::BI__builtin_ctz:
295   case Builtin::BI__builtin_ctzl:
296   case Builtin::BI__builtin_ctzll: {
297     Value *ArgValue = EmitScalarExpr(E->getArg(0));
298 
299     llvm::Type *ArgType = ArgValue->getType();
300     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
301 
302     llvm::Type *ResultType = ConvertType(E->getType());
303     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
304     Value *Result = Builder.CreateCall2(F, ArgValue, ZeroUndef);
305     if (Result->getType() != ResultType)
306       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
307                                      "cast");
308     return RValue::get(Result);
309   }
310   case Builtin::BI__builtin_clzs:
311   case Builtin::BI__builtin_clz:
312   case Builtin::BI__builtin_clzl:
313   case Builtin::BI__builtin_clzll: {
314     Value *ArgValue = EmitScalarExpr(E->getArg(0));
315 
316     llvm::Type *ArgType = ArgValue->getType();
317     Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
318 
319     llvm::Type *ResultType = ConvertType(E->getType());
320     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
321     Value *Result = Builder.CreateCall2(F, ArgValue, ZeroUndef);
322     if (Result->getType() != ResultType)
323       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
324                                      "cast");
325     return RValue::get(Result);
326   }
327   case Builtin::BI__builtin_ffs:
328   case Builtin::BI__builtin_ffsl:
329   case Builtin::BI__builtin_ffsll: {
330     // ffs(x) -> x ? cttz(x) + 1 : 0
331     Value *ArgValue = EmitScalarExpr(E->getArg(0));
332 
333     llvm::Type *ArgType = ArgValue->getType();
334     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
335 
336     llvm::Type *ResultType = ConvertType(E->getType());
337     Value *Tmp = Builder.CreateAdd(Builder.CreateCall2(F, ArgValue,
338                                                        Builder.getTrue()),
339                                    llvm::ConstantInt::get(ArgType, 1));
340     Value *Zero = llvm::Constant::getNullValue(ArgType);
341     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
342     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
343     if (Result->getType() != ResultType)
344       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
345                                      "cast");
346     return RValue::get(Result);
347   }
348   case Builtin::BI__builtin_parity:
349   case Builtin::BI__builtin_parityl:
350   case Builtin::BI__builtin_parityll: {
351     // parity(x) -> ctpop(x) & 1
352     Value *ArgValue = EmitScalarExpr(E->getArg(0));
353 
354     llvm::Type *ArgType = ArgValue->getType();
355     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
356 
357     llvm::Type *ResultType = ConvertType(E->getType());
358     Value *Tmp = Builder.CreateCall(F, ArgValue);
359     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
360     if (Result->getType() != ResultType)
361       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
362                                      "cast");
363     return RValue::get(Result);
364   }
365   case Builtin::BI__builtin_popcount:
366   case Builtin::BI__builtin_popcountl:
367   case Builtin::BI__builtin_popcountll: {
368     Value *ArgValue = EmitScalarExpr(E->getArg(0));
369 
370     llvm::Type *ArgType = ArgValue->getType();
371     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
372 
373     llvm::Type *ResultType = ConvertType(E->getType());
374     Value *Result = Builder.CreateCall(F, ArgValue);
375     if (Result->getType() != ResultType)
376       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
377                                      "cast");
378     return RValue::get(Result);
379   }
380   case Builtin::BI__builtin_expect: {
381     Value *ArgValue = EmitScalarExpr(E->getArg(0));
382     llvm::Type *ArgType = ArgValue->getType();
383 
384     Value *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
385     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
386 
387     Value *Result = Builder.CreateCall2(FnExpect, ArgValue, ExpectedValue,
388                                         "expval");
389     return RValue::get(Result);
390   }
391   case Builtin::BI__builtin_bswap16:
392   case Builtin::BI__builtin_bswap32:
393   case Builtin::BI__builtin_bswap64: {
394     Value *ArgValue = EmitScalarExpr(E->getArg(0));
395     llvm::Type *ArgType = ArgValue->getType();
396     Value *F = CGM.getIntrinsic(Intrinsic::bswap, ArgType);
397     return RValue::get(Builder.CreateCall(F, ArgValue));
398   }
399   case Builtin::BI__builtin_object_size: {
400     // We rely on constant folding to deal with expressions with side effects.
401     assert(!E->getArg(0)->HasSideEffects(getContext()) &&
402            "should have been constant folded");
403 
404     // We pass this builtin onto the optimizer so that it can
405     // figure out the object size in more complex cases.
406     llvm::Type *ResType = ConvertType(E->getType());
407 
408     // LLVM only supports 0 and 2, make sure that we pass along that
409     // as a boolean.
410     Value *Ty = EmitScalarExpr(E->getArg(1));
411     ConstantInt *CI = dyn_cast<ConstantInt>(Ty);
412     assert(CI);
413     uint64_t val = CI->getZExtValue();
414     CI = ConstantInt::get(Builder.getInt1Ty(), (val & 0x2) >> 1);
415     // FIXME: Get right address space.
416     llvm::Type *Tys[] = { ResType, Builder.getInt8PtrTy(0) };
417     Value *F = CGM.getIntrinsic(Intrinsic::objectsize, Tys);
418     return RValue::get(Builder.CreateCall2(F, EmitScalarExpr(E->getArg(0)),CI));
419   }
420   case Builtin::BI__builtin_prefetch: {
421     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
422     // FIXME: Technically these constants should of type 'int', yes?
423     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
424       llvm::ConstantInt::get(Int32Ty, 0);
425     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
426       llvm::ConstantInt::get(Int32Ty, 3);
427     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
428     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
429     return RValue::get(Builder.CreateCall4(F, Address, RW, Locality, Data));
430   }
431   case Builtin::BI__builtin_readcyclecounter: {
432     Value *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
433     return RValue::get(Builder.CreateCall(F));
434   }
435   case Builtin::BI__builtin___clear_cache: {
436     Value *Begin = EmitScalarExpr(E->getArg(0));
437     Value *End = EmitScalarExpr(E->getArg(1));
438     Value *F = CGM.getIntrinsic(Intrinsic::clear_cache);
439     return RValue::get(Builder.CreateCall2(F, Begin, End));
440   }
441   case Builtin::BI__builtin_trap: {
442     Value *F = CGM.getIntrinsic(Intrinsic::trap);
443     return RValue::get(Builder.CreateCall(F));
444   }
445   case Builtin::BI__debugbreak: {
446     Value *F = CGM.getIntrinsic(Intrinsic::debugtrap);
447     return RValue::get(Builder.CreateCall(F));
448   }
449   case Builtin::BI__builtin_unreachable: {
450     if (SanOpts->Unreachable)
451       EmitCheck(Builder.getFalse(), "builtin_unreachable",
452                 EmitCheckSourceLocation(E->getExprLoc()),
453                 ArrayRef<llvm::Value *>(), CRK_Unrecoverable);
454     else
455       Builder.CreateUnreachable();
456 
457     // We do need to preserve an insertion point.
458     EmitBlock(createBasicBlock("unreachable.cont"));
459 
460     return RValue::get(0);
461   }
462 
463   case Builtin::BI__builtin_powi:
464   case Builtin::BI__builtin_powif:
465   case Builtin::BI__builtin_powil: {
466     Value *Base = EmitScalarExpr(E->getArg(0));
467     Value *Exponent = EmitScalarExpr(E->getArg(1));
468     llvm::Type *ArgType = Base->getType();
469     Value *F = CGM.getIntrinsic(Intrinsic::powi, ArgType);
470     return RValue::get(Builder.CreateCall2(F, Base, Exponent));
471   }
472 
473   case Builtin::BI__builtin_isgreater:
474   case Builtin::BI__builtin_isgreaterequal:
475   case Builtin::BI__builtin_isless:
476   case Builtin::BI__builtin_islessequal:
477   case Builtin::BI__builtin_islessgreater:
478   case Builtin::BI__builtin_isunordered: {
479     // Ordered comparisons: we know the arguments to these are matching scalar
480     // floating point values.
481     Value *LHS = EmitScalarExpr(E->getArg(0));
482     Value *RHS = EmitScalarExpr(E->getArg(1));
483 
484     switch (BuiltinID) {
485     default: llvm_unreachable("Unknown ordered comparison");
486     case Builtin::BI__builtin_isgreater:
487       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
488       break;
489     case Builtin::BI__builtin_isgreaterequal:
490       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
491       break;
492     case Builtin::BI__builtin_isless:
493       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
494       break;
495     case Builtin::BI__builtin_islessequal:
496       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
497       break;
498     case Builtin::BI__builtin_islessgreater:
499       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
500       break;
501     case Builtin::BI__builtin_isunordered:
502       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
503       break;
504     }
505     // ZExt bool to int type.
506     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
507   }
508   case Builtin::BI__builtin_isnan: {
509     Value *V = EmitScalarExpr(E->getArg(0));
510     V = Builder.CreateFCmpUNO(V, V, "cmp");
511     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
512   }
513 
514   case Builtin::BI__builtin_isinf: {
515     // isinf(x) --> fabs(x) == infinity
516     Value *V = EmitScalarExpr(E->getArg(0));
517     V = EmitFAbs(*this, V, E->getArg(0)->getType());
518 
519     V = Builder.CreateFCmpOEQ(V, ConstantFP::getInfinity(V->getType()),"isinf");
520     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
521   }
522 
523   // TODO: BI__builtin_isinf_sign
524   //   isinf_sign(x) -> isinf(x) ? (signbit(x) ? -1 : 1) : 0
525 
526   case Builtin::BI__builtin_isnormal: {
527     // isnormal(x) --> x == x && fabsf(x) < infinity && fabsf(x) >= float_min
528     Value *V = EmitScalarExpr(E->getArg(0));
529     Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
530 
531     Value *Abs = EmitFAbs(*this, V, E->getArg(0)->getType());
532     Value *IsLessThanInf =
533       Builder.CreateFCmpULT(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
534     APFloat Smallest = APFloat::getSmallestNormalized(
535                    getContext().getFloatTypeSemantics(E->getArg(0)->getType()));
536     Value *IsNormal =
537       Builder.CreateFCmpUGE(Abs, ConstantFP::get(V->getContext(), Smallest),
538                             "isnormal");
539     V = Builder.CreateAnd(Eq, IsLessThanInf, "and");
540     V = Builder.CreateAnd(V, IsNormal, "and");
541     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
542   }
543 
544   case Builtin::BI__builtin_isfinite: {
545     // isfinite(x) --> x == x && fabs(x) != infinity;
546     Value *V = EmitScalarExpr(E->getArg(0));
547     Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
548 
549     Value *Abs = EmitFAbs(*this, V, E->getArg(0)->getType());
550     Value *IsNotInf =
551       Builder.CreateFCmpUNE(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
552 
553     V = Builder.CreateAnd(Eq, IsNotInf, "and");
554     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
555   }
556 
557   case Builtin::BI__builtin_fpclassify: {
558     Value *V = EmitScalarExpr(E->getArg(5));
559     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
560 
561     // Create Result
562     BasicBlock *Begin = Builder.GetInsertBlock();
563     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
564     Builder.SetInsertPoint(End);
565     PHINode *Result =
566       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
567                         "fpclassify_result");
568 
569     // if (V==0) return FP_ZERO
570     Builder.SetInsertPoint(Begin);
571     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
572                                           "iszero");
573     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
574     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
575     Builder.CreateCondBr(IsZero, End, NotZero);
576     Result->addIncoming(ZeroLiteral, Begin);
577 
578     // if (V != V) return FP_NAN
579     Builder.SetInsertPoint(NotZero);
580     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
581     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
582     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
583     Builder.CreateCondBr(IsNan, End, NotNan);
584     Result->addIncoming(NanLiteral, NotZero);
585 
586     // if (fabs(V) == infinity) return FP_INFINITY
587     Builder.SetInsertPoint(NotNan);
588     Value *VAbs = EmitFAbs(*this, V, E->getArg(5)->getType());
589     Value *IsInf =
590       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
591                             "isinf");
592     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
593     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
594     Builder.CreateCondBr(IsInf, End, NotInf);
595     Result->addIncoming(InfLiteral, NotNan);
596 
597     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
598     Builder.SetInsertPoint(NotInf);
599     APFloat Smallest = APFloat::getSmallestNormalized(
600         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
601     Value *IsNormal =
602       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
603                             "isnormal");
604     Value *NormalResult =
605       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
606                            EmitScalarExpr(E->getArg(3)));
607     Builder.CreateBr(End);
608     Result->addIncoming(NormalResult, NotInf);
609 
610     // return Result
611     Builder.SetInsertPoint(End);
612     return RValue::get(Result);
613   }
614 
615   case Builtin::BIalloca:
616   case Builtin::BI_alloca:
617   case Builtin::BI__builtin_alloca: {
618     Value *Size = EmitScalarExpr(E->getArg(0));
619     return RValue::get(Builder.CreateAlloca(Builder.getInt8Ty(), Size));
620   }
621   case Builtin::BIbzero:
622   case Builtin::BI__builtin_bzero: {
623     std::pair<llvm::Value*, unsigned> Dest =
624         EmitPointerWithAlignment(E->getArg(0));
625     Value *SizeVal = EmitScalarExpr(E->getArg(1));
626     Builder.CreateMemSet(Dest.first, Builder.getInt8(0), SizeVal,
627                          Dest.second, false);
628     return RValue::get(Dest.first);
629   }
630   case Builtin::BImemcpy:
631   case Builtin::BI__builtin_memcpy: {
632     std::pair<llvm::Value*, unsigned> Dest =
633         EmitPointerWithAlignment(E->getArg(0));
634     std::pair<llvm::Value*, unsigned> Src =
635         EmitPointerWithAlignment(E->getArg(1));
636     Value *SizeVal = EmitScalarExpr(E->getArg(2));
637     unsigned Align = std::min(Dest.second, Src.second);
638     Builder.CreateMemCpy(Dest.first, Src.first, SizeVal, Align, false);
639     return RValue::get(Dest.first);
640   }
641 
642   case Builtin::BI__builtin___memcpy_chk: {
643     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
644     llvm::APSInt Size, DstSize;
645     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
646         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
647       break;
648     if (Size.ugt(DstSize))
649       break;
650     std::pair<llvm::Value*, unsigned> Dest =
651         EmitPointerWithAlignment(E->getArg(0));
652     std::pair<llvm::Value*, unsigned> Src =
653         EmitPointerWithAlignment(E->getArg(1));
654     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
655     unsigned Align = std::min(Dest.second, Src.second);
656     Builder.CreateMemCpy(Dest.first, Src.first, SizeVal, Align, false);
657     return RValue::get(Dest.first);
658   }
659 
660   case Builtin::BI__builtin_objc_memmove_collectable: {
661     Value *Address = EmitScalarExpr(E->getArg(0));
662     Value *SrcAddr = EmitScalarExpr(E->getArg(1));
663     Value *SizeVal = EmitScalarExpr(E->getArg(2));
664     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
665                                                   Address, SrcAddr, SizeVal);
666     return RValue::get(Address);
667   }
668 
669   case Builtin::BI__builtin___memmove_chk: {
670     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
671     llvm::APSInt Size, DstSize;
672     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
673         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
674       break;
675     if (Size.ugt(DstSize))
676       break;
677     std::pair<llvm::Value*, unsigned> Dest =
678         EmitPointerWithAlignment(E->getArg(0));
679     std::pair<llvm::Value*, unsigned> Src =
680         EmitPointerWithAlignment(E->getArg(1));
681     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
682     unsigned Align = std::min(Dest.second, Src.second);
683     Builder.CreateMemMove(Dest.first, Src.first, SizeVal, Align, false);
684     return RValue::get(Dest.first);
685   }
686 
687   case Builtin::BImemmove:
688   case Builtin::BI__builtin_memmove: {
689     std::pair<llvm::Value*, unsigned> Dest =
690         EmitPointerWithAlignment(E->getArg(0));
691     std::pair<llvm::Value*, unsigned> Src =
692         EmitPointerWithAlignment(E->getArg(1));
693     Value *SizeVal = EmitScalarExpr(E->getArg(2));
694     unsigned Align = std::min(Dest.second, Src.second);
695     Builder.CreateMemMove(Dest.first, Src.first, SizeVal, Align, false);
696     return RValue::get(Dest.first);
697   }
698   case Builtin::BImemset:
699   case Builtin::BI__builtin_memset: {
700     std::pair<llvm::Value*, unsigned> Dest =
701         EmitPointerWithAlignment(E->getArg(0));
702     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
703                                          Builder.getInt8Ty());
704     Value *SizeVal = EmitScalarExpr(E->getArg(2));
705     Builder.CreateMemSet(Dest.first, ByteVal, SizeVal, Dest.second, false);
706     return RValue::get(Dest.first);
707   }
708   case Builtin::BI__builtin___memset_chk: {
709     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
710     llvm::APSInt Size, DstSize;
711     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
712         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
713       break;
714     if (Size.ugt(DstSize))
715       break;
716     std::pair<llvm::Value*, unsigned> Dest =
717         EmitPointerWithAlignment(E->getArg(0));
718     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
719                                          Builder.getInt8Ty());
720     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
721     Builder.CreateMemSet(Dest.first, ByteVal, SizeVal, Dest.second, false);
722     return RValue::get(Dest.first);
723   }
724   case Builtin::BI__builtin_dwarf_cfa: {
725     // The offset in bytes from the first argument to the CFA.
726     //
727     // Why on earth is this in the frontend?  Is there any reason at
728     // all that the backend can't reasonably determine this while
729     // lowering llvm.eh.dwarf.cfa()?
730     //
731     // TODO: If there's a satisfactory reason, add a target hook for
732     // this instead of hard-coding 0, which is correct for most targets.
733     int32_t Offset = 0;
734 
735     Value *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
736     return RValue::get(Builder.CreateCall(F,
737                                       llvm::ConstantInt::get(Int32Ty, Offset)));
738   }
739   case Builtin::BI__builtin_return_address: {
740     Value *Depth = EmitScalarExpr(E->getArg(0));
741     Depth = Builder.CreateIntCast(Depth, Int32Ty, false);
742     Value *F = CGM.getIntrinsic(Intrinsic::returnaddress);
743     return RValue::get(Builder.CreateCall(F, Depth));
744   }
745   case Builtin::BI__builtin_frame_address: {
746     Value *Depth = EmitScalarExpr(E->getArg(0));
747     Depth = Builder.CreateIntCast(Depth, Int32Ty, false);
748     Value *F = CGM.getIntrinsic(Intrinsic::frameaddress);
749     return RValue::get(Builder.CreateCall(F, Depth));
750   }
751   case Builtin::BI__builtin_extract_return_addr: {
752     Value *Address = EmitScalarExpr(E->getArg(0));
753     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
754     return RValue::get(Result);
755   }
756   case Builtin::BI__builtin_frob_return_addr: {
757     Value *Address = EmitScalarExpr(E->getArg(0));
758     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
759     return RValue::get(Result);
760   }
761   case Builtin::BI__builtin_dwarf_sp_column: {
762     llvm::IntegerType *Ty
763       = cast<llvm::IntegerType>(ConvertType(E->getType()));
764     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
765     if (Column == -1) {
766       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
767       return RValue::get(llvm::UndefValue::get(Ty));
768     }
769     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
770   }
771   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
772     Value *Address = EmitScalarExpr(E->getArg(0));
773     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
774       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
775     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
776   }
777   case Builtin::BI__builtin_eh_return: {
778     Value *Int = EmitScalarExpr(E->getArg(0));
779     Value *Ptr = EmitScalarExpr(E->getArg(1));
780 
781     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
782     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
783            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
784     Value *F = CGM.getIntrinsic(IntTy->getBitWidth() == 32
785                                   ? Intrinsic::eh_return_i32
786                                   : Intrinsic::eh_return_i64);
787     Builder.CreateCall2(F, Int, Ptr);
788     Builder.CreateUnreachable();
789 
790     // We do need to preserve an insertion point.
791     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
792 
793     return RValue::get(0);
794   }
795   case Builtin::BI__builtin_unwind_init: {
796     Value *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
797     return RValue::get(Builder.CreateCall(F));
798   }
799   case Builtin::BI__builtin_extend_pointer: {
800     // Extends a pointer to the size of an _Unwind_Word, which is
801     // uint64_t on all platforms.  Generally this gets poked into a
802     // register and eventually used as an address, so if the
803     // addressing registers are wider than pointers and the platform
804     // doesn't implicitly ignore high-order bits when doing
805     // addressing, we need to make sure we zext / sext based on
806     // the platform's expectations.
807     //
808     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
809 
810     // Cast the pointer to intptr_t.
811     Value *Ptr = EmitScalarExpr(E->getArg(0));
812     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
813 
814     // If that's 64 bits, we're done.
815     if (IntPtrTy->getBitWidth() == 64)
816       return RValue::get(Result);
817 
818     // Otherwise, ask the codegen data what to do.
819     if (getTargetHooks().extendPointerWithSExt())
820       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
821     else
822       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
823   }
824   case Builtin::BI__builtin_setjmp: {
825     // Buffer is a void**.
826     Value *Buf = EmitScalarExpr(E->getArg(0));
827 
828     // Store the frame pointer to the setjmp buffer.
829     Value *FrameAddr =
830       Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
831                          ConstantInt::get(Int32Ty, 0));
832     Builder.CreateStore(FrameAddr, Buf);
833 
834     // Store the stack pointer to the setjmp buffer.
835     Value *StackAddr =
836       Builder.CreateCall(CGM.getIntrinsic(Intrinsic::stacksave));
837     Value *StackSaveSlot =
838       Builder.CreateGEP(Buf, ConstantInt::get(Int32Ty, 2));
839     Builder.CreateStore(StackAddr, StackSaveSlot);
840 
841     // Call LLVM's EH setjmp, which is lightweight.
842     Value *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
843     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
844     return RValue::get(Builder.CreateCall(F, Buf));
845   }
846   case Builtin::BI__builtin_longjmp: {
847     Value *Buf = EmitScalarExpr(E->getArg(0));
848     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
849 
850     // Call LLVM's EH longjmp, which is lightweight.
851     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
852 
853     // longjmp doesn't return; mark this as unreachable.
854     Builder.CreateUnreachable();
855 
856     // We do need to preserve an insertion point.
857     EmitBlock(createBasicBlock("longjmp.cont"));
858 
859     return RValue::get(0);
860   }
861   case Builtin::BI__sync_fetch_and_add:
862   case Builtin::BI__sync_fetch_and_sub:
863   case Builtin::BI__sync_fetch_and_or:
864   case Builtin::BI__sync_fetch_and_and:
865   case Builtin::BI__sync_fetch_and_xor:
866   case Builtin::BI__sync_add_and_fetch:
867   case Builtin::BI__sync_sub_and_fetch:
868   case Builtin::BI__sync_and_and_fetch:
869   case Builtin::BI__sync_or_and_fetch:
870   case Builtin::BI__sync_xor_and_fetch:
871   case Builtin::BI__sync_val_compare_and_swap:
872   case Builtin::BI__sync_bool_compare_and_swap:
873   case Builtin::BI__sync_lock_test_and_set:
874   case Builtin::BI__sync_lock_release:
875   case Builtin::BI__sync_swap:
876     llvm_unreachable("Shouldn't make it through sema");
877   case Builtin::BI__sync_fetch_and_add_1:
878   case Builtin::BI__sync_fetch_and_add_2:
879   case Builtin::BI__sync_fetch_and_add_4:
880   case Builtin::BI__sync_fetch_and_add_8:
881   case Builtin::BI__sync_fetch_and_add_16:
882     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
883   case Builtin::BI__sync_fetch_and_sub_1:
884   case Builtin::BI__sync_fetch_and_sub_2:
885   case Builtin::BI__sync_fetch_and_sub_4:
886   case Builtin::BI__sync_fetch_and_sub_8:
887   case Builtin::BI__sync_fetch_and_sub_16:
888     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
889   case Builtin::BI__sync_fetch_and_or_1:
890   case Builtin::BI__sync_fetch_and_or_2:
891   case Builtin::BI__sync_fetch_and_or_4:
892   case Builtin::BI__sync_fetch_and_or_8:
893   case Builtin::BI__sync_fetch_and_or_16:
894     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
895   case Builtin::BI__sync_fetch_and_and_1:
896   case Builtin::BI__sync_fetch_and_and_2:
897   case Builtin::BI__sync_fetch_and_and_4:
898   case Builtin::BI__sync_fetch_and_and_8:
899   case Builtin::BI__sync_fetch_and_and_16:
900     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
901   case Builtin::BI__sync_fetch_and_xor_1:
902   case Builtin::BI__sync_fetch_and_xor_2:
903   case Builtin::BI__sync_fetch_and_xor_4:
904   case Builtin::BI__sync_fetch_and_xor_8:
905   case Builtin::BI__sync_fetch_and_xor_16:
906     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
907 
908   // Clang extensions: not overloaded yet.
909   case Builtin::BI__sync_fetch_and_min:
910     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
911   case Builtin::BI__sync_fetch_and_max:
912     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
913   case Builtin::BI__sync_fetch_and_umin:
914     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
915   case Builtin::BI__sync_fetch_and_umax:
916     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
917 
918   case Builtin::BI__sync_add_and_fetch_1:
919   case Builtin::BI__sync_add_and_fetch_2:
920   case Builtin::BI__sync_add_and_fetch_4:
921   case Builtin::BI__sync_add_and_fetch_8:
922   case Builtin::BI__sync_add_and_fetch_16:
923     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
924                                 llvm::Instruction::Add);
925   case Builtin::BI__sync_sub_and_fetch_1:
926   case Builtin::BI__sync_sub_and_fetch_2:
927   case Builtin::BI__sync_sub_and_fetch_4:
928   case Builtin::BI__sync_sub_and_fetch_8:
929   case Builtin::BI__sync_sub_and_fetch_16:
930     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
931                                 llvm::Instruction::Sub);
932   case Builtin::BI__sync_and_and_fetch_1:
933   case Builtin::BI__sync_and_and_fetch_2:
934   case Builtin::BI__sync_and_and_fetch_4:
935   case Builtin::BI__sync_and_and_fetch_8:
936   case Builtin::BI__sync_and_and_fetch_16:
937     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
938                                 llvm::Instruction::And);
939   case Builtin::BI__sync_or_and_fetch_1:
940   case Builtin::BI__sync_or_and_fetch_2:
941   case Builtin::BI__sync_or_and_fetch_4:
942   case Builtin::BI__sync_or_and_fetch_8:
943   case Builtin::BI__sync_or_and_fetch_16:
944     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
945                                 llvm::Instruction::Or);
946   case Builtin::BI__sync_xor_and_fetch_1:
947   case Builtin::BI__sync_xor_and_fetch_2:
948   case Builtin::BI__sync_xor_and_fetch_4:
949   case Builtin::BI__sync_xor_and_fetch_8:
950   case Builtin::BI__sync_xor_and_fetch_16:
951     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
952                                 llvm::Instruction::Xor);
953 
954   case Builtin::BI__sync_val_compare_and_swap_1:
955   case Builtin::BI__sync_val_compare_and_swap_2:
956   case Builtin::BI__sync_val_compare_and_swap_4:
957   case Builtin::BI__sync_val_compare_and_swap_8:
958   case Builtin::BI__sync_val_compare_and_swap_16: {
959     QualType T = E->getType();
960     llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
961     unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
962 
963     llvm::IntegerType *IntType =
964       llvm::IntegerType::get(getLLVMContext(),
965                              getContext().getTypeSize(T));
966     llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
967 
968     Value *Args[3];
969     Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
970     Args[1] = EmitScalarExpr(E->getArg(1));
971     llvm::Type *ValueType = Args[1]->getType();
972     Args[1] = EmitToInt(*this, Args[1], T, IntType);
973     Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);
974 
975     Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
976                                                 llvm::SequentiallyConsistent,
977                                                 llvm::SequentiallyConsistent);
978     Result = EmitFromInt(*this, Result, T, ValueType);
979     return RValue::get(Result);
980   }
981 
982   case Builtin::BI__sync_bool_compare_and_swap_1:
983   case Builtin::BI__sync_bool_compare_and_swap_2:
984   case Builtin::BI__sync_bool_compare_and_swap_4:
985   case Builtin::BI__sync_bool_compare_and_swap_8:
986   case Builtin::BI__sync_bool_compare_and_swap_16: {
987     QualType T = E->getArg(1)->getType();
988     llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
989     unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
990 
991     llvm::IntegerType *IntType =
992       llvm::IntegerType::get(getLLVMContext(),
993                              getContext().getTypeSize(T));
994     llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
995 
996     Value *Args[3];
997     Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
998     Args[1] = EmitToInt(*this, EmitScalarExpr(E->getArg(1)), T, IntType);
999     Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);
1000 
1001     Value *OldVal = Args[1];
1002     Value *PrevVal = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
1003                                                  llvm::SequentiallyConsistent,
1004                                                  llvm::SequentiallyConsistent);
1005     Value *Result = Builder.CreateICmpEQ(PrevVal, OldVal);
1006     // zext bool to int.
1007     Result = Builder.CreateZExt(Result, ConvertType(E->getType()));
1008     return RValue::get(Result);
1009   }
1010 
1011   case Builtin::BI__sync_swap_1:
1012   case Builtin::BI__sync_swap_2:
1013   case Builtin::BI__sync_swap_4:
1014   case Builtin::BI__sync_swap_8:
1015   case Builtin::BI__sync_swap_16:
1016     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
1017 
1018   case Builtin::BI__sync_lock_test_and_set_1:
1019   case Builtin::BI__sync_lock_test_and_set_2:
1020   case Builtin::BI__sync_lock_test_and_set_4:
1021   case Builtin::BI__sync_lock_test_and_set_8:
1022   case Builtin::BI__sync_lock_test_and_set_16:
1023     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
1024 
1025   case Builtin::BI__sync_lock_release_1:
1026   case Builtin::BI__sync_lock_release_2:
1027   case Builtin::BI__sync_lock_release_4:
1028   case Builtin::BI__sync_lock_release_8:
1029   case Builtin::BI__sync_lock_release_16: {
1030     Value *Ptr = EmitScalarExpr(E->getArg(0));
1031     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
1032     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
1033     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
1034                                              StoreSize.getQuantity() * 8);
1035     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
1036     llvm::StoreInst *Store =
1037       Builder.CreateStore(llvm::Constant::getNullValue(ITy), Ptr);
1038     Store->setAlignment(StoreSize.getQuantity());
1039     Store->setAtomic(llvm::Release);
1040     return RValue::get(0);
1041   }
1042 
1043   case Builtin::BI__sync_synchronize: {
1044     // We assume this is supposed to correspond to a C++0x-style
1045     // sequentially-consistent fence (i.e. this is only usable for
1046     // synchonization, not device I/O or anything like that). This intrinsic
1047     // is really badly designed in the sense that in theory, there isn't
1048     // any way to safely use it... but in practice, it mostly works
1049     // to use it with non-atomic loads and stores to get acquire/release
1050     // semantics.
1051     Builder.CreateFence(llvm::SequentiallyConsistent);
1052     return RValue::get(0);
1053   }
1054 
1055   case Builtin::BI__c11_atomic_is_lock_free:
1056   case Builtin::BI__atomic_is_lock_free: {
1057     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
1058     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
1059     // _Atomic(T) is always properly-aligned.
1060     const char *LibCallName = "__atomic_is_lock_free";
1061     CallArgList Args;
1062     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
1063              getContext().getSizeType());
1064     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
1065       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
1066                getContext().VoidPtrTy);
1067     else
1068       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
1069                getContext().VoidPtrTy);
1070     const CGFunctionInfo &FuncInfo =
1071         CGM.getTypes().arrangeFreeFunctionCall(E->getType(), Args,
1072                                                FunctionType::ExtInfo(),
1073                                                RequiredArgs::All);
1074     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
1075     llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
1076     return EmitCall(FuncInfo, Func, ReturnValueSlot(), Args);
1077   }
1078 
1079   case Builtin::BI__atomic_test_and_set: {
1080     // Look at the argument type to determine whether this is a volatile
1081     // operation. The parameter type is always volatile.
1082     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1083     bool Volatile =
1084         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1085 
1086     Value *Ptr = EmitScalarExpr(E->getArg(0));
1087     unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
1088     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
1089     Value *NewVal = Builder.getInt8(1);
1090     Value *Order = EmitScalarExpr(E->getArg(1));
1091     if (isa<llvm::ConstantInt>(Order)) {
1092       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
1093       AtomicRMWInst *Result = 0;
1094       switch (ord) {
1095       case 0:  // memory_order_relaxed
1096       default: // invalid order
1097         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1098                                          Ptr, NewVal,
1099                                          llvm::Monotonic);
1100         break;
1101       case 1:  // memory_order_consume
1102       case 2:  // memory_order_acquire
1103         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1104                                          Ptr, NewVal,
1105                                          llvm::Acquire);
1106         break;
1107       case 3:  // memory_order_release
1108         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1109                                          Ptr, NewVal,
1110                                          llvm::Release);
1111         break;
1112       case 4:  // memory_order_acq_rel
1113         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1114                                          Ptr, NewVal,
1115                                          llvm::AcquireRelease);
1116         break;
1117       case 5:  // memory_order_seq_cst
1118         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1119                                          Ptr, NewVal,
1120                                          llvm::SequentiallyConsistent);
1121         break;
1122       }
1123       Result->setVolatile(Volatile);
1124       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
1125     }
1126 
1127     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
1128 
1129     llvm::BasicBlock *BBs[5] = {
1130       createBasicBlock("monotonic", CurFn),
1131       createBasicBlock("acquire", CurFn),
1132       createBasicBlock("release", CurFn),
1133       createBasicBlock("acqrel", CurFn),
1134       createBasicBlock("seqcst", CurFn)
1135     };
1136     llvm::AtomicOrdering Orders[5] = {
1137       llvm::Monotonic, llvm::Acquire, llvm::Release,
1138       llvm::AcquireRelease, llvm::SequentiallyConsistent
1139     };
1140 
1141     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
1142     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
1143 
1144     Builder.SetInsertPoint(ContBB);
1145     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
1146 
1147     for (unsigned i = 0; i < 5; ++i) {
1148       Builder.SetInsertPoint(BBs[i]);
1149       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
1150                                                    Ptr, NewVal, Orders[i]);
1151       RMW->setVolatile(Volatile);
1152       Result->addIncoming(RMW, BBs[i]);
1153       Builder.CreateBr(ContBB);
1154     }
1155 
1156     SI->addCase(Builder.getInt32(0), BBs[0]);
1157     SI->addCase(Builder.getInt32(1), BBs[1]);
1158     SI->addCase(Builder.getInt32(2), BBs[1]);
1159     SI->addCase(Builder.getInt32(3), BBs[2]);
1160     SI->addCase(Builder.getInt32(4), BBs[3]);
1161     SI->addCase(Builder.getInt32(5), BBs[4]);
1162 
1163     Builder.SetInsertPoint(ContBB);
1164     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
1165   }
1166 
1167   case Builtin::BI__atomic_clear: {
1168     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1169     bool Volatile =
1170         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1171 
1172     Value *Ptr = EmitScalarExpr(E->getArg(0));
1173     unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
1174     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
1175     Value *NewVal = Builder.getInt8(0);
1176     Value *Order = EmitScalarExpr(E->getArg(1));
1177     if (isa<llvm::ConstantInt>(Order)) {
1178       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
1179       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
1180       Store->setAlignment(1);
1181       switch (ord) {
1182       case 0:  // memory_order_relaxed
1183       default: // invalid order
1184         Store->setOrdering(llvm::Monotonic);
1185         break;
1186       case 3:  // memory_order_release
1187         Store->setOrdering(llvm::Release);
1188         break;
1189       case 5:  // memory_order_seq_cst
1190         Store->setOrdering(llvm::SequentiallyConsistent);
1191         break;
1192       }
1193       return RValue::get(0);
1194     }
1195 
1196     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
1197 
1198     llvm::BasicBlock *BBs[3] = {
1199       createBasicBlock("monotonic", CurFn),
1200       createBasicBlock("release", CurFn),
1201       createBasicBlock("seqcst", CurFn)
1202     };
1203     llvm::AtomicOrdering Orders[3] = {
1204       llvm::Monotonic, llvm::Release, llvm::SequentiallyConsistent
1205     };
1206 
1207     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
1208     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
1209 
1210     for (unsigned i = 0; i < 3; ++i) {
1211       Builder.SetInsertPoint(BBs[i]);
1212       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
1213       Store->setAlignment(1);
1214       Store->setOrdering(Orders[i]);
1215       Builder.CreateBr(ContBB);
1216     }
1217 
1218     SI->addCase(Builder.getInt32(0), BBs[0]);
1219     SI->addCase(Builder.getInt32(3), BBs[1]);
1220     SI->addCase(Builder.getInt32(5), BBs[2]);
1221 
1222     Builder.SetInsertPoint(ContBB);
1223     return RValue::get(0);
1224   }
1225 
1226   case Builtin::BI__atomic_thread_fence:
1227   case Builtin::BI__atomic_signal_fence:
1228   case Builtin::BI__c11_atomic_thread_fence:
1229   case Builtin::BI__c11_atomic_signal_fence: {
1230     llvm::SynchronizationScope Scope;
1231     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
1232         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
1233       Scope = llvm::SingleThread;
1234     else
1235       Scope = llvm::CrossThread;
1236     Value *Order = EmitScalarExpr(E->getArg(0));
1237     if (isa<llvm::ConstantInt>(Order)) {
1238       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
1239       switch (ord) {
1240       case 0:  // memory_order_relaxed
1241       default: // invalid order
1242         break;
1243       case 1:  // memory_order_consume
1244       case 2:  // memory_order_acquire
1245         Builder.CreateFence(llvm::Acquire, Scope);
1246         break;
1247       case 3:  // memory_order_release
1248         Builder.CreateFence(llvm::Release, Scope);
1249         break;
1250       case 4:  // memory_order_acq_rel
1251         Builder.CreateFence(llvm::AcquireRelease, Scope);
1252         break;
1253       case 5:  // memory_order_seq_cst
1254         Builder.CreateFence(llvm::SequentiallyConsistent, Scope);
1255         break;
1256       }
1257       return RValue::get(0);
1258     }
1259 
1260     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
1261     AcquireBB = createBasicBlock("acquire", CurFn);
1262     ReleaseBB = createBasicBlock("release", CurFn);
1263     AcqRelBB = createBasicBlock("acqrel", CurFn);
1264     SeqCstBB = createBasicBlock("seqcst", CurFn);
1265     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
1266 
1267     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
1268     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
1269 
1270     Builder.SetInsertPoint(AcquireBB);
1271     Builder.CreateFence(llvm::Acquire, Scope);
1272     Builder.CreateBr(ContBB);
1273     SI->addCase(Builder.getInt32(1), AcquireBB);
1274     SI->addCase(Builder.getInt32(2), AcquireBB);
1275 
1276     Builder.SetInsertPoint(ReleaseBB);
1277     Builder.CreateFence(llvm::Release, Scope);
1278     Builder.CreateBr(ContBB);
1279     SI->addCase(Builder.getInt32(3), ReleaseBB);
1280 
1281     Builder.SetInsertPoint(AcqRelBB);
1282     Builder.CreateFence(llvm::AcquireRelease, Scope);
1283     Builder.CreateBr(ContBB);
1284     SI->addCase(Builder.getInt32(4), AcqRelBB);
1285 
1286     Builder.SetInsertPoint(SeqCstBB);
1287     Builder.CreateFence(llvm::SequentiallyConsistent, Scope);
1288     Builder.CreateBr(ContBB);
1289     SI->addCase(Builder.getInt32(5), SeqCstBB);
1290 
1291     Builder.SetInsertPoint(ContBB);
1292     return RValue::get(0);
1293   }
1294 
1295     // Library functions with special handling.
1296   case Builtin::BIsqrt:
1297   case Builtin::BIsqrtf:
1298   case Builtin::BIsqrtl: {
1299     // Transform a call to sqrt* into a @llvm.sqrt.* intrinsic call, but only
1300     // in finite- or unsafe-math mode (the intrinsic has different semantics
1301     // for handling negative numbers compared to the library function, so
1302     // -fmath-errno=0 is not enough).
1303     if (!FD->hasAttr<ConstAttr>())
1304       break;
1305     if (!(CGM.getCodeGenOpts().UnsafeFPMath ||
1306           CGM.getCodeGenOpts().NoNaNsFPMath))
1307       break;
1308     Value *Arg0 = EmitScalarExpr(E->getArg(0));
1309     llvm::Type *ArgType = Arg0->getType();
1310     Value *F = CGM.getIntrinsic(Intrinsic::sqrt, ArgType);
1311     return RValue::get(Builder.CreateCall(F, Arg0));
1312   }
1313 
1314   case Builtin::BIpow:
1315   case Builtin::BIpowf:
1316   case Builtin::BIpowl: {
1317     // Transform a call to pow* into a @llvm.pow.* intrinsic call.
1318     if (!FD->hasAttr<ConstAttr>())
1319       break;
1320     Value *Base = EmitScalarExpr(E->getArg(0));
1321     Value *Exponent = EmitScalarExpr(E->getArg(1));
1322     llvm::Type *ArgType = Base->getType();
1323     Value *F = CGM.getIntrinsic(Intrinsic::pow, ArgType);
1324     return RValue::get(Builder.CreateCall2(F, Base, Exponent));
1325   }
1326 
1327   case Builtin::BIfma:
1328   case Builtin::BIfmaf:
1329   case Builtin::BIfmal:
1330   case Builtin::BI__builtin_fma:
1331   case Builtin::BI__builtin_fmaf:
1332   case Builtin::BI__builtin_fmal: {
1333     // Rewrite fma to intrinsic.
1334     Value *FirstArg = EmitScalarExpr(E->getArg(0));
1335     llvm::Type *ArgType = FirstArg->getType();
1336     Value *F = CGM.getIntrinsic(Intrinsic::fma, ArgType);
1337     return RValue::get(Builder.CreateCall3(F, FirstArg,
1338                                               EmitScalarExpr(E->getArg(1)),
1339                                               EmitScalarExpr(E->getArg(2))));
1340   }
1341 
1342   case Builtin::BI__builtin_signbit:
1343   case Builtin::BI__builtin_signbitf:
1344   case Builtin::BI__builtin_signbitl: {
1345     LLVMContext &C = CGM.getLLVMContext();
1346 
1347     Value *Arg = EmitScalarExpr(E->getArg(0));
1348     llvm::Type *ArgTy = Arg->getType();
1349     if (ArgTy->isPPC_FP128Ty())
1350       break; // FIXME: I'm not sure what the right implementation is here.
1351     int ArgWidth = ArgTy->getPrimitiveSizeInBits();
1352     llvm::Type *ArgIntTy = llvm::IntegerType::get(C, ArgWidth);
1353     Value *BCArg = Builder.CreateBitCast(Arg, ArgIntTy);
1354     Value *ZeroCmp = llvm::Constant::getNullValue(ArgIntTy);
1355     Value *Result = Builder.CreateICmpSLT(BCArg, ZeroCmp);
1356     return RValue::get(Builder.CreateZExt(Result, ConvertType(E->getType())));
1357   }
1358   case Builtin::BI__builtin_annotation: {
1359     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
1360     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::annotation,
1361                                       AnnVal->getType());
1362 
1363     // Get the annotation string, go through casts. Sema requires this to be a
1364     // non-wide string literal, potentially casted, so the cast<> is safe.
1365     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
1366     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
1367     return RValue::get(EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc()));
1368   }
1369   case Builtin::BI__builtin_addcb:
1370   case Builtin::BI__builtin_addcs:
1371   case Builtin::BI__builtin_addc:
1372   case Builtin::BI__builtin_addcl:
1373   case Builtin::BI__builtin_addcll:
1374   case Builtin::BI__builtin_subcb:
1375   case Builtin::BI__builtin_subcs:
1376   case Builtin::BI__builtin_subc:
1377   case Builtin::BI__builtin_subcl:
1378   case Builtin::BI__builtin_subcll: {
1379 
1380     // We translate all of these builtins from expressions of the form:
1381     //   int x = ..., y = ..., carryin = ..., carryout, result;
1382     //   result = __builtin_addc(x, y, carryin, &carryout);
1383     //
1384     // to LLVM IR of the form:
1385     //
1386     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
1387     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
1388     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
1389     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
1390     //                                                       i32 %carryin)
1391     //   %result = extractvalue {i32, i1} %tmp2, 0
1392     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
1393     //   %tmp3 = or i1 %carry1, %carry2
1394     //   %tmp4 = zext i1 %tmp3 to i32
1395     //   store i32 %tmp4, i32* %carryout
1396 
1397     // Scalarize our inputs.
1398     llvm::Value *X = EmitScalarExpr(E->getArg(0));
1399     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
1400     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
1401     std::pair<llvm::Value*, unsigned> CarryOutPtr =
1402       EmitPointerWithAlignment(E->getArg(3));
1403 
1404     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
1405     llvm::Intrinsic::ID IntrinsicId;
1406     switch (BuiltinID) {
1407     default: llvm_unreachable("Unknown multiprecision builtin id.");
1408     case Builtin::BI__builtin_addcb:
1409     case Builtin::BI__builtin_addcs:
1410     case Builtin::BI__builtin_addc:
1411     case Builtin::BI__builtin_addcl:
1412     case Builtin::BI__builtin_addcll:
1413       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
1414       break;
1415     case Builtin::BI__builtin_subcb:
1416     case Builtin::BI__builtin_subcs:
1417     case Builtin::BI__builtin_subc:
1418     case Builtin::BI__builtin_subcl:
1419     case Builtin::BI__builtin_subcll:
1420       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
1421       break;
1422     }
1423 
1424     // Construct our resulting LLVM IR expression.
1425     llvm::Value *Carry1;
1426     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
1427                                               X, Y, Carry1);
1428     llvm::Value *Carry2;
1429     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
1430                                               Sum1, Carryin, Carry2);
1431     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
1432                                                X->getType());
1433     llvm::StoreInst *CarryOutStore = Builder.CreateStore(CarryOut,
1434                                                          CarryOutPtr.first);
1435     CarryOutStore->setAlignment(CarryOutPtr.second);
1436     return RValue::get(Sum2);
1437   }
1438   case Builtin::BI__builtin_uadd_overflow:
1439   case Builtin::BI__builtin_uaddl_overflow:
1440   case Builtin::BI__builtin_uaddll_overflow:
1441   case Builtin::BI__builtin_usub_overflow:
1442   case Builtin::BI__builtin_usubl_overflow:
1443   case Builtin::BI__builtin_usubll_overflow:
1444   case Builtin::BI__builtin_umul_overflow:
1445   case Builtin::BI__builtin_umull_overflow:
1446   case Builtin::BI__builtin_umulll_overflow:
1447   case Builtin::BI__builtin_sadd_overflow:
1448   case Builtin::BI__builtin_saddl_overflow:
1449   case Builtin::BI__builtin_saddll_overflow:
1450   case Builtin::BI__builtin_ssub_overflow:
1451   case Builtin::BI__builtin_ssubl_overflow:
1452   case Builtin::BI__builtin_ssubll_overflow:
1453   case Builtin::BI__builtin_smul_overflow:
1454   case Builtin::BI__builtin_smull_overflow:
1455   case Builtin::BI__builtin_smulll_overflow: {
1456 
1457     // We translate all of these builtins directly to the relevant llvm IR node.
1458 
1459     // Scalarize our inputs.
1460     llvm::Value *X = EmitScalarExpr(E->getArg(0));
1461     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
1462     std::pair<llvm::Value *, unsigned> SumOutPtr =
1463       EmitPointerWithAlignment(E->getArg(2));
1464 
1465     // Decide which of the overflow intrinsics we are lowering to:
1466     llvm::Intrinsic::ID IntrinsicId;
1467     switch (BuiltinID) {
1468     default: llvm_unreachable("Unknown security overflow builtin id.");
1469     case Builtin::BI__builtin_uadd_overflow:
1470     case Builtin::BI__builtin_uaddl_overflow:
1471     case Builtin::BI__builtin_uaddll_overflow:
1472       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
1473       break;
1474     case Builtin::BI__builtin_usub_overflow:
1475     case Builtin::BI__builtin_usubl_overflow:
1476     case Builtin::BI__builtin_usubll_overflow:
1477       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
1478       break;
1479     case Builtin::BI__builtin_umul_overflow:
1480     case Builtin::BI__builtin_umull_overflow:
1481     case Builtin::BI__builtin_umulll_overflow:
1482       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
1483       break;
1484     case Builtin::BI__builtin_sadd_overflow:
1485     case Builtin::BI__builtin_saddl_overflow:
1486     case Builtin::BI__builtin_saddll_overflow:
1487       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
1488       break;
1489     case Builtin::BI__builtin_ssub_overflow:
1490     case Builtin::BI__builtin_ssubl_overflow:
1491     case Builtin::BI__builtin_ssubll_overflow:
1492       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
1493       break;
1494     case Builtin::BI__builtin_smul_overflow:
1495     case Builtin::BI__builtin_smull_overflow:
1496     case Builtin::BI__builtin_smulll_overflow:
1497       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
1498       break;
1499     }
1500 
1501 
1502     llvm::Value *Carry;
1503     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
1504     llvm::StoreInst *SumOutStore = Builder.CreateStore(Sum, SumOutPtr.first);
1505     SumOutStore->setAlignment(SumOutPtr.second);
1506 
1507     return RValue::get(Carry);
1508   }
1509   case Builtin::BI__builtin_addressof:
1510     return RValue::get(EmitLValue(E->getArg(0)).getAddress());
1511   case Builtin::BI__noop:
1512     return RValue::get(0);
1513   case Builtin::BI_InterlockedCompareExchange: {
1514     AtomicCmpXchgInst *CXI = Builder.CreateAtomicCmpXchg(
1515         EmitScalarExpr(E->getArg(0)),
1516         EmitScalarExpr(E->getArg(2)),
1517         EmitScalarExpr(E->getArg(1)),
1518         SequentiallyConsistent,
1519         SequentiallyConsistent);
1520       CXI->setVolatile(true);
1521       return RValue::get(CXI);
1522   }
1523   case Builtin::BI_InterlockedIncrement: {
1524     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
1525       AtomicRMWInst::Add,
1526       EmitScalarExpr(E->getArg(0)),
1527       ConstantInt::get(Int32Ty, 1),
1528       llvm::SequentiallyConsistent);
1529     RMWI->setVolatile(true);
1530     return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(Int32Ty, 1)));
1531   }
1532   case Builtin::BI_InterlockedDecrement: {
1533     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
1534       AtomicRMWInst::Sub,
1535       EmitScalarExpr(E->getArg(0)),
1536       ConstantInt::get(Int32Ty, 1),
1537       llvm::SequentiallyConsistent);
1538     RMWI->setVolatile(true);
1539     return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(Int32Ty, 1)));
1540   }
1541   case Builtin::BI_InterlockedExchangeAdd: {
1542     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
1543       AtomicRMWInst::Add,
1544       EmitScalarExpr(E->getArg(0)),
1545       EmitScalarExpr(E->getArg(1)),
1546       llvm::SequentiallyConsistent);
1547     RMWI->setVolatile(true);
1548     return RValue::get(RMWI);
1549   }
1550   }
1551 
1552   // If this is an alias for a lib function (e.g. __builtin_sin), emit
1553   // the call using the normal call path, but using the unmangled
1554   // version of the function name.
1555   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
1556     return emitLibraryCall(*this, FD, E,
1557                            CGM.getBuiltinLibFunction(FD, BuiltinID));
1558 
1559   // If this is a predefined lib function (e.g. malloc), emit the call
1560   // using exactly the normal call path.
1561   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
1562     return emitLibraryCall(*this, FD, E, EmitScalarExpr(E->getCallee()));
1563 
1564   // See if we have a target specific intrinsic.
1565   const char *Name = getContext().BuiltinInfo.GetName(BuiltinID);
1566   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
1567   if (const char *Prefix =
1568       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch()))
1569     IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix, Name);
1570 
1571   if (IntrinsicID != Intrinsic::not_intrinsic) {
1572     SmallVector<Value*, 16> Args;
1573 
1574     // Find out if any arguments are required to be integer constant
1575     // expressions.
1576     unsigned ICEArguments = 0;
1577     ASTContext::GetBuiltinTypeError Error;
1578     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
1579     assert(Error == ASTContext::GE_None && "Should not codegen an error");
1580 
1581     Function *F = CGM.getIntrinsic(IntrinsicID);
1582     llvm::FunctionType *FTy = F->getFunctionType();
1583 
1584     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
1585       Value *ArgValue;
1586       // If this is a normal argument, just emit it as a scalar.
1587       if ((ICEArguments & (1 << i)) == 0) {
1588         ArgValue = EmitScalarExpr(E->getArg(i));
1589       } else {
1590         // If this is required to be a constant, constant fold it so that we
1591         // know that the generated intrinsic gets a ConstantInt.
1592         llvm::APSInt Result;
1593         bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result,getContext());
1594         assert(IsConst && "Constant arg isn't actually constant?");
1595         (void)IsConst;
1596         ArgValue = llvm::ConstantInt::get(getLLVMContext(), Result);
1597       }
1598 
1599       // If the intrinsic arg type is different from the builtin arg type
1600       // we need to do a bit cast.
1601       llvm::Type *PTy = FTy->getParamType(i);
1602       if (PTy != ArgValue->getType()) {
1603         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
1604                "Must be able to losslessly bit cast to param");
1605         ArgValue = Builder.CreateBitCast(ArgValue, PTy);
1606       }
1607 
1608       Args.push_back(ArgValue);
1609     }
1610 
1611     Value *V = Builder.CreateCall(F, Args);
1612     QualType BuiltinRetType = E->getType();
1613 
1614     llvm::Type *RetTy = VoidTy;
1615     if (!BuiltinRetType->isVoidType())
1616       RetTy = ConvertType(BuiltinRetType);
1617 
1618     if (RetTy != V->getType()) {
1619       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
1620              "Must be able to losslessly bit cast result type");
1621       V = Builder.CreateBitCast(V, RetTy);
1622     }
1623 
1624     return RValue::get(V);
1625   }
1626 
1627   // See if we have a target specific builtin that needs to be lowered.
1628   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E))
1629     return RValue::get(V);
1630 
1631   ErrorUnsupported(E, "builtin function");
1632 
1633   // Unknown builtin, for now just dump it out and return undef.
1634   return GetUndefRValue(E->getType());
1635 }
1636 
1637 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
1638                                               const CallExpr *E) {
1639   switch (getTarget().getTriple().getArch()) {
1640   case llvm::Triple::aarch64:
1641   case llvm::Triple::aarch64_be:
1642     return EmitAArch64BuiltinExpr(BuiltinID, E);
1643   case llvm::Triple::arm:
1644   case llvm::Triple::armeb:
1645   case llvm::Triple::thumb:
1646   case llvm::Triple::thumbeb:
1647     return EmitARMBuiltinExpr(BuiltinID, E);
1648   case llvm::Triple::arm64:
1649     return EmitARM64BuiltinExpr(BuiltinID, E);
1650   case llvm::Triple::x86:
1651   case llvm::Triple::x86_64:
1652     return EmitX86BuiltinExpr(BuiltinID, E);
1653   case llvm::Triple::ppc:
1654   case llvm::Triple::ppc64:
1655   case llvm::Triple::ppc64le:
1656     return EmitPPCBuiltinExpr(BuiltinID, E);
1657   default:
1658     return 0;
1659   }
1660 }
1661 
1662 static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
1663                                      NeonTypeFlags TypeFlags,
1664                                      bool V1Ty=false) {
1665   int IsQuad = TypeFlags.isQuad();
1666   switch (TypeFlags.getEltType()) {
1667   case NeonTypeFlags::Int8:
1668   case NeonTypeFlags::Poly8:
1669     return llvm::VectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
1670   case NeonTypeFlags::Int16:
1671   case NeonTypeFlags::Poly16:
1672   case NeonTypeFlags::Float16:
1673     return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
1674   case NeonTypeFlags::Int32:
1675     return llvm::VectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
1676   case NeonTypeFlags::Int64:
1677   case NeonTypeFlags::Poly64:
1678     return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
1679   case NeonTypeFlags::Poly128:
1680     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
1681     // There is a lot of i128 and f128 API missing.
1682     // so we use v16i8 to represent poly128 and get pattern matched.
1683     return llvm::VectorType::get(CGF->Int8Ty, 16);
1684   case NeonTypeFlags::Float32:
1685     return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
1686   case NeonTypeFlags::Float64:
1687     return llvm::VectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
1688   }
1689   llvm_unreachable("Unknown vector element type!");
1690 }
1691 
1692 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
1693   unsigned nElts = cast<llvm::VectorType>(V->getType())->getNumElements();
1694   Value* SV = llvm::ConstantVector::getSplat(nElts, C);
1695   return Builder.CreateShuffleVector(V, V, SV, "lane");
1696 }
1697 
1698 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
1699                                      const char *name,
1700                                      unsigned shift, bool rightshift) {
1701   unsigned j = 0;
1702   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
1703        ai != ae; ++ai, ++j)
1704     if (shift > 0 && shift == j)
1705       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
1706     else
1707       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
1708 
1709   return Builder.CreateCall(F, Ops, name);
1710 }
1711 
1712 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
1713                                             bool neg) {
1714   int SV = cast<ConstantInt>(V)->getSExtValue();
1715 
1716   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
1717   llvm::Constant *C = ConstantInt::get(VTy->getElementType(), neg ? -SV : SV);
1718   return llvm::ConstantVector::getSplat(VTy->getNumElements(), C);
1719 }
1720 
1721 // \brief Right-shift a vector by a constant.
1722 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
1723                                           llvm::Type *Ty, bool usgn,
1724                                           const char *name) {
1725   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
1726 
1727   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
1728   int EltSize = VTy->getScalarSizeInBits();
1729 
1730   Vec = Builder.CreateBitCast(Vec, Ty);
1731 
1732   // lshr/ashr are undefined when the shift amount is equal to the vector
1733   // element size.
1734   if (ShiftAmt == EltSize) {
1735     if (usgn) {
1736       // Right-shifting an unsigned value by its size yields 0.
1737       llvm::Constant *Zero = ConstantInt::get(VTy->getElementType(), 0);
1738       return llvm::ConstantVector::getSplat(VTy->getNumElements(), Zero);
1739     } else {
1740       // Right-shifting a signed value by its size is equivalent
1741       // to a shift of size-1.
1742       --ShiftAmt;
1743       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
1744     }
1745   }
1746 
1747   Shift = EmitNeonShiftVector(Shift, Ty, false);
1748   if (usgn)
1749     return Builder.CreateLShr(Vec, Shift, name);
1750   else
1751     return Builder.CreateAShr(Vec, Shift, name);
1752 }
1753 
1754 Value *CodeGenFunction::EmitConcatVectors(Value *Lo, Value *Hi,
1755                                           llvm::Type *ArgTy) {
1756   unsigned NumElts = ArgTy->getVectorNumElements();
1757   SmallVector<Constant *, 16> Indices;
1758   for (unsigned i = 0; i < 2 * NumElts; ++i)
1759     Indices.push_back(ConstantInt::get(Int32Ty, i));
1760 
1761   Constant *Mask = ConstantVector::get(Indices);
1762   Value *LoCast = Builder.CreateBitCast(Lo, ArgTy);
1763   Value *HiCast = Builder.CreateBitCast(Hi, ArgTy);
1764   return Builder.CreateShuffleVector(LoCast, HiCast, Mask, "concat");
1765 }
1766 
1767 Value *CodeGenFunction::EmitExtractHigh(Value *Vec, llvm::Type *ResTy) {
1768   unsigned NumElts = ResTy->getVectorNumElements();
1769   SmallVector<Constant *, 8> Indices;
1770 
1771   llvm::Type *InTy = llvm::VectorType::get(ResTy->getVectorElementType(),
1772                                            NumElts * 2);
1773   Value *VecCast = Builder.CreateBitCast(Vec, InTy);
1774 
1775   // extract_high is a shuffle on the second half of the input indices: E.g. 4,
1776   // 5, 6, 7 if we're extracting <4 x i16> from <8 x i16>.
1777   for (unsigned i = 0; i < NumElts; ++i)
1778     Indices.push_back(ConstantInt::get(Int32Ty, NumElts + i));
1779 
1780   Constant *Mask = ConstantVector::get(Indices);
1781   return Builder.CreateShuffleVector(VecCast, VecCast, Mask, "concat");
1782 }
1783 
1784 /// GetPointeeAlignment - Given an expression with a pointer type, find the
1785 /// alignment of the type referenced by the pointer.  Skip over implicit
1786 /// casts.
1787 std::pair<llvm::Value*, unsigned>
1788 CodeGenFunction::EmitPointerWithAlignment(const Expr *Addr) {
1789   assert(Addr->getType()->isPointerType());
1790   Addr = Addr->IgnoreParens();
1791   if (const ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(Addr)) {
1792     if ((ICE->getCastKind() == CK_BitCast || ICE->getCastKind() == CK_NoOp) &&
1793         ICE->getSubExpr()->getType()->isPointerType()) {
1794       std::pair<llvm::Value*, unsigned> Ptr =
1795           EmitPointerWithAlignment(ICE->getSubExpr());
1796       Ptr.first = Builder.CreateBitCast(Ptr.first,
1797                                         ConvertType(Addr->getType()));
1798       return Ptr;
1799     } else if (ICE->getCastKind() == CK_ArrayToPointerDecay) {
1800       LValue LV = EmitLValue(ICE->getSubExpr());
1801       unsigned Align = LV.getAlignment().getQuantity();
1802       if (!Align) {
1803         // FIXME: Once LValues are fixed to always set alignment,
1804         // zap this code.
1805         QualType PtTy = ICE->getSubExpr()->getType();
1806         if (!PtTy->isIncompleteType())
1807           Align = getContext().getTypeAlignInChars(PtTy).getQuantity();
1808         else
1809           Align = 1;
1810       }
1811       return std::make_pair(LV.getAddress(), Align);
1812     }
1813   }
1814   if (const UnaryOperator *UO = dyn_cast<UnaryOperator>(Addr)) {
1815     if (UO->getOpcode() == UO_AddrOf) {
1816       LValue LV = EmitLValue(UO->getSubExpr());
1817       unsigned Align = LV.getAlignment().getQuantity();
1818       if (!Align) {
1819         // FIXME: Once LValues are fixed to always set alignment,
1820         // zap this code.
1821         QualType PtTy = UO->getSubExpr()->getType();
1822         if (!PtTy->isIncompleteType())
1823           Align = getContext().getTypeAlignInChars(PtTy).getQuantity();
1824         else
1825           Align = 1;
1826       }
1827       return std::make_pair(LV.getAddress(), Align);
1828     }
1829   }
1830 
1831   unsigned Align = 1;
1832   QualType PtTy = Addr->getType()->getPointeeType();
1833   if (!PtTy->isIncompleteType())
1834     Align = getContext().getTypeAlignInChars(PtTy).getQuantity();
1835 
1836   return std::make_pair(EmitScalarExpr(Addr), Align);
1837 }
1838 
1839 enum {
1840   AddRetType = (1 << 0),
1841   Add1ArgType = (1 << 1),
1842   Add2ArgTypes = (1 << 2),
1843 
1844   VectorizeRetType = (1 << 3),
1845   VectorizeArgTypes = (1 << 4),
1846 
1847   InventFloatType = (1 << 5),
1848   UnsignedAlts = (1 << 6),
1849 
1850   Use64BitVectors = (1 << 7),
1851   Use128BitVectors = (1 << 8),
1852 
1853   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
1854   VectorRet = AddRetType | VectorizeRetType,
1855   VectorRetGetArgs01 =
1856       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
1857   FpCmpzModifiers =
1858       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
1859 };
1860 
1861  struct NeonIntrinsicInfo {
1862   unsigned BuiltinID;
1863   unsigned LLVMIntrinsic;
1864   unsigned AltLLVMIntrinsic;
1865   const char *NameHint;
1866   unsigned TypeModifier;
1867 
1868   bool operator<(unsigned RHSBuiltinID) const {
1869     return BuiltinID < RHSBuiltinID;
1870   }
1871 };
1872 
1873 #define NEONMAP0(NameBase) \
1874   { NEON::BI__builtin_neon_ ## NameBase, 0, 0, #NameBase, 0 }
1875 
1876 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
1877   { NEON:: BI__builtin_neon_ ## NameBase, \
1878       Intrinsic::LLVMIntrinsic, 0, #NameBase, TypeModifier }
1879 
1880 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
1881   { NEON:: BI__builtin_neon_ ## NameBase, \
1882       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
1883       #NameBase, TypeModifier }
1884 
1885 static const NeonIntrinsicInfo AArch64SISDIntrinsicInfo[] = {
1886   NEONMAP1(vabdd_f64, aarch64_neon_vabd, AddRetType),
1887   NEONMAP1(vabds_f32, aarch64_neon_vabd, AddRetType),
1888   NEONMAP1(vabsd_s64, aarch64_neon_vabs, 0),
1889   NEONMAP1(vaddd_s64, aarch64_neon_vaddds, 0),
1890   NEONMAP1(vaddd_u64, aarch64_neon_vadddu, 0),
1891   NEONMAP1(vaddlv_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1892   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1893   NEONMAP1(vaddlv_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1894   NEONMAP1(vaddlv_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1895   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1896   NEONMAP1(vaddlv_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1897   NEONMAP1(vaddlvq_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1898   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1899   NEONMAP1(vaddlvq_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
1900   NEONMAP1(vaddlvq_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1901   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1902   NEONMAP1(vaddlvq_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
1903   NEONMAP1(vaddv_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
1904   NEONMAP1(vaddv_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1905   NEONMAP1(vaddv_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1906   NEONMAP1(vaddv_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1907   NEONMAP1(vaddv_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1908   NEONMAP1(vaddv_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1909   NEONMAP1(vaddv_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1910   NEONMAP1(vaddvq_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
1911   NEONMAP1(vaddvq_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
1912   NEONMAP1(vaddvq_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1913   NEONMAP1(vaddvq_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1914   NEONMAP1(vaddvq_s64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1915   NEONMAP1(vaddvq_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1916   NEONMAP1(vaddvq_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1917   NEONMAP1(vaddvq_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1918   NEONMAP1(vaddvq_u64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1919   NEONMAP1(vaddvq_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
1920   NEONMAP1(vcaged_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
1921   NEONMAP1(vcages_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
1922   NEONMAP1(vcagtd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
1923   NEONMAP1(vcagts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
1924   NEONMAP1(vcaled_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
1925   NEONMAP1(vcales_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
1926   NEONMAP1(vcaltd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
1927   NEONMAP1(vcalts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
1928   NEONMAP1(vceqd_f64, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
1929   NEONMAP1(vceqd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
1930   NEONMAP1(vceqd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
1931   NEONMAP1(vceqs_f32, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
1932   NEONMAP1(vceqzd_f64, aarch64_neon_fceq, FpCmpzModifiers),
1933   NEONMAP1(vceqzd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
1934   NEONMAP1(vceqzd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
1935   NEONMAP1(vceqzs_f32, aarch64_neon_fceq, FpCmpzModifiers),
1936   NEONMAP1(vcged_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
1937   NEONMAP1(vcged_s64, aarch64_neon_vcge, VectorRetGetArgs01),
1938   NEONMAP1(vcged_u64, aarch64_neon_vchs, VectorRetGetArgs01),
1939   NEONMAP1(vcges_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
1940   NEONMAP1(vcgezd_f64, aarch64_neon_fcge, FpCmpzModifiers),
1941   NEONMAP1(vcgezd_s64, aarch64_neon_vcge, VectorRetGetArgs01),
1942   NEONMAP1(vcgezs_f32, aarch64_neon_fcge, FpCmpzModifiers),
1943   NEONMAP1(vcgtd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
1944   NEONMAP1(vcgtd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
1945   NEONMAP1(vcgtd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
1946   NEONMAP1(vcgts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
1947   NEONMAP1(vcgtzd_f64, aarch64_neon_fcgt, FpCmpzModifiers),
1948   NEONMAP1(vcgtzd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
1949   NEONMAP1(vcgtzs_f32, aarch64_neon_fcgt, FpCmpzModifiers),
1950   NEONMAP1(vcled_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
1951   NEONMAP1(vcled_s64, aarch64_neon_vcge, VectorRetGetArgs01),
1952   NEONMAP1(vcled_u64, aarch64_neon_vchs, VectorRetGetArgs01),
1953   NEONMAP1(vcles_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
1954   NEONMAP1(vclezd_f64, aarch64_neon_fclez, FpCmpzModifiers),
1955   NEONMAP1(vclezd_s64, aarch64_neon_vclez, VectorRetGetArgs01),
1956   NEONMAP1(vclezs_f32, aarch64_neon_fclez, FpCmpzModifiers),
1957   NEONMAP1(vcltd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
1958   NEONMAP1(vcltd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
1959   NEONMAP1(vcltd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
1960   NEONMAP1(vclts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
1961   NEONMAP1(vcltzd_f64, aarch64_neon_fcltz, FpCmpzModifiers),
1962   NEONMAP1(vcltzd_s64, aarch64_neon_vcltz, VectorRetGetArgs01),
1963   NEONMAP1(vcltzs_f32, aarch64_neon_fcltz, FpCmpzModifiers),
1964   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
1965   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
1966   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
1967   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
1968   NEONMAP1(vcvtd_f64_s64, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
1969   NEONMAP1(vcvtd_f64_u64, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
1970   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
1971   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
1972   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
1973   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
1974   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
1975   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
1976   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
1977   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
1978   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
1979   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
1980   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
1981   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
1982   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
1983   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
1984   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
1985   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
1986   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
1987   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
1988   NEONMAP1(vcvts_f32_s32, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
1989   NEONMAP1(vcvts_f32_u32, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
1990   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
1991   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
1992   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
1993   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
1994   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
1995   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
1996   NEONMAP1(vcvtxd_f32_f64, aarch64_neon_fcvtxn, 0),
1997   NEONMAP0(vdupb_lane_i8),
1998   NEONMAP0(vdupb_laneq_i8),
1999   NEONMAP0(vdupd_lane_f64),
2000   NEONMAP0(vdupd_lane_i64),
2001   NEONMAP0(vdupd_laneq_f64),
2002   NEONMAP0(vdupd_laneq_i64),
2003   NEONMAP0(vduph_lane_i16),
2004   NEONMAP0(vduph_laneq_i16),
2005   NEONMAP0(vdups_lane_f32),
2006   NEONMAP0(vdups_lane_i32),
2007   NEONMAP0(vdups_laneq_f32),
2008   NEONMAP0(vdups_laneq_i32),
2009   NEONMAP0(vfmad_lane_f64),
2010   NEONMAP0(vfmad_laneq_f64),
2011   NEONMAP0(vfmas_lane_f32),
2012   NEONMAP0(vfmas_laneq_f32),
2013   NEONMAP0(vget_lane_f32),
2014   NEONMAP0(vget_lane_f64),
2015   NEONMAP0(vget_lane_i16),
2016   NEONMAP0(vget_lane_i32),
2017   NEONMAP0(vget_lane_i64),
2018   NEONMAP0(vget_lane_i8),
2019   NEONMAP0(vgetq_lane_f32),
2020   NEONMAP0(vgetq_lane_f64),
2021   NEONMAP0(vgetq_lane_i16),
2022   NEONMAP0(vgetq_lane_i32),
2023   NEONMAP0(vgetq_lane_i64),
2024   NEONMAP0(vgetq_lane_i8),
2025   NEONMAP1(vmaxnmv_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
2026   NEONMAP1(vmaxnmvq_f32, aarch64_neon_vmaxnmv, 0),
2027   NEONMAP1(vmaxnmvq_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
2028   NEONMAP1(vmaxv_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
2029   NEONMAP1(vmaxv_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2030   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2031   NEONMAP1(vmaxv_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2032   NEONMAP1(vmaxv_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2033   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2034   NEONMAP1(vmaxv_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2035   NEONMAP1(vmaxvq_f32, aarch64_neon_vmaxv, 0),
2036   NEONMAP1(vmaxvq_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
2037   NEONMAP1(vmaxvq_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2038   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2039   NEONMAP1(vmaxvq_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
2040   NEONMAP1(vmaxvq_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2041   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2042   NEONMAP1(vmaxvq_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
2043   NEONMAP1(vminnmv_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
2044   NEONMAP1(vminnmvq_f32, aarch64_neon_vminnmv, 0),
2045   NEONMAP1(vminnmvq_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
2046   NEONMAP1(vminv_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
2047   NEONMAP1(vminv_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
2048   NEONMAP1(vminv_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
2049   NEONMAP1(vminv_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
2050   NEONMAP1(vminv_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
2051   NEONMAP1(vminv_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
2052   NEONMAP1(vminv_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
2053   NEONMAP1(vminvq_f32, aarch64_neon_vminv, 0),
2054   NEONMAP1(vminvq_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
2055   NEONMAP1(vminvq_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
2056   NEONMAP1(vminvq_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
2057   NEONMAP1(vminvq_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
2058   NEONMAP1(vminvq_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
2059   NEONMAP1(vminvq_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
2060   NEONMAP1(vminvq_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
2061   NEONMAP0(vmul_n_f64),
2062   NEONMAP1(vmull_p64, aarch64_neon_vmull_p64, 0),
2063   NEONMAP0(vmulxd_f64),
2064   NEONMAP0(vmulxs_f32),
2065   NEONMAP1(vnegd_s64, aarch64_neon_vneg, 0),
2066   NEONMAP1(vpaddd_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
2067   NEONMAP1(vpaddd_s64, aarch64_neon_vpadd, 0),
2068   NEONMAP1(vpaddd_u64, aarch64_neon_vpadd, 0),
2069   NEONMAP1(vpadds_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
2070   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
2071   NEONMAP1(vpmaxnms_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
2072   NEONMAP1(vpmaxqd_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
2073   NEONMAP1(vpmaxs_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
2074   NEONMAP1(vpminnmqd_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
2075   NEONMAP1(vpminnms_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
2076   NEONMAP1(vpminqd_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
2077   NEONMAP1(vpmins_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
2078   NEONMAP1(vqabsb_s8, arm_neon_vqabs, VectorRet),
2079   NEONMAP1(vqabsd_s64, arm_neon_vqabs, VectorRet),
2080   NEONMAP1(vqabsh_s16, arm_neon_vqabs, VectorRet),
2081   NEONMAP1(vqabss_s32, arm_neon_vqabs, VectorRet),
2082   NEONMAP1(vqaddb_s8, arm_neon_vqadds, VectorRet),
2083   NEONMAP1(vqaddb_u8, arm_neon_vqaddu, VectorRet),
2084   NEONMAP1(vqaddd_s64, arm_neon_vqadds, VectorRet),
2085   NEONMAP1(vqaddd_u64, arm_neon_vqaddu, VectorRet),
2086   NEONMAP1(vqaddh_s16, arm_neon_vqadds, VectorRet),
2087   NEONMAP1(vqaddh_u16, arm_neon_vqaddu, VectorRet),
2088   NEONMAP1(vqadds_s32, arm_neon_vqadds, VectorRet),
2089   NEONMAP1(vqadds_u32, arm_neon_vqaddu, VectorRet),
2090   NEONMAP0(vqdmlalh_lane_s16),
2091   NEONMAP0(vqdmlalh_laneq_s16),
2092   NEONMAP1(vqdmlalh_s16, aarch64_neon_vqdmlal, VectorRet),
2093   NEONMAP0(vqdmlals_lane_s32),
2094   NEONMAP0(vqdmlals_laneq_s32),
2095   NEONMAP1(vqdmlals_s32, aarch64_neon_vqdmlal, VectorRet),
2096   NEONMAP0(vqdmlslh_lane_s16),
2097   NEONMAP0(vqdmlslh_laneq_s16),
2098   NEONMAP1(vqdmlslh_s16, aarch64_neon_vqdmlsl, VectorRet),
2099   NEONMAP0(vqdmlsls_lane_s32),
2100   NEONMAP0(vqdmlsls_laneq_s32),
2101   NEONMAP1(vqdmlsls_s32, aarch64_neon_vqdmlsl, VectorRet),
2102   NEONMAP1(vqdmulhh_s16, arm_neon_vqdmulh, VectorRet),
2103   NEONMAP1(vqdmulhs_s32, arm_neon_vqdmulh, VectorRet),
2104   NEONMAP1(vqdmullh_s16, arm_neon_vqdmull, VectorRet),
2105   NEONMAP1(vqdmulls_s32, arm_neon_vqdmull, VectorRet),
2106   NEONMAP1(vqmovnd_s64, arm_neon_vqmovns, VectorRet),
2107   NEONMAP1(vqmovnd_u64, arm_neon_vqmovnu, VectorRet),
2108   NEONMAP1(vqmovnh_s16, arm_neon_vqmovns, VectorRet),
2109   NEONMAP1(vqmovnh_u16, arm_neon_vqmovnu, VectorRet),
2110   NEONMAP1(vqmovns_s32, arm_neon_vqmovns, VectorRet),
2111   NEONMAP1(vqmovns_u32, arm_neon_vqmovnu, VectorRet),
2112   NEONMAP1(vqmovund_s64, arm_neon_vqmovnsu, VectorRet),
2113   NEONMAP1(vqmovunh_s16, arm_neon_vqmovnsu, VectorRet),
2114   NEONMAP1(vqmovuns_s32, arm_neon_vqmovnsu, VectorRet),
2115   NEONMAP1(vqnegb_s8, arm_neon_vqneg, VectorRet),
2116   NEONMAP1(vqnegd_s64, arm_neon_vqneg, VectorRet),
2117   NEONMAP1(vqnegh_s16, arm_neon_vqneg, VectorRet),
2118   NEONMAP1(vqnegs_s32, arm_neon_vqneg, VectorRet),
2119   NEONMAP1(vqrdmulhh_s16, arm_neon_vqrdmulh, VectorRet),
2120   NEONMAP1(vqrdmulhs_s32, arm_neon_vqrdmulh, VectorRet),
2121   NEONMAP1(vqrshlb_s8, aarch64_neon_vqrshls, VectorRet),
2122   NEONMAP1(vqrshlb_u8, aarch64_neon_vqrshlu, VectorRet),
2123   NEONMAP1(vqrshld_s64, aarch64_neon_vqrshls, VectorRet),
2124   NEONMAP1(vqrshld_u64, aarch64_neon_vqrshlu, VectorRet),
2125   NEONMAP1(vqrshlh_s16, aarch64_neon_vqrshls, VectorRet),
2126   NEONMAP1(vqrshlh_u16, aarch64_neon_vqrshlu, VectorRet),
2127   NEONMAP1(vqrshls_s32, aarch64_neon_vqrshls, VectorRet),
2128   NEONMAP1(vqrshls_u32, aarch64_neon_vqrshlu, VectorRet),
2129   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_vsqrshrn, VectorRet),
2130   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_vuqrshrn, VectorRet),
2131   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_vsqrshrn, VectorRet),
2132   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_vuqrshrn, VectorRet),
2133   NEONMAP1(vqrshrns_n_s32, aarch64_neon_vsqrshrn, VectorRet),
2134   NEONMAP1(vqrshrns_n_u32, aarch64_neon_vuqrshrn, VectorRet),
2135   NEONMAP1(vqrshrund_n_s64, aarch64_neon_vsqrshrun, VectorRet),
2136   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_vsqrshrun, VectorRet),
2137   NEONMAP1(vqrshruns_n_s32, aarch64_neon_vsqrshrun, VectorRet),
2138   NEONMAP1(vqshlb_n_s8, aarch64_neon_vqshls_n, VectorRet),
2139   NEONMAP1(vqshlb_n_u8, aarch64_neon_vqshlu_n, VectorRet),
2140   NEONMAP1(vqshlb_s8, aarch64_neon_vqshls, VectorRet),
2141   NEONMAP1(vqshlb_u8, aarch64_neon_vqshlu, VectorRet),
2142   NEONMAP1(vqshld_n_s64, aarch64_neon_vqshls_n, VectorRet),
2143   NEONMAP1(vqshld_n_u64, aarch64_neon_vqshlu_n, VectorRet),
2144   NEONMAP1(vqshld_s64, aarch64_neon_vqshls, VectorRet),
2145   NEONMAP1(vqshld_u64, aarch64_neon_vqshlu, VectorRet),
2146   NEONMAP1(vqshlh_n_s16, aarch64_neon_vqshls_n, VectorRet),
2147   NEONMAP1(vqshlh_n_u16, aarch64_neon_vqshlu_n, VectorRet),
2148   NEONMAP1(vqshlh_s16, aarch64_neon_vqshls, VectorRet),
2149   NEONMAP1(vqshlh_u16, aarch64_neon_vqshlu, VectorRet),
2150   NEONMAP1(vqshls_n_s32, aarch64_neon_vqshls_n, VectorRet),
2151   NEONMAP1(vqshls_n_u32, aarch64_neon_vqshlu_n, VectorRet),
2152   NEONMAP1(vqshls_s32, aarch64_neon_vqshls, VectorRet),
2153   NEONMAP1(vqshls_u32, aarch64_neon_vqshlu, VectorRet),
2154   NEONMAP1(vqshlub_n_s8, aarch64_neon_vsqshlu, VectorRet),
2155   NEONMAP1(vqshlud_n_s64, aarch64_neon_vsqshlu, VectorRet),
2156   NEONMAP1(vqshluh_n_s16, aarch64_neon_vsqshlu, VectorRet),
2157   NEONMAP1(vqshlus_n_s32, aarch64_neon_vsqshlu, VectorRet),
2158   NEONMAP1(vqshrnd_n_s64, aarch64_neon_vsqshrn, VectorRet),
2159   NEONMAP1(vqshrnd_n_u64, aarch64_neon_vuqshrn, VectorRet),
2160   NEONMAP1(vqshrnh_n_s16, aarch64_neon_vsqshrn, VectorRet),
2161   NEONMAP1(vqshrnh_n_u16, aarch64_neon_vuqshrn, VectorRet),
2162   NEONMAP1(vqshrns_n_s32, aarch64_neon_vsqshrn, VectorRet),
2163   NEONMAP1(vqshrns_n_u32, aarch64_neon_vuqshrn, VectorRet),
2164   NEONMAP1(vqshrund_n_s64, aarch64_neon_vsqshrun, VectorRet),
2165   NEONMAP1(vqshrunh_n_s16, aarch64_neon_vsqshrun, VectorRet),
2166   NEONMAP1(vqshruns_n_s32, aarch64_neon_vsqshrun, VectorRet),
2167   NEONMAP1(vqsubb_s8, arm_neon_vqsubs, VectorRet),
2168   NEONMAP1(vqsubb_u8, arm_neon_vqsubu, VectorRet),
2169   NEONMAP1(vqsubd_s64, arm_neon_vqsubs, VectorRet),
2170   NEONMAP1(vqsubd_u64, arm_neon_vqsubu, VectorRet),
2171   NEONMAP1(vqsubh_s16, arm_neon_vqsubs, VectorRet),
2172   NEONMAP1(vqsubh_u16, arm_neon_vqsubu, VectorRet),
2173   NEONMAP1(vqsubs_s32, arm_neon_vqsubs, VectorRet),
2174   NEONMAP1(vqsubs_u32, arm_neon_vqsubu, VectorRet),
2175   NEONMAP1(vrecped_f64, aarch64_neon_vrecpe, AddRetType),
2176   NEONMAP1(vrecpes_f32, aarch64_neon_vrecpe, AddRetType),
2177   NEONMAP1(vrecpsd_f64, aarch64_neon_vrecps, AddRetType),
2178   NEONMAP1(vrecpss_f32, aarch64_neon_vrecps, AddRetType),
2179   NEONMAP1(vrecpxd_f64, aarch64_neon_vrecpx, AddRetType),
2180   NEONMAP1(vrecpxs_f32, aarch64_neon_vrecpx, AddRetType),
2181   NEONMAP1(vrshld_s64, aarch64_neon_vrshlds, 0),
2182   NEONMAP1(vrshld_u64, aarch64_neon_vrshldu, 0),
2183   NEONMAP1(vrshrd_n_s64, aarch64_neon_vsrshr, VectorRet),
2184   NEONMAP1(vrshrd_n_u64, aarch64_neon_vurshr, VectorRet),
2185   NEONMAP1(vrsqrted_f64, aarch64_neon_vrsqrte, AddRetType),
2186   NEONMAP1(vrsqrtes_f32, aarch64_neon_vrsqrte, AddRetType),
2187   NEONMAP1(vrsqrtsd_f64, aarch64_neon_vrsqrts, AddRetType),
2188   NEONMAP1(vrsqrtss_f32, aarch64_neon_vrsqrts, AddRetType),
2189   NEONMAP1(vrsrad_n_s64, aarch64_neon_vrsrads_n, 0),
2190   NEONMAP1(vrsrad_n_u64, aarch64_neon_vrsradu_n, 0),
2191   NEONMAP0(vset_lane_f32),
2192   NEONMAP0(vset_lane_f64),
2193   NEONMAP0(vset_lane_i16),
2194   NEONMAP0(vset_lane_i32),
2195   NEONMAP0(vset_lane_i64),
2196   NEONMAP0(vset_lane_i8),
2197   NEONMAP0(vsetq_lane_f32),
2198   NEONMAP0(vsetq_lane_f64),
2199   NEONMAP0(vsetq_lane_i16),
2200   NEONMAP0(vsetq_lane_i32),
2201   NEONMAP0(vsetq_lane_i64),
2202   NEONMAP0(vsetq_lane_i8),
2203   NEONMAP1(vsha1cq_u32, arm_neon_sha1c, 0),
2204   NEONMAP1(vsha1h_u32, arm_neon_sha1h, 0),
2205   NEONMAP1(vsha1mq_u32, arm_neon_sha1m, 0),
2206   NEONMAP1(vsha1pq_u32, arm_neon_sha1p, 0),
2207   NEONMAP1(vshld_n_s64, aarch64_neon_vshld_n, 0),
2208   NEONMAP1(vshld_n_u64, aarch64_neon_vshld_n, 0),
2209   NEONMAP1(vshld_s64, aarch64_neon_vshlds, 0),
2210   NEONMAP1(vshld_u64, aarch64_neon_vshldu, 0),
2211   NEONMAP1(vshrd_n_s64, aarch64_neon_vshrds_n, 0),
2212   NEONMAP1(vshrd_n_u64, aarch64_neon_vshrdu_n, 0),
2213   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, VectorRet),
2214   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, VectorRet),
2215   NEONMAP1(vsqaddb_u8, aarch64_neon_vsqadd, VectorRet),
2216   NEONMAP1(vsqaddd_u64, aarch64_neon_vsqadd, VectorRet),
2217   NEONMAP1(vsqaddh_u16, aarch64_neon_vsqadd, VectorRet),
2218   NEONMAP1(vsqadds_u32, aarch64_neon_vsqadd, VectorRet),
2219   NEONMAP1(vsrad_n_s64, aarch64_neon_vsrads_n, 0),
2220   NEONMAP1(vsrad_n_u64, aarch64_neon_vsradu_n, 0),
2221   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, VectorRet),
2222   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, VectorRet),
2223   NEONMAP1(vsubd_s64, aarch64_neon_vsubds, 0),
2224   NEONMAP1(vsubd_u64, aarch64_neon_vsubdu, 0),
2225   NEONMAP1(vtstd_s64, aarch64_neon_vtstd, VectorRetGetArgs01),
2226   NEONMAP1(vtstd_u64, aarch64_neon_vtstd, VectorRetGetArgs01),
2227   NEONMAP1(vuqaddb_s8, aarch64_neon_vuqadd, VectorRet),
2228   NEONMAP1(vuqaddd_s64, aarch64_neon_vuqadd, VectorRet),
2229   NEONMAP1(vuqaddh_s16, aarch64_neon_vuqadd, VectorRet),
2230   NEONMAP1(vuqadds_s32, aarch64_neon_vuqadd, VectorRet)
2231 };
2232 
2233 static NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
2234   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
2235   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
2236   NEONMAP1(vabs_v, arm_neon_vabs, 0),
2237   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
2238   NEONMAP0(vaddhn_v),
2239   NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
2240   NEONMAP1(vaeseq_v, arm_neon_aese, 0),
2241   NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
2242   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
2243   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
2244   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
2245   NEONMAP1(vcage_v, arm_neon_vacge, 0),
2246   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
2247   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
2248   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
2249   NEONMAP1(vcale_v, arm_neon_vacge, 0),
2250   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
2251   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
2252   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
2253   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
2254   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
2255   NEONMAP1(vclz_v, ctlz, Add1ArgType),
2256   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
2257   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
2258   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
2259   NEONMAP1(vcvt_f16_v, arm_neon_vcvtfp2hf, 0),
2260   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
2261   NEONMAP0(vcvt_f32_v),
2262   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
2263   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
2264   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
2265   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
2266   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
2267   NEONMAP0(vcvt_s32_v),
2268   NEONMAP0(vcvt_s64_v),
2269   NEONMAP0(vcvt_u32_v),
2270   NEONMAP0(vcvt_u64_v),
2271   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
2272   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
2273   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
2274   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
2275   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
2276   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
2277   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
2278   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
2279   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
2280   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
2281   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
2282   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
2283   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
2284   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
2285   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
2286   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
2287   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
2288   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
2289   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
2290   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
2291   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
2292   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
2293   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
2294   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
2295   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
2296   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
2297   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
2298   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
2299   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
2300   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
2301   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
2302   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
2303   NEONMAP0(vcvtq_f32_v),
2304   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
2305   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
2306   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
2307   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
2308   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
2309   NEONMAP0(vcvtq_s32_v),
2310   NEONMAP0(vcvtq_s64_v),
2311   NEONMAP0(vcvtq_u32_v),
2312   NEONMAP0(vcvtq_u64_v),
2313   NEONMAP0(vext_v),
2314   NEONMAP0(vextq_v),
2315   NEONMAP0(vfma_v),
2316   NEONMAP0(vfmaq_v),
2317   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
2318   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
2319   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
2320   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
2321   NEONMAP0(vld1_dup_v),
2322   NEONMAP1(vld1_v, arm_neon_vld1, 0),
2323   NEONMAP0(vld1q_dup_v),
2324   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
2325   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
2326   NEONMAP1(vld2_v, arm_neon_vld2, 0),
2327   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
2328   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
2329   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
2330   NEONMAP1(vld3_v, arm_neon_vld3, 0),
2331   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
2332   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
2333   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
2334   NEONMAP1(vld4_v, arm_neon_vld4, 0),
2335   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
2336   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
2337   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
2338   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
2339   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
2340   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
2341   NEONMAP0(vmovl_v),
2342   NEONMAP0(vmovn_v),
2343   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
2344   NEONMAP0(vmull_v),
2345   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
2346   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
2347   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
2348   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
2349   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
2350   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
2351   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
2352   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
2353   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
2354   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
2355   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
2356   NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
2357   NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
2358   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
2359   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
2360   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
2361   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
2362   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
2363   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
2364   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
2365   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
2366   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
2367   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
2368   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
2369   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
2370   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
2371   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
2372   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
2373   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
2374   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
2375   NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
2376   NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
2377   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
2378   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
2379   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
2380   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
2381   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
2382   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
2383   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
2384   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
2385   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
2386   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
2387   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
2388   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
2389   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
2390   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
2391   NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
2392   NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
2393   NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
2394   NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
2395   NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
2396   NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
2397   NEONMAP0(vshl_n_v),
2398   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
2399   NEONMAP0(vshll_n_v),
2400   NEONMAP0(vshlq_n_v),
2401   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
2402   NEONMAP0(vshr_n_v),
2403   NEONMAP0(vshrn_n_v),
2404   NEONMAP0(vshrq_n_v),
2405   NEONMAP1(vst1_v, arm_neon_vst1, 0),
2406   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
2407   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
2408   NEONMAP1(vst2_v, arm_neon_vst2, 0),
2409   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
2410   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
2411   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
2412   NEONMAP1(vst3_v, arm_neon_vst3, 0),
2413   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
2414   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
2415   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
2416   NEONMAP1(vst4_v, arm_neon_vst4, 0),
2417   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
2418   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
2419   NEONMAP0(vsubhn_v),
2420   NEONMAP0(vtrn_v),
2421   NEONMAP0(vtrnq_v),
2422   NEONMAP0(vtst_v),
2423   NEONMAP0(vtstq_v),
2424   NEONMAP0(vuzp_v),
2425   NEONMAP0(vuzpq_v),
2426   NEONMAP0(vzip_v),
2427   NEONMAP0(vzipq_v)
2428 };
2429 
2430 static NeonIntrinsicInfo ARM64SIMDIntrinsicMap[] = {
2431   NEONMAP1(vabs_v, arm64_neon_abs, 0),
2432   NEONMAP1(vabsq_v, arm64_neon_abs, 0),
2433   NEONMAP0(vaddhn_v),
2434   NEONMAP1(vaesdq_v, arm64_crypto_aesd, 0),
2435   NEONMAP1(vaeseq_v, arm64_crypto_aese, 0),
2436   NEONMAP1(vaesimcq_v, arm64_crypto_aesimc, 0),
2437   NEONMAP1(vaesmcq_v, arm64_crypto_aesmc, 0),
2438   NEONMAP1(vcage_v, arm64_neon_facge, 0),
2439   NEONMAP1(vcageq_v, arm64_neon_facge, 0),
2440   NEONMAP1(vcagt_v, arm64_neon_facgt, 0),
2441   NEONMAP1(vcagtq_v, arm64_neon_facgt, 0),
2442   NEONMAP1(vcale_v, arm64_neon_facge, 0),
2443   NEONMAP1(vcaleq_v, arm64_neon_facge, 0),
2444   NEONMAP1(vcalt_v, arm64_neon_facgt, 0),
2445   NEONMAP1(vcaltq_v, arm64_neon_facgt, 0),
2446   NEONMAP1(vcls_v, arm64_neon_cls, Add1ArgType),
2447   NEONMAP1(vclsq_v, arm64_neon_cls, Add1ArgType),
2448   NEONMAP1(vclz_v, ctlz, Add1ArgType),
2449   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
2450   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
2451   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
2452   NEONMAP1(vcvt_f16_v, arm64_neon_vcvtfp2hf, 0),
2453   NEONMAP1(vcvt_f32_f16, arm64_neon_vcvthf2fp, 0),
2454   NEONMAP0(vcvt_f32_v),
2455   NEONMAP2(vcvt_n_f32_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
2456   NEONMAP2(vcvt_n_f64_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
2457   NEONMAP1(vcvt_n_s32_v, arm64_neon_vcvtfp2fxs, 0),
2458   NEONMAP1(vcvt_n_s64_v, arm64_neon_vcvtfp2fxs, 0),
2459   NEONMAP1(vcvt_n_u32_v, arm64_neon_vcvtfp2fxu, 0),
2460   NEONMAP1(vcvt_n_u64_v, arm64_neon_vcvtfp2fxu, 0),
2461   NEONMAP0(vcvtq_f32_v),
2462   NEONMAP2(vcvtq_n_f32_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
2463   NEONMAP2(vcvtq_n_f64_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
2464   NEONMAP1(vcvtq_n_s32_v, arm64_neon_vcvtfp2fxs, 0),
2465   NEONMAP1(vcvtq_n_s64_v, arm64_neon_vcvtfp2fxs, 0),
2466   NEONMAP1(vcvtq_n_u32_v, arm64_neon_vcvtfp2fxu, 0),
2467   NEONMAP1(vcvtq_n_u64_v, arm64_neon_vcvtfp2fxu, 0),
2468   NEONMAP1(vcvtx_f32_v, arm64_neon_fcvtxn, AddRetType | Add1ArgType),
2469   NEONMAP0(vext_v),
2470   NEONMAP0(vextq_v),
2471   NEONMAP0(vfma_v),
2472   NEONMAP0(vfmaq_v),
2473   NEONMAP2(vhadd_v, arm64_neon_uhadd, arm64_neon_shadd, Add1ArgType | UnsignedAlts),
2474   NEONMAP2(vhaddq_v, arm64_neon_uhadd, arm64_neon_shadd, Add1ArgType | UnsignedAlts),
2475   NEONMAP2(vhsub_v, arm64_neon_uhsub, arm64_neon_shsub, Add1ArgType | UnsignedAlts),
2476   NEONMAP2(vhsubq_v, arm64_neon_uhsub, arm64_neon_shsub, Add1ArgType | UnsignedAlts),
2477   NEONMAP0(vmovl_v),
2478   NEONMAP0(vmovn_v),
2479   NEONMAP1(vmul_v, arm64_neon_pmul, Add1ArgType),
2480   NEONMAP1(vmulq_v, arm64_neon_pmul, Add1ArgType),
2481   NEONMAP1(vpadd_v, arm64_neon_addp, Add1ArgType),
2482   NEONMAP2(vpaddl_v, arm64_neon_uaddlp, arm64_neon_saddlp, UnsignedAlts),
2483   NEONMAP2(vpaddlq_v, arm64_neon_uaddlp, arm64_neon_saddlp, UnsignedAlts),
2484   NEONMAP1(vpaddq_v, arm64_neon_addp, Add1ArgType),
2485   NEONMAP1(vqabs_v, arm64_neon_sqabs, Add1ArgType),
2486   NEONMAP1(vqabsq_v, arm64_neon_sqabs, Add1ArgType),
2487   NEONMAP2(vqadd_v, arm64_neon_uqadd, arm64_neon_sqadd, Add1ArgType | UnsignedAlts),
2488   NEONMAP2(vqaddq_v, arm64_neon_uqadd, arm64_neon_sqadd, Add1ArgType | UnsignedAlts),
2489   NEONMAP2(vqdmlal_v, arm64_neon_sqdmull, arm64_neon_sqadd, 0),
2490   NEONMAP2(vqdmlsl_v, arm64_neon_sqdmull, arm64_neon_sqsub, 0),
2491   NEONMAP1(vqdmulh_v, arm64_neon_sqdmulh, Add1ArgType),
2492   NEONMAP1(vqdmulhq_v, arm64_neon_sqdmulh, Add1ArgType),
2493   NEONMAP1(vqdmull_v, arm64_neon_sqdmull, Add1ArgType),
2494   NEONMAP2(vqmovn_v, arm64_neon_uqxtn, arm64_neon_sqxtn, Add1ArgType | UnsignedAlts),
2495   NEONMAP1(vqmovun_v, arm64_neon_sqxtun, Add1ArgType),
2496   NEONMAP1(vqneg_v, arm64_neon_sqneg, Add1ArgType),
2497   NEONMAP1(vqnegq_v, arm64_neon_sqneg, Add1ArgType),
2498   NEONMAP1(vqrdmulh_v, arm64_neon_sqrdmulh, Add1ArgType),
2499   NEONMAP1(vqrdmulhq_v, arm64_neon_sqrdmulh, Add1ArgType),
2500   NEONMAP2(vqrshl_v, arm64_neon_uqrshl, arm64_neon_sqrshl, Add1ArgType | UnsignedAlts),
2501   NEONMAP2(vqrshlq_v, arm64_neon_uqrshl, arm64_neon_sqrshl, Add1ArgType | UnsignedAlts),
2502   NEONMAP2(vqshl_n_v, arm64_neon_uqshl, arm64_neon_sqshl, UnsignedAlts),
2503   NEONMAP2(vqshl_v, arm64_neon_uqshl, arm64_neon_sqshl, Add1ArgType | UnsignedAlts),
2504   NEONMAP2(vqshlq_n_v, arm64_neon_uqshl, arm64_neon_sqshl,UnsignedAlts),
2505   NEONMAP2(vqshlq_v, arm64_neon_uqshl, arm64_neon_sqshl, Add1ArgType | UnsignedAlts),
2506   NEONMAP2(vqsub_v, arm64_neon_uqsub, arm64_neon_sqsub, Add1ArgType | UnsignedAlts),
2507   NEONMAP2(vqsubq_v, arm64_neon_uqsub, arm64_neon_sqsub, Add1ArgType | UnsignedAlts),
2508   NEONMAP1(vraddhn_v, arm64_neon_raddhn, Add1ArgType),
2509   NEONMAP2(vrecpe_v, arm64_neon_frecpe, arm64_neon_urecpe, 0),
2510   NEONMAP2(vrecpeq_v, arm64_neon_frecpe, arm64_neon_urecpe, 0),
2511   NEONMAP1(vrecps_v, arm64_neon_frecps, Add1ArgType),
2512   NEONMAP1(vrecpsq_v, arm64_neon_frecps, Add1ArgType),
2513   NEONMAP2(vrhadd_v, arm64_neon_urhadd, arm64_neon_srhadd, Add1ArgType | UnsignedAlts),
2514   NEONMAP2(vrhaddq_v, arm64_neon_urhadd, arm64_neon_srhadd, Add1ArgType | UnsignedAlts),
2515   NEONMAP2(vrshl_v, arm64_neon_urshl, arm64_neon_srshl, Add1ArgType | UnsignedAlts),
2516   NEONMAP2(vrshlq_v, arm64_neon_urshl, arm64_neon_srshl, Add1ArgType | UnsignedAlts),
2517   NEONMAP2(vrsqrte_v, arm64_neon_frsqrte, arm64_neon_ursqrte, 0),
2518   NEONMAP2(vrsqrteq_v, arm64_neon_frsqrte, arm64_neon_ursqrte, 0),
2519   NEONMAP1(vrsqrts_v, arm64_neon_frsqrts, Add1ArgType),
2520   NEONMAP1(vrsqrtsq_v, arm64_neon_frsqrts, Add1ArgType),
2521   NEONMAP1(vrsubhn_v, arm64_neon_rsubhn, Add1ArgType),
2522   NEONMAP1(vsha1su0q_v, arm64_crypto_sha1su0, 0),
2523   NEONMAP1(vsha1su1q_v, arm64_crypto_sha1su1, 0),
2524   NEONMAP1(vsha256h2q_v, arm64_crypto_sha256h2, 0),
2525   NEONMAP1(vsha256hq_v, arm64_crypto_sha256h, 0),
2526   NEONMAP1(vsha256su0q_v, arm64_crypto_sha256su0, 0),
2527   NEONMAP1(vsha256su1q_v, arm64_crypto_sha256su1, 0),
2528   NEONMAP0(vshl_n_v),
2529   NEONMAP2(vshl_v, arm64_neon_ushl, arm64_neon_sshl, Add1ArgType | UnsignedAlts),
2530   NEONMAP0(vshll_n_v),
2531   NEONMAP0(vshlq_n_v),
2532   NEONMAP2(vshlq_v, arm64_neon_ushl, arm64_neon_sshl, Add1ArgType | UnsignedAlts),
2533   NEONMAP0(vshr_n_v),
2534   NEONMAP0(vshrn_n_v),
2535   NEONMAP0(vshrq_n_v),
2536   NEONMAP0(vsubhn_v),
2537   NEONMAP0(vtst_v),
2538   NEONMAP0(vtstq_v),
2539 };
2540 
2541 static NeonIntrinsicInfo ARM64SISDIntrinsicMap[] = {
2542   NEONMAP1(vabdd_f64, arm64_sisd_fabd, Add1ArgType),
2543   NEONMAP1(vabds_f32, arm64_sisd_fabd, Add1ArgType),
2544   NEONMAP1(vabsd_s64, arm64_neon_abs, Add1ArgType),
2545   NEONMAP1(vaddlv_s32, arm64_neon_saddlv, AddRetType | Add1ArgType),
2546   NEONMAP1(vaddlv_u32, arm64_neon_uaddlv, AddRetType | Add1ArgType),
2547   NEONMAP1(vaddlvq_s32, arm64_neon_saddlv, AddRetType | Add1ArgType),
2548   NEONMAP1(vaddlvq_u32, arm64_neon_uaddlv, AddRetType | Add1ArgType),
2549   NEONMAP1(vaddv_f32, arm64_neon_faddv, AddRetType | Add1ArgType),
2550   NEONMAP1(vaddv_s32, arm64_neon_saddv, AddRetType | Add1ArgType),
2551   NEONMAP1(vaddv_u32, arm64_neon_uaddv, AddRetType | Add1ArgType),
2552   NEONMAP1(vaddvq_f32, arm64_neon_faddv, AddRetType | Add1ArgType),
2553   NEONMAP1(vaddvq_f64, arm64_neon_faddv, AddRetType | Add1ArgType),
2554   NEONMAP1(vaddvq_s32, arm64_neon_saddv, AddRetType | Add1ArgType),
2555   NEONMAP1(vaddvq_s64, arm64_neon_saddv, AddRetType | Add1ArgType),
2556   NEONMAP1(vaddvq_u32, arm64_neon_uaddv, AddRetType | Add1ArgType),
2557   NEONMAP1(vaddvq_u64, arm64_neon_uaddv, AddRetType | Add1ArgType),
2558   NEONMAP1(vcaged_f64, arm64_neon_facge, AddRetType | Add1ArgType),
2559   NEONMAP1(vcages_f32, arm64_neon_facge, AddRetType | Add1ArgType),
2560   NEONMAP1(vcagtd_f64, arm64_neon_facgt, AddRetType | Add1ArgType),
2561   NEONMAP1(vcagts_f32, arm64_neon_facgt, AddRetType | Add1ArgType),
2562   NEONMAP1(vcaled_f64, arm64_neon_facge, AddRetType | Add1ArgType),
2563   NEONMAP1(vcales_f32, arm64_neon_facge, AddRetType | Add1ArgType),
2564   NEONMAP1(vcaltd_f64, arm64_neon_facgt, AddRetType | Add1ArgType),
2565   NEONMAP1(vcalts_f32, arm64_neon_facgt, AddRetType | Add1ArgType),
2566   NEONMAP1(vcvtad_s64_f64, arm64_neon_fcvtas, AddRetType | Add1ArgType),
2567   NEONMAP1(vcvtad_u64_f64, arm64_neon_fcvtau, AddRetType | Add1ArgType),
2568   NEONMAP1(vcvtas_s32_f32, arm64_neon_fcvtas, AddRetType | Add1ArgType),
2569   NEONMAP1(vcvtas_u32_f32, arm64_neon_fcvtau, AddRetType | Add1ArgType),
2570   NEONMAP1(vcvtd_n_f64_s64, arm64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
2571   NEONMAP1(vcvtd_n_f64_u64, arm64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
2572   NEONMAP1(vcvtd_n_s64_f64, arm64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
2573   NEONMAP1(vcvtd_n_u64_f64, arm64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
2574   NEONMAP1(vcvtmd_s64_f64, arm64_neon_fcvtms, AddRetType | Add1ArgType),
2575   NEONMAP1(vcvtmd_u64_f64, arm64_neon_fcvtmu, AddRetType | Add1ArgType),
2576   NEONMAP1(vcvtms_s32_f32, arm64_neon_fcvtms, AddRetType | Add1ArgType),
2577   NEONMAP1(vcvtms_u32_f32, arm64_neon_fcvtmu, AddRetType | Add1ArgType),
2578   NEONMAP1(vcvtnd_s64_f64, arm64_neon_fcvtns, AddRetType | Add1ArgType),
2579   NEONMAP1(vcvtnd_u64_f64, arm64_neon_fcvtnu, AddRetType | Add1ArgType),
2580   NEONMAP1(vcvtns_s32_f32, arm64_neon_fcvtns, AddRetType | Add1ArgType),
2581   NEONMAP1(vcvtns_u32_f32, arm64_neon_fcvtnu, AddRetType | Add1ArgType),
2582   NEONMAP1(vcvtpd_s64_f64, arm64_neon_fcvtps, AddRetType | Add1ArgType),
2583   NEONMAP1(vcvtpd_u64_f64, arm64_neon_fcvtpu, AddRetType | Add1ArgType),
2584   NEONMAP1(vcvtps_s32_f32, arm64_neon_fcvtps, AddRetType | Add1ArgType),
2585   NEONMAP1(vcvtps_u32_f32, arm64_neon_fcvtpu, AddRetType | Add1ArgType),
2586   NEONMAP1(vcvts_n_f32_s32, arm64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
2587   NEONMAP1(vcvts_n_f32_u32, arm64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
2588   NEONMAP1(vcvts_n_s32_f32, arm64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
2589   NEONMAP1(vcvts_n_u32_f32, arm64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
2590   NEONMAP1(vcvtxd_f32_f64, arm64_sisd_fcvtxn, 0),
2591   NEONMAP1(vmaxnmv_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
2592   NEONMAP1(vmaxnmvq_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
2593   NEONMAP1(vmaxnmvq_f64, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
2594   NEONMAP1(vmaxv_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
2595   NEONMAP1(vmaxv_s32, arm64_neon_smaxv, AddRetType | Add1ArgType),
2596   NEONMAP1(vmaxv_u32, arm64_neon_umaxv, AddRetType | Add1ArgType),
2597   NEONMAP1(vmaxvq_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
2598   NEONMAP1(vmaxvq_f64, arm64_neon_fmaxv, AddRetType | Add1ArgType),
2599   NEONMAP1(vmaxvq_s32, arm64_neon_smaxv, AddRetType | Add1ArgType),
2600   NEONMAP1(vmaxvq_u32, arm64_neon_umaxv, AddRetType | Add1ArgType),
2601   NEONMAP1(vminnmv_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
2602   NEONMAP1(vminnmvq_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
2603   NEONMAP1(vminnmvq_f64, arm64_neon_fminnmv, AddRetType | Add1ArgType),
2604   NEONMAP1(vminv_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
2605   NEONMAP1(vminv_s32, arm64_neon_sminv, AddRetType | Add1ArgType),
2606   NEONMAP1(vminv_u32, arm64_neon_uminv, AddRetType | Add1ArgType),
2607   NEONMAP1(vminvq_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
2608   NEONMAP1(vminvq_f64, arm64_neon_fminv, AddRetType | Add1ArgType),
2609   NEONMAP1(vminvq_s32, arm64_neon_sminv, AddRetType | Add1ArgType),
2610   NEONMAP1(vminvq_u32, arm64_neon_uminv, AddRetType | Add1ArgType),
2611   NEONMAP1(vmull_p64, arm64_neon_pmull64, 0),
2612   NEONMAP1(vmulxd_f64, arm64_neon_fmulx, Add1ArgType),
2613   NEONMAP1(vmulxs_f32, arm64_neon_fmulx, Add1ArgType),
2614   NEONMAP1(vpaddd_s64, arm64_neon_uaddv, AddRetType | Add1ArgType),
2615   NEONMAP1(vpaddd_u64, arm64_neon_uaddv, AddRetType | Add1ArgType),
2616   NEONMAP1(vpmaxnmqd_f64, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
2617   NEONMAP1(vpmaxnms_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
2618   NEONMAP1(vpmaxqd_f64, arm64_neon_fmaxv, AddRetType | Add1ArgType),
2619   NEONMAP1(vpmaxs_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
2620   NEONMAP1(vpminnmqd_f64, arm64_neon_fminnmv, AddRetType | Add1ArgType),
2621   NEONMAP1(vpminnms_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
2622   NEONMAP1(vpminqd_f64, arm64_neon_fminv, AddRetType | Add1ArgType),
2623   NEONMAP1(vpmins_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
2624   NEONMAP1(vqabsb_s8, arm64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
2625   NEONMAP1(vqabsd_s64, arm64_neon_sqabs, Add1ArgType),
2626   NEONMAP1(vqabsh_s16, arm64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
2627   NEONMAP1(vqabss_s32, arm64_neon_sqabs, Add1ArgType),
2628   NEONMAP1(vqaddb_s8, arm64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
2629   NEONMAP1(vqaddb_u8, arm64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
2630   NEONMAP1(vqaddd_s64, arm64_neon_sqadd, Add1ArgType),
2631   NEONMAP1(vqaddd_u64, arm64_neon_uqadd, Add1ArgType),
2632   NEONMAP1(vqaddh_s16, arm64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
2633   NEONMAP1(vqaddh_u16, arm64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
2634   NEONMAP1(vqadds_s32, arm64_neon_sqadd, Add1ArgType),
2635   NEONMAP1(vqadds_u32, arm64_neon_uqadd, Add1ArgType),
2636   NEONMAP1(vqdmulhh_s16, arm64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
2637   NEONMAP1(vqdmulhs_s32, arm64_neon_sqdmulh, Add1ArgType),
2638   NEONMAP1(vqdmullh_s16, arm64_neon_sqdmull, VectorRet | Use128BitVectors),
2639   NEONMAP1(vqdmulls_s32, arm64_neon_sqdmulls_scalar, 0),
2640   NEONMAP1(vqmovnd_s64, arm64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
2641   NEONMAP1(vqmovnd_u64, arm64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
2642   NEONMAP1(vqmovnh_s16, arm64_neon_sqxtn, VectorRet | Use64BitVectors),
2643   NEONMAP1(vqmovnh_u16, arm64_neon_uqxtn, VectorRet | Use64BitVectors),
2644   NEONMAP1(vqmovns_s32, arm64_neon_sqxtn, VectorRet | Use64BitVectors),
2645   NEONMAP1(vqmovns_u32, arm64_neon_uqxtn, VectorRet | Use64BitVectors),
2646   NEONMAP1(vqmovund_s64, arm64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
2647   NEONMAP1(vqmovunh_s16, arm64_neon_sqxtun, VectorRet | Use64BitVectors),
2648   NEONMAP1(vqmovuns_s32, arm64_neon_sqxtun, VectorRet | Use64BitVectors),
2649   NEONMAP1(vqnegb_s8, arm64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
2650   NEONMAP1(vqnegd_s64, arm64_neon_sqneg, Add1ArgType),
2651   NEONMAP1(vqnegh_s16, arm64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
2652   NEONMAP1(vqnegs_s32, arm64_neon_sqneg, Add1ArgType),
2653   NEONMAP1(vqrdmulhh_s16, arm64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
2654   NEONMAP1(vqrdmulhs_s32, arm64_neon_sqrdmulh, Add1ArgType),
2655   NEONMAP1(vqrshlb_s8, arm64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
2656   NEONMAP1(vqrshlb_u8, arm64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
2657   NEONMAP1(vqrshld_s64, arm64_neon_sqrshl, Add1ArgType),
2658   NEONMAP1(vqrshld_u64, arm64_neon_uqrshl, Add1ArgType),
2659   NEONMAP1(vqrshlh_s16, arm64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
2660   NEONMAP1(vqrshlh_u16, arm64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
2661   NEONMAP1(vqrshls_s32, arm64_neon_sqrshl, Add1ArgType),
2662   NEONMAP1(vqrshls_u32, arm64_neon_uqrshl, Add1ArgType),
2663   NEONMAP1(vqrshrnd_n_s64, arm64_neon_sqrshrn, AddRetType),
2664   NEONMAP1(vqrshrnd_n_u64, arm64_neon_uqrshrn, AddRetType),
2665   NEONMAP1(vqrshrnh_n_s16, arm64_neon_sqrshrn, VectorRet | Use64BitVectors),
2666   NEONMAP1(vqrshrnh_n_u16, arm64_neon_uqrshrn, VectorRet | Use64BitVectors),
2667   NEONMAP1(vqrshrns_n_s32, arm64_neon_sqrshrn, VectorRet | Use64BitVectors),
2668   NEONMAP1(vqrshrns_n_u32, arm64_neon_uqrshrn, VectorRet | Use64BitVectors),
2669   NEONMAP1(vqrshrund_n_s64, arm64_neon_sqrshrun, AddRetType),
2670   NEONMAP1(vqrshrunh_n_s16, arm64_neon_sqrshrun, VectorRet | Use64BitVectors),
2671   NEONMAP1(vqrshruns_n_s32, arm64_neon_sqrshrun, VectorRet | Use64BitVectors),
2672   NEONMAP1(vqshlb_n_s8, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
2673   NEONMAP1(vqshlb_n_u8, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
2674   NEONMAP1(vqshlb_s8, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
2675   NEONMAP1(vqshlb_u8, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
2676   NEONMAP1(vqshld_s64, arm64_neon_sqshl, Add1ArgType),
2677   NEONMAP1(vqshld_u64, arm64_neon_uqshl, Add1ArgType),
2678   NEONMAP1(vqshlh_n_s16, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
2679   NEONMAP1(vqshlh_n_u16, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
2680   NEONMAP1(vqshlh_s16, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
2681   NEONMAP1(vqshlh_u16, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
2682   NEONMAP1(vqshls_n_s32, arm64_neon_sqshl, Add1ArgType),
2683   NEONMAP1(vqshls_n_u32, arm64_neon_uqshl, Add1ArgType),
2684   NEONMAP1(vqshls_s32, arm64_neon_sqshl, Add1ArgType),
2685   NEONMAP1(vqshls_u32, arm64_neon_uqshl, Add1ArgType),
2686   NEONMAP1(vqshlub_n_s8, arm64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
2687   NEONMAP1(vqshluh_n_s16, arm64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
2688   NEONMAP1(vqshlus_n_s32, arm64_neon_sqshlu, Add1ArgType),
2689   NEONMAP1(vqshrnd_n_s64, arm64_neon_sqshrn, AddRetType),
2690   NEONMAP1(vqshrnd_n_u64, arm64_neon_uqshrn, AddRetType),
2691   NEONMAP1(vqshrnh_n_s16, arm64_neon_sqshrn, VectorRet | Use64BitVectors),
2692   NEONMAP1(vqshrnh_n_u16, arm64_neon_uqshrn, VectorRet | Use64BitVectors),
2693   NEONMAP1(vqshrns_n_s32, arm64_neon_sqshrn, VectorRet | Use64BitVectors),
2694   NEONMAP1(vqshrns_n_u32, arm64_neon_uqshrn, VectorRet | Use64BitVectors),
2695   NEONMAP1(vqshrund_n_s64, arm64_neon_sqshrun, AddRetType),
2696   NEONMAP1(vqshrunh_n_s16, arm64_neon_sqshrun, VectorRet | Use64BitVectors),
2697   NEONMAP1(vqshruns_n_s32, arm64_neon_sqshrun, VectorRet | Use64BitVectors),
2698   NEONMAP1(vqsubb_s8, arm64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
2699   NEONMAP1(vqsubb_u8, arm64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
2700   NEONMAP1(vqsubd_s64, arm64_neon_sqsub, Add1ArgType),
2701   NEONMAP1(vqsubd_u64, arm64_neon_uqsub, Add1ArgType),
2702   NEONMAP1(vqsubh_s16, arm64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
2703   NEONMAP1(vqsubh_u16, arm64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
2704   NEONMAP1(vqsubs_s32, arm64_neon_sqsub, Add1ArgType),
2705   NEONMAP1(vqsubs_u32, arm64_neon_uqsub, Add1ArgType),
2706   NEONMAP1(vrecped_f64, arm64_neon_frecpe, Add1ArgType),
2707   NEONMAP1(vrecpes_f32, arm64_neon_frecpe, Add1ArgType),
2708   NEONMAP1(vrecpxd_f64, arm64_neon_frecpx, Add1ArgType),
2709   NEONMAP1(vrecpxs_f32, arm64_neon_frecpx, Add1ArgType),
2710   NEONMAP1(vrshld_s64, arm64_neon_srshl, Add1ArgType),
2711   NEONMAP1(vrshld_u64, arm64_neon_urshl, Add1ArgType),
2712   NEONMAP1(vrsqrted_f64, arm64_neon_frsqrte, Add1ArgType),
2713   NEONMAP1(vrsqrtes_f32, arm64_neon_frsqrte, Add1ArgType),
2714   NEONMAP1(vrsqrtsd_f64, arm64_neon_frsqrts, Add1ArgType),
2715   NEONMAP1(vrsqrtss_f32, arm64_neon_frsqrts, Add1ArgType),
2716   NEONMAP1(vsha1cq_u32, arm64_crypto_sha1c, 0),
2717   NEONMAP1(vsha1h_u32, arm64_crypto_sha1h, 0),
2718   NEONMAP1(vsha1mq_u32, arm64_crypto_sha1m, 0),
2719   NEONMAP1(vsha1pq_u32, arm64_crypto_sha1p, 0),
2720   NEONMAP1(vshld_s64, arm64_neon_sshl, Add1ArgType),
2721   NEONMAP1(vshld_u64, arm64_neon_ushl, Add1ArgType),
2722   NEONMAP1(vslid_n_s64, arm64_neon_vsli, Vectorize1ArgType),
2723   NEONMAP1(vslid_n_u64, arm64_neon_vsli, Vectorize1ArgType),
2724   NEONMAP1(vsqaddb_u8, arm64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
2725   NEONMAP1(vsqaddd_u64, arm64_neon_usqadd, Add1ArgType),
2726   NEONMAP1(vsqaddh_u16, arm64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
2727   NEONMAP1(vsqadds_u32, arm64_neon_usqadd, Add1ArgType),
2728   NEONMAP1(vsrid_n_s64, arm64_neon_vsri, Vectorize1ArgType),
2729   NEONMAP1(vsrid_n_u64, arm64_neon_vsri, Vectorize1ArgType),
2730   NEONMAP1(vuqaddb_s8, arm64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
2731   NEONMAP1(vuqaddd_s64, arm64_neon_suqadd, Add1ArgType),
2732   NEONMAP1(vuqaddh_s16, arm64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
2733   NEONMAP1(vuqadds_s32, arm64_neon_suqadd, Add1ArgType),
2734 };
2735 
2736 #undef NEONMAP0
2737 #undef NEONMAP1
2738 #undef NEONMAP2
2739 
2740 static bool NEONSIMDIntrinsicsProvenSorted = false;
2741 static bool AArch64SISDIntrinsicInfoProvenSorted = false;
2742 
2743 static bool ARM64SIMDIntrinsicsProvenSorted = false;
2744 static bool ARM64SISDIntrinsicsProvenSorted = false;
2745 
2746 
2747 static const NeonIntrinsicInfo *
2748 findNeonIntrinsicInMap(llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap,
2749                        unsigned BuiltinID, bool &MapProvenSorted) {
2750 
2751 #ifndef NDEBUG
2752   if (!MapProvenSorted) {
2753     // FIXME: use std::is_sorted once C++11 is allowed
2754     for (unsigned i = 0; i < IntrinsicMap.size() - 1; ++i)
2755       assert(IntrinsicMap[i].BuiltinID <= IntrinsicMap[i + 1].BuiltinID);
2756     MapProvenSorted = true;
2757   }
2758 #endif
2759 
2760   const NeonIntrinsicInfo *Builtin =
2761       std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID);
2762 
2763   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
2764     return Builtin;
2765 
2766   return 0;
2767 }
2768 
2769 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
2770                                                    unsigned Modifier,
2771                                                    llvm::Type *ArgType,
2772                                                    const CallExpr *E) {
2773   int VectorSize = 0;
2774   if (Modifier & Use64BitVectors)
2775     VectorSize = 64;
2776   else if (Modifier & Use128BitVectors)
2777     VectorSize = 128;
2778 
2779   // Return type.
2780   SmallVector<llvm::Type *, 3> Tys;
2781   if (Modifier & AddRetType) {
2782     llvm::Type *Ty = ConvertType(E->getCallReturnType());
2783     if (Modifier & VectorizeRetType)
2784       Ty = llvm::VectorType::get(
2785           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
2786 
2787     Tys.push_back(Ty);
2788   }
2789 
2790   // Arguments.
2791   if (Modifier & VectorizeArgTypes) {
2792     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
2793     ArgType = llvm::VectorType::get(ArgType, Elts);
2794   }
2795 
2796   if (Modifier & (Add1ArgType | Add2ArgTypes))
2797     Tys.push_back(ArgType);
2798 
2799   if (Modifier & Add2ArgTypes)
2800     Tys.push_back(ArgType);
2801 
2802   if (Modifier & InventFloatType)
2803     Tys.push_back(FloatTy);
2804 
2805   return CGM.getIntrinsic(IntrinsicID, Tys);
2806 }
2807 
2808 static Value *EmitCommonNeonSISDBuiltinExpr(CodeGenFunction &CGF,
2809                                             const NeonIntrinsicInfo &SISDInfo,
2810                                             SmallVectorImpl<Value *> &Ops,
2811                                             const CallExpr *E) {
2812   unsigned BuiltinID = SISDInfo.BuiltinID;
2813   unsigned int Int = SISDInfo.LLVMIntrinsic;
2814   unsigned Modifier = SISDInfo.TypeModifier;
2815   const char *s = SISDInfo.NameHint;
2816 
2817   switch (BuiltinID) {
2818   case NEON::BI__builtin_neon_vcled_s64:
2819   case NEON::BI__builtin_neon_vcled_u64:
2820   case NEON::BI__builtin_neon_vcles_f32:
2821   case NEON::BI__builtin_neon_vcled_f64:
2822   case NEON::BI__builtin_neon_vcltd_s64:
2823   case NEON::BI__builtin_neon_vcltd_u64:
2824   case NEON::BI__builtin_neon_vclts_f32:
2825   case NEON::BI__builtin_neon_vcltd_f64:
2826   case NEON::BI__builtin_neon_vcales_f32:
2827   case NEON::BI__builtin_neon_vcaled_f64:
2828   case NEON::BI__builtin_neon_vcalts_f32:
2829   case NEON::BI__builtin_neon_vcaltd_f64:
2830     // Only one direction of comparisons actually exist, cmle is actually a cmge
2831     // with swapped operands. The table gives us the right intrinsic but we
2832     // still need to do the swap.
2833     std::swap(Ops[0], Ops[1]);
2834     break;
2835   }
2836 
2837   assert(Int && "Generic code assumes a valid intrinsic");
2838 
2839   // Determine the type(s) of this overloaded AArch64 intrinsic.
2840   const Expr *Arg = E->getArg(0);
2841   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
2842   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
2843 
2844   int j = 0;
2845   ConstantInt *C0 = ConstantInt::get(CGF.Int32Ty, 0);
2846   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
2847        ai != ae; ++ai, ++j) {
2848     llvm::Type *ArgTy = ai->getType();
2849     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
2850              ArgTy->getPrimitiveSizeInBits())
2851       continue;
2852 
2853     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
2854     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
2855     // it before inserting.
2856     Ops[j] =
2857         CGF.Builder.CreateTruncOrBitCast(Ops[j], ArgTy->getVectorElementType());
2858     Ops[j] =
2859         CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0);
2860   }
2861 
2862   Value *Result = CGF.EmitNeonCall(F, Ops, s);
2863   llvm::Type *ResultType = CGF.ConvertType(E->getType());
2864   if (ResultType->getPrimitiveSizeInBits() <
2865       Result->getType()->getPrimitiveSizeInBits())
2866     return CGF.Builder.CreateExtractElement(Result, C0);
2867 
2868   return CGF.Builder.CreateBitCast(Result, ResultType, s);
2869 }
2870 
2871 static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF,
2872                                            const NeonIntrinsicInfo &SISDInfo,
2873                                            const CallExpr *E) {
2874   unsigned BuiltinID = SISDInfo.BuiltinID;
2875   unsigned int Int = SISDInfo.LLVMIntrinsic;
2876   const char *s = SISDInfo.NameHint;
2877 
2878   SmallVector<Value *, 4> Ops;
2879   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
2880     Ops.push_back(CGF.EmitScalarExpr(E->getArg(i)));
2881   }
2882 
2883   // AArch64 scalar builtins are not overloaded, they do not have an extra
2884   // argument that specifies the vector type, need to handle each case.
2885   switch (BuiltinID) {
2886   default: break;
2887   case NEON::BI__builtin_neon_vdups_lane_f32:
2888   case NEON::BI__builtin_neon_vdupd_lane_f64:
2889   case NEON::BI__builtin_neon_vdups_laneq_f32:
2890   case NEON::BI__builtin_neon_vdupd_laneq_f64: {
2891     return CGF.Builder.CreateExtractElement(Ops[0], Ops[1], "vdup_lane");
2892   }
2893   case NEON::BI__builtin_neon_vdupb_lane_i8:
2894   case NEON::BI__builtin_neon_vduph_lane_i16:
2895   case NEON::BI__builtin_neon_vdups_lane_i32:
2896   case NEON::BI__builtin_neon_vdupd_lane_i64:
2897   case NEON::BI__builtin_neon_vdupb_laneq_i8:
2898   case NEON::BI__builtin_neon_vduph_laneq_i16:
2899   case NEON::BI__builtin_neon_vdups_laneq_i32:
2900   case NEON::BI__builtin_neon_vdupd_laneq_i64: {
2901     // The backend treats Neon scalar types as v1ix types
2902     // So we want to dup lane from any vector to v1ix vector
2903     // with shufflevector
2904     s = "vdup_lane";
2905     Value* SV = llvm::ConstantVector::getSplat(1, cast<ConstantInt>(Ops[1]));
2906     Value *Result = CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], SV, s);
2907     llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
2908     // AArch64 intrinsic one-element vector type cast to
2909     // scalar type expected by the builtin
2910     return CGF.Builder.CreateBitCast(Result, Ty, s);
2911   }
2912   case NEON::BI__builtin_neon_vqdmlalh_lane_s16 :
2913   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16 :
2914   case NEON::BI__builtin_neon_vqdmlals_lane_s32 :
2915   case NEON::BI__builtin_neon_vqdmlals_laneq_s32 :
2916   case NEON::BI__builtin_neon_vqdmlslh_lane_s16 :
2917   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16 :
2918   case NEON::BI__builtin_neon_vqdmlsls_lane_s32 :
2919   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32 : {
2920     Int = Intrinsic::arm_neon_vqadds;
2921     if (BuiltinID == NEON::BI__builtin_neon_vqdmlslh_lane_s16 ||
2922         BuiltinID == NEON::BI__builtin_neon_vqdmlslh_laneq_s16 ||
2923         BuiltinID == NEON::BI__builtin_neon_vqdmlsls_lane_s32 ||
2924         BuiltinID == NEON::BI__builtin_neon_vqdmlsls_laneq_s32) {
2925       Int = Intrinsic::arm_neon_vqsubs;
2926     }
2927     // create vqdmull call with b * c[i]
2928     llvm::Type *Ty = CGF.ConvertType(E->getArg(1)->getType());
2929     llvm::VectorType *OpVTy = llvm::VectorType::get(Ty, 1);
2930     Ty = CGF.ConvertType(E->getArg(0)->getType());
2931     llvm::VectorType *ResVTy = llvm::VectorType::get(Ty, 1);
2932     Value *F = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, ResVTy);
2933     Value *V = UndefValue::get(OpVTy);
2934     llvm::Constant *CI = ConstantInt::get(CGF.Int32Ty, 0);
2935     SmallVector<Value *, 2> MulOps;
2936     MulOps.push_back(Ops[1]);
2937     MulOps.push_back(Ops[2]);
2938     MulOps[0] = CGF.Builder.CreateInsertElement(V, MulOps[0], CI);
2939     MulOps[1] = CGF.Builder.CreateExtractElement(MulOps[1], Ops[3], "extract");
2940     MulOps[1] = CGF.Builder.CreateInsertElement(V, MulOps[1], CI);
2941     Value *MulRes = CGF.Builder.CreateCall2(F, MulOps[0], MulOps[1]);
2942     // create vqadds call with a +/- vqdmull result
2943     F = CGF.CGM.getIntrinsic(Int, ResVTy);
2944     SmallVector<Value *, 2> AddOps;
2945     AddOps.push_back(Ops[0]);
2946     AddOps.push_back(MulRes);
2947     V = UndefValue::get(ResVTy);
2948     AddOps[0] = CGF.Builder.CreateInsertElement(V, AddOps[0], CI);
2949     Value *AddRes = CGF.Builder.CreateCall2(F, AddOps[0], AddOps[1]);
2950     return CGF.Builder.CreateBitCast(AddRes, Ty);
2951   }
2952   case NEON::BI__builtin_neon_vfmas_lane_f32:
2953   case NEON::BI__builtin_neon_vfmas_laneq_f32:
2954   case NEON::BI__builtin_neon_vfmad_lane_f64:
2955   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
2956     llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
2957     Value *F = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
2958     Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
2959     return CGF.Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
2960   }
2961   // Scalar Floating-point Multiply Extended
2962   case NEON::BI__builtin_neon_vmulxs_f32:
2963   case NEON::BI__builtin_neon_vmulxd_f64: {
2964     Int = Intrinsic::aarch64_neon_vmulx;
2965     llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
2966     return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
2967   }
2968   case NEON::BI__builtin_neon_vmul_n_f64: {
2969     // v1f64 vmul_n_f64  should be mapped to Neon scalar mul lane
2970     llvm::Type *VTy = GetNeonType(&CGF,
2971       NeonTypeFlags(NeonTypeFlags::Float64, false, false));
2972     Ops[0] = CGF.Builder.CreateBitCast(Ops[0], VTy);
2973     llvm::Value *Idx = llvm::ConstantInt::get(CGF.Int32Ty, 0);
2974     Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], Idx, "extract");
2975     Value *Result = CGF.Builder.CreateFMul(Ops[0], Ops[1]);
2976     return CGF.Builder.CreateBitCast(Result, VTy);
2977   }
2978   case NEON::BI__builtin_neon_vget_lane_i8:
2979   case NEON::BI__builtin_neon_vget_lane_i16:
2980   case NEON::BI__builtin_neon_vget_lane_i32:
2981   case NEON::BI__builtin_neon_vget_lane_i64:
2982   case NEON::BI__builtin_neon_vget_lane_f32:
2983   case NEON::BI__builtin_neon_vget_lane_f64:
2984   case NEON::BI__builtin_neon_vgetq_lane_i8:
2985   case NEON::BI__builtin_neon_vgetq_lane_i16:
2986   case NEON::BI__builtin_neon_vgetq_lane_i32:
2987   case NEON::BI__builtin_neon_vgetq_lane_i64:
2988   case NEON::BI__builtin_neon_vgetq_lane_f32:
2989   case NEON::BI__builtin_neon_vgetq_lane_f64:
2990     return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vget_lane_i8, E);
2991   case NEON::BI__builtin_neon_vset_lane_i8:
2992   case NEON::BI__builtin_neon_vset_lane_i16:
2993   case NEON::BI__builtin_neon_vset_lane_i32:
2994   case NEON::BI__builtin_neon_vset_lane_i64:
2995   case NEON::BI__builtin_neon_vset_lane_f32:
2996   case NEON::BI__builtin_neon_vset_lane_f64:
2997   case NEON::BI__builtin_neon_vsetq_lane_i8:
2998   case NEON::BI__builtin_neon_vsetq_lane_i16:
2999   case NEON::BI__builtin_neon_vsetq_lane_i32:
3000   case NEON::BI__builtin_neon_vsetq_lane_i64:
3001   case NEON::BI__builtin_neon_vsetq_lane_f32:
3002   case NEON::BI__builtin_neon_vsetq_lane_f64:
3003     return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vset_lane_i8, E);
3004 
3005   case NEON::BI__builtin_neon_vceqzd_s64:
3006   case NEON::BI__builtin_neon_vceqzd_u64:
3007   case NEON::BI__builtin_neon_vcgezd_s64:
3008   case NEON::BI__builtin_neon_vcgtzd_s64:
3009   case NEON::BI__builtin_neon_vclezd_s64:
3010   case NEON::BI__builtin_neon_vcltzd_s64:
3011     // Add implicit zero operand.
3012     Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
3013     break;
3014   case NEON::BI__builtin_neon_vceqzs_f32:
3015   case NEON::BI__builtin_neon_vceqzd_f64:
3016   case NEON::BI__builtin_neon_vcgezs_f32:
3017   case NEON::BI__builtin_neon_vcgezd_f64:
3018   case NEON::BI__builtin_neon_vcgtzs_f32:
3019   case NEON::BI__builtin_neon_vcgtzd_f64:
3020   case NEON::BI__builtin_neon_vclezs_f32:
3021   case NEON::BI__builtin_neon_vclezd_f64:
3022   case NEON::BI__builtin_neon_vcltzs_f32:
3023   case NEON::BI__builtin_neon_vcltzd_f64:
3024     // Add implicit zero operand.
3025     Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
3026     break;
3027   }
3028 
3029   // It didn't need any handling specific to the AArch64 backend, so defer to
3030   // common code.
3031   return EmitCommonNeonSISDBuiltinExpr(CGF, SISDInfo, Ops, E);
3032 }
3033 
3034 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
3035     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
3036     const char *NameHint, unsigned Modifier, const CallExpr *E,
3037     SmallVectorImpl<llvm::Value *> &Ops, llvm::Value *Align) {
3038   // Get the last argument, which specifies the vector type.
3039   llvm::APSInt NeonTypeConst;
3040   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
3041   if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext()))
3042     return 0;
3043 
3044   // Determine the type of this overloaded NEON intrinsic.
3045   NeonTypeFlags Type(NeonTypeConst.getZExtValue());
3046   bool Usgn = Type.isUnsigned();
3047   bool Quad = Type.isQuad();
3048 
3049   llvm::VectorType *VTy = GetNeonType(this, Type);
3050   llvm::Type *Ty = VTy;
3051   if (!Ty)
3052     return 0;
3053 
3054   unsigned Int = LLVMIntrinsic;
3055   if ((Modifier & UnsignedAlts) && !Usgn)
3056     Int = AltLLVMIntrinsic;
3057 
3058   switch (BuiltinID) {
3059   default: break;
3060   case NEON::BI__builtin_neon_vabs_v:
3061   case NEON::BI__builtin_neon_vabsq_v:
3062     if (VTy->getElementType()->isFloatingPointTy())
3063       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
3064     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
3065   case NEON::BI__builtin_neon_vaddhn_v: {
3066     llvm::VectorType *SrcTy =
3067         llvm::VectorType::getExtendedElementVectorType(VTy);
3068 
3069     // %sum = add <4 x i32> %lhs, %rhs
3070     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
3071     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
3072     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
3073 
3074     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
3075     Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
3076                                        SrcTy->getScalarSizeInBits() / 2);
3077     ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
3078     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
3079 
3080     // %res = trunc <4 x i32> %high to <4 x i16>
3081     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
3082   }
3083   case NEON::BI__builtin_neon_vcale_v:
3084   case NEON::BI__builtin_neon_vcaleq_v:
3085   case NEON::BI__builtin_neon_vcalt_v:
3086   case NEON::BI__builtin_neon_vcaltq_v:
3087     std::swap(Ops[0], Ops[1]);
3088   case NEON::BI__builtin_neon_vcage_v:
3089   case NEON::BI__builtin_neon_vcageq_v:
3090   case NEON::BI__builtin_neon_vcagt_v:
3091   case NEON::BI__builtin_neon_vcagtq_v: {
3092     llvm::Type *VecFlt = llvm::VectorType::get(
3093         VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy,
3094         VTy->getNumElements());
3095     llvm::Type *Tys[] = { VTy, VecFlt };
3096     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
3097     return EmitNeonCall(F, Ops, NameHint);
3098   }
3099   case NEON::BI__builtin_neon_vclz_v:
3100   case NEON::BI__builtin_neon_vclzq_v:
3101     // We generate target-independent intrinsic, which needs a second argument
3102     // for whether or not clz of zero is undefined; on ARM it isn't.
3103     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
3104     break;
3105   case NEON::BI__builtin_neon_vcvt_f32_v:
3106   case NEON::BI__builtin_neon_vcvtq_f32_v:
3107     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3108     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad));
3109     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
3110                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
3111   case NEON::BI__builtin_neon_vcvt_n_f32_v:
3112   case NEON::BI__builtin_neon_vcvt_n_f64_v:
3113   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
3114   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
3115     bool Double =
3116       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
3117     llvm::Type *FloatTy =
3118         GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
3119                                                : NeonTypeFlags::Float32,
3120                                         false, Quad));
3121     llvm::Type *Tys[2] = { FloatTy, Ty };
3122     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
3123     Function *F = CGM.getIntrinsic(Int, Tys);
3124     return EmitNeonCall(F, Ops, "vcvt_n");
3125   }
3126   case NEON::BI__builtin_neon_vcvt_n_s32_v:
3127   case NEON::BI__builtin_neon_vcvt_n_u32_v:
3128   case NEON::BI__builtin_neon_vcvt_n_s64_v:
3129   case NEON::BI__builtin_neon_vcvt_n_u64_v:
3130   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
3131   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
3132   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
3133   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
3134     bool Double =
3135       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
3136     llvm::Type *FloatTy =
3137         GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
3138                                                : NeonTypeFlags::Float32,
3139                                         false, Quad));
3140     llvm::Type *Tys[2] = { Ty, FloatTy };
3141     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
3142     return EmitNeonCall(F, Ops, "vcvt_n");
3143   }
3144   case NEON::BI__builtin_neon_vcvt_s32_v:
3145   case NEON::BI__builtin_neon_vcvt_u32_v:
3146   case NEON::BI__builtin_neon_vcvt_s64_v:
3147   case NEON::BI__builtin_neon_vcvt_u64_v:
3148   case NEON::BI__builtin_neon_vcvtq_s32_v:
3149   case NEON::BI__builtin_neon_vcvtq_u32_v:
3150   case NEON::BI__builtin_neon_vcvtq_s64_v:
3151   case NEON::BI__builtin_neon_vcvtq_u64_v: {
3152     bool Double =
3153       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
3154     llvm::Type *FloatTy =
3155         GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
3156                                                : NeonTypeFlags::Float32,
3157                                         false, Quad));
3158     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
3159     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
3160                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
3161   }
3162   case NEON::BI__builtin_neon_vcvta_s32_v:
3163   case NEON::BI__builtin_neon_vcvta_s64_v:
3164   case NEON::BI__builtin_neon_vcvta_u32_v:
3165   case NEON::BI__builtin_neon_vcvta_u64_v:
3166   case NEON::BI__builtin_neon_vcvtaq_s32_v:
3167   case NEON::BI__builtin_neon_vcvtaq_s64_v:
3168   case NEON::BI__builtin_neon_vcvtaq_u32_v:
3169   case NEON::BI__builtin_neon_vcvtaq_u64_v:
3170   case NEON::BI__builtin_neon_vcvtn_s32_v:
3171   case NEON::BI__builtin_neon_vcvtn_s64_v:
3172   case NEON::BI__builtin_neon_vcvtn_u32_v:
3173   case NEON::BI__builtin_neon_vcvtn_u64_v:
3174   case NEON::BI__builtin_neon_vcvtnq_s32_v:
3175   case NEON::BI__builtin_neon_vcvtnq_s64_v:
3176   case NEON::BI__builtin_neon_vcvtnq_u32_v:
3177   case NEON::BI__builtin_neon_vcvtnq_u64_v:
3178   case NEON::BI__builtin_neon_vcvtp_s32_v:
3179   case NEON::BI__builtin_neon_vcvtp_s64_v:
3180   case NEON::BI__builtin_neon_vcvtp_u32_v:
3181   case NEON::BI__builtin_neon_vcvtp_u64_v:
3182   case NEON::BI__builtin_neon_vcvtpq_s32_v:
3183   case NEON::BI__builtin_neon_vcvtpq_s64_v:
3184   case NEON::BI__builtin_neon_vcvtpq_u32_v:
3185   case NEON::BI__builtin_neon_vcvtpq_u64_v:
3186   case NEON::BI__builtin_neon_vcvtm_s32_v:
3187   case NEON::BI__builtin_neon_vcvtm_s64_v:
3188   case NEON::BI__builtin_neon_vcvtm_u32_v:
3189   case NEON::BI__builtin_neon_vcvtm_u64_v:
3190   case NEON::BI__builtin_neon_vcvtmq_s32_v:
3191   case NEON::BI__builtin_neon_vcvtmq_s64_v:
3192   case NEON::BI__builtin_neon_vcvtmq_u32_v:
3193   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
3194     bool Double =
3195       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
3196     llvm::Type *InTy =
3197       GetNeonType(this,
3198                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
3199                                 : NeonTypeFlags::Float32, false, Quad));
3200     llvm::Type *Tys[2] = { Ty, InTy };
3201     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
3202   }
3203   case NEON::BI__builtin_neon_vext_v:
3204   case NEON::BI__builtin_neon_vextq_v: {
3205     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
3206     SmallVector<Constant*, 16> Indices;
3207     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
3208       Indices.push_back(ConstantInt::get(Int32Ty, i+CV));
3209 
3210     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3211     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3212     Value *SV = llvm::ConstantVector::get(Indices);
3213     return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext");
3214   }
3215   case NEON::BI__builtin_neon_vfma_v:
3216   case NEON::BI__builtin_neon_vfmaq_v: {
3217     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
3218     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3219     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3220     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
3221 
3222     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
3223     return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
3224   }
3225   case NEON::BI__builtin_neon_vld1_v:
3226   case NEON::BI__builtin_neon_vld1q_v:
3227     Ops.push_back(Align);
3228     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vld1");
3229   case NEON::BI__builtin_neon_vld2_v:
3230   case NEON::BI__builtin_neon_vld2q_v:
3231   case NEON::BI__builtin_neon_vld3_v:
3232   case NEON::BI__builtin_neon_vld3q_v:
3233   case NEON::BI__builtin_neon_vld4_v:
3234   case NEON::BI__builtin_neon_vld4q_v: {
3235     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
3236     Ops[1] = Builder.CreateCall2(F, Ops[1], Align, NameHint);
3237     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
3238     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3239     return Builder.CreateStore(Ops[1], Ops[0]);
3240   }
3241   case NEON::BI__builtin_neon_vld1_dup_v:
3242   case NEON::BI__builtin_neon_vld1q_dup_v: {
3243     Value *V = UndefValue::get(Ty);
3244     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
3245     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3246     LoadInst *Ld = Builder.CreateLoad(Ops[0]);
3247     Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
3248     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
3249     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
3250     return EmitNeonSplat(Ops[0], CI);
3251   }
3252   case NEON::BI__builtin_neon_vld2_lane_v:
3253   case NEON::BI__builtin_neon_vld2q_lane_v:
3254   case NEON::BI__builtin_neon_vld3_lane_v:
3255   case NEON::BI__builtin_neon_vld3q_lane_v:
3256   case NEON::BI__builtin_neon_vld4_lane_v:
3257   case NEON::BI__builtin_neon_vld4q_lane_v: {
3258     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
3259     for (unsigned I = 2; I < Ops.size() - 1; ++I)
3260       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
3261     Ops.push_back(Align);
3262     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
3263     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
3264     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3265     return Builder.CreateStore(Ops[1], Ops[0]);
3266   }
3267   case NEON::BI__builtin_neon_vmovl_v: {
3268     llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
3269     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
3270     if (Usgn)
3271       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
3272     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
3273   }
3274   case NEON::BI__builtin_neon_vmovn_v: {
3275     llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
3276     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
3277     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
3278   }
3279   case NEON::BI__builtin_neon_vmull_v:
3280     // FIXME: the integer vmull operations could be emitted in terms of pure
3281     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
3282     // hoisting the exts outside loops. Until global ISel comes along that can
3283     // see through such movement this leads to bad CodeGen. So we need an
3284     // intrinsic for now.
3285     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
3286     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
3287     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
3288   case NEON::BI__builtin_neon_vpadal_v:
3289   case NEON::BI__builtin_neon_vpadalq_v: {
3290     // The source operand type has twice as many elements of half the size.
3291     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
3292     llvm::Type *EltTy =
3293       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
3294     llvm::Type *NarrowTy =
3295       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
3296     llvm::Type *Tys[2] = { Ty, NarrowTy };
3297     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
3298   }
3299   case NEON::BI__builtin_neon_vpaddl_v:
3300   case NEON::BI__builtin_neon_vpaddlq_v: {
3301     // The source operand type has twice as many elements of half the size.
3302     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
3303     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
3304     llvm::Type *NarrowTy =
3305       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
3306     llvm::Type *Tys[2] = { Ty, NarrowTy };
3307     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
3308   }
3309   case NEON::BI__builtin_neon_vqdmlal_v:
3310   case NEON::BI__builtin_neon_vqdmlsl_v: {
3311     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
3312     Value *Mul = EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty),
3313                               MulOps, "vqdmlal");
3314 
3315     SmallVector<Value *, 2> AccumOps;
3316     AccumOps.push_back(Ops[0]);
3317     AccumOps.push_back(Mul);
3318     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty),
3319                         AccumOps, NameHint);
3320   }
3321   case NEON::BI__builtin_neon_vqshl_n_v:
3322   case NEON::BI__builtin_neon_vqshlq_n_v:
3323     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
3324                         1, false);
3325   case NEON::BI__builtin_neon_vrecpe_v:
3326   case NEON::BI__builtin_neon_vrecpeq_v:
3327   case NEON::BI__builtin_neon_vrsqrte_v:
3328   case NEON::BI__builtin_neon_vrsqrteq_v:
3329     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
3330     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
3331 
3332   case NEON::BI__builtin_neon_vshl_n_v:
3333   case NEON::BI__builtin_neon_vshlq_n_v:
3334     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
3335     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
3336                              "vshl_n");
3337   case NEON::BI__builtin_neon_vshll_n_v: {
3338     llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
3339     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
3340     if (Usgn)
3341       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
3342     else
3343       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
3344     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
3345     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
3346   }
3347   case NEON::BI__builtin_neon_vshrn_n_v: {
3348     llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
3349     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
3350     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
3351     if (Usgn)
3352       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
3353     else
3354       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
3355     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
3356   }
3357   case NEON::BI__builtin_neon_vshr_n_v:
3358   case NEON::BI__builtin_neon_vshrq_n_v:
3359     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
3360   case NEON::BI__builtin_neon_vst1_v:
3361   case NEON::BI__builtin_neon_vst1q_v:
3362   case NEON::BI__builtin_neon_vst2_v:
3363   case NEON::BI__builtin_neon_vst2q_v:
3364   case NEON::BI__builtin_neon_vst3_v:
3365   case NEON::BI__builtin_neon_vst3q_v:
3366   case NEON::BI__builtin_neon_vst4_v:
3367   case NEON::BI__builtin_neon_vst4q_v:
3368   case NEON::BI__builtin_neon_vst2_lane_v:
3369   case NEON::BI__builtin_neon_vst2q_lane_v:
3370   case NEON::BI__builtin_neon_vst3_lane_v:
3371   case NEON::BI__builtin_neon_vst3q_lane_v:
3372   case NEON::BI__builtin_neon_vst4_lane_v:
3373   case NEON::BI__builtin_neon_vst4q_lane_v:
3374     Ops.push_back(Align);
3375     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "");
3376   case NEON::BI__builtin_neon_vsubhn_v: {
3377     llvm::VectorType *SrcTy =
3378         llvm::VectorType::getExtendedElementVectorType(VTy);
3379 
3380     // %sum = add <4 x i32> %lhs, %rhs
3381     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
3382     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
3383     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
3384 
3385     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
3386     Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
3387                                        SrcTy->getScalarSizeInBits() / 2);
3388     ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
3389     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
3390 
3391     // %res = trunc <4 x i32> %high to <4 x i16>
3392     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
3393   }
3394   case NEON::BI__builtin_neon_vtrn_v:
3395   case NEON::BI__builtin_neon_vtrnq_v: {
3396     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
3397     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3398     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
3399     Value *SV = 0;
3400 
3401     for (unsigned vi = 0; vi != 2; ++vi) {
3402       SmallVector<Constant*, 16> Indices;
3403       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
3404         Indices.push_back(Builder.getInt32(i+vi));
3405         Indices.push_back(Builder.getInt32(i+e+vi));
3406       }
3407       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
3408       SV = llvm::ConstantVector::get(Indices);
3409       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn");
3410       SV = Builder.CreateStore(SV, Addr);
3411     }
3412     return SV;
3413   }
3414   case NEON::BI__builtin_neon_vtst_v:
3415   case NEON::BI__builtin_neon_vtstq_v: {
3416     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3417     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3418     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
3419     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
3420                                 ConstantAggregateZero::get(Ty));
3421     return Builder.CreateSExt(Ops[0], Ty, "vtst");
3422   }
3423   case NEON::BI__builtin_neon_vuzp_v:
3424   case NEON::BI__builtin_neon_vuzpq_v: {
3425     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
3426     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3427     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
3428     Value *SV = 0;
3429 
3430     for (unsigned vi = 0; vi != 2; ++vi) {
3431       SmallVector<Constant*, 16> Indices;
3432       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
3433         Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi));
3434 
3435       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
3436       SV = llvm::ConstantVector::get(Indices);
3437       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp");
3438       SV = Builder.CreateStore(SV, Addr);
3439     }
3440     return SV;
3441   }
3442   case NEON::BI__builtin_neon_vzip_v:
3443   case NEON::BI__builtin_neon_vzipq_v: {
3444     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
3445     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3446     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
3447     Value *SV = 0;
3448 
3449     for (unsigned vi = 0; vi != 2; ++vi) {
3450       SmallVector<Constant*, 16> Indices;
3451       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
3452         Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1));
3453         Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e));
3454       }
3455       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
3456       SV = llvm::ConstantVector::get(Indices);
3457       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip");
3458       SV = Builder.CreateStore(SV, Addr);
3459     }
3460     return SV;
3461   }
3462   }
3463 
3464   assert(Int && "Expected valid intrinsic number");
3465 
3466   // Determine the type(s) of this overloaded AArch64 intrinsic.
3467   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
3468 
3469   Value *Result = EmitNeonCall(F, Ops, NameHint);
3470   llvm::Type *ResultType = ConvertType(E->getType());
3471   // AArch64 intrinsic one-element vector type cast to
3472   // scalar type expected by the builtin
3473   return Builder.CreateBitCast(Result, ResultType, NameHint);
3474 }
3475 
3476 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
3477     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
3478     const CmpInst::Predicate Ip, const Twine &Name) {
3479   llvm::Type *OTy = Op->getType();
3480 
3481   // FIXME: this is utterly horrific. We should not be looking at previous
3482   // codegen context to find out what needs doing. Unfortunately TableGen
3483   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
3484   // (etc).
3485   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
3486     OTy = BI->getOperand(0)->getType();
3487 
3488   Op = Builder.CreateBitCast(Op, OTy);
3489   if (OTy->getScalarType()->isFloatingPointTy()) {
3490     Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
3491   } else {
3492     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
3493   }
3494   return Builder.CreateSExt(Op, Ty, Name);
3495 }
3496 
3497 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
3498                                  Value *ExtOp, Value *IndexOp,
3499                                  llvm::Type *ResTy, unsigned IntID,
3500                                  const char *Name) {
3501   SmallVector<Value *, 2> TblOps;
3502   if (ExtOp)
3503     TblOps.push_back(ExtOp);
3504 
3505   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
3506   SmallVector<Constant*, 16> Indices;
3507   llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
3508   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
3509     Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i));
3510     Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i+1));
3511   }
3512   Value *SV = llvm::ConstantVector::get(Indices);
3513 
3514   int PairPos = 0, End = Ops.size() - 1;
3515   while (PairPos < End) {
3516     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
3517                                                      Ops[PairPos+1], SV, Name));
3518     PairPos += 2;
3519   }
3520 
3521   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
3522   // of the 128-bit lookup table with zero.
3523   if (PairPos == End) {
3524     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
3525     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
3526                                                      ZeroTbl, SV, Name));
3527   }
3528 
3529   Function *TblF;
3530   TblOps.push_back(IndexOp);
3531   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
3532 
3533   return CGF.EmitNeonCall(TblF, TblOps, Name);
3534 }
3535 
3536 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF,
3537                                         unsigned BuiltinID,
3538                                         const CallExpr *E) {
3539   unsigned int Int = 0;
3540   const char *s = NULL;
3541 
3542   switch (BuiltinID) {
3543   default:
3544     return 0;
3545   case NEON::BI__builtin_neon_vtbl1_v:
3546   case NEON::BI__builtin_neon_vqtbl1_v:
3547   case NEON::BI__builtin_neon_vqtbl1q_v:
3548   case NEON::BI__builtin_neon_vtbl2_v:
3549   case NEON::BI__builtin_neon_vqtbl2_v:
3550   case NEON::BI__builtin_neon_vqtbl2q_v:
3551   case NEON::BI__builtin_neon_vtbl3_v:
3552   case NEON::BI__builtin_neon_vqtbl3_v:
3553   case NEON::BI__builtin_neon_vqtbl3q_v:
3554   case NEON::BI__builtin_neon_vtbl4_v:
3555   case NEON::BI__builtin_neon_vqtbl4_v:
3556   case NEON::BI__builtin_neon_vqtbl4q_v:
3557   case NEON::BI__builtin_neon_vtbx1_v:
3558   case NEON::BI__builtin_neon_vqtbx1_v:
3559   case NEON::BI__builtin_neon_vqtbx1q_v:
3560   case NEON::BI__builtin_neon_vtbx2_v:
3561   case NEON::BI__builtin_neon_vqtbx2_v:
3562   case NEON::BI__builtin_neon_vqtbx2q_v:
3563   case NEON::BI__builtin_neon_vtbx3_v:
3564   case NEON::BI__builtin_neon_vqtbx3_v:
3565   case NEON::BI__builtin_neon_vqtbx3q_v:
3566   case NEON::BI__builtin_neon_vtbx4_v:
3567   case NEON::BI__builtin_neon_vqtbx4_v:
3568   case NEON::BI__builtin_neon_vqtbx4q_v:
3569     break;
3570   }
3571 
3572   assert(E->getNumArgs() >= 3);
3573 
3574   // Get the last argument, which specifies the vector type.
3575   llvm::APSInt Result;
3576   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
3577   if (!Arg->isIntegerConstantExpr(Result, CGF.getContext()))
3578     return 0;
3579 
3580   // Determine the type of this overloaded NEON intrinsic.
3581   NeonTypeFlags Type(Result.getZExtValue());
3582   llvm::VectorType *VTy = GetNeonType(&CGF, Type);
3583   llvm::Type *Ty = VTy;
3584   if (!Ty)
3585     return 0;
3586 
3587   SmallVector<Value *, 4> Ops;
3588   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
3589     Ops.push_back(CGF.EmitScalarExpr(E->getArg(i)));
3590   }
3591 
3592   unsigned nElts = VTy->getNumElements();
3593 
3594   // AArch64 scalar builtins are not overloaded, they do not have an extra
3595   // argument that specifies the vector type, need to handle each case.
3596   SmallVector<Value *, 2> TblOps;
3597   switch (BuiltinID) {
3598   case NEON::BI__builtin_neon_vtbl1_v: {
3599     TblOps.push_back(Ops[0]);
3600     return packTBLDVectorList(CGF, TblOps, 0, Ops[1], Ty,
3601                               Intrinsic::aarch64_neon_vtbl1, "vtbl1");
3602   }
3603   case NEON::BI__builtin_neon_vtbl2_v: {
3604     TblOps.push_back(Ops[0]);
3605     TblOps.push_back(Ops[1]);
3606     return packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
3607                               Intrinsic::aarch64_neon_vtbl1, "vtbl1");
3608   }
3609   case NEON::BI__builtin_neon_vtbl3_v: {
3610     TblOps.push_back(Ops[0]);
3611     TblOps.push_back(Ops[1]);
3612     TblOps.push_back(Ops[2]);
3613     return packTBLDVectorList(CGF, TblOps, 0, Ops[3], Ty,
3614                               Intrinsic::aarch64_neon_vtbl2, "vtbl2");
3615   }
3616   case NEON::BI__builtin_neon_vtbl4_v: {
3617     TblOps.push_back(Ops[0]);
3618     TblOps.push_back(Ops[1]);
3619     TblOps.push_back(Ops[2]);
3620     TblOps.push_back(Ops[3]);
3621     return packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
3622                               Intrinsic::aarch64_neon_vtbl2, "vtbl2");
3623   }
3624   case NEON::BI__builtin_neon_vtbx1_v: {
3625     TblOps.push_back(Ops[1]);
3626     Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
3627                                     Intrinsic::aarch64_neon_vtbl1, "vtbl1");
3628 
3629     llvm::Constant *Eight = ConstantInt::get(VTy->getElementType(), 8);
3630     Value* EightV = llvm::ConstantVector::getSplat(nElts, Eight);
3631     Value *CmpRes = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
3632     CmpRes = CGF.Builder.CreateSExt(CmpRes, Ty);
3633 
3634     SmallVector<Value *, 4> BslOps;
3635     BslOps.push_back(CmpRes);
3636     BslOps.push_back(Ops[0]);
3637     BslOps.push_back(TblRes);
3638     Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty);
3639     return CGF.EmitNeonCall(BslF, BslOps, "vbsl");
3640   }
3641   case NEON::BI__builtin_neon_vtbx2_v: {
3642     TblOps.push_back(Ops[1]);
3643     TblOps.push_back(Ops[2]);
3644     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty,
3645                               Intrinsic::aarch64_neon_vtbx1, "vtbx1");
3646   }
3647   case NEON::BI__builtin_neon_vtbx3_v: {
3648     TblOps.push_back(Ops[1]);
3649     TblOps.push_back(Ops[2]);
3650     TblOps.push_back(Ops[3]);
3651     Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
3652                                        Intrinsic::aarch64_neon_vtbl2, "vtbl2");
3653 
3654     llvm::Constant *TwentyFour = ConstantInt::get(VTy->getElementType(), 24);
3655     Value* TwentyFourV = llvm::ConstantVector::getSplat(nElts, TwentyFour);
3656     Value *CmpRes = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
3657                                            TwentyFourV);
3658     CmpRes = CGF.Builder.CreateSExt(CmpRes, Ty);
3659 
3660     SmallVector<Value *, 4> BslOps;
3661     BslOps.push_back(CmpRes);
3662     BslOps.push_back(Ops[0]);
3663     BslOps.push_back(TblRes);
3664     Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty);
3665     return CGF.EmitNeonCall(BslF, BslOps, "vbsl");
3666   }
3667   case NEON::BI__builtin_neon_vtbx4_v: {
3668     TblOps.push_back(Ops[1]);
3669     TblOps.push_back(Ops[2]);
3670     TblOps.push_back(Ops[3]);
3671     TblOps.push_back(Ops[4]);
3672     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty,
3673                               Intrinsic::aarch64_neon_vtbx2, "vtbx2");
3674   }
3675   case NEON::BI__builtin_neon_vqtbl1_v:
3676   case NEON::BI__builtin_neon_vqtbl1q_v:
3677     Int = Intrinsic::aarch64_neon_vtbl1; s = "vtbl1"; break;
3678   case NEON::BI__builtin_neon_vqtbl2_v:
3679   case NEON::BI__builtin_neon_vqtbl2q_v: {
3680     Int = Intrinsic::aarch64_neon_vtbl2; s = "vtbl2"; break;
3681   case NEON::BI__builtin_neon_vqtbl3_v:
3682   case NEON::BI__builtin_neon_vqtbl3q_v:
3683     Int = Intrinsic::aarch64_neon_vtbl3; s = "vtbl3"; break;
3684   case NEON::BI__builtin_neon_vqtbl4_v:
3685   case NEON::BI__builtin_neon_vqtbl4q_v:
3686     Int = Intrinsic::aarch64_neon_vtbl4; s = "vtbl4"; break;
3687   case NEON::BI__builtin_neon_vqtbx1_v:
3688   case NEON::BI__builtin_neon_vqtbx1q_v:
3689     Int = Intrinsic::aarch64_neon_vtbx1; s = "vtbx1"; break;
3690   case NEON::BI__builtin_neon_vqtbx2_v:
3691   case NEON::BI__builtin_neon_vqtbx2q_v:
3692     Int = Intrinsic::aarch64_neon_vtbx2; s = "vtbx2"; break;
3693   case NEON::BI__builtin_neon_vqtbx3_v:
3694   case NEON::BI__builtin_neon_vqtbx3q_v:
3695     Int = Intrinsic::aarch64_neon_vtbx3; s = "vtbx3"; break;
3696   case NEON::BI__builtin_neon_vqtbx4_v:
3697   case NEON::BI__builtin_neon_vqtbx4q_v:
3698     Int = Intrinsic::aarch64_neon_vtbx4; s = "vtbx4"; break;
3699   }
3700   }
3701 
3702   if (!Int)
3703     return 0;
3704 
3705   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
3706   return CGF.EmitNeonCall(F, Ops, s);
3707 }
3708 
3709 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
3710                                                const CallExpr *E) {
3711 
3712   // Process AArch64 scalar builtins
3713   llvm::ArrayRef<NeonIntrinsicInfo> SISDInfo(AArch64SISDIntrinsicInfo);
3714   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
3715       SISDInfo, BuiltinID, AArch64SISDIntrinsicInfoProvenSorted);
3716 
3717   if (Builtin) {
3718     Value *Result = EmitAArch64ScalarBuiltinExpr(*this, *Builtin, E);
3719     assert(Result && "SISD intrinsic should have been handled");
3720     return Result;
3721   }
3722 
3723   // Process AArch64 table lookup builtins
3724   if (Value *Result = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E))
3725     return Result;
3726 
3727   if (BuiltinID == AArch64::BI__clear_cache) {
3728     assert(E->getNumArgs() == 2 &&
3729            "Variadic __clear_cache slipped through on AArch64");
3730 
3731     const FunctionDecl *FD = E->getDirectCallee();
3732     SmallVector<Value *, 2> Ops;
3733     for (unsigned i = 0; i < E->getNumArgs(); i++)
3734       Ops.push_back(EmitScalarExpr(E->getArg(i)));
3735     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
3736     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
3737     StringRef Name = FD->getName();
3738     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
3739   }
3740 
3741   SmallVector<Value *, 4> Ops;
3742   llvm::Value *Align = 0; // Alignment for load/store
3743 
3744   if (BuiltinID == NEON::BI__builtin_neon_vldrq_p128) {
3745    Value *Op = EmitScalarExpr(E->getArg(0));
3746    unsigned addressSpace =
3747      cast<llvm::PointerType>(Op->getType())->getAddressSpace();
3748    llvm::Type *Ty = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
3749    Op = Builder.CreateBitCast(Op, Ty);
3750    Op = Builder.CreateLoad(Op);
3751    Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
3752    return Builder.CreateBitCast(Op, Ty);
3753   }
3754   if (BuiltinID == NEON::BI__builtin_neon_vstrq_p128) {
3755     Value *Op0 = EmitScalarExpr(E->getArg(0));
3756     unsigned addressSpace =
3757       cast<llvm::PointerType>(Op0->getType())->getAddressSpace();
3758     llvm::Type *PTy = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
3759     Op0 = Builder.CreateBitCast(Op0, PTy);
3760     Value *Op1 = EmitScalarExpr(E->getArg(1));
3761     llvm::Type *Ty = llvm::Type::getFP128Ty(getLLVMContext());
3762     Op1 = Builder.CreateBitCast(Op1, Ty);
3763     return Builder.CreateStore(Op1, Op0);
3764   }
3765   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
3766     if (i == 0) {
3767       switch (BuiltinID) {
3768       case NEON::BI__builtin_neon_vld1_v:
3769       case NEON::BI__builtin_neon_vld1q_v:
3770       case NEON::BI__builtin_neon_vst1_v:
3771       case NEON::BI__builtin_neon_vst1q_v:
3772       case NEON::BI__builtin_neon_vst2_v:
3773       case NEON::BI__builtin_neon_vst2q_v:
3774       case NEON::BI__builtin_neon_vst3_v:
3775       case NEON::BI__builtin_neon_vst3q_v:
3776       case NEON::BI__builtin_neon_vst4_v:
3777       case NEON::BI__builtin_neon_vst4q_v:
3778       case NEON::BI__builtin_neon_vst1_x2_v:
3779       case NEON::BI__builtin_neon_vst1q_x2_v:
3780       case NEON::BI__builtin_neon_vst1_x3_v:
3781       case NEON::BI__builtin_neon_vst1q_x3_v:
3782       case NEON::BI__builtin_neon_vst1_x4_v:
3783       case NEON::BI__builtin_neon_vst1q_x4_v:
3784       // Handle ld1/st1 lane in this function a little different from ARM.
3785       case NEON::BI__builtin_neon_vld1_lane_v:
3786       case NEON::BI__builtin_neon_vld1q_lane_v:
3787       case NEON::BI__builtin_neon_vst1_lane_v:
3788       case NEON::BI__builtin_neon_vst1q_lane_v:
3789       case NEON::BI__builtin_neon_vst2_lane_v:
3790       case NEON::BI__builtin_neon_vst2q_lane_v:
3791       case NEON::BI__builtin_neon_vst3_lane_v:
3792       case NEON::BI__builtin_neon_vst3q_lane_v:
3793       case NEON::BI__builtin_neon_vst4_lane_v:
3794       case NEON::BI__builtin_neon_vst4q_lane_v:
3795       case NEON::BI__builtin_neon_vld1_dup_v:
3796       case NEON::BI__builtin_neon_vld1q_dup_v:
3797         // Get the alignment for the argument in addition to the value;
3798         // we'll use it later.
3799         std::pair<llvm::Value *, unsigned> Src =
3800             EmitPointerWithAlignment(E->getArg(0));
3801         Ops.push_back(Src.first);
3802         Align = Builder.getInt32(Src.second);
3803         continue;
3804       }
3805     }
3806     if (i == 1) {
3807       switch (BuiltinID) {
3808       case NEON::BI__builtin_neon_vld2_v:
3809       case NEON::BI__builtin_neon_vld2q_v:
3810       case NEON::BI__builtin_neon_vld3_v:
3811       case NEON::BI__builtin_neon_vld3q_v:
3812       case NEON::BI__builtin_neon_vld4_v:
3813       case NEON::BI__builtin_neon_vld4q_v:
3814       case NEON::BI__builtin_neon_vld1_x2_v:
3815       case NEON::BI__builtin_neon_vld1q_x2_v:
3816       case NEON::BI__builtin_neon_vld1_x3_v:
3817       case NEON::BI__builtin_neon_vld1q_x3_v:
3818       case NEON::BI__builtin_neon_vld1_x4_v:
3819       case NEON::BI__builtin_neon_vld1q_x4_v:
3820       // Handle ld1/st1 dup lane in this function a little different from ARM.
3821       case NEON::BI__builtin_neon_vld2_dup_v:
3822       case NEON::BI__builtin_neon_vld2q_dup_v:
3823       case NEON::BI__builtin_neon_vld3_dup_v:
3824       case NEON::BI__builtin_neon_vld3q_dup_v:
3825       case NEON::BI__builtin_neon_vld4_dup_v:
3826       case NEON::BI__builtin_neon_vld4q_dup_v:
3827       case NEON::BI__builtin_neon_vld2_lane_v:
3828       case NEON::BI__builtin_neon_vld2q_lane_v:
3829       case NEON::BI__builtin_neon_vld3_lane_v:
3830       case NEON::BI__builtin_neon_vld3q_lane_v:
3831       case NEON::BI__builtin_neon_vld4_lane_v:
3832       case NEON::BI__builtin_neon_vld4q_lane_v:
3833         // Get the alignment for the argument in addition to the value;
3834         // we'll use it later.
3835         std::pair<llvm::Value *, unsigned> Src =
3836             EmitPointerWithAlignment(E->getArg(1));
3837         Ops.push_back(Src.first);
3838         Align = Builder.getInt32(Src.second);
3839         continue;
3840       }
3841     }
3842     Ops.push_back(EmitScalarExpr(E->getArg(i)));
3843   }
3844 
3845   // Get the last argument, which specifies the vector type.
3846   llvm::APSInt Result;
3847   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
3848   if (!Arg->isIntegerConstantExpr(Result, getContext()))
3849     return 0;
3850 
3851   // Determine the type of this overloaded NEON intrinsic.
3852   NeonTypeFlags Type(Result.getZExtValue());
3853   bool usgn = Type.isUnsigned();
3854   bool quad = Type.isQuad();
3855 
3856   llvm::VectorType *VTy = GetNeonType(this, Type);
3857   llvm::Type *Ty = VTy;
3858   if (!Ty)
3859     return 0;
3860 
3861 
3862   // Many NEON builtins have identical semantics and uses in ARM and
3863   // AArch64. Emit these in a single function.
3864   llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
3865   Builtin = findNeonIntrinsicInMap(IntrinsicMap, BuiltinID,
3866                                    NEONSIMDIntrinsicsProvenSorted);
3867   if (Builtin)
3868     return EmitCommonNeonBuiltinExpr(
3869         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
3870         Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
3871 
3872   unsigned Int;
3873   switch (BuiltinID) {
3874   default:
3875     return 0;
3876 
3877   // AArch64 builtins mapping to legacy ARM v7 builtins.
3878   // FIXME: the mapped builtins listed correspond to what has been tested
3879   // in aarch64-neon-intrinsics.c so far.
3880 
3881   // Shift by immediate
3882   case NEON::BI__builtin_neon_vrshr_n_v:
3883   case NEON::BI__builtin_neon_vrshrq_n_v:
3884     Int = usgn ? Intrinsic::aarch64_neon_vurshr
3885                : Intrinsic::aarch64_neon_vsrshr;
3886     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n");
3887   case NEON::BI__builtin_neon_vsra_n_v:
3888     if (VTy->getElementType()->isIntegerTy(64)) {
3889       Int = usgn ? Intrinsic::aarch64_neon_vsradu_n
3890                  : Intrinsic::aarch64_neon_vsrads_n;
3891       return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vsra_n");
3892     }
3893     return EmitARMBuiltinExpr(NEON::BI__builtin_neon_vsra_n_v, E);
3894   case NEON::BI__builtin_neon_vsraq_n_v:
3895     return EmitARMBuiltinExpr(NEON::BI__builtin_neon_vsraq_n_v, E);
3896   case NEON::BI__builtin_neon_vrsra_n_v:
3897     if (VTy->getElementType()->isIntegerTy(64)) {
3898       Int = usgn ? Intrinsic::aarch64_neon_vrsradu_n
3899                  : Intrinsic::aarch64_neon_vrsrads_n;
3900       return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vrsra_n");
3901     }
3902     // fall through
3903   case NEON::BI__builtin_neon_vrsraq_n_v: {
3904     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3905     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
3906     Int = usgn ? Intrinsic::aarch64_neon_vurshr
3907                : Intrinsic::aarch64_neon_vsrshr;
3908     Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]);
3909     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
3910   }
3911   case NEON::BI__builtin_neon_vqshlu_n_v:
3912   case NEON::BI__builtin_neon_vqshluq_n_v:
3913     Int = Intrinsic::aarch64_neon_vsqshlu;
3914     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n");
3915   case NEON::BI__builtin_neon_vsri_n_v:
3916   case NEON::BI__builtin_neon_vsriq_n_v:
3917     Int = Intrinsic::aarch64_neon_vsri;
3918     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsri_n");
3919   case NEON::BI__builtin_neon_vsli_n_v:
3920   case NEON::BI__builtin_neon_vsliq_n_v:
3921     Int = Intrinsic::aarch64_neon_vsli;
3922     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsli_n");
3923   case NEON::BI__builtin_neon_vqshrun_n_v:
3924     Int = Intrinsic::aarch64_neon_vsqshrun;
3925     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
3926   case NEON::BI__builtin_neon_vrshrn_n_v:
3927     Int = Intrinsic::aarch64_neon_vrshrn;
3928     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
3929   case NEON::BI__builtin_neon_vqrshrun_n_v:
3930     Int = Intrinsic::aarch64_neon_vsqrshrun;
3931     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
3932   case NEON::BI__builtin_neon_vqshrn_n_v:
3933     Int = usgn ? Intrinsic::aarch64_neon_vuqshrn
3934                : Intrinsic::aarch64_neon_vsqshrn;
3935     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
3936   case NEON::BI__builtin_neon_vqrshrn_n_v:
3937     Int = usgn ? Intrinsic::aarch64_neon_vuqrshrn
3938                : Intrinsic::aarch64_neon_vsqrshrn;
3939     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
3940 
3941   // Convert
3942   case NEON::BI__builtin_neon_vcvt_n_f64_v:
3943   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
3944     llvm::Type *FloatTy =
3945         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
3946     llvm::Type *Tys[2] = { FloatTy, Ty };
3947     Int = usgn ? Intrinsic::arm_neon_vcvtfxu2fp
3948                : Intrinsic::arm_neon_vcvtfxs2fp;
3949     Function *F = CGM.getIntrinsic(Int, Tys);
3950     return EmitNeonCall(F, Ops, "vcvt_n");
3951   }
3952 
3953   // Load/Store
3954   case NEON::BI__builtin_neon_vld1_x2_v:
3955   case NEON::BI__builtin_neon_vld1q_x2_v:
3956   case NEON::BI__builtin_neon_vld1_x3_v:
3957   case NEON::BI__builtin_neon_vld1q_x3_v:
3958   case NEON::BI__builtin_neon_vld1_x4_v:
3959   case NEON::BI__builtin_neon_vld1q_x4_v: {
3960     unsigned Int;
3961     switch (BuiltinID) {
3962     case NEON::BI__builtin_neon_vld1_x2_v:
3963     case NEON::BI__builtin_neon_vld1q_x2_v:
3964       Int = Intrinsic::aarch64_neon_vld1x2;
3965       break;
3966     case NEON::BI__builtin_neon_vld1_x3_v:
3967     case NEON::BI__builtin_neon_vld1q_x3_v:
3968       Int = Intrinsic::aarch64_neon_vld1x3;
3969       break;
3970     case NEON::BI__builtin_neon_vld1_x4_v:
3971     case NEON::BI__builtin_neon_vld1q_x4_v:
3972       Int = Intrinsic::aarch64_neon_vld1x4;
3973       break;
3974     }
3975     Function *F = CGM.getIntrinsic(Int, Ty);
3976     Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld1xN");
3977     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
3978     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
3979     return Builder.CreateStore(Ops[1], Ops[0]);
3980   }
3981   case NEON::BI__builtin_neon_vst1_x2_v:
3982   case NEON::BI__builtin_neon_vst1q_x2_v:
3983   case NEON::BI__builtin_neon_vst1_x3_v:
3984   case NEON::BI__builtin_neon_vst1q_x3_v:
3985   case NEON::BI__builtin_neon_vst1_x4_v:
3986   case NEON::BI__builtin_neon_vst1q_x4_v: {
3987     Ops.push_back(Align);
3988     unsigned Int;
3989     switch (BuiltinID) {
3990     case NEON::BI__builtin_neon_vst1_x2_v:
3991     case NEON::BI__builtin_neon_vst1q_x2_v:
3992       Int = Intrinsic::aarch64_neon_vst1x2;
3993       break;
3994     case NEON::BI__builtin_neon_vst1_x3_v:
3995     case NEON::BI__builtin_neon_vst1q_x3_v:
3996       Int = Intrinsic::aarch64_neon_vst1x3;
3997       break;
3998     case NEON::BI__builtin_neon_vst1_x4_v:
3999     case NEON::BI__builtin_neon_vst1q_x4_v:
4000       Int = Intrinsic::aarch64_neon_vst1x4;
4001       break;
4002     }
4003     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "");
4004   }
4005   case NEON::BI__builtin_neon_vld1_lane_v:
4006   case NEON::BI__builtin_neon_vld1q_lane_v: {
4007     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4008     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
4009     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4010     LoadInst *Ld = Builder.CreateLoad(Ops[0]);
4011     Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
4012     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
4013   }
4014   case NEON::BI__builtin_neon_vst1_lane_v:
4015   case NEON::BI__builtin_neon_vst1q_lane_v: {
4016     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4017     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
4018     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4019     StoreInst *St =
4020         Builder.CreateStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty));
4021     St->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
4022     return St;
4023   }
4024   case NEON::BI__builtin_neon_vld2_dup_v:
4025   case NEON::BI__builtin_neon_vld2q_dup_v:
4026   case NEON::BI__builtin_neon_vld3_dup_v:
4027   case NEON::BI__builtin_neon_vld3q_dup_v:
4028   case NEON::BI__builtin_neon_vld4_dup_v:
4029   case NEON::BI__builtin_neon_vld4q_dup_v: {
4030     // Handle 64-bit x 1 elements as a special-case.  There is no "dup" needed.
4031     if (VTy->getElementType()->getPrimitiveSizeInBits() == 64 &&
4032         VTy->getNumElements() == 1) {
4033       switch (BuiltinID) {
4034       case NEON::BI__builtin_neon_vld2_dup_v:
4035         Int = Intrinsic::arm_neon_vld2;
4036         break;
4037       case NEON::BI__builtin_neon_vld3_dup_v:
4038         Int = Intrinsic::arm_neon_vld3;
4039         break;
4040       case NEON::BI__builtin_neon_vld4_dup_v:
4041         Int = Intrinsic::arm_neon_vld4;
4042         break;
4043       default:
4044         llvm_unreachable("unknown vld_dup intrinsic?");
4045       }
4046       Function *F = CGM.getIntrinsic(Int, Ty);
4047       Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld_dup");
4048       Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4049       Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4050       return Builder.CreateStore(Ops[1], Ops[0]);
4051     }
4052     switch (BuiltinID) {
4053     case NEON::BI__builtin_neon_vld2_dup_v:
4054     case NEON::BI__builtin_neon_vld2q_dup_v:
4055       Int = Intrinsic::arm_neon_vld2lane;
4056       break;
4057     case NEON::BI__builtin_neon_vld3_dup_v:
4058     case NEON::BI__builtin_neon_vld3q_dup_v:
4059       Int = Intrinsic::arm_neon_vld3lane;
4060       break;
4061     case NEON::BI__builtin_neon_vld4_dup_v:
4062     case NEON::BI__builtin_neon_vld4q_dup_v:
4063       Int = Intrinsic::arm_neon_vld4lane;
4064       break;
4065     }
4066     Function *F = CGM.getIntrinsic(Int, Ty);
4067     llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType());
4068 
4069     SmallVector<Value *, 6> Args;
4070     Args.push_back(Ops[1]);
4071     Args.append(STy->getNumElements(), UndefValue::get(Ty));
4072 
4073     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
4074     Args.push_back(CI);
4075     Args.push_back(Align);
4076 
4077     Ops[1] = Builder.CreateCall(F, Args, "vld_dup");
4078     // splat lane 0 to all elts in each vector of the result.
4079     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
4080       Value *Val = Builder.CreateExtractValue(Ops[1], i);
4081       Value *Elt = Builder.CreateBitCast(Val, Ty);
4082       Elt = EmitNeonSplat(Elt, CI);
4083       Elt = Builder.CreateBitCast(Elt, Val->getType());
4084       Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i);
4085     }
4086     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4087     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4088     return Builder.CreateStore(Ops[1], Ops[0]);
4089   }
4090 
4091   case NEON::BI__builtin_neon_vmul_lane_v:
4092   case NEON::BI__builtin_neon_vmul_laneq_v: {
4093     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
4094     bool Quad = false;
4095     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
4096       Quad = true;
4097     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
4098     llvm::Type *VTy = GetNeonType(this,
4099       NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
4100     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
4101     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
4102     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
4103     return Builder.CreateBitCast(Result, Ty);
4104   }
4105 
4106   // AArch64-only builtins
4107   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
4108     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4109     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4110     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4111 
4112     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4113     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
4114     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
4115   }
4116   case NEON::BI__builtin_neon_vfmaq_lane_v: {
4117     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4118     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4119     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4120 
4121     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
4122     llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
4123                                             VTy->getNumElements() / 2);
4124     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
4125     Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
4126                                                cast<ConstantInt>(Ops[3]));
4127     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
4128 
4129     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
4130   }
4131   case NEON::BI__builtin_neon_vfma_lane_v: {
4132     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
4133     // v1f64 fma should be mapped to Neon scalar f64 fma
4134     if (VTy && VTy->getElementType() == DoubleTy) {
4135       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
4136       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
4137       llvm::Type *VTy = GetNeonType(this,
4138         NeonTypeFlags(NeonTypeFlags::Float64, false, false));
4139       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
4140       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
4141       Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
4142       Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
4143       return Builder.CreateBitCast(Result, Ty);
4144     }
4145     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4146     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4147     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4148 
4149     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4150     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
4151     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
4152   }
4153   case NEON::BI__builtin_neon_vfma_laneq_v: {
4154     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
4155     // v1f64 fma should be mapped to Neon scalar f64 fma
4156     if (VTy && VTy->getElementType() == DoubleTy) {
4157       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
4158       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
4159       llvm::Type *VTy = GetNeonType(this,
4160         NeonTypeFlags(NeonTypeFlags::Float64, false, true));
4161       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
4162       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
4163       Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
4164       Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
4165       return Builder.CreateBitCast(Result, Ty);
4166     }
4167     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4168     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4169     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4170 
4171     llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
4172                                             VTy->getNumElements() * 2);
4173     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
4174     Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
4175                                                cast<ConstantInt>(Ops[3]));
4176     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
4177 
4178     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
4179   }
4180   case NEON::BI__builtin_neon_vfms_v:
4181   case NEON::BI__builtin_neon_vfmsq_v: {
4182     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4183     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4184     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4185     Ops[1] = Builder.CreateFNeg(Ops[1]);
4186     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4187 
4188     // LLVM's fma intrinsic puts the accumulator in the last position, but the
4189     // AArch64 intrinsic has it first.
4190     return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
4191   }
4192   case NEON::BI__builtin_neon_vmaxnm_v:
4193   case NEON::BI__builtin_neon_vmaxnmq_v: {
4194     Int = Intrinsic::aarch64_neon_vmaxnm;
4195     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
4196   }
4197   case NEON::BI__builtin_neon_vminnm_v:
4198   case NEON::BI__builtin_neon_vminnmq_v: {
4199     Int = Intrinsic::aarch64_neon_vminnm;
4200     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
4201   }
4202   case NEON::BI__builtin_neon_vpmaxnm_v:
4203   case NEON::BI__builtin_neon_vpmaxnmq_v: {
4204     Int = Intrinsic::aarch64_neon_vpmaxnm;
4205     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
4206   }
4207   case NEON::BI__builtin_neon_vpminnm_v:
4208   case NEON::BI__builtin_neon_vpminnmq_v: {
4209     Int = Intrinsic::aarch64_neon_vpminnm;
4210     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
4211   }
4212   case NEON::BI__builtin_neon_vpmaxq_v: {
4213     Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs;
4214     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
4215   }
4216   case NEON::BI__builtin_neon_vpminq_v: {
4217     Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
4218     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
4219   }
4220   case NEON::BI__builtin_neon_vmulx_v:
4221   case NEON::BI__builtin_neon_vmulxq_v: {
4222     Int = Intrinsic::aarch64_neon_vmulx;
4223     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
4224   }
4225   case NEON::BI__builtin_neon_vsqadd_v:
4226   case NEON::BI__builtin_neon_vsqaddq_v: {
4227     Int = Intrinsic::aarch64_neon_usqadd;
4228     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
4229   }
4230   case NEON::BI__builtin_neon_vuqadd_v:
4231   case NEON::BI__builtin_neon_vuqaddq_v: {
4232     Int = Intrinsic::aarch64_neon_suqadd;
4233     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
4234   }
4235   case NEON::BI__builtin_neon_vrbit_v:
4236   case NEON::BI__builtin_neon_vrbitq_v:
4237     Int = Intrinsic::aarch64_neon_rbit;
4238     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
4239   case NEON::BI__builtin_neon_vcvt_f32_f64: {
4240     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
4241     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
4242     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
4243   }
4244   case NEON::BI__builtin_neon_vcvtx_f32_v: {
4245     llvm::Type *EltTy = FloatTy;
4246     llvm::Type *ResTy = llvm::VectorType::get(EltTy, 2);
4247     llvm::Type *Tys[2] = { ResTy, Ty };
4248     Int = Intrinsic::aarch64_neon_vcvtxn;
4249     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtx_f32_f64");
4250   }
4251   case NEON::BI__builtin_neon_vcvt_f64_f32: {
4252     llvm::Type *OpTy =
4253         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, false));
4254     Ops[0] = Builder.CreateBitCast(Ops[0], OpTy);
4255     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
4256   }
4257   case NEON::BI__builtin_neon_vcvt_f64_v:
4258   case NEON::BI__builtin_neon_vcvtq_f64_v: {
4259     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4260     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
4261     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
4262                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
4263   }
4264   case NEON::BI__builtin_neon_vrndn_v:
4265   case NEON::BI__builtin_neon_vrndnq_v: {
4266     Int = Intrinsic::aarch64_neon_frintn;
4267     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
4268   }
4269   case NEON::BI__builtin_neon_vrnda_v:
4270   case NEON::BI__builtin_neon_vrndaq_v: {
4271     Int = Intrinsic::round;
4272     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
4273   }
4274   case NEON::BI__builtin_neon_vrndp_v:
4275   case NEON::BI__builtin_neon_vrndpq_v: {
4276     Int = Intrinsic::ceil;
4277     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
4278   }
4279   case NEON::BI__builtin_neon_vrndm_v:
4280   case NEON::BI__builtin_neon_vrndmq_v: {
4281     Int = Intrinsic::floor;
4282     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
4283   }
4284   case NEON::BI__builtin_neon_vrndx_v:
4285   case NEON::BI__builtin_neon_vrndxq_v: {
4286     Int = Intrinsic::rint;
4287     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
4288   }
4289   case NEON::BI__builtin_neon_vrnd_v:
4290   case NEON::BI__builtin_neon_vrndq_v: {
4291     Int = Intrinsic::trunc;
4292     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd");
4293   }
4294   case NEON::BI__builtin_neon_vrndi_v:
4295   case NEON::BI__builtin_neon_vrndiq_v: {
4296     Int = Intrinsic::nearbyint;
4297     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
4298   }
4299   case NEON::BI__builtin_neon_vsqrt_v:
4300   case NEON::BI__builtin_neon_vsqrtq_v: {
4301     Int = Intrinsic::sqrt;
4302     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
4303   }
4304   case NEON::BI__builtin_neon_vceqz_v:
4305   case NEON::BI__builtin_neon_vceqzq_v:
4306     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
4307                                          ICmpInst::ICMP_EQ, "vceqz");
4308   case NEON::BI__builtin_neon_vcgez_v:
4309   case NEON::BI__builtin_neon_vcgezq_v:
4310     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
4311                                          ICmpInst::ICMP_SGE, "vcgez");
4312   case NEON::BI__builtin_neon_vclez_v:
4313   case NEON::BI__builtin_neon_vclezq_v:
4314     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
4315                                          ICmpInst::ICMP_SLE, "vclez");
4316   case NEON::BI__builtin_neon_vcgtz_v:
4317   case NEON::BI__builtin_neon_vcgtzq_v:
4318     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
4319                                          ICmpInst::ICMP_SGT, "vcgtz");
4320   case NEON::BI__builtin_neon_vcltz_v:
4321   case NEON::BI__builtin_neon_vcltzq_v:
4322     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
4323                                          ICmpInst::ICMP_SLT, "vcltz");
4324   }
4325 }
4326 
4327 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
4328                                            const CallExpr *E) {
4329   if (BuiltinID == ARM::BI__clear_cache) {
4330     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
4331     const FunctionDecl *FD = E->getDirectCallee();
4332     SmallVector<Value*, 2> Ops;
4333     for (unsigned i = 0; i < 2; i++)
4334       Ops.push_back(EmitScalarExpr(E->getArg(i)));
4335     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
4336     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
4337     StringRef Name = FD->getName();
4338     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
4339   }
4340 
4341   if (BuiltinID == ARM::BI__builtin_arm_ldrexd ||
4342       (BuiltinID == ARM::BI__builtin_arm_ldrex &&
4343        getContext().getTypeSize(E->getType()) == 64)) {
4344     Function *F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
4345 
4346     Value *LdPtr = EmitScalarExpr(E->getArg(0));
4347     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
4348                                     "ldrexd");
4349 
4350     Value *Val0 = Builder.CreateExtractValue(Val, 1);
4351     Value *Val1 = Builder.CreateExtractValue(Val, 0);
4352     Val0 = Builder.CreateZExt(Val0, Int64Ty);
4353     Val1 = Builder.CreateZExt(Val1, Int64Ty);
4354 
4355     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
4356     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
4357     Val = Builder.CreateOr(Val, Val1);
4358     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
4359   }
4360 
4361   if (BuiltinID == ARM::BI__builtin_arm_ldrex) {
4362     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
4363 
4364     QualType Ty = E->getType();
4365     llvm::Type *RealResTy = ConvertType(Ty);
4366     llvm::Type *IntResTy = llvm::IntegerType::get(getLLVMContext(),
4367                                                   getContext().getTypeSize(Ty));
4368     LoadAddr = Builder.CreateBitCast(LoadAddr, IntResTy->getPointerTo());
4369 
4370     Function *F = CGM.getIntrinsic(Intrinsic::arm_ldrex, LoadAddr->getType());
4371     Value *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
4372 
4373     if (RealResTy->isPointerTy())
4374       return Builder.CreateIntToPtr(Val, RealResTy);
4375     else {
4376       Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
4377       return Builder.CreateBitCast(Val, RealResTy);
4378     }
4379   }
4380 
4381   if (BuiltinID == ARM::BI__builtin_arm_strexd ||
4382       (BuiltinID == ARM::BI__builtin_arm_strex &&
4383        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
4384     Function *F = CGM.getIntrinsic(Intrinsic::arm_strexd);
4385     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, NULL);
4386 
4387     Value *Tmp = CreateMemTemp(E->getArg(0)->getType());
4388     Value *Val = EmitScalarExpr(E->getArg(0));
4389     Builder.CreateStore(Val, Tmp);
4390 
4391     Value *LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
4392     Val = Builder.CreateLoad(LdPtr);
4393 
4394     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
4395     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
4396     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), Int8PtrTy);
4397     return Builder.CreateCall3(F, Arg0, Arg1, StPtr, "strexd");
4398   }
4399 
4400   if (BuiltinID == ARM::BI__builtin_arm_strex) {
4401     Value *StoreVal = EmitScalarExpr(E->getArg(0));
4402     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
4403 
4404     QualType Ty = E->getArg(0)->getType();
4405     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
4406                                                  getContext().getTypeSize(Ty));
4407     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
4408 
4409     if (StoreVal->getType()->isPointerTy())
4410       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
4411     else {
4412       StoreVal = Builder.CreateBitCast(StoreVal, StoreTy);
4413       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
4414     }
4415 
4416     Function *F = CGM.getIntrinsic(Intrinsic::arm_strex, StoreAddr->getType());
4417     return Builder.CreateCall2(F, StoreVal, StoreAddr, "strex");
4418   }
4419 
4420   if (BuiltinID == ARM::BI__builtin_arm_clrex) {
4421     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
4422     return Builder.CreateCall(F);
4423   }
4424 
4425   if (BuiltinID == ARM::BI__builtin_arm_sevl) {
4426     Function *F = CGM.getIntrinsic(Intrinsic::arm_sevl);
4427     return Builder.CreateCall(F);
4428   }
4429 
4430   // CRC32
4431   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
4432   switch (BuiltinID) {
4433   case ARM::BI__builtin_arm_crc32b:
4434     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
4435   case ARM::BI__builtin_arm_crc32cb:
4436     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
4437   case ARM::BI__builtin_arm_crc32h:
4438     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
4439   case ARM::BI__builtin_arm_crc32ch:
4440     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
4441   case ARM::BI__builtin_arm_crc32w:
4442   case ARM::BI__builtin_arm_crc32d:
4443     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
4444   case ARM::BI__builtin_arm_crc32cw:
4445   case ARM::BI__builtin_arm_crc32cd:
4446     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
4447   }
4448 
4449   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
4450     Value *Arg0 = EmitScalarExpr(E->getArg(0));
4451     Value *Arg1 = EmitScalarExpr(E->getArg(1));
4452 
4453     // crc32{c,}d intrinsics are implemnted as two calls to crc32{c,}w
4454     // intrinsics, hence we need different codegen for these cases.
4455     if (BuiltinID == ARM::BI__builtin_arm_crc32d ||
4456         BuiltinID == ARM::BI__builtin_arm_crc32cd) {
4457       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
4458       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
4459       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
4460       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
4461 
4462       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
4463       Value *Res = Builder.CreateCall2(F, Arg0, Arg1a);
4464       return Builder.CreateCall2(F, Res, Arg1b);
4465     } else {
4466       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
4467 
4468       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
4469       return Builder.CreateCall2(F, Arg0, Arg1);
4470     }
4471   }
4472 
4473   SmallVector<Value*, 4> Ops;
4474   llvm::Value *Align = 0;
4475   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
4476     if (i == 0) {
4477       switch (BuiltinID) {
4478       case NEON::BI__builtin_neon_vld1_v:
4479       case NEON::BI__builtin_neon_vld1q_v:
4480       case NEON::BI__builtin_neon_vld1q_lane_v:
4481       case NEON::BI__builtin_neon_vld1_lane_v:
4482       case NEON::BI__builtin_neon_vld1_dup_v:
4483       case NEON::BI__builtin_neon_vld1q_dup_v:
4484       case NEON::BI__builtin_neon_vst1_v:
4485       case NEON::BI__builtin_neon_vst1q_v:
4486       case NEON::BI__builtin_neon_vst1q_lane_v:
4487       case NEON::BI__builtin_neon_vst1_lane_v:
4488       case NEON::BI__builtin_neon_vst2_v:
4489       case NEON::BI__builtin_neon_vst2q_v:
4490       case NEON::BI__builtin_neon_vst2_lane_v:
4491       case NEON::BI__builtin_neon_vst2q_lane_v:
4492       case NEON::BI__builtin_neon_vst3_v:
4493       case NEON::BI__builtin_neon_vst3q_v:
4494       case NEON::BI__builtin_neon_vst3_lane_v:
4495       case NEON::BI__builtin_neon_vst3q_lane_v:
4496       case NEON::BI__builtin_neon_vst4_v:
4497       case NEON::BI__builtin_neon_vst4q_v:
4498       case NEON::BI__builtin_neon_vst4_lane_v:
4499       case NEON::BI__builtin_neon_vst4q_lane_v:
4500         // Get the alignment for the argument in addition to the value;
4501         // we'll use it later.
4502         std::pair<llvm::Value*, unsigned> Src =
4503             EmitPointerWithAlignment(E->getArg(0));
4504         Ops.push_back(Src.first);
4505         Align = Builder.getInt32(Src.second);
4506         continue;
4507       }
4508     }
4509     if (i == 1) {
4510       switch (BuiltinID) {
4511       case NEON::BI__builtin_neon_vld2_v:
4512       case NEON::BI__builtin_neon_vld2q_v:
4513       case NEON::BI__builtin_neon_vld3_v:
4514       case NEON::BI__builtin_neon_vld3q_v:
4515       case NEON::BI__builtin_neon_vld4_v:
4516       case NEON::BI__builtin_neon_vld4q_v:
4517       case NEON::BI__builtin_neon_vld2_lane_v:
4518       case NEON::BI__builtin_neon_vld2q_lane_v:
4519       case NEON::BI__builtin_neon_vld3_lane_v:
4520       case NEON::BI__builtin_neon_vld3q_lane_v:
4521       case NEON::BI__builtin_neon_vld4_lane_v:
4522       case NEON::BI__builtin_neon_vld4q_lane_v:
4523       case NEON::BI__builtin_neon_vld2_dup_v:
4524       case NEON::BI__builtin_neon_vld3_dup_v:
4525       case NEON::BI__builtin_neon_vld4_dup_v:
4526         // Get the alignment for the argument in addition to the value;
4527         // we'll use it later.
4528         std::pair<llvm::Value*, unsigned> Src =
4529             EmitPointerWithAlignment(E->getArg(1));
4530         Ops.push_back(Src.first);
4531         Align = Builder.getInt32(Src.second);
4532         continue;
4533       }
4534     }
4535     Ops.push_back(EmitScalarExpr(E->getArg(i)));
4536   }
4537 
4538   switch (BuiltinID) {
4539   default: break;
4540   // vget_lane and vset_lane are not overloaded and do not have an extra
4541   // argument that specifies the vector type.
4542   case NEON::BI__builtin_neon_vget_lane_i8:
4543   case NEON::BI__builtin_neon_vget_lane_i16:
4544   case NEON::BI__builtin_neon_vget_lane_i32:
4545   case NEON::BI__builtin_neon_vget_lane_i64:
4546   case NEON::BI__builtin_neon_vget_lane_f32:
4547   case NEON::BI__builtin_neon_vgetq_lane_i8:
4548   case NEON::BI__builtin_neon_vgetq_lane_i16:
4549   case NEON::BI__builtin_neon_vgetq_lane_i32:
4550   case NEON::BI__builtin_neon_vgetq_lane_i64:
4551   case NEON::BI__builtin_neon_vgetq_lane_f32:
4552     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
4553                                         "vget_lane");
4554   case NEON::BI__builtin_neon_vset_lane_i8:
4555   case NEON::BI__builtin_neon_vset_lane_i16:
4556   case NEON::BI__builtin_neon_vset_lane_i32:
4557   case NEON::BI__builtin_neon_vset_lane_i64:
4558   case NEON::BI__builtin_neon_vset_lane_f32:
4559   case NEON::BI__builtin_neon_vsetq_lane_i8:
4560   case NEON::BI__builtin_neon_vsetq_lane_i16:
4561   case NEON::BI__builtin_neon_vsetq_lane_i32:
4562   case NEON::BI__builtin_neon_vsetq_lane_i64:
4563   case NEON::BI__builtin_neon_vsetq_lane_f32:
4564     Ops.push_back(EmitScalarExpr(E->getArg(2)));
4565     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
4566 
4567   // Non-polymorphic crypto instructions also not overloaded
4568   case NEON::BI__builtin_neon_vsha1h_u32:
4569     Ops.push_back(EmitScalarExpr(E->getArg(0)));
4570     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
4571                         "vsha1h");
4572   case NEON::BI__builtin_neon_vsha1cq_u32:
4573     Ops.push_back(EmitScalarExpr(E->getArg(2)));
4574     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
4575                         "vsha1h");
4576   case NEON::BI__builtin_neon_vsha1pq_u32:
4577     Ops.push_back(EmitScalarExpr(E->getArg(2)));
4578     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
4579                         "vsha1h");
4580   case NEON::BI__builtin_neon_vsha1mq_u32:
4581     Ops.push_back(EmitScalarExpr(E->getArg(2)));
4582     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
4583                         "vsha1h");
4584   }
4585 
4586   // Get the last argument, which specifies the vector type.
4587   llvm::APSInt Result;
4588   const Expr *Arg = E->getArg(E->getNumArgs()-1);
4589   if (!Arg->isIntegerConstantExpr(Result, getContext()))
4590     return 0;
4591 
4592   if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f ||
4593       BuiltinID == ARM::BI__builtin_arm_vcvtr_d) {
4594     // Determine the overloaded type of this builtin.
4595     llvm::Type *Ty;
4596     if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f)
4597       Ty = FloatTy;
4598     else
4599       Ty = DoubleTy;
4600 
4601     // Determine whether this is an unsigned conversion or not.
4602     bool usgn = Result.getZExtValue() == 1;
4603     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
4604 
4605     // Call the appropriate intrinsic.
4606     Function *F = CGM.getIntrinsic(Int, Ty);
4607     return Builder.CreateCall(F, Ops, "vcvtr");
4608   }
4609 
4610   // Determine the type of this overloaded NEON intrinsic.
4611   NeonTypeFlags Type(Result.getZExtValue());
4612   bool usgn = Type.isUnsigned();
4613   bool rightShift = false;
4614 
4615   llvm::VectorType *VTy = GetNeonType(this, Type);
4616   llvm::Type *Ty = VTy;
4617   if (!Ty)
4618     return 0;
4619 
4620   // Many NEON builtins have identical semantics and uses in ARM and
4621   // AArch64. Emit these in a single function.
4622   llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
4623   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
4624       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
4625   if (Builtin)
4626     return EmitCommonNeonBuiltinExpr(
4627         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
4628         Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
4629 
4630   unsigned Int;
4631   switch (BuiltinID) {
4632   default: return 0;
4633   case NEON::BI__builtin_neon_vld1q_lane_v:
4634     // Handle 64-bit integer elements as a special case.  Use shuffles of
4635     // one-element vectors to avoid poor code for i64 in the backend.
4636     if (VTy->getElementType()->isIntegerTy(64)) {
4637       // Extract the other lane.
4638       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4639       int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
4640       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
4641       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
4642       // Load the value as a one-element vector.
4643       Ty = llvm::VectorType::get(VTy->getElementType(), 1);
4644       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Ty);
4645       Value *Ld = Builder.CreateCall2(F, Ops[0], Align);
4646       // Combine them.
4647       SmallVector<Constant*, 2> Indices;
4648       Indices.push_back(ConstantInt::get(Int32Ty, 1-Lane));
4649       Indices.push_back(ConstantInt::get(Int32Ty, Lane));
4650       SV = llvm::ConstantVector::get(Indices);
4651       return Builder.CreateShuffleVector(Ops[1], Ld, SV, "vld1q_lane");
4652     }
4653     // fall through
4654   case NEON::BI__builtin_neon_vld1_lane_v: {
4655     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4656     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
4657     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4658     LoadInst *Ld = Builder.CreateLoad(Ops[0]);
4659     Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
4660     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
4661   }
4662   case NEON::BI__builtin_neon_vld2_dup_v:
4663   case NEON::BI__builtin_neon_vld3_dup_v:
4664   case NEON::BI__builtin_neon_vld4_dup_v: {
4665     // Handle 64-bit elements as a special-case.  There is no "dup" needed.
4666     if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) {
4667       switch (BuiltinID) {
4668       case NEON::BI__builtin_neon_vld2_dup_v:
4669         Int = Intrinsic::arm_neon_vld2;
4670         break;
4671       case NEON::BI__builtin_neon_vld3_dup_v:
4672         Int = Intrinsic::arm_neon_vld3;
4673         break;
4674       case NEON::BI__builtin_neon_vld4_dup_v:
4675         Int = Intrinsic::arm_neon_vld4;
4676         break;
4677       default: llvm_unreachable("unknown vld_dup intrinsic?");
4678       }
4679       Function *F = CGM.getIntrinsic(Int, Ty);
4680       Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld_dup");
4681       Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4682       Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4683       return Builder.CreateStore(Ops[1], Ops[0]);
4684     }
4685     switch (BuiltinID) {
4686     case NEON::BI__builtin_neon_vld2_dup_v:
4687       Int = Intrinsic::arm_neon_vld2lane;
4688       break;
4689     case NEON::BI__builtin_neon_vld3_dup_v:
4690       Int = Intrinsic::arm_neon_vld3lane;
4691       break;
4692     case NEON::BI__builtin_neon_vld4_dup_v:
4693       Int = Intrinsic::arm_neon_vld4lane;
4694       break;
4695     default: llvm_unreachable("unknown vld_dup intrinsic?");
4696     }
4697     Function *F = CGM.getIntrinsic(Int, Ty);
4698     llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType());
4699 
4700     SmallVector<Value*, 6> Args;
4701     Args.push_back(Ops[1]);
4702     Args.append(STy->getNumElements(), UndefValue::get(Ty));
4703 
4704     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
4705     Args.push_back(CI);
4706     Args.push_back(Align);
4707 
4708     Ops[1] = Builder.CreateCall(F, Args, "vld_dup");
4709     // splat lane 0 to all elts in each vector of the result.
4710     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
4711       Value *Val = Builder.CreateExtractValue(Ops[1], i);
4712       Value *Elt = Builder.CreateBitCast(Val, Ty);
4713       Elt = EmitNeonSplat(Elt, CI);
4714       Elt = Builder.CreateBitCast(Elt, Val->getType());
4715       Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i);
4716     }
4717     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4718     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4719     return Builder.CreateStore(Ops[1], Ops[0]);
4720   }
4721   case NEON::BI__builtin_neon_vqrshrn_n_v:
4722     Int =
4723       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
4724     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
4725                         1, true);
4726   case NEON::BI__builtin_neon_vqrshrun_n_v:
4727     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
4728                         Ops, "vqrshrun_n", 1, true);
4729   case NEON::BI__builtin_neon_vqshlu_n_v:
4730   case NEON::BI__builtin_neon_vqshluq_n_v:
4731     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftsu, Ty),
4732                         Ops, "vqshlu", 1, false);
4733   case NEON::BI__builtin_neon_vqshrn_n_v:
4734     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
4735     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
4736                         1, true);
4737   case NEON::BI__builtin_neon_vqshrun_n_v:
4738     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
4739                         Ops, "vqshrun_n", 1, true);
4740   case NEON::BI__builtin_neon_vrecpe_v:
4741   case NEON::BI__builtin_neon_vrecpeq_v:
4742     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
4743                         Ops, "vrecpe");
4744   case NEON::BI__builtin_neon_vrshrn_n_v:
4745     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
4746                         Ops, "vrshrn_n", 1, true);
4747   case NEON::BI__builtin_neon_vrshr_n_v:
4748   case NEON::BI__builtin_neon_vrshrq_n_v:
4749     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
4750     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true);
4751   case NEON::BI__builtin_neon_vrsra_n_v:
4752   case NEON::BI__builtin_neon_vrsraq_n_v:
4753     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4754     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4755     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
4756     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
4757     Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]);
4758     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
4759   case NEON::BI__builtin_neon_vsri_n_v:
4760   case NEON::BI__builtin_neon_vsriq_n_v:
4761     rightShift = true;
4762   case NEON::BI__builtin_neon_vsli_n_v:
4763   case NEON::BI__builtin_neon_vsliq_n_v:
4764     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
4765     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
4766                         Ops, "vsli_n");
4767   case NEON::BI__builtin_neon_vsra_n_v:
4768   case NEON::BI__builtin_neon_vsraq_n_v:
4769     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4770     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
4771     return Builder.CreateAdd(Ops[0], Ops[1]);
4772   case NEON::BI__builtin_neon_vst1q_lane_v:
4773     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
4774     // a one-element vector and avoid poor code for i64 in the backend.
4775     if (VTy->getElementType()->isIntegerTy(64)) {
4776       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4777       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
4778       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
4779       Ops[2] = Align;
4780       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
4781                                                  Ops[1]->getType()), Ops);
4782     }
4783     // fall through
4784   case NEON::BI__builtin_neon_vst1_lane_v: {
4785     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4786     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
4787     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4788     StoreInst *St = Builder.CreateStore(Ops[1],
4789                                         Builder.CreateBitCast(Ops[0], Ty));
4790     St->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
4791     return St;
4792   }
4793   case NEON::BI__builtin_neon_vtbl1_v:
4794     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
4795                         Ops, "vtbl1");
4796   case NEON::BI__builtin_neon_vtbl2_v:
4797     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
4798                         Ops, "vtbl2");
4799   case NEON::BI__builtin_neon_vtbl3_v:
4800     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
4801                         Ops, "vtbl3");
4802   case NEON::BI__builtin_neon_vtbl4_v:
4803     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
4804                         Ops, "vtbl4");
4805   case NEON::BI__builtin_neon_vtbx1_v:
4806     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
4807                         Ops, "vtbx1");
4808   case NEON::BI__builtin_neon_vtbx2_v:
4809     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
4810                         Ops, "vtbx2");
4811   case NEON::BI__builtin_neon_vtbx3_v:
4812     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
4813                         Ops, "vtbx3");
4814   case NEON::BI__builtin_neon_vtbx4_v:
4815     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
4816                         Ops, "vtbx4");
4817   }
4818 }
4819 
4820 static Value *EmitARM64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
4821                                       const CallExpr *E,
4822                                       SmallVectorImpl<Value *> &Ops) {
4823   unsigned int Int = 0;
4824   const char *s = NULL;
4825 
4826   unsigned TblPos;
4827   switch (BuiltinID) {
4828   default:
4829     return 0;
4830   case NEON::BI__builtin_neon_vtbl1_v:
4831   case NEON::BI__builtin_neon_vqtbl1_v:
4832   case NEON::BI__builtin_neon_vqtbl1q_v:
4833   case NEON::BI__builtin_neon_vtbl2_v:
4834   case NEON::BI__builtin_neon_vqtbl2_v:
4835   case NEON::BI__builtin_neon_vqtbl2q_v:
4836   case NEON::BI__builtin_neon_vtbl3_v:
4837   case NEON::BI__builtin_neon_vqtbl3_v:
4838   case NEON::BI__builtin_neon_vqtbl3q_v:
4839   case NEON::BI__builtin_neon_vtbl4_v:
4840   case NEON::BI__builtin_neon_vqtbl4_v:
4841   case NEON::BI__builtin_neon_vqtbl4q_v:
4842     TblPos = 0;
4843     break;
4844   case NEON::BI__builtin_neon_vtbx1_v:
4845   case NEON::BI__builtin_neon_vqtbx1_v:
4846   case NEON::BI__builtin_neon_vqtbx1q_v:
4847   case NEON::BI__builtin_neon_vtbx2_v:
4848   case NEON::BI__builtin_neon_vqtbx2_v:
4849   case NEON::BI__builtin_neon_vqtbx2q_v:
4850   case NEON::BI__builtin_neon_vtbx3_v:
4851   case NEON::BI__builtin_neon_vqtbx3_v:
4852   case NEON::BI__builtin_neon_vqtbx3q_v:
4853   case NEON::BI__builtin_neon_vtbx4_v:
4854   case NEON::BI__builtin_neon_vqtbx4_v:
4855   case NEON::BI__builtin_neon_vqtbx4q_v:
4856     TblPos = 1;
4857     break;
4858   }
4859 
4860   assert(E->getNumArgs() >= 3);
4861 
4862   // Get the last argument, which specifies the vector type.
4863   llvm::APSInt Result;
4864   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
4865   if (!Arg->isIntegerConstantExpr(Result, CGF.getContext()))
4866     return 0;
4867 
4868   // Determine the type of this overloaded NEON intrinsic.
4869   NeonTypeFlags Type(Result.getZExtValue());
4870   llvm::VectorType *VTy = GetNeonType(&CGF, Type);
4871   llvm::Type *Ty = VTy;
4872   if (!Ty)
4873     return 0;
4874 
4875   Arg = E->getArg(TblPos);
4876   unsigned nElts = VTy->getNumElements();
4877 
4878   CodeGen::CGBuilderTy &Builder = CGF.Builder;
4879 
4880   // AArch64 scalar builtins are not overloaded, they do not have an extra
4881   // argument that specifies the vector type, need to handle each case.
4882   SmallVector<Value *, 2> TblOps;
4883   switch (BuiltinID) {
4884   case NEON::BI__builtin_neon_vtbl1_v: {
4885     TblOps.push_back(Ops[0]);
4886     return packTBLDVectorList(CGF, TblOps, 0, Ops[1], Ty,
4887                               Intrinsic::arm64_neon_tbl1, "vtbl1");
4888   }
4889   case NEON::BI__builtin_neon_vtbl2_v: {
4890     TblOps.push_back(Ops[0]);
4891     TblOps.push_back(Ops[1]);
4892     return packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
4893                               Intrinsic::arm64_neon_tbl1, "vtbl1");
4894   }
4895   case NEON::BI__builtin_neon_vtbl3_v: {
4896     TblOps.push_back(Ops[0]);
4897     TblOps.push_back(Ops[1]);
4898     TblOps.push_back(Ops[2]);
4899     return packTBLDVectorList(CGF, TblOps, 0, Ops[3], Ty,
4900                               Intrinsic::arm64_neon_tbl2, "vtbl2");
4901   }
4902   case NEON::BI__builtin_neon_vtbl4_v: {
4903     TblOps.push_back(Ops[0]);
4904     TblOps.push_back(Ops[1]);
4905     TblOps.push_back(Ops[2]);
4906     TblOps.push_back(Ops[3]);
4907     return packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
4908                               Intrinsic::arm64_neon_tbl2, "vtbl2");
4909   }
4910   case NEON::BI__builtin_neon_vtbx1_v: {
4911     TblOps.push_back(Ops[1]);
4912     Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
4913                                     Intrinsic::arm64_neon_tbl1, "vtbl1");
4914 
4915     llvm::Constant *Eight = ConstantInt::get(VTy->getElementType(), 8);
4916     Value* EightV = llvm::ConstantVector::getSplat(nElts, Eight);
4917     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
4918     CmpRes = Builder.CreateSExt(CmpRes, Ty);
4919 
4920     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
4921     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
4922     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
4923   }
4924   case NEON::BI__builtin_neon_vtbx2_v: {
4925     TblOps.push_back(Ops[1]);
4926     TblOps.push_back(Ops[2]);
4927     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty,
4928                               Intrinsic::arm64_neon_tbx1, "vtbx1");
4929   }
4930   case NEON::BI__builtin_neon_vtbx3_v: {
4931     TblOps.push_back(Ops[1]);
4932     TblOps.push_back(Ops[2]);
4933     TblOps.push_back(Ops[3]);
4934     Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
4935                                        Intrinsic::arm64_neon_tbl2, "vtbl2");
4936 
4937     llvm::Constant *TwentyFour = ConstantInt::get(VTy->getElementType(), 24);
4938     Value* TwentyFourV = llvm::ConstantVector::getSplat(nElts, TwentyFour);
4939     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
4940                                            TwentyFourV);
4941     CmpRes = Builder.CreateSExt(CmpRes, Ty);
4942 
4943     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
4944     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
4945     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
4946   }
4947   case NEON::BI__builtin_neon_vtbx4_v: {
4948     TblOps.push_back(Ops[1]);
4949     TblOps.push_back(Ops[2]);
4950     TblOps.push_back(Ops[3]);
4951     TblOps.push_back(Ops[4]);
4952     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty,
4953                               Intrinsic::arm64_neon_tbx2, "vtbx2");
4954   }
4955   case NEON::BI__builtin_neon_vqtbl1_v:
4956   case NEON::BI__builtin_neon_vqtbl1q_v:
4957     Int = Intrinsic::arm64_neon_tbl1; s = "vtbl1"; break;
4958   case NEON::BI__builtin_neon_vqtbl2_v:
4959   case NEON::BI__builtin_neon_vqtbl2q_v: {
4960     Int = Intrinsic::arm64_neon_tbl2; s = "vtbl2"; break;
4961   case NEON::BI__builtin_neon_vqtbl3_v:
4962   case NEON::BI__builtin_neon_vqtbl3q_v:
4963     Int = Intrinsic::arm64_neon_tbl3; s = "vtbl3"; break;
4964   case NEON::BI__builtin_neon_vqtbl4_v:
4965   case NEON::BI__builtin_neon_vqtbl4q_v:
4966     Int = Intrinsic::arm64_neon_tbl4; s = "vtbl4"; break;
4967   case NEON::BI__builtin_neon_vqtbx1_v:
4968   case NEON::BI__builtin_neon_vqtbx1q_v:
4969     Int = Intrinsic::arm64_neon_tbx1; s = "vtbx1"; break;
4970   case NEON::BI__builtin_neon_vqtbx2_v:
4971   case NEON::BI__builtin_neon_vqtbx2q_v:
4972     Int = Intrinsic::arm64_neon_tbx2; s = "vtbx2"; break;
4973   case NEON::BI__builtin_neon_vqtbx3_v:
4974   case NEON::BI__builtin_neon_vqtbx3q_v:
4975     Int = Intrinsic::arm64_neon_tbx3; s = "vtbx3"; break;
4976   case NEON::BI__builtin_neon_vqtbx4_v:
4977   case NEON::BI__builtin_neon_vqtbx4q_v:
4978     Int = Intrinsic::arm64_neon_tbx4; s = "vtbx4"; break;
4979   }
4980   }
4981 
4982   if (!Int)
4983     return 0;
4984 
4985   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
4986   return CGF.EmitNeonCall(F, Ops, s);
4987 }
4988 
4989 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
4990   llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
4991   Op = Builder.CreateBitCast(Op, Int16Ty);
4992   Value *V = UndefValue::get(VTy);
4993   llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
4994   Op = Builder.CreateInsertElement(V, Op, CI);
4995   return Op;
4996 }
4997 
4998 Value *CodeGenFunction::vectorWrapScalar8(Value *Op) {
4999   llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8);
5000   Op = Builder.CreateBitCast(Op, Int8Ty);
5001   Value *V = UndefValue::get(VTy);
5002   llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
5003   Op = Builder.CreateInsertElement(V, Op, CI);
5004   return Op;
5005 }
5006 
5007 Value *CodeGenFunction::
5008 emitVectorWrappedScalar8Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops,
5009                                   const char *Name) {
5010   // i8 is not a legal types for ARM64, so we can't just use
5011   // a normal overloaed intrinsic call for these scalar types. Instead
5012   // we'll build 64-bit vectors w/ lane zero being our input values and
5013   // perform the operation on that. The back end can pattern match directly
5014   // to the scalar instruction.
5015   Ops[0] = vectorWrapScalar8(Ops[0]);
5016   Ops[1] = vectorWrapScalar8(Ops[1]);
5017   llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8);
5018   Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name);
5019   Constant *CI = ConstantInt::get(Int32Ty, 0);
5020   return Builder.CreateExtractElement(V, CI, "lane0");
5021 }
5022 
5023 Value *CodeGenFunction::
5024 emitVectorWrappedScalar16Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops,
5025                                    const char *Name) {
5026   // i16 is not a legal types for ARM64, so we can't just use
5027   // a normal overloaed intrinsic call for these scalar types. Instead
5028   // we'll build 64-bit vectors w/ lane zero being our input values and
5029   // perform the operation on that. The back end can pattern match directly
5030   // to the scalar instruction.
5031   Ops[0] = vectorWrapScalar16(Ops[0]);
5032   Ops[1] = vectorWrapScalar16(Ops[1]);
5033   llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
5034   Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name);
5035   Constant *CI = ConstantInt::get(Int32Ty, 0);
5036   return Builder.CreateExtractElement(V, CI, "lane0");
5037 }
5038 
5039 Value *CodeGenFunction::EmitARM64BuiltinExpr(unsigned BuiltinID,
5040                                              const CallExpr *E) {
5041   if (BuiltinID == ARM64::BI__clear_cache) {
5042     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
5043     const FunctionDecl *FD = E->getDirectCallee();
5044     SmallVector<Value*, 2> Ops;
5045     for (unsigned i = 0; i < 2; i++)
5046       Ops.push_back(EmitScalarExpr(E->getArg(i)));
5047     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
5048     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
5049     StringRef Name = FD->getName();
5050     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
5051   }
5052 
5053   if (BuiltinID == ARM64::BI__builtin_arm_ldrex &&
5054       getContext().getTypeSize(E->getType()) == 128) {
5055     Function *F = CGM.getIntrinsic(Intrinsic::arm64_ldxp);
5056 
5057     Value *LdPtr = EmitScalarExpr(E->getArg(0));
5058     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
5059                                     "ldxp");
5060 
5061     Value *Val0 = Builder.CreateExtractValue(Val, 1);
5062     Value *Val1 = Builder.CreateExtractValue(Val, 0);
5063     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
5064     Val0 = Builder.CreateZExt(Val0, Int128Ty);
5065     Val1 = Builder.CreateZExt(Val1, Int128Ty);
5066 
5067     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
5068     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
5069     Val = Builder.CreateOr(Val, Val1);
5070     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
5071   } else if (BuiltinID == ARM64::BI__builtin_arm_ldrex) {
5072     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
5073 
5074     QualType Ty = E->getType();
5075     llvm::Type *RealResTy = ConvertType(Ty);
5076     llvm::Type *IntResTy = llvm::IntegerType::get(getLLVMContext(),
5077                                                   getContext().getTypeSize(Ty));
5078     LoadAddr = Builder.CreateBitCast(LoadAddr, IntResTy->getPointerTo());
5079 
5080     Function *F = CGM.getIntrinsic(Intrinsic::arm64_ldxr, LoadAddr->getType());
5081     Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
5082 
5083     if (RealResTy->isPointerTy())
5084       return Builder.CreateIntToPtr(Val, RealResTy);
5085 
5086     Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
5087     return Builder.CreateBitCast(Val, RealResTy);
5088   }
5089 
5090   if (BuiltinID == ARM64::BI__builtin_arm_strex &&
5091       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
5092     Function *F = CGM.getIntrinsic(Intrinsic::arm64_stxp);
5093     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty, NULL);
5094 
5095     Value *One = llvm::ConstantInt::get(Int32Ty, 1);
5096     Value *Tmp = Builder.CreateAlloca(ConvertType(E->getArg(0)->getType()),
5097                                       One);
5098     Value *Val = EmitScalarExpr(E->getArg(0));
5099     Builder.CreateStore(Val, Tmp);
5100 
5101     Value *LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
5102     Val = Builder.CreateLoad(LdPtr);
5103 
5104     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
5105     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
5106     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
5107                                          Int8PtrTy);
5108     return Builder.CreateCall3(F, Arg0, Arg1, StPtr, "stxp");
5109   } else if (BuiltinID == ARM64::BI__builtin_arm_strex) {
5110     Value *StoreVal = EmitScalarExpr(E->getArg(0));
5111     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
5112 
5113     QualType Ty = E->getArg(0)->getType();
5114     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
5115                                                  getContext().getTypeSize(Ty));
5116     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
5117 
5118     if (StoreVal->getType()->isPointerTy())
5119       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
5120     else {
5121       StoreVal = Builder.CreateBitCast(StoreVal, StoreTy);
5122       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
5123     }
5124 
5125     Function *F = CGM.getIntrinsic(Intrinsic::arm64_stxr, StoreAddr->getType());
5126     return Builder.CreateCall2(F, StoreVal, StoreAddr, "stxr");
5127   }
5128 
5129   if (BuiltinID == ARM64::BI__builtin_arm_clrex) {
5130     Function *F = CGM.getIntrinsic(Intrinsic::arm64_clrex);
5131     return Builder.CreateCall(F);
5132   }
5133 
5134   // CRC32
5135   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
5136   switch (BuiltinID) {
5137   case ARM64::BI__builtin_arm_crc32b:
5138     CRCIntrinsicID = Intrinsic::arm64_crc32b; break;
5139   case ARM64::BI__builtin_arm_crc32cb:
5140     CRCIntrinsicID = Intrinsic::arm64_crc32cb; break;
5141   case ARM64::BI__builtin_arm_crc32h:
5142     CRCIntrinsicID = Intrinsic::arm64_crc32h; break;
5143   case ARM64::BI__builtin_arm_crc32ch:
5144     CRCIntrinsicID = Intrinsic::arm64_crc32ch; break;
5145   case ARM64::BI__builtin_arm_crc32w:
5146     CRCIntrinsicID = Intrinsic::arm64_crc32w; break;
5147   case ARM64::BI__builtin_arm_crc32cw:
5148     CRCIntrinsicID = Intrinsic::arm64_crc32cw; break;
5149   case ARM64::BI__builtin_arm_crc32d:
5150     CRCIntrinsicID = Intrinsic::arm64_crc32x; break;
5151   case ARM64::BI__builtin_arm_crc32cd:
5152     CRCIntrinsicID = Intrinsic::arm64_crc32cx; break;
5153   }
5154 
5155   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
5156     Value *Arg0 = EmitScalarExpr(E->getArg(0));
5157     Value *Arg1 = EmitScalarExpr(E->getArg(1));
5158     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
5159 
5160     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
5161     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
5162 
5163     return Builder.CreateCall2(F, Arg0, Arg1);
5164   }
5165 
5166   llvm::SmallVector<Value*, 4> Ops;
5167   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++)
5168     Ops.push_back(EmitScalarExpr(E->getArg(i)));
5169 
5170   llvm::ArrayRef<NeonIntrinsicInfo> SISDMap(ARM64SISDIntrinsicMap);
5171   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
5172       SISDMap, BuiltinID, ARM64SISDIntrinsicsProvenSorted);
5173 
5174   if (Builtin) {
5175     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
5176     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
5177     assert(Result && "SISD intrinsic should have been handled");
5178     return Result;
5179   }
5180 
5181   llvm::APSInt Result;
5182   const Expr *Arg = E->getArg(E->getNumArgs()-1);
5183   NeonTypeFlags Type(0);
5184   if (Arg->isIntegerConstantExpr(Result, getContext()))
5185     // Determine the type of this overloaded NEON intrinsic.
5186     Type = NeonTypeFlags(Result.getZExtValue());
5187 
5188   bool usgn = Type.isUnsigned();
5189   bool quad = Type.isQuad();
5190 
5191   // Handle non-overloaded intrinsics first.
5192   switch (BuiltinID) {
5193   default: break;
5194   case NEON::BI__builtin_neon_vldrq_p128: {
5195     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
5196     Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
5197     return Builder.CreateLoad(Ptr);
5198   }
5199   case NEON::BI__builtin_neon_vstrq_p128: {
5200     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
5201     Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
5202     return Builder.CreateStore(EmitScalarExpr(E->getArg(1)), Ptr);
5203   }
5204   case NEON::BI__builtin_neon_vcvts_u32_f32:
5205   case NEON::BI__builtin_neon_vcvtd_u64_f64:
5206     usgn = true;
5207     // FALL THROUGH
5208   case NEON::BI__builtin_neon_vcvts_s32_f32:
5209   case NEON::BI__builtin_neon_vcvtd_s64_f64: {
5210     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5211     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
5212     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
5213     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
5214     Ops[0] = Builder.CreateBitCast(Ops[0], FTy);
5215     if (usgn)
5216       return Builder.CreateFPToUI(Ops[0], InTy);
5217     return Builder.CreateFPToSI(Ops[0], InTy);
5218   }
5219   case NEON::BI__builtin_neon_vcvts_f32_u32:
5220   case NEON::BI__builtin_neon_vcvtd_f64_u64:
5221     usgn = true;
5222     // FALL THROUGH
5223   case NEON::BI__builtin_neon_vcvts_f32_s32:
5224   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
5225     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5226     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
5227     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
5228     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
5229     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
5230     if (usgn)
5231       return Builder.CreateUIToFP(Ops[0], FTy);
5232     return Builder.CreateSIToFP(Ops[0], FTy);
5233   }
5234   case NEON::BI__builtin_neon_vpaddd_s64: {
5235     llvm::Type *Ty =
5236       llvm::VectorType::get(llvm::Type::getInt64Ty(getLLVMContext()), 2);
5237     Value *Vec = EmitScalarExpr(E->getArg(0));
5238     // The vector is v2f64, so make sure it's bitcast to that.
5239     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
5240     llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
5241     llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
5242     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5243     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5244     // Pairwise addition of a v2f64 into a scalar f64.
5245     return Builder.CreateAdd(Op0, Op1, "vpaddd");
5246   }
5247   case NEON::BI__builtin_neon_vpaddd_f64: {
5248     llvm::Type *Ty =
5249       llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2);
5250     Value *Vec = EmitScalarExpr(E->getArg(0));
5251     // The vector is v2f64, so make sure it's bitcast to that.
5252     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
5253     llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
5254     llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
5255     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5256     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5257     // Pairwise addition of a v2f64 into a scalar f64.
5258     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
5259   }
5260   case NEON::BI__builtin_neon_vpadds_f32: {
5261     llvm::Type *Ty =
5262       llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2);
5263     Value *Vec = EmitScalarExpr(E->getArg(0));
5264     // The vector is v2f32, so make sure it's bitcast to that.
5265     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
5266     llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
5267     llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
5268     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
5269     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
5270     // Pairwise addition of a v2f32 into a scalar f32.
5271     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
5272   }
5273   case NEON::BI__builtin_neon_vceqzd_s64:
5274   case NEON::BI__builtin_neon_vceqzd_f64:
5275   case NEON::BI__builtin_neon_vceqzs_f32:
5276     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5277     return EmitAArch64CompareBuiltinExpr(
5278         Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OEQ,
5279         ICmpInst::ICMP_EQ, "vceqz");
5280   case NEON::BI__builtin_neon_vcgezd_s64:
5281   case NEON::BI__builtin_neon_vcgezd_f64:
5282   case NEON::BI__builtin_neon_vcgezs_f32:
5283     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5284     return EmitAArch64CompareBuiltinExpr(
5285         Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGE,
5286         ICmpInst::ICMP_SGE, "vcgez");
5287   case NEON::BI__builtin_neon_vclezd_s64:
5288   case NEON::BI__builtin_neon_vclezd_f64:
5289   case NEON::BI__builtin_neon_vclezs_f32:
5290     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5291     return EmitAArch64CompareBuiltinExpr(
5292         Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLE,
5293         ICmpInst::ICMP_SLE, "vclez");
5294   case NEON::BI__builtin_neon_vcgtzd_s64:
5295   case NEON::BI__builtin_neon_vcgtzd_f64:
5296   case NEON::BI__builtin_neon_vcgtzs_f32:
5297     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5298     return EmitAArch64CompareBuiltinExpr(
5299         Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGT,
5300         ICmpInst::ICMP_SGT, "vcgtz");
5301   case NEON::BI__builtin_neon_vcltzd_s64:
5302   case NEON::BI__builtin_neon_vcltzd_f64:
5303   case NEON::BI__builtin_neon_vcltzs_f32:
5304     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5305     return EmitAArch64CompareBuiltinExpr(
5306         Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLT,
5307         ICmpInst::ICMP_SLT, "vcltz");
5308 
5309   case NEON::BI__builtin_neon_vceqzd_u64: {
5310     llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext());
5311     Ops.push_back(EmitScalarExpr(E->getArg(0)));
5312     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5313     Ops[0] = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, Ops[0],
5314                                 llvm::Constant::getNullValue(Ty));
5315     return Builder.CreateSExt(Ops[0], Ty, "vceqzd");
5316   }
5317   case NEON::BI__builtin_neon_vceqd_f64:
5318   case NEON::BI__builtin_neon_vcled_f64:
5319   case NEON::BI__builtin_neon_vcltd_f64:
5320   case NEON::BI__builtin_neon_vcged_f64:
5321   case NEON::BI__builtin_neon_vcgtd_f64: {
5322     llvm::CmpInst::Predicate P;
5323     switch (BuiltinID) {
5324     default: llvm_unreachable("missing builtin ID in switch!");
5325     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
5326     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
5327     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
5328     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
5329     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
5330     }
5331     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5332     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
5333     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
5334     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
5335     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
5336   }
5337   case NEON::BI__builtin_neon_vceqs_f32:
5338   case NEON::BI__builtin_neon_vcles_f32:
5339   case NEON::BI__builtin_neon_vclts_f32:
5340   case NEON::BI__builtin_neon_vcges_f32:
5341   case NEON::BI__builtin_neon_vcgts_f32: {
5342     llvm::CmpInst::Predicate P;
5343     switch (BuiltinID) {
5344     default: llvm_unreachable("missing builtin ID in switch!");
5345     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
5346     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
5347     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
5348     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
5349     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
5350     }
5351     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5352     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
5353     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
5354     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
5355     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
5356   }
5357   case NEON::BI__builtin_neon_vceqd_s64:
5358   case NEON::BI__builtin_neon_vceqd_u64:
5359   case NEON::BI__builtin_neon_vcgtd_s64:
5360   case NEON::BI__builtin_neon_vcgtd_u64:
5361   case NEON::BI__builtin_neon_vcltd_s64:
5362   case NEON::BI__builtin_neon_vcltd_u64:
5363   case NEON::BI__builtin_neon_vcged_u64:
5364   case NEON::BI__builtin_neon_vcged_s64:
5365   case NEON::BI__builtin_neon_vcled_u64:
5366   case NEON::BI__builtin_neon_vcled_s64: {
5367     llvm::CmpInst::Predicate P;
5368     switch (BuiltinID) {
5369     default: llvm_unreachable("missing builtin ID in switch!");
5370     case NEON::BI__builtin_neon_vceqd_s64:
5371     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
5372     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
5373     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
5374     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
5375     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
5376     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
5377     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
5378     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
5379     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
5380     }
5381     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5382     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
5383     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
5384     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
5385     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
5386   }
5387   case NEON::BI__builtin_neon_vtstd_s64:
5388   case NEON::BI__builtin_neon_vtstd_u64: {
5389     llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext());
5390     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5391     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5392     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5393     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
5394     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
5395                                 llvm::Constant::getNullValue(Ty));
5396     return Builder.CreateSExt(Ops[0], Ty, "vtstd");
5397   }
5398   case NEON::BI__builtin_neon_vset_lane_i8:
5399   case NEON::BI__builtin_neon_vset_lane_i16:
5400   case NEON::BI__builtin_neon_vset_lane_i32:
5401   case NEON::BI__builtin_neon_vset_lane_i64:
5402   case NEON::BI__builtin_neon_vset_lane_f32:
5403   case NEON::BI__builtin_neon_vsetq_lane_i8:
5404   case NEON::BI__builtin_neon_vsetq_lane_i16:
5405   case NEON::BI__builtin_neon_vsetq_lane_i32:
5406   case NEON::BI__builtin_neon_vsetq_lane_i64:
5407   case NEON::BI__builtin_neon_vsetq_lane_f32:
5408     Ops.push_back(EmitScalarExpr(E->getArg(2)));
5409     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
5410   case NEON::BI__builtin_neon_vset_lane_f64:
5411     // The vector type needs a cast for the v1f64 variant.
5412     Ops[1] = Builder.CreateBitCast(Ops[1],
5413                                    llvm::VectorType::get(DoubleTy, 1));
5414     Ops.push_back(EmitScalarExpr(E->getArg(2)));
5415     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
5416   case NEON::BI__builtin_neon_vsetq_lane_f64:
5417     // The vector type needs a cast for the v2f64 variant.
5418     Ops[1] = Builder.CreateBitCast(Ops[1],
5419         llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2));
5420     Ops.push_back(EmitScalarExpr(E->getArg(2)));
5421     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
5422 
5423   case NEON::BI__builtin_neon_vget_lane_i8:
5424   case NEON::BI__builtin_neon_vdupb_lane_i8:
5425     Ops[0] = Builder.CreateBitCast(Ops[0],
5426         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8));
5427     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5428                                         "vget_lane");
5429   case NEON::BI__builtin_neon_vgetq_lane_i8:
5430   case NEON::BI__builtin_neon_vdupb_laneq_i8:
5431     Ops[0] = Builder.CreateBitCast(Ops[0],
5432         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16));
5433     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5434                                         "vgetq_lane");
5435   case NEON::BI__builtin_neon_vget_lane_i16:
5436   case NEON::BI__builtin_neon_vduph_lane_i16:
5437     Ops[0] = Builder.CreateBitCast(Ops[0],
5438         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4));
5439     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5440                                         "vget_lane");
5441   case NEON::BI__builtin_neon_vgetq_lane_i16:
5442   case NEON::BI__builtin_neon_vduph_laneq_i16:
5443     Ops[0] = Builder.CreateBitCast(Ops[0],
5444         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8));
5445     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5446                                         "vgetq_lane");
5447   case NEON::BI__builtin_neon_vget_lane_i32:
5448   case NEON::BI__builtin_neon_vdups_lane_i32:
5449     Ops[0] = Builder.CreateBitCast(
5450         Ops[0],
5451         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 2));
5452     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5453                                         "vget_lane");
5454   case NEON::BI__builtin_neon_vdups_lane_f32:
5455     Ops[0] = Builder.CreateBitCast(Ops[0],
5456         llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2));
5457     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5458                                         "vdups_lane");
5459   case NEON::BI__builtin_neon_vgetq_lane_i32:
5460   case NEON::BI__builtin_neon_vdups_laneq_i32:
5461     Ops[0] = Builder.CreateBitCast(Ops[0],
5462         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 4));
5463     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5464                                         "vgetq_lane");
5465   case NEON::BI__builtin_neon_vget_lane_i64:
5466   case NEON::BI__builtin_neon_vdupd_lane_i64:
5467     Ops[0] = Builder.CreateBitCast(Ops[0],
5468         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 1));
5469     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5470                                         "vget_lane");
5471   case NEON::BI__builtin_neon_vdupd_lane_f64:
5472     Ops[0] = Builder.CreateBitCast(Ops[0],
5473         llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1));
5474     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5475                                         "vdupd_lane");
5476   case NEON::BI__builtin_neon_vgetq_lane_i64:
5477   case NEON::BI__builtin_neon_vdupd_laneq_i64:
5478     Ops[0] = Builder.CreateBitCast(Ops[0],
5479         llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 2));
5480     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5481                                         "vgetq_lane");
5482   case NEON::BI__builtin_neon_vget_lane_f32:
5483     Ops[0] = Builder.CreateBitCast(Ops[0],
5484         llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2));
5485     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5486                                         "vget_lane");
5487   case NEON::BI__builtin_neon_vget_lane_f64:
5488     Ops[0] = Builder.CreateBitCast(Ops[0],
5489         llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1));
5490     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5491                                         "vget_lane");
5492   case NEON::BI__builtin_neon_vgetq_lane_f32:
5493   case NEON::BI__builtin_neon_vdups_laneq_f32:
5494     Ops[0] = Builder.CreateBitCast(Ops[0],
5495         llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 4));
5496     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5497                                         "vgetq_lane");
5498   case NEON::BI__builtin_neon_vgetq_lane_f64:
5499   case NEON::BI__builtin_neon_vdupd_laneq_f64:
5500     Ops[0] = Builder.CreateBitCast(Ops[0],
5501         llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2));
5502     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
5503                                         "vgetq_lane");
5504   case NEON::BI__builtin_neon_vaddd_s64:
5505   case NEON::BI__builtin_neon_vaddd_u64:
5506     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
5507   case NEON::BI__builtin_neon_vsubd_s64:
5508   case NEON::BI__builtin_neon_vsubd_u64:
5509     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
5510   case NEON::BI__builtin_neon_vqdmlalh_s16:
5511   case NEON::BI__builtin_neon_vqdmlslh_s16: {
5512     SmallVector<Value *, 2> ProductOps;
5513     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
5514     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
5515     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
5516     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmull, VTy),
5517                           ProductOps, "vqdmlXl");
5518     Constant *CI = ConstantInt::get(Int32Ty, 0);
5519     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
5520 
5521     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
5522                                         ? Intrinsic::arm64_neon_sqadd
5523                                         : Intrinsic::arm64_neon_sqsub;
5524     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
5525   }
5526   case NEON::BI__builtin_neon_vqshlud_n_s64: {
5527     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5528     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
5529     llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
5530     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqshlu, VTy),
5531                           Ops, "vqshlu_n");
5532     return Builder.CreateBitCast(Ops[0], Int64Ty);
5533   }
5534   case NEON::BI__builtin_neon_vqshld_n_u64:
5535   case NEON::BI__builtin_neon_vqshld_n_s64: {
5536     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
5537                                    ? Intrinsic::arm64_neon_uqshl
5538                                    : Intrinsic::arm64_neon_sqshl;
5539     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5540     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
5541     llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
5542     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, "vqshl_n");
5543     return Builder.CreateBitCast(Ops[0], Int64Ty);
5544   }
5545   case NEON::BI__builtin_neon_vrshrd_n_u64:
5546   case NEON::BI__builtin_neon_vrshrd_n_s64: {
5547     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
5548                                    ? Intrinsic::arm64_neon_urshl
5549                                    : Intrinsic::arm64_neon_srshl;
5550     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5551     llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
5552     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, "vrshr_n", 1, true);
5553     return Builder.CreateBitCast(Ops[0], Int64Ty);
5554   }
5555   case NEON::BI__builtin_neon_vrsrad_n_u64:
5556   case NEON::BI__builtin_neon_vrsrad_n_s64: {
5557     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
5558                                    ? Intrinsic::arm64_neon_urshl
5559                                    : Intrinsic::arm64_neon_srshl;
5560     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
5561     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
5562     Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Int64Ty), Ops[1],
5563                                  Builder.CreateSExt(Ops[2], Int64Ty));
5564     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
5565   }
5566   case NEON::BI__builtin_neon_vshld_n_s64:
5567   case NEON::BI__builtin_neon_vshld_n_u64: {
5568     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
5569     return Builder.CreateShl(
5570         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
5571                                                    Amt->getZExtValue())),
5572         "vshr_n");
5573   }
5574   case NEON::BI__builtin_neon_vshrd_n_s64: {
5575     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
5576     return Builder.CreateAShr(
5577         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
5578                                                    Amt->getZExtValue())),
5579         "vshr_n");
5580   }
5581   case NEON::BI__builtin_neon_vshrd_n_u64: {
5582     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
5583     return Builder.CreateLShr(
5584         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
5585                                                    Amt->getZExtValue())),
5586         "vshr_n");
5587   }
5588   case NEON::BI__builtin_neon_vsrad_n_s64: {
5589     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
5590     Ops[1] = Builder.CreateAShr(
5591         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
5592                                                    Amt->getZExtValue())),
5593         "vshr_n");
5594     return Builder.CreateAdd(Ops[0], Ops[1]);
5595   }
5596   case NEON::BI__builtin_neon_vsrad_n_u64: {
5597     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
5598     Ops[1] = Builder.CreateLShr(
5599         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
5600                                                    Amt->getZExtValue())),
5601         "vshr_n");
5602     return Builder.CreateAdd(Ops[0], Ops[1]);
5603   }
5604   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
5605   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
5606   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
5607   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
5608     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
5609                                           "lane");
5610     SmallVector<Value *, 2> ProductOps;
5611     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
5612     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
5613     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
5614     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmull, VTy),
5615                           ProductOps, "vqdmlXl");
5616     Constant *CI = ConstantInt::get(Int32Ty, 0);
5617     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
5618     Ops.pop_back();
5619 
5620     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
5621                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
5622                           ? Intrinsic::arm64_neon_sqadd
5623                           : Intrinsic::arm64_neon_sqsub;
5624     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
5625   }
5626   case NEON::BI__builtin_neon_vqdmlals_s32:
5627   case NEON::BI__builtin_neon_vqdmlsls_s32: {
5628     SmallVector<Value *, 2> ProductOps;
5629     ProductOps.push_back(Ops[1]);
5630     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
5631     Ops[1] =
5632         EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmulls_scalar),
5633                      ProductOps, "vqdmlXl");
5634 
5635     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
5636                                         ? Intrinsic::arm64_neon_sqadd
5637                                         : Intrinsic::arm64_neon_sqsub;
5638     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
5639   }
5640   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
5641   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
5642   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
5643   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
5644     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
5645                                           "lane");
5646     SmallVector<Value *, 2> ProductOps;
5647     ProductOps.push_back(Ops[1]);
5648     ProductOps.push_back(Ops[2]);
5649     Ops[1] =
5650         EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmulls_scalar),
5651                      ProductOps, "vqdmlXl");
5652     Ops.pop_back();
5653 
5654     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
5655                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
5656                           ? Intrinsic::arm64_neon_sqadd
5657                           : Intrinsic::arm64_neon_sqsub;
5658     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
5659   }
5660   }
5661 
5662   llvm::VectorType *VTy = GetNeonType(this, Type);
5663   llvm::Type *Ty = VTy;
5664   if (!Ty)
5665     return 0;
5666 
5667   // Not all intrinsics handled by the common case work for ARM64 yet, so only
5668   // defer to common code if it's been added to our special map.
5669   Builtin = findNeonIntrinsicInMap(ARM64SIMDIntrinsicMap, BuiltinID,
5670                                    ARM64SIMDIntrinsicsProvenSorted);
5671 
5672   if (Builtin)
5673     return EmitCommonNeonBuiltinExpr(
5674         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
5675         Builtin->NameHint, Builtin->TypeModifier, E, Ops, 0);
5676 
5677   if (Value *V = EmitARM64TblBuiltinExpr(*this, BuiltinID, E, Ops))
5678     return V;
5679 
5680   unsigned Int;
5681   switch (BuiltinID) {
5682   default: return 0;
5683   case NEON::BI__builtin_neon_vbsl_v:
5684   case NEON::BI__builtin_neon_vbslq_v: {
5685     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
5686     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
5687     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
5688     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
5689 
5690     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
5691     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
5692     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
5693     return Builder.CreateBitCast(Ops[0], Ty);
5694   }
5695   case NEON::BI__builtin_neon_vfma_lane_v:
5696   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
5697     // The ARM builtins (and instructions) have the addend as the first
5698     // operand, but the 'fma' intrinsics have it last. Swap it around here.
5699     Value *Addend = Ops[0];
5700     Value *Multiplicand = Ops[1];
5701     Value *LaneSource = Ops[2];
5702     Ops[0] = Multiplicand;
5703     Ops[1] = LaneSource;
5704     Ops[2] = Addend;
5705 
5706     // Now adjust things to handle the lane access.
5707     llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ?
5708       llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) :
5709       VTy;
5710     llvm::Constant *cst = cast<Constant>(Ops[3]);
5711     Value *SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), cst);
5712     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
5713     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
5714 
5715     Ops.pop_back();
5716     Int = Intrinsic::fma;
5717     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
5718   }
5719   case NEON::BI__builtin_neon_vfma_laneq_v: {
5720     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
5721     // v1f64 fma should be mapped to Neon scalar f64 fma
5722     if (VTy && VTy->getElementType() == DoubleTy) {
5723       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
5724       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
5725       llvm::Type *VTy = GetNeonType(this,
5726         NeonTypeFlags(NeonTypeFlags::Float64, false, true));
5727       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
5728       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
5729       Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
5730       Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
5731       return Builder.CreateBitCast(Result, Ty);
5732     }
5733     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
5734     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5735     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5736 
5737     llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
5738                                             VTy->getNumElements() * 2);
5739     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
5740     Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
5741                                                cast<ConstantInt>(Ops[3]));
5742     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
5743 
5744     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
5745   }
5746   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
5747     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
5748     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5749     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5750 
5751     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
5752     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
5753     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
5754   }
5755   case NEON::BI__builtin_neon_vfmas_lane_f32:
5756   case NEON::BI__builtin_neon_vfmas_laneq_f32:
5757   case NEON::BI__builtin_neon_vfmad_lane_f64:
5758   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
5759     Ops.push_back(EmitScalarExpr(E->getArg(3)));
5760     llvm::Type *Ty = ConvertType(E->getCallReturnType());
5761     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
5762     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
5763     return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
5764   }
5765   case NEON::BI__builtin_neon_vfms_v:
5766   case NEON::BI__builtin_neon_vfmsq_v: {  // Only used for FP types
5767     // FIXME: probably remove when we no longer support aarch64_simd.h
5768     // (arm_neon.h delegates to vfma).
5769 
5770     // The ARM builtins (and instructions) have the addend as the first
5771     // operand, but the 'fma' intrinsics have it last. Swap it around here.
5772     Value *Subtrahend = Ops[0];
5773     Value *Multiplicand = Ops[2];
5774     Ops[0] = Multiplicand;
5775     Ops[2] = Subtrahend;
5776     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
5777     Ops[1] = Builder.CreateFNeg(Ops[1]);
5778     Int = Intrinsic::fma;
5779     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls");
5780   }
5781   case NEON::BI__builtin_neon_vmull_v:
5782     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5783     Int = usgn ? Intrinsic::arm64_neon_umull : Intrinsic::arm64_neon_smull;
5784     if (Type.isPoly()) Int = Intrinsic::arm64_neon_pmull;
5785     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
5786   case NEON::BI__builtin_neon_vmax_v:
5787   case NEON::BI__builtin_neon_vmaxq_v:
5788     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5789     Int = usgn ? Intrinsic::arm64_neon_umax : Intrinsic::arm64_neon_smax;
5790     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmax;
5791     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
5792   case NEON::BI__builtin_neon_vmin_v:
5793   case NEON::BI__builtin_neon_vminq_v:
5794     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5795     Int = usgn ? Intrinsic::arm64_neon_umin : Intrinsic::arm64_neon_smin;
5796     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmin;
5797     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
5798   case NEON::BI__builtin_neon_vabd_v:
5799   case NEON::BI__builtin_neon_vabdq_v:
5800     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5801     Int = usgn ? Intrinsic::arm64_neon_uabd : Intrinsic::arm64_neon_sabd;
5802     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fabd;
5803     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
5804   case NEON::BI__builtin_neon_vpadal_v:
5805   case NEON::BI__builtin_neon_vpadalq_v: {
5806     unsigned ArgElts = VTy->getNumElements();
5807     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
5808     unsigned BitWidth = EltTy->getBitWidth();
5809     llvm::Type *ArgTy = llvm::VectorType::get(
5810         llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts);
5811     llvm::Type* Tys[2] = { VTy, ArgTy };
5812     Int = usgn ? Intrinsic::arm64_neon_uaddlp : Intrinsic::arm64_neon_saddlp;
5813     SmallVector<llvm::Value*, 1> TmpOps;
5814     TmpOps.push_back(Ops[1]);
5815     Function *F = CGM.getIntrinsic(Int, Tys);
5816     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
5817     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
5818     return Builder.CreateAdd(tmp, addend);
5819   }
5820   case NEON::BI__builtin_neon_vpmin_v:
5821   case NEON::BI__builtin_neon_vpminq_v:
5822     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5823     Int = usgn ? Intrinsic::arm64_neon_uminp : Intrinsic::arm64_neon_sminp;
5824     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fminp;
5825     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
5826   case NEON::BI__builtin_neon_vpmax_v:
5827   case NEON::BI__builtin_neon_vpmaxq_v:
5828     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
5829     Int = usgn ? Intrinsic::arm64_neon_umaxp : Intrinsic::arm64_neon_smaxp;
5830     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmaxp;
5831     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
5832   case NEON::BI__builtin_neon_vminnm_v:
5833   case NEON::BI__builtin_neon_vminnmq_v:
5834     Int = Intrinsic::arm64_neon_fminnm;
5835     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
5836   case NEON::BI__builtin_neon_vmaxnm_v:
5837   case NEON::BI__builtin_neon_vmaxnmq_v:
5838     Int = Intrinsic::arm64_neon_fmaxnm;
5839     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
5840   case NEON::BI__builtin_neon_vrecpss_f32: {
5841     llvm::Type *f32Type = llvm::Type::getFloatTy(getLLVMContext());
5842     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5843     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_frecps, f32Type),
5844                         Ops, "vrecps");
5845   }
5846   case NEON::BI__builtin_neon_vrecpsd_f64: {
5847     llvm::Type *f64Type = llvm::Type::getDoubleTy(getLLVMContext());
5848     Ops.push_back(EmitScalarExpr(E->getArg(1)));
5849     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_frecps, f64Type),
5850                         Ops, "vrecps");
5851   }
5852   case NEON::BI__builtin_neon_vrshr_n_v:
5853   case NEON::BI__builtin_neon_vrshrq_n_v:
5854     // FIXME: this can be shared with 32-bit ARM, but not AArch64 at the
5855     // moment. After the final merge it should be added to
5856     // EmitCommonNeonBuiltinExpr.
5857     Int = usgn ? Intrinsic::arm64_neon_urshl : Intrinsic::arm64_neon_srshl;
5858     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true);
5859   case NEON::BI__builtin_neon_vqshlu_n_v:
5860   case NEON::BI__builtin_neon_vqshluq_n_v:
5861     // FIXME: AArch64 and ARM use different intrinsics for this, but are
5862     // essentially compatible. It should be in EmitCommonNeonBuiltinExpr after
5863     // the final merge.
5864     Int = Intrinsic::arm64_neon_sqshlu;
5865     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n", 1, false);
5866   case NEON::BI__builtin_neon_vqshrun_n_v:
5867     // FIXME: as above
5868     Int = Intrinsic::arm64_neon_sqshrun;
5869     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
5870   case NEON::BI__builtin_neon_vqrshrun_n_v:
5871     // FIXME: and again.
5872     Int = Intrinsic::arm64_neon_sqrshrun;
5873     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
5874   case NEON::BI__builtin_neon_vqshrn_n_v:
5875     // FIXME: guess
5876     Int = usgn ? Intrinsic::arm64_neon_uqshrn : Intrinsic::arm64_neon_sqshrn;
5877     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
5878   case NEON::BI__builtin_neon_vrshrn_n_v:
5879     // FIXME: there might be a pattern here.
5880     Int = Intrinsic::arm64_neon_rshrn;
5881     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
5882   case NEON::BI__builtin_neon_vqrshrn_n_v:
5883     // FIXME: another one
5884     Int = usgn ? Intrinsic::arm64_neon_uqrshrn : Intrinsic::arm64_neon_sqrshrn;
5885     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
5886   case NEON::BI__builtin_neon_vrnda_v:
5887   case NEON::BI__builtin_neon_vrndaq_v: {
5888     Int = Intrinsic::round;
5889     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
5890   }
5891   case NEON::BI__builtin_neon_vrndi_v:
5892   case NEON::BI__builtin_neon_vrndiq_v: {
5893     Int = Intrinsic::nearbyint;
5894     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
5895   }
5896   case NEON::BI__builtin_neon_vrndm_v:
5897   case NEON::BI__builtin_neon_vrndmq_v: {
5898     Int = Intrinsic::floor;
5899     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
5900   }
5901   case NEON::BI__builtin_neon_vrndn_v:
5902   case NEON::BI__builtin_neon_vrndnq_v: {
5903     Int = Intrinsic::arm64_neon_frintn;
5904     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
5905   }
5906   case NEON::BI__builtin_neon_vrndp_v:
5907   case NEON::BI__builtin_neon_vrndpq_v: {
5908     Int = Intrinsic::ceil;
5909     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
5910   }
5911   case NEON::BI__builtin_neon_vrndx_v:
5912   case NEON::BI__builtin_neon_vrndxq_v: {
5913     Int = Intrinsic::rint;
5914     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
5915   }
5916   case NEON::BI__builtin_neon_vrnd_v:
5917   case NEON::BI__builtin_neon_vrndq_v: {
5918     Int = Intrinsic::trunc;
5919     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
5920   }
5921   case NEON::BI__builtin_neon_vceqz_v:
5922   case NEON::BI__builtin_neon_vceqzq_v:
5923     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
5924                                          ICmpInst::ICMP_EQ, "vceqz");
5925   case NEON::BI__builtin_neon_vcgez_v:
5926   case NEON::BI__builtin_neon_vcgezq_v:
5927     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
5928                                          ICmpInst::ICMP_SGE, "vcgez");
5929   case NEON::BI__builtin_neon_vclez_v:
5930   case NEON::BI__builtin_neon_vclezq_v:
5931     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
5932                                          ICmpInst::ICMP_SLE, "vclez");
5933   case NEON::BI__builtin_neon_vcgtz_v:
5934   case NEON::BI__builtin_neon_vcgtzq_v:
5935     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
5936                                          ICmpInst::ICMP_SGT, "vcgtz");
5937   case NEON::BI__builtin_neon_vcltz_v:
5938   case NEON::BI__builtin_neon_vcltzq_v:
5939     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
5940                                          ICmpInst::ICMP_SLT, "vcltz");
5941   case NEON::BI__builtin_neon_vcvt_f64_v:
5942   case NEON::BI__builtin_neon_vcvtq_f64_v:
5943     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5944     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
5945     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
5946                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
5947   case NEON::BI__builtin_neon_vcvt_f64_f32: {
5948     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
5949            "unexpected vcvt_f64_f32 builtin");
5950     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
5951     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
5952 
5953     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
5954   }
5955   case NEON::BI__builtin_neon_vcvt_f32_f64: {
5956     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
5957            "unexpected vcvt_f32_f64 builtin");
5958     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
5959     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
5960 
5961     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
5962   }
5963   case NEON::BI__builtin_neon_vcvt_s32_v:
5964   case NEON::BI__builtin_neon_vcvt_u32_v:
5965   case NEON::BI__builtin_neon_vcvt_s64_v:
5966   case NEON::BI__builtin_neon_vcvt_u64_v:
5967   case NEON::BI__builtin_neon_vcvtq_s32_v:
5968   case NEON::BI__builtin_neon_vcvtq_u32_v:
5969   case NEON::BI__builtin_neon_vcvtq_s64_v:
5970   case NEON::BI__builtin_neon_vcvtq_u64_v: {
5971     bool Double =
5972       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
5973     llvm::Type *InTy =
5974       GetNeonType(this,
5975                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
5976                                 : NeonTypeFlags::Float32, false, quad));
5977     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
5978     if (usgn)
5979       return Builder.CreateFPToUI(Ops[0], Ty);
5980     return Builder.CreateFPToSI(Ops[0], Ty);
5981   }
5982   case NEON::BI__builtin_neon_vcvta_s32_v:
5983   case NEON::BI__builtin_neon_vcvtaq_s32_v:
5984   case NEON::BI__builtin_neon_vcvta_u32_v:
5985   case NEON::BI__builtin_neon_vcvtaq_u32_v:
5986   case NEON::BI__builtin_neon_vcvta_s64_v:
5987   case NEON::BI__builtin_neon_vcvtaq_s64_v:
5988   case NEON::BI__builtin_neon_vcvta_u64_v:
5989   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
5990     Int = usgn ? Intrinsic::arm64_neon_fcvtau : Intrinsic::arm64_neon_fcvtas;
5991     bool Double =
5992       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
5993     llvm::Type *InTy =
5994       GetNeonType(this,
5995                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
5996                                 : NeonTypeFlags::Float32, false, quad));
5997     llvm::Type *Tys[2] = { Ty, InTy };
5998     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
5999   }
6000   case NEON::BI__builtin_neon_vcvtm_s32_v:
6001   case NEON::BI__builtin_neon_vcvtmq_s32_v:
6002   case NEON::BI__builtin_neon_vcvtm_u32_v:
6003   case NEON::BI__builtin_neon_vcvtmq_u32_v:
6004   case NEON::BI__builtin_neon_vcvtm_s64_v:
6005   case NEON::BI__builtin_neon_vcvtmq_s64_v:
6006   case NEON::BI__builtin_neon_vcvtm_u64_v:
6007   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
6008     Int = usgn ? Intrinsic::arm64_neon_fcvtmu : Intrinsic::arm64_neon_fcvtms;
6009     bool Double =
6010       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
6011     llvm::Type *InTy =
6012       GetNeonType(this,
6013                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
6014                                 : NeonTypeFlags::Float32, false, quad));
6015     llvm::Type *Tys[2] = { Ty, InTy };
6016     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
6017   }
6018   case NEON::BI__builtin_neon_vcvtn_s32_v:
6019   case NEON::BI__builtin_neon_vcvtnq_s32_v:
6020   case NEON::BI__builtin_neon_vcvtn_u32_v:
6021   case NEON::BI__builtin_neon_vcvtnq_u32_v:
6022   case NEON::BI__builtin_neon_vcvtn_s64_v:
6023   case NEON::BI__builtin_neon_vcvtnq_s64_v:
6024   case NEON::BI__builtin_neon_vcvtn_u64_v:
6025   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
6026     Int = usgn ? Intrinsic::arm64_neon_fcvtnu : Intrinsic::arm64_neon_fcvtns;
6027     bool Double =
6028       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
6029     llvm::Type *InTy =
6030       GetNeonType(this,
6031                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
6032                                 : NeonTypeFlags::Float32, false, quad));
6033     llvm::Type *Tys[2] = { Ty, InTy };
6034     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
6035   }
6036   case NEON::BI__builtin_neon_vcvtp_s32_v:
6037   case NEON::BI__builtin_neon_vcvtpq_s32_v:
6038   case NEON::BI__builtin_neon_vcvtp_u32_v:
6039   case NEON::BI__builtin_neon_vcvtpq_u32_v:
6040   case NEON::BI__builtin_neon_vcvtp_s64_v:
6041   case NEON::BI__builtin_neon_vcvtpq_s64_v:
6042   case NEON::BI__builtin_neon_vcvtp_u64_v:
6043   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
6044     Int = usgn ? Intrinsic::arm64_neon_fcvtpu : Intrinsic::arm64_neon_fcvtps;
6045     bool Double =
6046       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
6047     llvm::Type *InTy =
6048       GetNeonType(this,
6049                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
6050                                 : NeonTypeFlags::Float32, false, quad));
6051     llvm::Type *Tys[2] = { Ty, InTy };
6052     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
6053   }
6054   case NEON::BI__builtin_neon_vmulx_v:
6055   case NEON::BI__builtin_neon_vmulxq_v: {
6056     Int = Intrinsic::arm64_neon_fmulx;
6057     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
6058   }
6059   case NEON::BI__builtin_neon_vmul_lane_v:
6060   case NEON::BI__builtin_neon_vmul_laneq_v: {
6061     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
6062     bool Quad = false;
6063     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
6064       Quad = true;
6065     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
6066     llvm::Type *VTy = GetNeonType(this,
6067       NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
6068     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
6069     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
6070     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
6071     return Builder.CreateBitCast(Result, Ty);
6072   }
6073   case NEON::BI__builtin_neon_vnegd_s64:
6074     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
6075   case NEON::BI__builtin_neon_vpmaxnm_v:
6076   case NEON::BI__builtin_neon_vpmaxnmq_v: {
6077     Int = Intrinsic::arm64_neon_fmaxnmp;
6078     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
6079   }
6080   case NEON::BI__builtin_neon_vpminnm_v:
6081   case NEON::BI__builtin_neon_vpminnmq_v: {
6082     Int = Intrinsic::arm64_neon_fminnmp;
6083     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
6084   }
6085   case NEON::BI__builtin_neon_vsqrt_v:
6086   case NEON::BI__builtin_neon_vsqrtq_v: {
6087     Int = Intrinsic::sqrt;
6088     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6089     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
6090   }
6091   case NEON::BI__builtin_neon_vrbit_v:
6092   case NEON::BI__builtin_neon_vrbitq_v: {
6093     Int = Intrinsic::arm64_neon_rbit;
6094     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
6095   }
6096   case NEON::BI__builtin_neon_vaddv_u8:
6097     // FIXME: These are handled by the AArch64 scalar code.
6098     usgn = true;
6099     // FALLTHROUGH
6100   case NEON::BI__builtin_neon_vaddv_s8: {
6101     Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
6102     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6103     VTy =
6104       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6105     llvm::Type *Tys[2] = { Ty, VTy };
6106     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6107     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
6108     return Builder.CreateTrunc(Ops[0],
6109              llvm::IntegerType::get(getLLVMContext(), 8));
6110   }
6111   case NEON::BI__builtin_neon_vaddv_u16:
6112     usgn = true;
6113     // FALLTHROUGH
6114   case NEON::BI__builtin_neon_vaddv_s16: {
6115     Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
6116     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6117     VTy =
6118       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6119     llvm::Type *Tys[2] = { Ty, VTy };
6120     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6121     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
6122     return Builder.CreateTrunc(Ops[0],
6123              llvm::IntegerType::get(getLLVMContext(), 16));
6124   }
6125   case NEON::BI__builtin_neon_vaddvq_u8:
6126     usgn = true;
6127     // FALLTHROUGH
6128   case NEON::BI__builtin_neon_vaddvq_s8: {
6129     Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
6130     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6131     VTy =
6132       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6133     llvm::Type *Tys[2] = { Ty, VTy };
6134     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6135     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
6136     return Builder.CreateTrunc(Ops[0],
6137              llvm::IntegerType::get(getLLVMContext(), 8));
6138   }
6139   case NEON::BI__builtin_neon_vaddvq_u16:
6140     usgn = true;
6141     // FALLTHROUGH
6142   case NEON::BI__builtin_neon_vaddvq_s16: {
6143     Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
6144     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6145     VTy =
6146       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6147     llvm::Type *Tys[2] = { Ty, VTy };
6148     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6149     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
6150     return Builder.CreateTrunc(Ops[0],
6151              llvm::IntegerType::get(getLLVMContext(), 16));
6152   }
6153   case NEON::BI__builtin_neon_vmaxv_u8: {
6154     Int = Intrinsic::arm64_neon_umaxv;
6155     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6156     VTy =
6157       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6158     llvm::Type *Tys[2] = { Ty, VTy };
6159     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6160     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6161     return Builder.CreateTrunc(Ops[0],
6162              llvm::IntegerType::get(getLLVMContext(), 8));
6163   }
6164   case NEON::BI__builtin_neon_vmaxv_u16: {
6165     Int = Intrinsic::arm64_neon_umaxv;
6166     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6167     VTy =
6168       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6169     llvm::Type *Tys[2] = { Ty, VTy };
6170     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6171     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6172     return Builder.CreateTrunc(Ops[0],
6173              llvm::IntegerType::get(getLLVMContext(), 16));
6174   }
6175   case NEON::BI__builtin_neon_vmaxvq_u8: {
6176     Int = Intrinsic::arm64_neon_umaxv;
6177     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6178     VTy =
6179       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6180     llvm::Type *Tys[2] = { Ty, VTy };
6181     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6182     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6183     return Builder.CreateTrunc(Ops[0],
6184              llvm::IntegerType::get(getLLVMContext(), 8));
6185   }
6186   case NEON::BI__builtin_neon_vmaxvq_u16: {
6187     Int = Intrinsic::arm64_neon_umaxv;
6188     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6189     VTy =
6190       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6191     llvm::Type *Tys[2] = { Ty, VTy };
6192     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6193     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6194     return Builder.CreateTrunc(Ops[0],
6195              llvm::IntegerType::get(getLLVMContext(), 16));
6196   }
6197   case NEON::BI__builtin_neon_vmaxv_s8: {
6198     Int = Intrinsic::arm64_neon_smaxv;
6199     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6200     VTy =
6201       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6202     llvm::Type *Tys[2] = { Ty, VTy };
6203     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6204     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6205     return Builder.CreateTrunc(Ops[0],
6206              llvm::IntegerType::get(getLLVMContext(), 8));
6207   }
6208   case NEON::BI__builtin_neon_vmaxv_s16: {
6209     Int = Intrinsic::arm64_neon_smaxv;
6210     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6211     VTy =
6212       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6213     llvm::Type *Tys[2] = { Ty, VTy };
6214     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6215     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6216     return Builder.CreateTrunc(Ops[0],
6217              llvm::IntegerType::get(getLLVMContext(), 16));
6218   }
6219   case NEON::BI__builtin_neon_vmaxvq_s8: {
6220     Int = Intrinsic::arm64_neon_smaxv;
6221     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6222     VTy =
6223       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6224     llvm::Type *Tys[2] = { Ty, VTy };
6225     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6226     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6227     return Builder.CreateTrunc(Ops[0],
6228              llvm::IntegerType::get(getLLVMContext(), 8));
6229   }
6230   case NEON::BI__builtin_neon_vmaxvq_s16: {
6231     Int = Intrinsic::arm64_neon_smaxv;
6232     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6233     VTy =
6234       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6235     llvm::Type *Tys[2] = { Ty, VTy };
6236     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6237     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
6238     return Builder.CreateTrunc(Ops[0],
6239              llvm::IntegerType::get(getLLVMContext(), 16));
6240   }
6241   case NEON::BI__builtin_neon_vminv_u8: {
6242     Int = Intrinsic::arm64_neon_uminv;
6243     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6244     VTy =
6245       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6246     llvm::Type *Tys[2] = { Ty, VTy };
6247     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6248     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6249     return Builder.CreateTrunc(Ops[0],
6250              llvm::IntegerType::get(getLLVMContext(), 8));
6251   }
6252   case NEON::BI__builtin_neon_vminv_u16: {
6253     Int = Intrinsic::arm64_neon_uminv;
6254     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6255     VTy =
6256       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6257     llvm::Type *Tys[2] = { Ty, VTy };
6258     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6259     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6260     return Builder.CreateTrunc(Ops[0],
6261              llvm::IntegerType::get(getLLVMContext(), 16));
6262   }
6263   case NEON::BI__builtin_neon_vminvq_u8: {
6264     Int = Intrinsic::arm64_neon_uminv;
6265     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6266     VTy =
6267       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6268     llvm::Type *Tys[2] = { Ty, VTy };
6269     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6270     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6271     return Builder.CreateTrunc(Ops[0],
6272              llvm::IntegerType::get(getLLVMContext(), 8));
6273   }
6274   case NEON::BI__builtin_neon_vminvq_u16: {
6275     Int = Intrinsic::arm64_neon_uminv;
6276     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6277     VTy =
6278       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6279     llvm::Type *Tys[2] = { Ty, VTy };
6280     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6281     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6282     return Builder.CreateTrunc(Ops[0],
6283              llvm::IntegerType::get(getLLVMContext(), 16));
6284   }
6285   case NEON::BI__builtin_neon_vminv_s8: {
6286     Int = Intrinsic::arm64_neon_sminv;
6287     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6288     VTy =
6289       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6290     llvm::Type *Tys[2] = { Ty, VTy };
6291     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6292     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6293     return Builder.CreateTrunc(Ops[0],
6294              llvm::IntegerType::get(getLLVMContext(), 8));
6295   }
6296   case NEON::BI__builtin_neon_vminv_s16: {
6297     Int = Intrinsic::arm64_neon_sminv;
6298     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6299     VTy =
6300       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6301     llvm::Type *Tys[2] = { Ty, VTy };
6302     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6303     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6304     return Builder.CreateTrunc(Ops[0],
6305              llvm::IntegerType::get(getLLVMContext(), 16));
6306   }
6307   case NEON::BI__builtin_neon_vminvq_s8: {
6308     Int = Intrinsic::arm64_neon_sminv;
6309     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6310     VTy =
6311       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6312     llvm::Type *Tys[2] = { Ty, VTy };
6313     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6314     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6315     return Builder.CreateTrunc(Ops[0],
6316              llvm::IntegerType::get(getLLVMContext(), 8));
6317   }
6318   case NEON::BI__builtin_neon_vminvq_s16: {
6319     Int = Intrinsic::arm64_neon_sminv;
6320     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6321     VTy =
6322       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6323     llvm::Type *Tys[2] = { Ty, VTy };
6324     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6325     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
6326     return Builder.CreateTrunc(Ops[0],
6327              llvm::IntegerType::get(getLLVMContext(), 16));
6328   }
6329   case NEON::BI__builtin_neon_vmul_n_f64: {
6330     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
6331     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
6332     return Builder.CreateFMul(Ops[0], RHS);
6333   }
6334   case NEON::BI__builtin_neon_vaddlv_u8: {
6335     Int = Intrinsic::arm64_neon_uaddlv;
6336     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6337     VTy =
6338       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6339     llvm::Type *Tys[2] = { Ty, VTy };
6340     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6341     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6342     return Builder.CreateTrunc(Ops[0],
6343              llvm::IntegerType::get(getLLVMContext(), 16));
6344   }
6345   case NEON::BI__builtin_neon_vaddlv_u16: {
6346     Int = Intrinsic::arm64_neon_uaddlv;
6347     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6348     VTy =
6349       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6350     llvm::Type *Tys[2] = { Ty, VTy };
6351     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6352     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6353   }
6354   case NEON::BI__builtin_neon_vaddlvq_u8: {
6355     Int = Intrinsic::arm64_neon_uaddlv;
6356     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6357     VTy =
6358       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6359     llvm::Type *Tys[2] = { Ty, VTy };
6360     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6361     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6362     return Builder.CreateTrunc(Ops[0],
6363              llvm::IntegerType::get(getLLVMContext(), 16));
6364   }
6365   case NEON::BI__builtin_neon_vaddlvq_u16: {
6366     Int = Intrinsic::arm64_neon_uaddlv;
6367     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6368     VTy =
6369       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6370     llvm::Type *Tys[2] = { Ty, VTy };
6371     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6372     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6373   }
6374   case NEON::BI__builtin_neon_vaddlv_s8: {
6375     Int = Intrinsic::arm64_neon_saddlv;
6376     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6377     VTy =
6378       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
6379     llvm::Type *Tys[2] = { Ty, VTy };
6380     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6381     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6382     return Builder.CreateTrunc(Ops[0],
6383              llvm::IntegerType::get(getLLVMContext(), 16));
6384   }
6385   case NEON::BI__builtin_neon_vaddlv_s16: {
6386     Int = Intrinsic::arm64_neon_saddlv;
6387     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6388     VTy =
6389       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
6390     llvm::Type *Tys[2] = { Ty, VTy };
6391     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6392     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6393   }
6394   case NEON::BI__builtin_neon_vaddlvq_s8: {
6395     Int = Intrinsic::arm64_neon_saddlv;
6396     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6397     VTy =
6398       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
6399     llvm::Type *Tys[2] = { Ty, VTy };
6400     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6401     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6402     return Builder.CreateTrunc(Ops[0],
6403              llvm::IntegerType::get(getLLVMContext(), 16));
6404   }
6405   case NEON::BI__builtin_neon_vaddlvq_s16: {
6406     Int = Intrinsic::arm64_neon_saddlv;
6407     Ty = llvm::IntegerType::get(getLLVMContext(), 32);
6408     VTy =
6409       llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
6410     llvm::Type *Tys[2] = { Ty, VTy };
6411     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6412     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
6413   }
6414   case NEON::BI__builtin_neon_vsri_n_v:
6415   case NEON::BI__builtin_neon_vsriq_n_v: {
6416     Int = Intrinsic::arm64_neon_vsri;
6417     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
6418     return EmitNeonCall(Intrin, Ops, "vsri_n");
6419   }
6420   case NEON::BI__builtin_neon_vsli_n_v:
6421   case NEON::BI__builtin_neon_vsliq_n_v: {
6422     Int = Intrinsic::arm64_neon_vsli;
6423     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
6424     return EmitNeonCall(Intrin, Ops, "vsli_n");
6425   }
6426   case NEON::BI__builtin_neon_vsra_n_v:
6427   case NEON::BI__builtin_neon_vsraq_n_v:
6428     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6429     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
6430     return Builder.CreateAdd(Ops[0], Ops[1]);
6431   case NEON::BI__builtin_neon_vrsra_n_v:
6432   case NEON::BI__builtin_neon_vrsraq_n_v: {
6433     Int = usgn ? Intrinsic::arm64_neon_urshl : Intrinsic::arm64_neon_srshl;
6434     SmallVector<llvm::Value*,2> TmpOps;
6435     TmpOps.push_back(Ops[1]);
6436     TmpOps.push_back(Ops[2]);
6437     Function* F = CGM.getIntrinsic(Int, Ty);
6438     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
6439     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
6440     return Builder.CreateAdd(Ops[0], tmp);
6441   }
6442     // FIXME: Sharing loads & stores with 32-bit is complicated by the absence
6443     // of an Align parameter here.
6444   case NEON::BI__builtin_neon_vld1_x2_v:
6445   case NEON::BI__builtin_neon_vld1q_x2_v:
6446   case NEON::BI__builtin_neon_vld1_x3_v:
6447   case NEON::BI__builtin_neon_vld1q_x3_v:
6448   case NEON::BI__builtin_neon_vld1_x4_v:
6449   case NEON::BI__builtin_neon_vld1q_x4_v: {
6450     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
6451     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6452     llvm::Type *Tys[2] = { VTy, PTy };
6453     unsigned Int;
6454     switch (BuiltinID) {
6455     case NEON::BI__builtin_neon_vld1_x2_v:
6456     case NEON::BI__builtin_neon_vld1q_x2_v:
6457       Int = Intrinsic::arm64_neon_ld1x2;
6458       break;
6459     case NEON::BI__builtin_neon_vld1_x3_v:
6460     case NEON::BI__builtin_neon_vld1q_x3_v:
6461       Int = Intrinsic::arm64_neon_ld1x3;
6462       break;
6463     case NEON::BI__builtin_neon_vld1_x4_v:
6464     case NEON::BI__builtin_neon_vld1q_x4_v:
6465       Int = Intrinsic::arm64_neon_ld1x4;
6466       break;
6467     }
6468     Function *F = CGM.getIntrinsic(Int, Tys);
6469     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
6470     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6471     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6472     return Builder.CreateStore(Ops[1], Ops[0]);
6473   }
6474   case NEON::BI__builtin_neon_vst1_x2_v:
6475   case NEON::BI__builtin_neon_vst1q_x2_v:
6476   case NEON::BI__builtin_neon_vst1_x3_v:
6477   case NEON::BI__builtin_neon_vst1q_x3_v:
6478   case NEON::BI__builtin_neon_vst1_x4_v:
6479   case NEON::BI__builtin_neon_vst1q_x4_v: {
6480     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
6481     llvm::Type *Tys[2] = { VTy, PTy };
6482     unsigned Int;
6483     switch (BuiltinID) {
6484     case NEON::BI__builtin_neon_vst1_x2_v:
6485     case NEON::BI__builtin_neon_vst1q_x2_v:
6486       Int = Intrinsic::arm64_neon_st1x2;
6487       break;
6488     case NEON::BI__builtin_neon_vst1_x3_v:
6489     case NEON::BI__builtin_neon_vst1q_x3_v:
6490       Int = Intrinsic::arm64_neon_st1x3;
6491       break;
6492     case NEON::BI__builtin_neon_vst1_x4_v:
6493     case NEON::BI__builtin_neon_vst1q_x4_v:
6494       Int = Intrinsic::arm64_neon_st1x4;
6495       break;
6496     }
6497     SmallVector<Value *, 4> IntOps(Ops.begin()+1, Ops.end());
6498     IntOps.push_back(Ops[0]);
6499     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), IntOps, "");
6500   }
6501   case NEON::BI__builtin_neon_vld1_v:
6502   case NEON::BI__builtin_neon_vld1q_v:
6503     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
6504     return Builder.CreateLoad(Ops[0]);
6505   case NEON::BI__builtin_neon_vst1_v:
6506   case NEON::BI__builtin_neon_vst1q_v:
6507     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
6508     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
6509     return Builder.CreateStore(Ops[1], Ops[0]);
6510   case NEON::BI__builtin_neon_vld1_lane_v:
6511   case NEON::BI__builtin_neon_vld1q_lane_v:
6512     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6513     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
6514     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6515     Ops[0] = Builder.CreateLoad(Ops[0]);
6516     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
6517   case NEON::BI__builtin_neon_vld1_dup_v:
6518   case NEON::BI__builtin_neon_vld1q_dup_v: {
6519     Value *V = UndefValue::get(Ty);
6520     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
6521     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6522     Ops[0] = Builder.CreateLoad(Ops[0]);
6523     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
6524     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
6525     return EmitNeonSplat(Ops[0], CI);
6526   }
6527   case NEON::BI__builtin_neon_vst1_lane_v:
6528   case NEON::BI__builtin_neon_vst1q_lane_v:
6529     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6530     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
6531     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6532     return Builder.CreateStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty));
6533   case NEON::BI__builtin_neon_vld2_v:
6534   case NEON::BI__builtin_neon_vld2q_v: {
6535     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
6536     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6537     llvm::Type *Tys[2] = { VTy, PTy };
6538     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2, Tys);
6539     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
6540     Ops[0] = Builder.CreateBitCast(Ops[0],
6541                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6542     return Builder.CreateStore(Ops[1], Ops[0]);
6543   }
6544   case NEON::BI__builtin_neon_vld3_v:
6545   case NEON::BI__builtin_neon_vld3q_v: {
6546     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
6547     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6548     llvm::Type *Tys[2] = { VTy, PTy };
6549     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3, Tys);
6550     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
6551     Ops[0] = Builder.CreateBitCast(Ops[0],
6552                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6553     return Builder.CreateStore(Ops[1], Ops[0]);
6554   }
6555   case NEON::BI__builtin_neon_vld4_v:
6556   case NEON::BI__builtin_neon_vld4q_v: {
6557     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
6558     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6559     llvm::Type *Tys[2] = { VTy, PTy };
6560     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4, Tys);
6561     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
6562     Ops[0] = Builder.CreateBitCast(Ops[0],
6563                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6564     return Builder.CreateStore(Ops[1], Ops[0]);
6565   }
6566   case NEON::BI__builtin_neon_vld2_dup_v:
6567   case NEON::BI__builtin_neon_vld2q_dup_v: {
6568     llvm::Type *PTy =
6569       llvm::PointerType::getUnqual(VTy->getElementType());
6570     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6571     llvm::Type *Tys[2] = { VTy, PTy };
6572     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2r, Tys);
6573     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
6574     Ops[0] = Builder.CreateBitCast(Ops[0],
6575                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6576     return Builder.CreateStore(Ops[1], Ops[0]);
6577   }
6578   case NEON::BI__builtin_neon_vld3_dup_v:
6579   case NEON::BI__builtin_neon_vld3q_dup_v: {
6580     llvm::Type *PTy =
6581       llvm::PointerType::getUnqual(VTy->getElementType());
6582     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6583     llvm::Type *Tys[2] = { VTy, PTy };
6584     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3r, Tys);
6585     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
6586     Ops[0] = Builder.CreateBitCast(Ops[0],
6587                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6588     return Builder.CreateStore(Ops[1], Ops[0]);
6589   }
6590   case NEON::BI__builtin_neon_vld4_dup_v:
6591   case NEON::BI__builtin_neon_vld4q_dup_v: {
6592     llvm::Type *PTy =
6593       llvm::PointerType::getUnqual(VTy->getElementType());
6594     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6595     llvm::Type *Tys[2] = { VTy, PTy };
6596     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4r, Tys);
6597     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
6598     Ops[0] = Builder.CreateBitCast(Ops[0],
6599                 llvm::PointerType::getUnqual(Ops[1]->getType()));
6600     return Builder.CreateStore(Ops[1], Ops[0]);
6601   }
6602   case NEON::BI__builtin_neon_vld2_lane_v:
6603   case NEON::BI__builtin_neon_vld2q_lane_v: {
6604     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6605     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2lane, Tys);
6606     Ops.push_back(Ops[1]);
6607     Ops.erase(Ops.begin()+1);
6608     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6609     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6610     Ops[3] = Builder.CreateZExt(Ops[3],
6611                 llvm::IntegerType::get(getLLVMContext(), 64));
6612     Ops[1] = Builder.CreateCall(F,
6613                 ArrayRef<Value*>(Ops).slice(1), "vld2_lane");
6614     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6615     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6616     return Builder.CreateStore(Ops[1], Ops[0]);
6617   }
6618   case NEON::BI__builtin_neon_vld3_lane_v:
6619   case NEON::BI__builtin_neon_vld3q_lane_v: {
6620     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6621     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3lane, Tys);
6622     Ops.push_back(Ops[1]);
6623     Ops.erase(Ops.begin()+1);
6624     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6625     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6626     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
6627     Ops[4] = Builder.CreateZExt(Ops[4],
6628                 llvm::IntegerType::get(getLLVMContext(), 64));
6629     Ops[1] = Builder.CreateCall(F,
6630                 ArrayRef<Value*>(Ops).slice(1), "vld3_lane");
6631     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6632     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6633     return Builder.CreateStore(Ops[1], Ops[0]);
6634   }
6635   case NEON::BI__builtin_neon_vld4_lane_v:
6636   case NEON::BI__builtin_neon_vld4q_lane_v: {
6637     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6638     Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4lane, Tys);
6639     Ops.push_back(Ops[1]);
6640     Ops.erase(Ops.begin()+1);
6641     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6642     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6643     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
6644     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
6645     Ops[5] = Builder.CreateZExt(Ops[5],
6646                 llvm::IntegerType::get(getLLVMContext(), 64));
6647     Ops[1] = Builder.CreateCall(F,
6648                 ArrayRef<Value*>(Ops).slice(1), "vld4_lane");
6649     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6650     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6651     return Builder.CreateStore(Ops[1], Ops[0]);
6652   }
6653   case NEON::BI__builtin_neon_vst2_v:
6654   case NEON::BI__builtin_neon_vst2q_v: {
6655     Ops.push_back(Ops[0]);
6656     Ops.erase(Ops.begin());
6657     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
6658     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st2, Tys),
6659                         Ops, "");
6660   }
6661   case NEON::BI__builtin_neon_vst2_lane_v:
6662   case NEON::BI__builtin_neon_vst2q_lane_v: {
6663     Ops.push_back(Ops[0]);
6664     Ops.erase(Ops.begin());
6665     Ops[2] = Builder.CreateZExt(Ops[2],
6666                 llvm::IntegerType::get(getLLVMContext(), 64));
6667     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
6668     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st2lane, Tys),
6669                         Ops, "");
6670   }
6671   case NEON::BI__builtin_neon_vst3_v:
6672   case NEON::BI__builtin_neon_vst3q_v: {
6673     Ops.push_back(Ops[0]);
6674     Ops.erase(Ops.begin());
6675     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
6676     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st3, Tys),
6677                         Ops, "");
6678   }
6679   case NEON::BI__builtin_neon_vst3_lane_v:
6680   case NEON::BI__builtin_neon_vst3q_lane_v: {
6681     Ops.push_back(Ops[0]);
6682     Ops.erase(Ops.begin());
6683     Ops[3] = Builder.CreateZExt(Ops[3],
6684                 llvm::IntegerType::get(getLLVMContext(), 64));
6685     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
6686     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st3lane, Tys),
6687                         Ops, "");
6688   }
6689   case NEON::BI__builtin_neon_vst4_v:
6690   case NEON::BI__builtin_neon_vst4q_v: {
6691     Ops.push_back(Ops[0]);
6692     Ops.erase(Ops.begin());
6693     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
6694     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st4, Tys),
6695                         Ops, "");
6696   }
6697   case NEON::BI__builtin_neon_vst4_lane_v:
6698   case NEON::BI__builtin_neon_vst4q_lane_v: {
6699     Ops.push_back(Ops[0]);
6700     Ops.erase(Ops.begin());
6701     Ops[4] = Builder.CreateZExt(Ops[4],
6702                 llvm::IntegerType::get(getLLVMContext(), 64));
6703     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
6704     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st4lane, Tys),
6705                         Ops, "");
6706   }
6707   case NEON::BI__builtin_neon_vtrn_v:
6708   case NEON::BI__builtin_neon_vtrnq_v: {
6709     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
6710     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6711     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6712     Value *SV = 0;
6713 
6714     for (unsigned vi = 0; vi != 2; ++vi) {
6715       SmallVector<Constant*, 16> Indices;
6716       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
6717         Indices.push_back(ConstantInt::get(Int32Ty, i+vi));
6718         Indices.push_back(ConstantInt::get(Int32Ty, i+e+vi));
6719       }
6720       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
6721       SV = llvm::ConstantVector::get(Indices);
6722       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn");
6723       SV = Builder.CreateStore(SV, Addr);
6724     }
6725     return SV;
6726   }
6727   case NEON::BI__builtin_neon_vuzp_v:
6728   case NEON::BI__builtin_neon_vuzpq_v: {
6729     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
6730     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6731     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6732     Value *SV = 0;
6733 
6734     for (unsigned vi = 0; vi != 2; ++vi) {
6735       SmallVector<Constant*, 16> Indices;
6736       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
6737         Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi));
6738 
6739       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
6740       SV = llvm::ConstantVector::get(Indices);
6741       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp");
6742       SV = Builder.CreateStore(SV, Addr);
6743     }
6744     return SV;
6745   }
6746   case NEON::BI__builtin_neon_vzip_v:
6747   case NEON::BI__builtin_neon_vzipq_v: {
6748     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
6749     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6750     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6751     Value *SV = 0;
6752 
6753     for (unsigned vi = 0; vi != 2; ++vi) {
6754       SmallVector<Constant*, 16> Indices;
6755       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
6756         Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1));
6757         Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e));
6758       }
6759       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
6760       SV = llvm::ConstantVector::get(Indices);
6761       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip");
6762       SV = Builder.CreateStore(SV, Addr);
6763     }
6764     return SV;
6765   }
6766   case NEON::BI__builtin_neon_vqtbl1q_v: {
6767     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl1, Ty),
6768                         Ops, "vtbl1");
6769   }
6770   case NEON::BI__builtin_neon_vqtbl2q_v: {
6771     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl2, Ty),
6772                         Ops, "vtbl2");
6773   }
6774   case NEON::BI__builtin_neon_vqtbl3q_v: {
6775     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl3, Ty),
6776                         Ops, "vtbl3");
6777   }
6778   case NEON::BI__builtin_neon_vqtbl4q_v: {
6779     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl4, Ty),
6780                         Ops, "vtbl4");
6781   }
6782   case NEON::BI__builtin_neon_vqtbx1q_v: {
6783     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx1, Ty),
6784                         Ops, "vtbx1");
6785   }
6786   case NEON::BI__builtin_neon_vqtbx2q_v: {
6787     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx2, Ty),
6788                         Ops, "vtbx2");
6789   }
6790   case NEON::BI__builtin_neon_vqtbx3q_v: {
6791     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx3, Ty),
6792                         Ops, "vtbx3");
6793   }
6794   case NEON::BI__builtin_neon_vqtbx4q_v: {
6795     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx4, Ty),
6796                         Ops, "vtbx4");
6797   }
6798   case NEON::BI__builtin_neon_vsqadd_v:
6799   case NEON::BI__builtin_neon_vsqaddq_v: {
6800     Int = Intrinsic::arm64_neon_usqadd;
6801     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
6802   }
6803   case NEON::BI__builtin_neon_vuqadd_v:
6804   case NEON::BI__builtin_neon_vuqaddq_v: {
6805     Int = Intrinsic::arm64_neon_suqadd;
6806     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
6807   }
6808   }
6809 }
6810 
6811 llvm::Value *CodeGenFunction::
6812 BuildVector(ArrayRef<llvm::Value*> Ops) {
6813   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
6814          "Not a power-of-two sized vector!");
6815   bool AllConstants = true;
6816   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
6817     AllConstants &= isa<Constant>(Ops[i]);
6818 
6819   // If this is a constant vector, create a ConstantVector.
6820   if (AllConstants) {
6821     SmallVector<llvm::Constant*, 16> CstOps;
6822     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
6823       CstOps.push_back(cast<Constant>(Ops[i]));
6824     return llvm::ConstantVector::get(CstOps);
6825   }
6826 
6827   // Otherwise, insertelement the values to build the vector.
6828   Value *Result =
6829     llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), Ops.size()));
6830 
6831   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
6832     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt32(i));
6833 
6834   return Result;
6835 }
6836 
6837 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
6838                                            const CallExpr *E) {
6839   SmallVector<Value*, 4> Ops;
6840 
6841   // Find out if any arguments are required to be integer constant expressions.
6842   unsigned ICEArguments = 0;
6843   ASTContext::GetBuiltinTypeError Error;
6844   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
6845   assert(Error == ASTContext::GE_None && "Should not codegen an error");
6846 
6847   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
6848     // If this is a normal argument, just emit it as a scalar.
6849     if ((ICEArguments & (1 << i)) == 0) {
6850       Ops.push_back(EmitScalarExpr(E->getArg(i)));
6851       continue;
6852     }
6853 
6854     // If this is required to be a constant, constant fold it so that we know
6855     // that the generated intrinsic gets a ConstantInt.
6856     llvm::APSInt Result;
6857     bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
6858     assert(IsConst && "Constant arg isn't actually constant?"); (void)IsConst;
6859     Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
6860   }
6861 
6862   switch (BuiltinID) {
6863   default: return 0;
6864   case X86::BI_mm_prefetch: {
6865     Value *Address = EmitScalarExpr(E->getArg(0));
6866     Value *RW = ConstantInt::get(Int32Ty, 0);
6867     Value *Locality = EmitScalarExpr(E->getArg(1));
6868     Value *Data = ConstantInt::get(Int32Ty, 1);
6869     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
6870     return Builder.CreateCall4(F, Address, RW, Locality, Data);
6871   }
6872   case X86::BI__builtin_ia32_vec_init_v8qi:
6873   case X86::BI__builtin_ia32_vec_init_v4hi:
6874   case X86::BI__builtin_ia32_vec_init_v2si:
6875     return Builder.CreateBitCast(BuildVector(Ops),
6876                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
6877   case X86::BI__builtin_ia32_vec_ext_v2si:
6878     return Builder.CreateExtractElement(Ops[0],
6879                                   llvm::ConstantInt::get(Ops[1]->getType(), 0));
6880   case X86::BI__builtin_ia32_ldmxcsr: {
6881     Value *Tmp = CreateMemTemp(E->getArg(0)->getType());
6882     Builder.CreateStore(Ops[0], Tmp);
6883     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
6884                               Builder.CreateBitCast(Tmp, Int8PtrTy));
6885   }
6886   case X86::BI__builtin_ia32_stmxcsr: {
6887     Value *Tmp = CreateMemTemp(E->getType());
6888     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
6889                        Builder.CreateBitCast(Tmp, Int8PtrTy));
6890     return Builder.CreateLoad(Tmp, "stmxcsr");
6891   }
6892   case X86::BI__builtin_ia32_storehps:
6893   case X86::BI__builtin_ia32_storelps: {
6894     llvm::Type *PtrTy = llvm::PointerType::getUnqual(Int64Ty);
6895     llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 2);
6896 
6897     // cast val v2i64
6898     Ops[1] = Builder.CreateBitCast(Ops[1], VecTy, "cast");
6899 
6900     // extract (0, 1)
6901     unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1;
6902     llvm::Value *Idx = llvm::ConstantInt::get(Int32Ty, Index);
6903     Ops[1] = Builder.CreateExtractElement(Ops[1], Idx, "extract");
6904 
6905     // cast pointer to i64 & store
6906     Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
6907     return Builder.CreateStore(Ops[1], Ops[0]);
6908   }
6909   case X86::BI__builtin_ia32_palignr: {
6910     unsigned shiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
6911 
6912     // If palignr is shifting the pair of input vectors less than 9 bytes,
6913     // emit a shuffle instruction.
6914     if (shiftVal <= 8) {
6915       SmallVector<llvm::Constant*, 8> Indices;
6916       for (unsigned i = 0; i != 8; ++i)
6917         Indices.push_back(llvm::ConstantInt::get(Int32Ty, shiftVal + i));
6918 
6919       Value* SV = llvm::ConstantVector::get(Indices);
6920       return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
6921     }
6922 
6923     // If palignr is shifting the pair of input vectors more than 8 but less
6924     // than 16 bytes, emit a logical right shift of the destination.
6925     if (shiftVal < 16) {
6926       // MMX has these as 1 x i64 vectors for some odd optimization reasons.
6927       llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 1);
6928 
6929       Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
6930       Ops[1] = llvm::ConstantInt::get(VecTy, (shiftVal-8) * 8);
6931 
6932       // create i32 constant
6933       llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_mmx_psrl_q);
6934       return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr");
6935     }
6936 
6937     // If palignr is shifting the pair of vectors more than 16 bytes, emit zero.
6938     return llvm::Constant::getNullValue(ConvertType(E->getType()));
6939   }
6940   case X86::BI__builtin_ia32_palignr128: {
6941     unsigned shiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
6942 
6943     // If palignr is shifting the pair of input vectors less than 17 bytes,
6944     // emit a shuffle instruction.
6945     if (shiftVal <= 16) {
6946       SmallVector<llvm::Constant*, 16> Indices;
6947       for (unsigned i = 0; i != 16; ++i)
6948         Indices.push_back(llvm::ConstantInt::get(Int32Ty, shiftVal + i));
6949 
6950       Value* SV = llvm::ConstantVector::get(Indices);
6951       return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
6952     }
6953 
6954     // If palignr is shifting the pair of input vectors more than 16 but less
6955     // than 32 bytes, emit a logical right shift of the destination.
6956     if (shiftVal < 32) {
6957       llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 2);
6958 
6959       Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
6960       Ops[1] = llvm::ConstantInt::get(Int32Ty, (shiftVal-16) * 8);
6961 
6962       // create i32 constant
6963       llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse2_psrl_dq);
6964       return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr");
6965     }
6966 
6967     // If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
6968     return llvm::Constant::getNullValue(ConvertType(E->getType()));
6969   }
6970   case X86::BI__builtin_ia32_palignr256: {
6971     unsigned shiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
6972 
6973     // If palignr is shifting the pair of input vectors less than 17 bytes,
6974     // emit a shuffle instruction.
6975     if (shiftVal <= 16) {
6976       SmallVector<llvm::Constant*, 32> Indices;
6977       // 256-bit palignr operates on 128-bit lanes so we need to handle that
6978       for (unsigned l = 0; l != 2; ++l) {
6979         unsigned LaneStart = l * 16;
6980         unsigned LaneEnd = (l+1) * 16;
6981         for (unsigned i = 0; i != 16; ++i) {
6982           unsigned Idx = shiftVal + i + LaneStart;
6983           if (Idx >= LaneEnd) Idx += 16; // end of lane, switch operand
6984           Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx));
6985         }
6986       }
6987 
6988       Value* SV = llvm::ConstantVector::get(Indices);
6989       return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
6990     }
6991 
6992     // If palignr is shifting the pair of input vectors more than 16 but less
6993     // than 32 bytes, emit a logical right shift of the destination.
6994     if (shiftVal < 32) {
6995       llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 4);
6996 
6997       Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
6998       Ops[1] = llvm::ConstantInt::get(Int32Ty, (shiftVal-16) * 8);
6999 
7000       // create i32 constant
7001       llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_avx2_psrl_dq);
7002       return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr");
7003     }
7004 
7005     // If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
7006     return llvm::Constant::getNullValue(ConvertType(E->getType()));
7007   }
7008   case X86::BI__builtin_ia32_movntps:
7009   case X86::BI__builtin_ia32_movntps256:
7010   case X86::BI__builtin_ia32_movntpd:
7011   case X86::BI__builtin_ia32_movntpd256:
7012   case X86::BI__builtin_ia32_movntdq:
7013   case X86::BI__builtin_ia32_movntdq256:
7014   case X86::BI__builtin_ia32_movnti:
7015   case X86::BI__builtin_ia32_movnti64: {
7016     llvm::MDNode *Node = llvm::MDNode::get(getLLVMContext(),
7017                                            Builder.getInt32(1));
7018 
7019     // Convert the type of the pointer to a pointer to the stored type.
7020     Value *BC = Builder.CreateBitCast(Ops[0],
7021                                 llvm::PointerType::getUnqual(Ops[1]->getType()),
7022                                       "cast");
7023     StoreInst *SI = Builder.CreateStore(Ops[1], BC);
7024     SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
7025 
7026     // If the operand is an integer, we can't assume alignment. Otherwise,
7027     // assume natural alignment.
7028     QualType ArgTy = E->getArg(1)->getType();
7029     unsigned Align;
7030     if (ArgTy->isIntegerType())
7031       Align = 1;
7032     else
7033       Align = getContext().getTypeSizeInChars(ArgTy).getQuantity();
7034     SI->setAlignment(Align);
7035     return SI;
7036   }
7037   // 3DNow!
7038   case X86::BI__builtin_ia32_pswapdsf:
7039   case X86::BI__builtin_ia32_pswapdsi: {
7040     const char *name = 0;
7041     Intrinsic::ID ID = Intrinsic::not_intrinsic;
7042     switch(BuiltinID) {
7043     default: llvm_unreachable("Unsupported intrinsic!");
7044     case X86::BI__builtin_ia32_pswapdsf:
7045     case X86::BI__builtin_ia32_pswapdsi:
7046       name = "pswapd";
7047       ID = Intrinsic::x86_3dnowa_pswapd;
7048       break;
7049     }
7050     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
7051     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
7052     llvm::Function *F = CGM.getIntrinsic(ID);
7053     return Builder.CreateCall(F, Ops, name);
7054   }
7055   case X86::BI__builtin_ia32_rdrand16_step:
7056   case X86::BI__builtin_ia32_rdrand32_step:
7057   case X86::BI__builtin_ia32_rdrand64_step:
7058   case X86::BI__builtin_ia32_rdseed16_step:
7059   case X86::BI__builtin_ia32_rdseed32_step:
7060   case X86::BI__builtin_ia32_rdseed64_step: {
7061     Intrinsic::ID ID;
7062     switch (BuiltinID) {
7063     default: llvm_unreachable("Unsupported intrinsic!");
7064     case X86::BI__builtin_ia32_rdrand16_step:
7065       ID = Intrinsic::x86_rdrand_16;
7066       break;
7067     case X86::BI__builtin_ia32_rdrand32_step:
7068       ID = Intrinsic::x86_rdrand_32;
7069       break;
7070     case X86::BI__builtin_ia32_rdrand64_step:
7071       ID = Intrinsic::x86_rdrand_64;
7072       break;
7073     case X86::BI__builtin_ia32_rdseed16_step:
7074       ID = Intrinsic::x86_rdseed_16;
7075       break;
7076     case X86::BI__builtin_ia32_rdseed32_step:
7077       ID = Intrinsic::x86_rdseed_32;
7078       break;
7079     case X86::BI__builtin_ia32_rdseed64_step:
7080       ID = Intrinsic::x86_rdseed_64;
7081       break;
7082     }
7083 
7084     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
7085     Builder.CreateStore(Builder.CreateExtractValue(Call, 0), Ops[0]);
7086     return Builder.CreateExtractValue(Call, 1);
7087   }
7088   // AVX2 broadcast
7089   case X86::BI__builtin_ia32_vbroadcastsi256: {
7090     Value *VecTmp = CreateMemTemp(E->getArg(0)->getType());
7091     Builder.CreateStore(Ops[0], VecTmp);
7092     Value *F = CGM.getIntrinsic(Intrinsic::x86_avx2_vbroadcasti128);
7093     return Builder.CreateCall(F, Builder.CreateBitCast(VecTmp, Int8PtrTy));
7094   }
7095   }
7096 }
7097 
7098 
7099 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
7100                                            const CallExpr *E) {
7101   SmallVector<Value*, 4> Ops;
7102 
7103   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
7104     Ops.push_back(EmitScalarExpr(E->getArg(i)));
7105 
7106   Intrinsic::ID ID = Intrinsic::not_intrinsic;
7107 
7108   switch (BuiltinID) {
7109   default: return 0;
7110 
7111   // vec_ld, vec_lvsl, vec_lvsr
7112   case PPC::BI__builtin_altivec_lvx:
7113   case PPC::BI__builtin_altivec_lvxl:
7114   case PPC::BI__builtin_altivec_lvebx:
7115   case PPC::BI__builtin_altivec_lvehx:
7116   case PPC::BI__builtin_altivec_lvewx:
7117   case PPC::BI__builtin_altivec_lvsl:
7118   case PPC::BI__builtin_altivec_lvsr:
7119   {
7120     Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
7121 
7122     Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
7123     Ops.pop_back();
7124 
7125     switch (BuiltinID) {
7126     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
7127     case PPC::BI__builtin_altivec_lvx:
7128       ID = Intrinsic::ppc_altivec_lvx;
7129       break;
7130     case PPC::BI__builtin_altivec_lvxl:
7131       ID = Intrinsic::ppc_altivec_lvxl;
7132       break;
7133     case PPC::BI__builtin_altivec_lvebx:
7134       ID = Intrinsic::ppc_altivec_lvebx;
7135       break;
7136     case PPC::BI__builtin_altivec_lvehx:
7137       ID = Intrinsic::ppc_altivec_lvehx;
7138       break;
7139     case PPC::BI__builtin_altivec_lvewx:
7140       ID = Intrinsic::ppc_altivec_lvewx;
7141       break;
7142     case PPC::BI__builtin_altivec_lvsl:
7143       ID = Intrinsic::ppc_altivec_lvsl;
7144       break;
7145     case PPC::BI__builtin_altivec_lvsr:
7146       ID = Intrinsic::ppc_altivec_lvsr;
7147       break;
7148     }
7149     llvm::Function *F = CGM.getIntrinsic(ID);
7150     return Builder.CreateCall(F, Ops, "");
7151   }
7152 
7153   // vec_st
7154   case PPC::BI__builtin_altivec_stvx:
7155   case PPC::BI__builtin_altivec_stvxl:
7156   case PPC::BI__builtin_altivec_stvebx:
7157   case PPC::BI__builtin_altivec_stvehx:
7158   case PPC::BI__builtin_altivec_stvewx:
7159   {
7160     Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
7161     Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
7162     Ops.pop_back();
7163 
7164     switch (BuiltinID) {
7165     default: llvm_unreachable("Unsupported st intrinsic!");
7166     case PPC::BI__builtin_altivec_stvx:
7167       ID = Intrinsic::ppc_altivec_stvx;
7168       break;
7169     case PPC::BI__builtin_altivec_stvxl:
7170       ID = Intrinsic::ppc_altivec_stvxl;
7171       break;
7172     case PPC::BI__builtin_altivec_stvebx:
7173       ID = Intrinsic::ppc_altivec_stvebx;
7174       break;
7175     case PPC::BI__builtin_altivec_stvehx:
7176       ID = Intrinsic::ppc_altivec_stvehx;
7177       break;
7178     case PPC::BI__builtin_altivec_stvewx:
7179       ID = Intrinsic::ppc_altivec_stvewx;
7180       break;
7181     }
7182     llvm::Function *F = CGM.getIntrinsic(ID);
7183     return Builder.CreateCall(F, Ops, "");
7184   }
7185   }
7186 }
7187