1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This contains code to emit Builtin calls as LLVM code.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCXXABI.h"
15 #include "CGObjCRuntime.h"
16 #include "CGOpenCLRuntime.h"
17 #include "CGRecordLayout.h"
18 #include "CodeGenFunction.h"
19 #include "CodeGenModule.h"
20 #include "ConstantEmitter.h"
21 #include "TargetInfo.h"
22 #include "clang/AST/ASTContext.h"
23 #include "clang/AST/Decl.h"
24 #include "clang/AST/OSLog.h"
25 #include "clang/Basic/TargetBuiltins.h"
26 #include "clang/Basic/TargetInfo.h"
27 #include "clang/CodeGen/CGFunctionInfo.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/IR/CallSite.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/InlineAsm.h"
32 #include "llvm/IR/Intrinsics.h"
33 #include "llvm/IR/MDBuilder.h"
34 #include "llvm/Support/ConvertUTF.h"
35 #include "llvm/Support/ScopedPrinter.h"
36 #include "llvm/Support/TargetParser.h"
37 #include <sstream>
38 
39 using namespace clang;
40 using namespace CodeGen;
41 using namespace llvm;
42 
43 static
44 int64_t clamp(int64_t Value, int64_t Low, int64_t High) {
45   return std::min(High, std::max(Low, Value));
46 }
47 
48 /// getBuiltinLibFunction - Given a builtin id for a function like
49 /// "__builtin_fabsf", return a Function* for "fabsf".
50 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
51                                                      unsigned BuiltinID) {
52   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
53 
54   // Get the name, skip over the __builtin_ prefix (if necessary).
55   StringRef Name;
56   GlobalDecl D(FD);
57 
58   // If the builtin has been declared explicitly with an assembler label,
59   // use the mangled name. This differs from the plain label on platforms
60   // that prefix labels.
61   if (FD->hasAttr<AsmLabelAttr>())
62     Name = getMangledName(D);
63   else
64     Name = Context.BuiltinInfo.getName(BuiltinID) + 10;
65 
66   llvm::FunctionType *Ty =
67     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
68 
69   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
70 }
71 
72 /// Emit the conversions required to turn the given value into an
73 /// integer of the given size.
74 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
75                         QualType T, llvm::IntegerType *IntType) {
76   V = CGF.EmitToMemory(V, T);
77 
78   if (V->getType()->isPointerTy())
79     return CGF.Builder.CreatePtrToInt(V, IntType);
80 
81   assert(V->getType() == IntType);
82   return V;
83 }
84 
85 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
86                           QualType T, llvm::Type *ResultType) {
87   V = CGF.EmitFromMemory(V, T);
88 
89   if (ResultType->isPointerTy())
90     return CGF.Builder.CreateIntToPtr(V, ResultType);
91 
92   assert(V->getType() == ResultType);
93   return V;
94 }
95 
96 /// Utility to insert an atomic instruction based on Instrinsic::ID
97 /// and the expression node.
98 static Value *MakeBinaryAtomicValue(
99     CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E,
100     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
101   QualType T = E->getType();
102   assert(E->getArg(0)->getType()->isPointerType());
103   assert(CGF.getContext().hasSameUnqualifiedType(T,
104                                   E->getArg(0)->getType()->getPointeeType()));
105   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
106 
107   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
108   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
109 
110   llvm::IntegerType *IntType =
111     llvm::IntegerType::get(CGF.getLLVMContext(),
112                            CGF.getContext().getTypeSize(T));
113   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
114 
115   llvm::Value *Args[2];
116   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
117   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
118   llvm::Type *ValueType = Args[1]->getType();
119   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
120 
121   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
122       Kind, Args[0], Args[1], Ordering);
123   return EmitFromInt(CGF, Result, T, ValueType);
124 }
125 
126 static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
127   Value *Val = CGF.EmitScalarExpr(E->getArg(0));
128   Value *Address = CGF.EmitScalarExpr(E->getArg(1));
129 
130   // Convert the type of the pointer to a pointer to the stored type.
131   Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
132   Value *BC = CGF.Builder.CreateBitCast(
133       Address, llvm::PointerType::getUnqual(Val->getType()), "cast");
134   LValue LV = CGF.MakeNaturalAlignAddrLValue(BC, E->getArg(0)->getType());
135   LV.setNontemporal(true);
136   CGF.EmitStoreOfScalar(Val, LV, false);
137   return nullptr;
138 }
139 
140 static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
141   Value *Address = CGF.EmitScalarExpr(E->getArg(0));
142 
143   LValue LV = CGF.MakeNaturalAlignAddrLValue(Address, E->getType());
144   LV.setNontemporal(true);
145   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
146 }
147 
148 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
149                                llvm::AtomicRMWInst::BinOp Kind,
150                                const CallExpr *E) {
151   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
152 }
153 
154 /// Utility to insert an atomic instruction based Instrinsic::ID and
155 /// the expression node, where the return value is the result of the
156 /// operation.
157 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
158                                    llvm::AtomicRMWInst::BinOp Kind,
159                                    const CallExpr *E,
160                                    Instruction::BinaryOps Op,
161                                    bool Invert = false) {
162   QualType T = E->getType();
163   assert(E->getArg(0)->getType()->isPointerType());
164   assert(CGF.getContext().hasSameUnqualifiedType(T,
165                                   E->getArg(0)->getType()->getPointeeType()));
166   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
167 
168   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
169   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
170 
171   llvm::IntegerType *IntType =
172     llvm::IntegerType::get(CGF.getLLVMContext(),
173                            CGF.getContext().getTypeSize(T));
174   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
175 
176   llvm::Value *Args[2];
177   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
178   llvm::Type *ValueType = Args[1]->getType();
179   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
180   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
181 
182   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
183       Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
184   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
185   if (Invert)
186     Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
187                                      llvm::ConstantInt::get(IntType, -1));
188   Result = EmitFromInt(CGF, Result, T, ValueType);
189   return RValue::get(Result);
190 }
191 
192 /// Utility to insert an atomic cmpxchg instruction.
193 ///
194 /// @param CGF The current codegen function.
195 /// @param E   Builtin call expression to convert to cmpxchg.
196 ///            arg0 - address to operate on
197 ///            arg1 - value to compare with
198 ///            arg2 - new value
199 /// @param ReturnBool Specifies whether to return success flag of
200 ///                   cmpxchg result or the old value.
201 ///
202 /// @returns result of cmpxchg, according to ReturnBool
203 ///
204 /// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
205 /// invoke the function EmitAtomicCmpXchgForMSIntrin.
206 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
207                                      bool ReturnBool) {
208   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
209   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
210   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
211 
212   llvm::IntegerType *IntType = llvm::IntegerType::get(
213       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
214   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
215 
216   Value *Args[3];
217   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
218   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
219   llvm::Type *ValueType = Args[1]->getType();
220   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
221   Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
222 
223   Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
224       Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent,
225       llvm::AtomicOrdering::SequentiallyConsistent);
226   if (ReturnBool)
227     // Extract boolean success flag and zext it to int.
228     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
229                                   CGF.ConvertType(E->getType()));
230   else
231     // Extract old value and emit it using the same type as compare value.
232     return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
233                        ValueType);
234 }
235 
236 /// This function should be invoked to emit atomic cmpxchg for Microsoft's
237 /// _InterlockedCompareExchange* intrinsics which have the following signature:
238 /// T _InterlockedCompareExchange(T volatile *Destination,
239 ///                               T Exchange,
240 ///                               T Comparand);
241 ///
242 /// Whereas the llvm 'cmpxchg' instruction has the following syntax:
243 /// cmpxchg *Destination, Comparand, Exchange.
244 /// So we need to swap Comparand and Exchange when invoking
245 /// CreateAtomicCmpXchg. That is the reason we could not use the above utility
246 /// function MakeAtomicCmpXchgValue since it expects the arguments to be
247 /// already swapped.
248 
249 static
250 Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
251     AtomicOrdering SuccessOrdering = AtomicOrdering::SequentiallyConsistent) {
252   assert(E->getArg(0)->getType()->isPointerType());
253   assert(CGF.getContext().hasSameUnqualifiedType(
254       E->getType(), E->getArg(0)->getType()->getPointeeType()));
255   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
256                                                  E->getArg(1)->getType()));
257   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
258                                                  E->getArg(2)->getType()));
259 
260   auto *Destination = CGF.EmitScalarExpr(E->getArg(0));
261   auto *Comparand = CGF.EmitScalarExpr(E->getArg(2));
262   auto *Exchange = CGF.EmitScalarExpr(E->getArg(1));
263 
264   // For Release ordering, the failure ordering should be Monotonic.
265   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release ?
266                          AtomicOrdering::Monotonic :
267                          SuccessOrdering;
268 
269   auto *Result = CGF.Builder.CreateAtomicCmpXchg(
270                    Destination, Comparand, Exchange,
271                    SuccessOrdering, FailureOrdering);
272   Result->setVolatile(true);
273   return CGF.Builder.CreateExtractValue(Result, 0);
274 }
275 
276 static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
277     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
278   assert(E->getArg(0)->getType()->isPointerType());
279 
280   auto *IntTy = CGF.ConvertType(E->getType());
281   auto *Result = CGF.Builder.CreateAtomicRMW(
282                    AtomicRMWInst::Add,
283                    CGF.EmitScalarExpr(E->getArg(0)),
284                    ConstantInt::get(IntTy, 1),
285                    Ordering);
286   return CGF.Builder.CreateAdd(Result, ConstantInt::get(IntTy, 1));
287 }
288 
289 static Value *EmitAtomicDecrementValue(CodeGenFunction &CGF, const CallExpr *E,
290     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
291   assert(E->getArg(0)->getType()->isPointerType());
292 
293   auto *IntTy = CGF.ConvertType(E->getType());
294   auto *Result = CGF.Builder.CreateAtomicRMW(
295                    AtomicRMWInst::Sub,
296                    CGF.EmitScalarExpr(E->getArg(0)),
297                    ConstantInt::get(IntTy, 1),
298                    Ordering);
299   return CGF.Builder.CreateSub(Result, ConstantInt::get(IntTy, 1));
300 }
301 
302 // Emit a simple mangled intrinsic that has 1 argument and a return type
303 // matching the argument type.
304 static Value *emitUnaryBuiltin(CodeGenFunction &CGF,
305                                const CallExpr *E,
306                                unsigned IntrinsicID) {
307   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
308 
309   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
310   return CGF.Builder.CreateCall(F, Src0);
311 }
312 
313 // Emit an intrinsic that has 2 operands of the same type as its result.
314 static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
315                                 const CallExpr *E,
316                                 unsigned IntrinsicID) {
317   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
318   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
319 
320   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
321   return CGF.Builder.CreateCall(F, { Src0, Src1 });
322 }
323 
324 // Emit an intrinsic that has 3 operands of the same type as its result.
325 static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
326                                  const CallExpr *E,
327                                  unsigned IntrinsicID) {
328   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
329   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
330   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
331 
332   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
333   return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
334 }
335 
336 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
337 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
338                                const CallExpr *E,
339                                unsigned IntrinsicID) {
340   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
341   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
342 
343   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
344   return CGF.Builder.CreateCall(F, {Src0, Src1});
345 }
346 
347 /// EmitFAbs - Emit a call to @llvm.fabs().
348 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
349   Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
350   llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
351   Call->setDoesNotAccessMemory();
352   return Call;
353 }
354 
355 /// Emit the computation of the sign bit for a floating point value. Returns
356 /// the i1 sign bit value.
357 static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
358   LLVMContext &C = CGF.CGM.getLLVMContext();
359 
360   llvm::Type *Ty = V->getType();
361   int Width = Ty->getPrimitiveSizeInBits();
362   llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
363   V = CGF.Builder.CreateBitCast(V, IntTy);
364   if (Ty->isPPC_FP128Ty()) {
365     // We want the sign bit of the higher-order double. The bitcast we just
366     // did works as if the double-double was stored to memory and then
367     // read as an i128. The "store" will put the higher-order double in the
368     // lower address in both little- and big-Endian modes, but the "load"
369     // will treat those bits as a different part of the i128: the low bits in
370     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
371     // we need to shift the high bits down to the low before truncating.
372     Width >>= 1;
373     if (CGF.getTarget().isBigEndian()) {
374       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
375       V = CGF.Builder.CreateLShr(V, ShiftCst);
376     }
377     // We are truncating value in order to extract the higher-order
378     // double, which we will be using to extract the sign from.
379     IntTy = llvm::IntegerType::get(C, Width);
380     V = CGF.Builder.CreateTrunc(V, IntTy);
381   }
382   Value *Zero = llvm::Constant::getNullValue(IntTy);
383   return CGF.Builder.CreateICmpSLT(V, Zero);
384 }
385 
386 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
387                               const CallExpr *E, llvm::Constant *calleeValue) {
388   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
389   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
390 }
391 
392 /// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
393 /// depending on IntrinsicID.
394 ///
395 /// \arg CGF The current codegen function.
396 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
397 /// \arg X The first argument to the llvm.*.with.overflow.*.
398 /// \arg Y The second argument to the llvm.*.with.overflow.*.
399 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
400 /// \returns The result (i.e. sum/product) returned by the intrinsic.
401 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
402                                           const llvm::Intrinsic::ID IntrinsicID,
403                                           llvm::Value *X, llvm::Value *Y,
404                                           llvm::Value *&Carry) {
405   // Make sure we have integers of the same width.
406   assert(X->getType() == Y->getType() &&
407          "Arguments must be the same type. (Did you forget to make sure both "
408          "arguments have the same integer width?)");
409 
410   llvm::Value *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
411   llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
412   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
413   return CGF.Builder.CreateExtractValue(Tmp, 0);
414 }
415 
416 static Value *emitRangedBuiltin(CodeGenFunction &CGF,
417                                 unsigned IntrinsicID,
418                                 int low, int high) {
419     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
420     llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
421     Value *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
422     llvm::Instruction *Call = CGF.Builder.CreateCall(F);
423     Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
424     return Call;
425 }
426 
427 namespace {
428   struct WidthAndSignedness {
429     unsigned Width;
430     bool Signed;
431   };
432 }
433 
434 static WidthAndSignedness
435 getIntegerWidthAndSignedness(const clang::ASTContext &context,
436                              const clang::QualType Type) {
437   assert(Type->isIntegerType() && "Given type is not an integer.");
438   unsigned Width = Type->isBooleanType() ? 1 : context.getTypeInfo(Type).Width;
439   bool Signed = Type->isSignedIntegerType();
440   return {Width, Signed};
441 }
442 
443 // Given one or more integer types, this function produces an integer type that
444 // encompasses them: any value in one of the given types could be expressed in
445 // the encompassing type.
446 static struct WidthAndSignedness
447 EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
448   assert(Types.size() > 0 && "Empty list of types.");
449 
450   // If any of the given types is signed, we must return a signed type.
451   bool Signed = false;
452   for (const auto &Type : Types) {
453     Signed |= Type.Signed;
454   }
455 
456   // The encompassing type must have a width greater than or equal to the width
457   // of the specified types.  Additionally, if the encompassing type is signed,
458   // its width must be strictly greater than the width of any unsigned types
459   // given.
460   unsigned Width = 0;
461   for (const auto &Type : Types) {
462     unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
463     if (Width < MinWidth) {
464       Width = MinWidth;
465     }
466   }
467 
468   return {Width, Signed};
469 }
470 
471 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
472   llvm::Type *DestType = Int8PtrTy;
473   if (ArgValue->getType() != DestType)
474     ArgValue =
475         Builder.CreateBitCast(ArgValue, DestType, ArgValue->getName().data());
476 
477   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
478   return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
479 }
480 
481 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
482 /// __builtin_object_size(p, @p To) is correct
483 static bool areBOSTypesCompatible(int From, int To) {
484   // Note: Our __builtin_object_size implementation currently treats Type=0 and
485   // Type=2 identically. Encoding this implementation detail here may make
486   // improving __builtin_object_size difficult in the future, so it's omitted.
487   return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
488 }
489 
490 static llvm::Value *
491 getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
492   return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
493 }
494 
495 llvm::Value *
496 CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
497                                                  llvm::IntegerType *ResType,
498                                                  llvm::Value *EmittedE) {
499   uint64_t ObjectSize;
500   if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
501     return emitBuiltinObjectSize(E, Type, ResType, EmittedE);
502   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
503 }
504 
505 /// Returns a Value corresponding to the size of the given expression.
506 /// This Value may be either of the following:
507 ///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
508 ///     it)
509 ///   - A call to the @llvm.objectsize intrinsic
510 ///
511 /// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
512 /// and we wouldn't otherwise try to reference a pass_object_size parameter,
513 /// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
514 llvm::Value *
515 CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
516                                        llvm::IntegerType *ResType,
517                                        llvm::Value *EmittedE) {
518   // We need to reference an argument if the pointer is a parameter with the
519   // pass_object_size attribute.
520   if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
521     auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
522     auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
523     if (Param != nullptr && PS != nullptr &&
524         areBOSTypesCompatible(PS->getType(), Type)) {
525       auto Iter = SizeArguments.find(Param);
526       assert(Iter != SizeArguments.end());
527 
528       const ImplicitParamDecl *D = Iter->second;
529       auto DIter = LocalDeclMap.find(D);
530       assert(DIter != LocalDeclMap.end());
531 
532       return EmitLoadOfScalar(DIter->second, /*volatile=*/false,
533                               getContext().getSizeType(), E->getBeginLoc());
534     }
535   }
536 
537   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
538   // evaluate E for side-effects. In either case, we shouldn't lower to
539   // @llvm.objectsize.
540   if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
541     return getDefaultBuiltinObjectSizeResult(Type, ResType);
542 
543   Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
544   assert(Ptr->getType()->isPointerTy() &&
545          "Non-pointer passed to __builtin_object_size?");
546 
547   Value *F = CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
548 
549   // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
550   Value *Min = Builder.getInt1((Type & 2) != 0);
551   // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
552   Value *NullIsUnknown = Builder.getTrue();
553   return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown});
554 }
555 
556 namespace {
557 /// A struct to generically desribe a bit test intrinsic.
558 struct BitTest {
559   enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
560   enum InterlockingKind : uint8_t {
561     Unlocked,
562     Sequential,
563     Acquire,
564     Release,
565     NoFence
566   };
567 
568   ActionKind Action;
569   InterlockingKind Interlocking;
570   bool Is64Bit;
571 
572   static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
573 };
574 } // namespace
575 
576 BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
577   switch (BuiltinID) {
578     // Main portable variants.
579   case Builtin::BI_bittest:
580     return {TestOnly, Unlocked, false};
581   case Builtin::BI_bittestandcomplement:
582     return {Complement, Unlocked, false};
583   case Builtin::BI_bittestandreset:
584     return {Reset, Unlocked, false};
585   case Builtin::BI_bittestandset:
586     return {Set, Unlocked, false};
587   case Builtin::BI_interlockedbittestandreset:
588     return {Reset, Sequential, false};
589   case Builtin::BI_interlockedbittestandset:
590     return {Set, Sequential, false};
591 
592     // X86-specific 64-bit variants.
593   case Builtin::BI_bittest64:
594     return {TestOnly, Unlocked, true};
595   case Builtin::BI_bittestandcomplement64:
596     return {Complement, Unlocked, true};
597   case Builtin::BI_bittestandreset64:
598     return {Reset, Unlocked, true};
599   case Builtin::BI_bittestandset64:
600     return {Set, Unlocked, true};
601   case Builtin::BI_interlockedbittestandreset64:
602     return {Reset, Sequential, true};
603   case Builtin::BI_interlockedbittestandset64:
604     return {Set, Sequential, true};
605 
606     // ARM/AArch64-specific ordering variants.
607   case Builtin::BI_interlockedbittestandset_acq:
608     return {Set, Acquire, false};
609   case Builtin::BI_interlockedbittestandset_rel:
610     return {Set, Release, false};
611   case Builtin::BI_interlockedbittestandset_nf:
612     return {Set, NoFence, false};
613   case Builtin::BI_interlockedbittestandreset_acq:
614     return {Reset, Acquire, false};
615   case Builtin::BI_interlockedbittestandreset_rel:
616     return {Reset, Release, false};
617   case Builtin::BI_interlockedbittestandreset_nf:
618     return {Reset, NoFence, false};
619   }
620   llvm_unreachable("expected only bittest intrinsics");
621 }
622 
623 static char bitActionToX86BTCode(BitTest::ActionKind A) {
624   switch (A) {
625   case BitTest::TestOnly:   return '\0';
626   case BitTest::Complement: return 'c';
627   case BitTest::Reset:      return 'r';
628   case BitTest::Set:        return 's';
629   }
630   llvm_unreachable("invalid action");
631 }
632 
633 static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
634                                             BitTest BT,
635                                             const CallExpr *E, Value *BitBase,
636                                             Value *BitPos) {
637   char Action = bitActionToX86BTCode(BT.Action);
638   char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
639 
640   // Build the assembly.
641   SmallString<64> Asm;
642   raw_svector_ostream AsmOS(Asm);
643   if (BT.Interlocking != BitTest::Unlocked)
644     AsmOS << "lock ";
645   AsmOS << "bt";
646   if (Action)
647     AsmOS << Action;
648   AsmOS << SizeSuffix << " $2, ($1)\n\tsetc ${0:b}";
649 
650   // Build the constraints. FIXME: We should support immediates when possible.
651   std::string Constraints = "=r,r,r,~{cc},~{flags},~{fpsr}";
652   llvm::IntegerType *IntType = llvm::IntegerType::get(
653       CGF.getLLVMContext(),
654       CGF.getContext().getTypeSize(E->getArg(1)->getType()));
655   llvm::Type *IntPtrType = IntType->getPointerTo();
656   llvm::FunctionType *FTy =
657       llvm::FunctionType::get(CGF.Int8Ty, {IntPtrType, IntType}, false);
658 
659   llvm::InlineAsm *IA =
660       llvm::InlineAsm::get(FTy, Asm, Constraints, /*SideEffects=*/true);
661   return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
662 }
663 
664 static llvm::AtomicOrdering
665 getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
666   switch (I) {
667   case BitTest::Unlocked:   return llvm::AtomicOrdering::NotAtomic;
668   case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
669   case BitTest::Acquire:    return llvm::AtomicOrdering::Acquire;
670   case BitTest::Release:    return llvm::AtomicOrdering::Release;
671   case BitTest::NoFence:    return llvm::AtomicOrdering::Monotonic;
672   }
673   llvm_unreachable("invalid interlocking");
674 }
675 
676 /// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
677 /// bits and a bit position and read and optionally modify the bit at that
678 /// position. The position index can be arbitrarily large, i.e. it can be larger
679 /// than 31 or 63, so we need an indexed load in the general case.
680 static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
681                                          unsigned BuiltinID,
682                                          const CallExpr *E) {
683   Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
684   Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
685 
686   BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
687 
688   // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
689   // indexing operation internally. Use them if possible.
690   llvm::Triple::ArchType Arch = CGF.getTarget().getTriple().getArch();
691   if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64)
692     return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
693 
694   // Otherwise, use generic code to load one byte and test the bit. Use all but
695   // the bottom three bits as the array index, and the bottom three bits to form
696   // a mask.
697   // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
698   Value *ByteIndex = CGF.Builder.CreateAShr(
699       BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
700   Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
701   Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
702                                                  ByteIndex, "bittest.byteaddr"),
703                    CharUnits::One());
704   Value *PosLow =
705       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
706                             llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
707 
708   // The updating instructions will need a mask.
709   Value *Mask = nullptr;
710   if (BT.Action != BitTest::TestOnly) {
711     Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
712                                  "bittest.mask");
713   }
714 
715   // Check the action and ordering of the interlocked intrinsics.
716   llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
717 
718   Value *OldByte = nullptr;
719   if (Ordering != llvm::AtomicOrdering::NotAtomic) {
720     // Emit a combined atomicrmw load/store operation for the interlocked
721     // intrinsics.
722     llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
723     if (BT.Action == BitTest::Reset) {
724       Mask = CGF.Builder.CreateNot(Mask);
725       RMWOp = llvm::AtomicRMWInst::And;
726     }
727     OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr.getPointer(), Mask,
728                                           Ordering);
729   } else {
730     // Emit a plain load for the non-interlocked intrinsics.
731     OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
732     Value *NewByte = nullptr;
733     switch (BT.Action) {
734     case BitTest::TestOnly:
735       // Don't store anything.
736       break;
737     case BitTest::Complement:
738       NewByte = CGF.Builder.CreateXor(OldByte, Mask);
739       break;
740     case BitTest::Reset:
741       NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
742       break;
743     case BitTest::Set:
744       NewByte = CGF.Builder.CreateOr(OldByte, Mask);
745       break;
746     }
747     if (NewByte)
748       CGF.Builder.CreateStore(NewByte, ByteAddr);
749   }
750 
751   // However we loaded the old byte, either by plain load or atomicrmw, shift
752   // the bit into the low position and mask it to 0 or 1.
753   Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
754   return CGF.Builder.CreateAnd(
755       ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
756 }
757 
758 namespace {
759 enum class MSVCSetJmpKind {
760   _setjmpex,
761   _setjmp3,
762   _setjmp
763 };
764 }
765 
766 /// MSVC handles setjmp a bit differently on different platforms. On every
767 /// architecture except 32-bit x86, the frame address is passed. On x86, extra
768 /// parameters can be passed as variadic arguments, but we always pass none.
769 static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
770                                const CallExpr *E) {
771   llvm::Value *Arg1 = nullptr;
772   llvm::Type *Arg1Ty = nullptr;
773   StringRef Name;
774   bool IsVarArg = false;
775   if (SJKind == MSVCSetJmpKind::_setjmp3) {
776     Name = "_setjmp3";
777     Arg1Ty = CGF.Int32Ty;
778     Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
779     IsVarArg = true;
780   } else {
781     Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
782     Arg1Ty = CGF.Int8PtrTy;
783     if (CGF.getTarget().getTriple().getArch() == llvm::Triple::aarch64) {
784       Arg1 = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(Intrinsic::sponentry));
785     } else
786       Arg1 = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(Intrinsic::frameaddress),
787                                     llvm::ConstantInt::get(CGF.Int32Ty, 0));
788   }
789 
790   // Mark the call site and declaration with ReturnsTwice.
791   llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
792   llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
793       CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
794       llvm::Attribute::ReturnsTwice);
795   llvm::Constant *SetJmpFn = CGF.CGM.CreateRuntimeFunction(
796       llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
797       ReturnsTwiceAttr, /*Local=*/true);
798 
799   llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
800       CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
801   llvm::Value *Args[] = {Buf, Arg1};
802   llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
803   CS.setAttributes(ReturnsTwiceAttr);
804   return RValue::get(CS.getInstruction());
805 }
806 
807 // Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
808 // we handle them here.
809 enum class CodeGenFunction::MSVCIntrin {
810   _BitScanForward,
811   _BitScanReverse,
812   _InterlockedAnd,
813   _InterlockedDecrement,
814   _InterlockedExchange,
815   _InterlockedExchangeAdd,
816   _InterlockedExchangeSub,
817   _InterlockedIncrement,
818   _InterlockedOr,
819   _InterlockedXor,
820   _InterlockedExchangeAdd_acq,
821   _InterlockedExchangeAdd_rel,
822   _InterlockedExchangeAdd_nf,
823   _InterlockedExchange_acq,
824   _InterlockedExchange_rel,
825   _InterlockedExchange_nf,
826   _InterlockedCompareExchange_acq,
827   _InterlockedCompareExchange_rel,
828   _InterlockedCompareExchange_nf,
829   _InterlockedOr_acq,
830   _InterlockedOr_rel,
831   _InterlockedOr_nf,
832   _InterlockedXor_acq,
833   _InterlockedXor_rel,
834   _InterlockedXor_nf,
835   _InterlockedAnd_acq,
836   _InterlockedAnd_rel,
837   _InterlockedAnd_nf,
838   _InterlockedIncrement_acq,
839   _InterlockedIncrement_rel,
840   _InterlockedIncrement_nf,
841   _InterlockedDecrement_acq,
842   _InterlockedDecrement_rel,
843   _InterlockedDecrement_nf,
844   __fastfail,
845 };
846 
847 Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
848                                             const CallExpr *E) {
849   switch (BuiltinID) {
850   case MSVCIntrin::_BitScanForward:
851   case MSVCIntrin::_BitScanReverse: {
852     Value *ArgValue = EmitScalarExpr(E->getArg(1));
853 
854     llvm::Type *ArgType = ArgValue->getType();
855     llvm::Type *IndexType =
856       EmitScalarExpr(E->getArg(0))->getType()->getPointerElementType();
857     llvm::Type *ResultType = ConvertType(E->getType());
858 
859     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
860     Value *ResZero = llvm::Constant::getNullValue(ResultType);
861     Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
862 
863     BasicBlock *Begin = Builder.GetInsertBlock();
864     BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
865     Builder.SetInsertPoint(End);
866     PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
867 
868     Builder.SetInsertPoint(Begin);
869     Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
870     BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
871     Builder.CreateCondBr(IsZero, End, NotZero);
872     Result->addIncoming(ResZero, Begin);
873 
874     Builder.SetInsertPoint(NotZero);
875     Address IndexAddress = EmitPointerWithAlignment(E->getArg(0));
876 
877     if (BuiltinID == MSVCIntrin::_BitScanForward) {
878       Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
879       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
880       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
881       Builder.CreateStore(ZeroCount, IndexAddress, false);
882     } else {
883       unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
884       Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
885 
886       Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
887       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
888       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
889       Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
890       Builder.CreateStore(Index, IndexAddress, false);
891     }
892     Builder.CreateBr(End);
893     Result->addIncoming(ResOne, NotZero);
894 
895     Builder.SetInsertPoint(End);
896     return Result;
897   }
898   case MSVCIntrin::_InterlockedAnd:
899     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
900   case MSVCIntrin::_InterlockedExchange:
901     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
902   case MSVCIntrin::_InterlockedExchangeAdd:
903     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
904   case MSVCIntrin::_InterlockedExchangeSub:
905     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
906   case MSVCIntrin::_InterlockedOr:
907     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
908   case MSVCIntrin::_InterlockedXor:
909     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
910   case MSVCIntrin::_InterlockedExchangeAdd_acq:
911     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
912                                  AtomicOrdering::Acquire);
913   case MSVCIntrin::_InterlockedExchangeAdd_rel:
914     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
915                                  AtomicOrdering::Release);
916   case MSVCIntrin::_InterlockedExchangeAdd_nf:
917     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
918                                  AtomicOrdering::Monotonic);
919   case MSVCIntrin::_InterlockedExchange_acq:
920     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
921                                  AtomicOrdering::Acquire);
922   case MSVCIntrin::_InterlockedExchange_rel:
923     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
924                                  AtomicOrdering::Release);
925   case MSVCIntrin::_InterlockedExchange_nf:
926     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
927                                  AtomicOrdering::Monotonic);
928   case MSVCIntrin::_InterlockedCompareExchange_acq:
929     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Acquire);
930   case MSVCIntrin::_InterlockedCompareExchange_rel:
931     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
932   case MSVCIntrin::_InterlockedCompareExchange_nf:
933     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
934   case MSVCIntrin::_InterlockedOr_acq:
935     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
936                                  AtomicOrdering::Acquire);
937   case MSVCIntrin::_InterlockedOr_rel:
938     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
939                                  AtomicOrdering::Release);
940   case MSVCIntrin::_InterlockedOr_nf:
941     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
942                                  AtomicOrdering::Monotonic);
943   case MSVCIntrin::_InterlockedXor_acq:
944     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
945                                  AtomicOrdering::Acquire);
946   case MSVCIntrin::_InterlockedXor_rel:
947     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
948                                  AtomicOrdering::Release);
949   case MSVCIntrin::_InterlockedXor_nf:
950     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
951                                  AtomicOrdering::Monotonic);
952   case MSVCIntrin::_InterlockedAnd_acq:
953     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
954                                  AtomicOrdering::Acquire);
955   case MSVCIntrin::_InterlockedAnd_rel:
956     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
957                                  AtomicOrdering::Release);
958   case MSVCIntrin::_InterlockedAnd_nf:
959     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
960                                  AtomicOrdering::Monotonic);
961   case MSVCIntrin::_InterlockedIncrement_acq:
962     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Acquire);
963   case MSVCIntrin::_InterlockedIncrement_rel:
964     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Release);
965   case MSVCIntrin::_InterlockedIncrement_nf:
966     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Monotonic);
967   case MSVCIntrin::_InterlockedDecrement_acq:
968     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Acquire);
969   case MSVCIntrin::_InterlockedDecrement_rel:
970     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Release);
971   case MSVCIntrin::_InterlockedDecrement_nf:
972     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Monotonic);
973 
974   case MSVCIntrin::_InterlockedDecrement:
975     return EmitAtomicDecrementValue(*this, E);
976   case MSVCIntrin::_InterlockedIncrement:
977     return EmitAtomicIncrementValue(*this, E);
978 
979   case MSVCIntrin::__fastfail: {
980     // Request immediate process termination from the kernel. The instruction
981     // sequences to do this are documented on MSDN:
982     // https://msdn.microsoft.com/en-us/library/dn774154.aspx
983     llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
984     StringRef Asm, Constraints;
985     switch (ISA) {
986     default:
987       ErrorUnsupported(E, "__fastfail call for this architecture");
988       break;
989     case llvm::Triple::x86:
990     case llvm::Triple::x86_64:
991       Asm = "int $$0x29";
992       Constraints = "{cx}";
993       break;
994     case llvm::Triple::thumb:
995       Asm = "udf #251";
996       Constraints = "{r0}";
997       break;
998     }
999     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
1000     llvm::InlineAsm *IA =
1001         llvm::InlineAsm::get(FTy, Asm, Constraints, /*SideEffects=*/true);
1002     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
1003         getLLVMContext(), llvm::AttributeList::FunctionIndex,
1004         llvm::Attribute::NoReturn);
1005     CallSite CS = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
1006     CS.setAttributes(NoReturnAttr);
1007     return CS.getInstruction();
1008   }
1009   }
1010   llvm_unreachable("Incorrect MSVC intrinsic!");
1011 }
1012 
1013 namespace {
1014 // ARC cleanup for __builtin_os_log_format
1015 struct CallObjCArcUse final : EHScopeStack::Cleanup {
1016   CallObjCArcUse(llvm::Value *object) : object(object) {}
1017   llvm::Value *object;
1018 
1019   void Emit(CodeGenFunction &CGF, Flags flags) override {
1020     CGF.EmitARCIntrinsicUse(object);
1021   }
1022 };
1023 }
1024 
1025 Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
1026                                                  BuiltinCheckKind Kind) {
1027   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
1028           && "Unsupported builtin check kind");
1029 
1030   Value *ArgValue = EmitScalarExpr(E);
1031   if (!SanOpts.has(SanitizerKind::Builtin) || !getTarget().isCLZForZeroUndef())
1032     return ArgValue;
1033 
1034   SanitizerScope SanScope(this);
1035   Value *Cond = Builder.CreateICmpNE(
1036       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
1037   EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
1038             SanitizerHandler::InvalidBuiltin,
1039             {EmitCheckSourceLocation(E->getExprLoc()),
1040              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
1041             None);
1042   return ArgValue;
1043 }
1044 
1045 /// Get the argument type for arguments to os_log_helper.
1046 static CanQualType getOSLogArgType(ASTContext &C, int Size) {
1047   QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
1048   return C.getCanonicalType(UnsignedTy);
1049 }
1050 
1051 llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
1052     const analyze_os_log::OSLogBufferLayout &Layout,
1053     CharUnits BufferAlignment) {
1054   ASTContext &Ctx = getContext();
1055 
1056   llvm::SmallString<64> Name;
1057   {
1058     raw_svector_ostream OS(Name);
1059     OS << "__os_log_helper";
1060     OS << "_" << BufferAlignment.getQuantity();
1061     OS << "_" << int(Layout.getSummaryByte());
1062     OS << "_" << int(Layout.getNumArgsByte());
1063     for (const auto &Item : Layout.Items)
1064       OS << "_" << int(Item.getSizeByte()) << "_"
1065          << int(Item.getDescriptorByte());
1066   }
1067 
1068   if (llvm::Function *F = CGM.getModule().getFunction(Name))
1069     return F;
1070 
1071   llvm::SmallVector<QualType, 4> ArgTys;
1072   llvm::SmallVector<ImplicitParamDecl, 4> Params;
1073   Params.emplace_back(Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"),
1074                       Ctx.VoidPtrTy, ImplicitParamDecl::Other);
1075   ArgTys.emplace_back(Ctx.VoidPtrTy);
1076 
1077   for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
1078     char Size = Layout.Items[I].getSizeByte();
1079     if (!Size)
1080       continue;
1081 
1082     QualType ArgTy = getOSLogArgType(Ctx, Size);
1083     Params.emplace_back(
1084         Ctx, nullptr, SourceLocation(),
1085         &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)), ArgTy,
1086         ImplicitParamDecl::Other);
1087     ArgTys.emplace_back(ArgTy);
1088   }
1089 
1090   FunctionArgList Args;
1091   for (auto &P : Params)
1092     Args.push_back(&P);
1093 
1094   QualType ReturnTy = Ctx.VoidTy;
1095   QualType FuncionTy = Ctx.getFunctionType(ReturnTy, ArgTys, {});
1096 
1097   // The helper function has linkonce_odr linkage to enable the linker to merge
1098   // identical functions. To ensure the merging always happens, 'noinline' is
1099   // attached to the function when compiling with -Oz.
1100   const CGFunctionInfo &FI =
1101       CGM.getTypes().arrangeBuiltinFunctionDeclaration(ReturnTy, Args);
1102   llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
1103   llvm::Function *Fn = llvm::Function::Create(
1104       FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
1105   Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
1106   CGM.SetLLVMFunctionAttributes(GlobalDecl(), FI, Fn);
1107   CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
1108 
1109   // Attach 'noinline' at -Oz.
1110   if (CGM.getCodeGenOpts().OptimizeSize == 2)
1111     Fn->addFnAttr(llvm::Attribute::NoInline);
1112 
1113   auto NL = ApplyDebugLocation::CreateEmpty(*this);
1114   IdentifierInfo *II = &Ctx.Idents.get(Name);
1115   FunctionDecl *FD = FunctionDecl::Create(
1116       Ctx, Ctx.getTranslationUnitDecl(), SourceLocation(), SourceLocation(), II,
1117       FuncionTy, nullptr, SC_PrivateExtern, false, false);
1118 
1119   StartFunction(FD, ReturnTy, Fn, FI, Args);
1120 
1121   // Create a scope with an artificial location for the body of this function.
1122   auto AL = ApplyDebugLocation::CreateArtificial(*this);
1123 
1124   CharUnits Offset;
1125   Address BufAddr(Builder.CreateLoad(GetAddrOfLocalVar(&Params[0]), "buf"),
1126                   BufferAlignment);
1127   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
1128                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
1129   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
1130                       Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
1131 
1132   unsigned I = 1;
1133   for (const auto &Item : Layout.Items) {
1134     Builder.CreateStore(
1135         Builder.getInt8(Item.getDescriptorByte()),
1136         Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
1137     Builder.CreateStore(
1138         Builder.getInt8(Item.getSizeByte()),
1139         Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
1140 
1141     CharUnits Size = Item.size();
1142     if (!Size.getQuantity())
1143       continue;
1144 
1145     Address Arg = GetAddrOfLocalVar(&Params[I]);
1146     Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
1147     Addr = Builder.CreateBitCast(Addr, Arg.getPointer()->getType(),
1148                                  "argDataCast");
1149     Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
1150     Offset += Size;
1151     ++I;
1152   }
1153 
1154   FinishFunction();
1155 
1156   return Fn;
1157 }
1158 
1159 RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
1160   assert(E.getNumArgs() >= 2 &&
1161          "__builtin_os_log_format takes at least 2 arguments");
1162   ASTContext &Ctx = getContext();
1163   analyze_os_log::OSLogBufferLayout Layout;
1164   analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
1165   Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
1166   llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
1167 
1168   // Ignore argument 1, the format string. It is not currently used.
1169   CallArgList Args;
1170   Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
1171 
1172   for (const auto &Item : Layout.Items) {
1173     int Size = Item.getSizeByte();
1174     if (!Size)
1175       continue;
1176 
1177     llvm::Value *ArgVal;
1178 
1179     if (Item.getKind() == analyze_os_log::OSLogBufferItem::MaskKind) {
1180       uint64_t Val = 0;
1181       for (unsigned I = 0, E = Item.getMaskType().size(); I < E; ++I)
1182         Val |= ((uint64_t)Item.getMaskType()[I]) << I * 8;
1183       ArgVal = llvm::Constant::getIntegerValue(Int64Ty, llvm::APInt(64, Val));
1184     } else if (const Expr *TheExpr = Item.getExpr()) {
1185       ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
1186 
1187       // Check if this is a retainable type.
1188       if (TheExpr->getType()->isObjCRetainableType()) {
1189         assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
1190                "Only scalar can be a ObjC retainable type");
1191         // Check if the object is constant, if not, save it in
1192         // RetainableOperands.
1193         if (!isa<Constant>(ArgVal))
1194           RetainableOperands.push_back(ArgVal);
1195       }
1196     } else {
1197       ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
1198     }
1199 
1200     unsigned ArgValSize =
1201         CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
1202     llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
1203                                                      ArgValSize);
1204     ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
1205     CanQualType ArgTy = getOSLogArgType(Ctx, Size);
1206     // If ArgVal has type x86_fp80, zero-extend ArgVal.
1207     ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
1208     Args.add(RValue::get(ArgVal), ArgTy);
1209   }
1210 
1211   const CGFunctionInfo &FI =
1212       CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
1213   llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
1214       Layout, BufAddr.getAlignment());
1215   EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
1216 
1217   // Push a clang.arc.use cleanup for each object in RetainableOperands. The
1218   // cleanup will cause the use to appear after the final log call, keeping
1219   // the object valid while it’s held in the log buffer.  Note that if there’s
1220   // a release cleanup on the object, it will already be active; since
1221   // cleanups are emitted in reverse order, the use will occur before the
1222   // object is released.
1223   if (!RetainableOperands.empty() && getLangOpts().ObjCAutoRefCount &&
1224       CGM.getCodeGenOpts().OptimizationLevel != 0)
1225     for (llvm::Value *Object : RetainableOperands)
1226       pushFullExprCleanup<CallObjCArcUse>(getARCCleanupKind(), Object);
1227 
1228   return RValue::get(BufAddr.getPointer());
1229 }
1230 
1231 /// Determine if a binop is a checked mixed-sign multiply we can specialize.
1232 static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
1233                                        WidthAndSignedness Op1Info,
1234                                        WidthAndSignedness Op2Info,
1235                                        WidthAndSignedness ResultInfo) {
1236   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
1237          Op1Info.Width == Op2Info.Width && Op1Info.Width >= ResultInfo.Width &&
1238          Op1Info.Signed != Op2Info.Signed;
1239 }
1240 
1241 /// Emit a checked mixed-sign multiply. This is a cheaper specialization of
1242 /// the generic checked-binop irgen.
1243 static RValue
1244 EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
1245                              WidthAndSignedness Op1Info, const clang::Expr *Op2,
1246                              WidthAndSignedness Op2Info,
1247                              const clang::Expr *ResultArg, QualType ResultQTy,
1248                              WidthAndSignedness ResultInfo) {
1249   assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
1250                                     Op2Info, ResultInfo) &&
1251          "Not a mixed-sign multipliction we can specialize");
1252 
1253   // Emit the signed and unsigned operands.
1254   const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
1255   const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
1256   llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
1257   llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
1258 
1259   llvm::Type *OpTy = Signed->getType();
1260   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
1261   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
1262   llvm::Type *ResTy = ResultPtr.getElementType();
1263 
1264   // Take the absolute value of the signed operand.
1265   llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
1266   llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
1267   llvm::Value *AbsSigned =
1268       CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
1269 
1270   // Perform a checked unsigned multiplication.
1271   llvm::Value *UnsignedOverflow;
1272   llvm::Value *UnsignedResult =
1273       EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
1274                             Unsigned, UnsignedOverflow);
1275 
1276   llvm::Value *Overflow, *Result;
1277   if (ResultInfo.Signed) {
1278     // Signed overflow occurs if the result is greater than INT_MAX or lesser
1279     // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
1280     auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width)
1281                       .zextOrSelf(Op1Info.Width);
1282     llvm::Value *MaxResult =
1283         CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
1284                               CGF.Builder.CreateZExt(IsNegative, OpTy));
1285     llvm::Value *SignedOverflow =
1286         CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
1287     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
1288 
1289     // Prepare the signed result (possibly by negating it).
1290     llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
1291     llvm::Value *SignedResult =
1292         CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
1293     Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
1294   } else {
1295     // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
1296     llvm::Value *Underflow = CGF.Builder.CreateAnd(
1297         IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
1298     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
1299     if (ResultInfo.Width < Op1Info.Width) {
1300       auto IntMax =
1301           llvm::APInt::getMaxValue(ResultInfo.Width).zext(Op1Info.Width);
1302       llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
1303           UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
1304       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
1305     }
1306 
1307     // Negate the product if it would be negative in infinite precision.
1308     Result = CGF.Builder.CreateSelect(
1309         IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
1310 
1311     Result = CGF.Builder.CreateTrunc(Result, ResTy);
1312   }
1313   assert(Overflow && Result && "Missing overflow or result");
1314 
1315   bool isVolatile =
1316       ResultArg->getType()->getPointeeType().isVolatileQualified();
1317   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
1318                           isVolatile);
1319   return RValue::get(Overflow);
1320 }
1321 
1322 static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType,
1323                                Value *&RecordPtr, CharUnits Align, Value *Func,
1324                                int Lvl) {
1325   const auto *RT = RType->getAs<RecordType>();
1326   ASTContext &Context = CGF.getContext();
1327   RecordDecl *RD = RT->getDecl()->getDefinition();
1328   ASTContext &Ctx = RD->getASTContext();
1329   const ASTRecordLayout &RL = Ctx.getASTRecordLayout(RD);
1330   std::string Pad = std::string(Lvl * 4, ' ');
1331 
1332   Value *GString =
1333       CGF.Builder.CreateGlobalStringPtr(RType.getAsString() + " {\n");
1334   Value *Res = CGF.Builder.CreateCall(Func, {GString});
1335 
1336   static llvm::DenseMap<QualType, const char *> Types;
1337   if (Types.empty()) {
1338     Types[Context.CharTy] = "%c";
1339     Types[Context.BoolTy] = "%d";
1340     Types[Context.SignedCharTy] = "%hhd";
1341     Types[Context.UnsignedCharTy] = "%hhu";
1342     Types[Context.IntTy] = "%d";
1343     Types[Context.UnsignedIntTy] = "%u";
1344     Types[Context.LongTy] = "%ld";
1345     Types[Context.UnsignedLongTy] = "%lu";
1346     Types[Context.LongLongTy] = "%lld";
1347     Types[Context.UnsignedLongLongTy] = "%llu";
1348     Types[Context.ShortTy] = "%hd";
1349     Types[Context.UnsignedShortTy] = "%hu";
1350     Types[Context.VoidPtrTy] = "%p";
1351     Types[Context.FloatTy] = "%f";
1352     Types[Context.DoubleTy] = "%f";
1353     Types[Context.LongDoubleTy] = "%Lf";
1354     Types[Context.getPointerType(Context.CharTy)] = "%s";
1355     Types[Context.getPointerType(Context.getConstType(Context.CharTy))] = "%s";
1356   }
1357 
1358   for (const auto *FD : RD->fields()) {
1359     uint64_t Off = RL.getFieldOffset(FD->getFieldIndex());
1360     Off = Ctx.toCharUnitsFromBits(Off).getQuantity();
1361 
1362     Value *FieldPtr = RecordPtr;
1363     if (RD->isUnion())
1364       FieldPtr = CGF.Builder.CreatePointerCast(
1365           FieldPtr, CGF.ConvertType(Context.getPointerType(FD->getType())));
1366     else
1367       FieldPtr = CGF.Builder.CreateStructGEP(CGF.ConvertType(RType), FieldPtr,
1368                                              FD->getFieldIndex());
1369 
1370     GString = CGF.Builder.CreateGlobalStringPtr(
1371         llvm::Twine(Pad)
1372             .concat(FD->getType().getAsString())
1373             .concat(llvm::Twine(' '))
1374             .concat(FD->getNameAsString())
1375             .concat(" : ")
1376             .str());
1377     Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
1378     Res = CGF.Builder.CreateAdd(Res, TmpRes);
1379 
1380     QualType CanonicalType =
1381         FD->getType().getUnqualifiedType().getCanonicalType();
1382 
1383     // We check whether we are in a recursive type
1384     if (CanonicalType->isRecordType()) {
1385       Value *TmpRes =
1386           dumpRecord(CGF, CanonicalType, FieldPtr, Align, Func, Lvl + 1);
1387       Res = CGF.Builder.CreateAdd(TmpRes, Res);
1388       continue;
1389     }
1390 
1391     // We try to determine the best format to print the current field
1392     llvm::Twine Format = Types.find(CanonicalType) == Types.end()
1393                              ? Types[Context.VoidPtrTy]
1394                              : Types[CanonicalType];
1395 
1396     Address FieldAddress = Address(FieldPtr, Align);
1397     FieldPtr = CGF.Builder.CreateLoad(FieldAddress);
1398 
1399     // FIXME Need to handle bitfield here
1400     GString = CGF.Builder.CreateGlobalStringPtr(
1401         Format.concat(llvm::Twine('\n')).str());
1402     TmpRes = CGF.Builder.CreateCall(Func, {GString, FieldPtr});
1403     Res = CGF.Builder.CreateAdd(Res, TmpRes);
1404   }
1405 
1406   GString = CGF.Builder.CreateGlobalStringPtr(Pad + "}\n");
1407   Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
1408   Res = CGF.Builder.CreateAdd(Res, TmpRes);
1409   return Res;
1410 }
1411 
1412 RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) {
1413   llvm::Value *Src = EmitScalarExpr(E->getArg(0));
1414   llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1));
1415 
1416   // The builtin's shift arg may have a different type than the source arg and
1417   // result, but the LLVM intrinsic uses the same type for all values.
1418   llvm::Type *Ty = Src->getType();
1419   ShiftAmt = Builder.CreateIntCast(ShiftAmt, Ty, false);
1420 
1421   // Rotate is a special case of LLVM funnel shift - 1st 2 args are the same.
1422   unsigned IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
1423   Value *F = CGM.getIntrinsic(IID, Ty);
1424   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
1425 }
1426 
1427 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
1428                                         const CallExpr *E,
1429                                         ReturnValueSlot ReturnValue) {
1430   const FunctionDecl *FD = GD.getDecl()->getAsFunction();
1431   // See if we can constant fold this builtin.  If so, don't emit it at all.
1432   Expr::EvalResult Result;
1433   if (E->EvaluateAsRValue(Result, CGM.getContext()) &&
1434       !Result.hasSideEffects()) {
1435     if (Result.Val.isInt())
1436       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
1437                                                 Result.Val.getInt()));
1438     if (Result.Val.isFloat())
1439       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
1440                                                Result.Val.getFloat()));
1441   }
1442 
1443   // There are LLVM math intrinsics/instructions corresponding to math library
1444   // functions except the LLVM op will never set errno while the math library
1445   // might. Also, math builtins have the same semantics as their math library
1446   // twins. Thus, we can transform math library and builtin calls to their
1447   // LLVM counterparts if the call is marked 'const' (known to never set errno).
1448   if (FD->hasAttr<ConstAttr>()) {
1449     switch (BuiltinID) {
1450     case Builtin::BIceil:
1451     case Builtin::BIceilf:
1452     case Builtin::BIceill:
1453     case Builtin::BI__builtin_ceil:
1454     case Builtin::BI__builtin_ceilf:
1455     case Builtin::BI__builtin_ceill:
1456       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::ceil));
1457 
1458     case Builtin::BIcopysign:
1459     case Builtin::BIcopysignf:
1460     case Builtin::BIcopysignl:
1461     case Builtin::BI__builtin_copysign:
1462     case Builtin::BI__builtin_copysignf:
1463     case Builtin::BI__builtin_copysignl:
1464     case Builtin::BI__builtin_copysignf128:
1465       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
1466 
1467     case Builtin::BIcos:
1468     case Builtin::BIcosf:
1469     case Builtin::BIcosl:
1470     case Builtin::BI__builtin_cos:
1471     case Builtin::BI__builtin_cosf:
1472     case Builtin::BI__builtin_cosl:
1473       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::cos));
1474 
1475     case Builtin::BIexp:
1476     case Builtin::BIexpf:
1477     case Builtin::BIexpl:
1478     case Builtin::BI__builtin_exp:
1479     case Builtin::BI__builtin_expf:
1480     case Builtin::BI__builtin_expl:
1481       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp));
1482 
1483     case Builtin::BIexp2:
1484     case Builtin::BIexp2f:
1485     case Builtin::BIexp2l:
1486     case Builtin::BI__builtin_exp2:
1487     case Builtin::BI__builtin_exp2f:
1488     case Builtin::BI__builtin_exp2l:
1489       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp2));
1490 
1491     case Builtin::BIfabs:
1492     case Builtin::BIfabsf:
1493     case Builtin::BIfabsl:
1494     case Builtin::BI__builtin_fabs:
1495     case Builtin::BI__builtin_fabsf:
1496     case Builtin::BI__builtin_fabsl:
1497     case Builtin::BI__builtin_fabsf128:
1498       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
1499 
1500     case Builtin::BIfloor:
1501     case Builtin::BIfloorf:
1502     case Builtin::BIfloorl:
1503     case Builtin::BI__builtin_floor:
1504     case Builtin::BI__builtin_floorf:
1505     case Builtin::BI__builtin_floorl:
1506       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::floor));
1507 
1508     case Builtin::BIfma:
1509     case Builtin::BIfmaf:
1510     case Builtin::BIfmal:
1511     case Builtin::BI__builtin_fma:
1512     case Builtin::BI__builtin_fmaf:
1513     case Builtin::BI__builtin_fmal:
1514       return RValue::get(emitTernaryBuiltin(*this, E, Intrinsic::fma));
1515 
1516     case Builtin::BIfmax:
1517     case Builtin::BIfmaxf:
1518     case Builtin::BIfmaxl:
1519     case Builtin::BI__builtin_fmax:
1520     case Builtin::BI__builtin_fmaxf:
1521     case Builtin::BI__builtin_fmaxl:
1522       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::maxnum));
1523 
1524     case Builtin::BIfmin:
1525     case Builtin::BIfminf:
1526     case Builtin::BIfminl:
1527     case Builtin::BI__builtin_fmin:
1528     case Builtin::BI__builtin_fminf:
1529     case Builtin::BI__builtin_fminl:
1530       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::minnum));
1531 
1532     // fmod() is a special-case. It maps to the frem instruction rather than an
1533     // LLVM intrinsic.
1534     case Builtin::BIfmod:
1535     case Builtin::BIfmodf:
1536     case Builtin::BIfmodl:
1537     case Builtin::BI__builtin_fmod:
1538     case Builtin::BI__builtin_fmodf:
1539     case Builtin::BI__builtin_fmodl: {
1540       Value *Arg1 = EmitScalarExpr(E->getArg(0));
1541       Value *Arg2 = EmitScalarExpr(E->getArg(1));
1542       return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
1543     }
1544 
1545     case Builtin::BIlog:
1546     case Builtin::BIlogf:
1547     case Builtin::BIlogl:
1548     case Builtin::BI__builtin_log:
1549     case Builtin::BI__builtin_logf:
1550     case Builtin::BI__builtin_logl:
1551       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log));
1552 
1553     case Builtin::BIlog10:
1554     case Builtin::BIlog10f:
1555     case Builtin::BIlog10l:
1556     case Builtin::BI__builtin_log10:
1557     case Builtin::BI__builtin_log10f:
1558     case Builtin::BI__builtin_log10l:
1559       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log10));
1560 
1561     case Builtin::BIlog2:
1562     case Builtin::BIlog2f:
1563     case Builtin::BIlog2l:
1564     case Builtin::BI__builtin_log2:
1565     case Builtin::BI__builtin_log2f:
1566     case Builtin::BI__builtin_log2l:
1567       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log2));
1568 
1569     case Builtin::BInearbyint:
1570     case Builtin::BInearbyintf:
1571     case Builtin::BInearbyintl:
1572     case Builtin::BI__builtin_nearbyint:
1573     case Builtin::BI__builtin_nearbyintf:
1574     case Builtin::BI__builtin_nearbyintl:
1575       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::nearbyint));
1576 
1577     case Builtin::BIpow:
1578     case Builtin::BIpowf:
1579     case Builtin::BIpowl:
1580     case Builtin::BI__builtin_pow:
1581     case Builtin::BI__builtin_powf:
1582     case Builtin::BI__builtin_powl:
1583       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::pow));
1584 
1585     case Builtin::BIrint:
1586     case Builtin::BIrintf:
1587     case Builtin::BIrintl:
1588     case Builtin::BI__builtin_rint:
1589     case Builtin::BI__builtin_rintf:
1590     case Builtin::BI__builtin_rintl:
1591       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::rint));
1592 
1593     case Builtin::BIround:
1594     case Builtin::BIroundf:
1595     case Builtin::BIroundl:
1596     case Builtin::BI__builtin_round:
1597     case Builtin::BI__builtin_roundf:
1598     case Builtin::BI__builtin_roundl:
1599       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::round));
1600 
1601     case Builtin::BIsin:
1602     case Builtin::BIsinf:
1603     case Builtin::BIsinl:
1604     case Builtin::BI__builtin_sin:
1605     case Builtin::BI__builtin_sinf:
1606     case Builtin::BI__builtin_sinl:
1607       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::sin));
1608 
1609     case Builtin::BIsqrt:
1610     case Builtin::BIsqrtf:
1611     case Builtin::BIsqrtl:
1612     case Builtin::BI__builtin_sqrt:
1613     case Builtin::BI__builtin_sqrtf:
1614     case Builtin::BI__builtin_sqrtl:
1615       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::sqrt));
1616 
1617     case Builtin::BItrunc:
1618     case Builtin::BItruncf:
1619     case Builtin::BItruncl:
1620     case Builtin::BI__builtin_trunc:
1621     case Builtin::BI__builtin_truncf:
1622     case Builtin::BI__builtin_truncl:
1623       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::trunc));
1624 
1625     default:
1626       break;
1627     }
1628   }
1629 
1630   switch (BuiltinID) {
1631   default: break;
1632   case Builtin::BI__builtin___CFStringMakeConstantString:
1633   case Builtin::BI__builtin___NSStringMakeConstantString:
1634     return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
1635   case Builtin::BI__builtin_stdarg_start:
1636   case Builtin::BI__builtin_va_start:
1637   case Builtin::BI__va_start:
1638   case Builtin::BI__builtin_va_end:
1639     return RValue::get(
1640         EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
1641                            ? EmitScalarExpr(E->getArg(0))
1642                            : EmitVAListRef(E->getArg(0)).getPointer(),
1643                        BuiltinID != Builtin::BI__builtin_va_end));
1644   case Builtin::BI__builtin_va_copy: {
1645     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
1646     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
1647 
1648     llvm::Type *Type = Int8PtrTy;
1649 
1650     DstPtr = Builder.CreateBitCast(DstPtr, Type);
1651     SrcPtr = Builder.CreateBitCast(SrcPtr, Type);
1652     return RValue::get(Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy),
1653                                           {DstPtr, SrcPtr}));
1654   }
1655   case Builtin::BI__builtin_abs:
1656   case Builtin::BI__builtin_labs:
1657   case Builtin::BI__builtin_llabs: {
1658     // X < 0 ? -X : X
1659     // The negation has 'nsw' because abs of INT_MIN is undefined.
1660     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1661     Value *NegOp = Builder.CreateNSWNeg(ArgValue, "neg");
1662     Constant *Zero = llvm::Constant::getNullValue(ArgValue->getType());
1663     Value *CmpResult = Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
1664     Value *Result = Builder.CreateSelect(CmpResult, NegOp, ArgValue, "abs");
1665     return RValue::get(Result);
1666   }
1667   case Builtin::BI__builtin_conj:
1668   case Builtin::BI__builtin_conjf:
1669   case Builtin::BI__builtin_conjl: {
1670     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1671     Value *Real = ComplexVal.first;
1672     Value *Imag = ComplexVal.second;
1673     Value *Zero =
1674       Imag->getType()->isFPOrFPVectorTy()
1675         ? llvm::ConstantFP::getZeroValueForNegation(Imag->getType())
1676         : llvm::Constant::getNullValue(Imag->getType());
1677 
1678     Imag = Builder.CreateFSub(Zero, Imag, "sub");
1679     return RValue::getComplex(std::make_pair(Real, Imag));
1680   }
1681   case Builtin::BI__builtin_creal:
1682   case Builtin::BI__builtin_crealf:
1683   case Builtin::BI__builtin_creall:
1684   case Builtin::BIcreal:
1685   case Builtin::BIcrealf:
1686   case Builtin::BIcreall: {
1687     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1688     return RValue::get(ComplexVal.first);
1689   }
1690 
1691   case Builtin::BI__builtin_dump_struct: {
1692     Value *Func = EmitScalarExpr(E->getArg(1)->IgnoreImpCasts());
1693     CharUnits Arg0Align = EmitPointerWithAlignment(E->getArg(0)).getAlignment();
1694 
1695     const Expr *Arg0 = E->getArg(0)->IgnoreImpCasts();
1696     QualType Arg0Type = Arg0->getType()->getPointeeType();
1697 
1698     Value *RecordPtr = EmitScalarExpr(Arg0);
1699     Value *Res = dumpRecord(*this, Arg0Type, RecordPtr, Arg0Align, Func, 0);
1700     return RValue::get(Res);
1701   }
1702 
1703   case Builtin::BI__builtin_cimag:
1704   case Builtin::BI__builtin_cimagf:
1705   case Builtin::BI__builtin_cimagl:
1706   case Builtin::BIcimag:
1707   case Builtin::BIcimagf:
1708   case Builtin::BIcimagl: {
1709     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1710     return RValue::get(ComplexVal.second);
1711   }
1712 
1713   case Builtin::BI__builtin_clrsb:
1714   case Builtin::BI__builtin_clrsbl:
1715   case Builtin::BI__builtin_clrsbll: {
1716     // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or
1717     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1718 
1719     llvm::Type *ArgType = ArgValue->getType();
1720     Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1721 
1722     llvm::Type *ResultType = ConvertType(E->getType());
1723     Value *Zero = llvm::Constant::getNullValue(ArgType);
1724     Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg");
1725     Value *Inverse = Builder.CreateNot(ArgValue, "not");
1726     Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue);
1727     Value *Ctlz = Builder.CreateCall(F, {Tmp, Builder.getFalse()});
1728     Value *Result = Builder.CreateSub(Ctlz, llvm::ConstantInt::get(ArgType, 1));
1729     Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1730                                    "cast");
1731     return RValue::get(Result);
1732   }
1733   case Builtin::BI__builtin_ctzs:
1734   case Builtin::BI__builtin_ctz:
1735   case Builtin::BI__builtin_ctzl:
1736   case Builtin::BI__builtin_ctzll: {
1737     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
1738 
1739     llvm::Type *ArgType = ArgValue->getType();
1740     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1741 
1742     llvm::Type *ResultType = ConvertType(E->getType());
1743     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
1744     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
1745     if (Result->getType() != ResultType)
1746       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1747                                      "cast");
1748     return RValue::get(Result);
1749   }
1750   case Builtin::BI__builtin_clzs:
1751   case Builtin::BI__builtin_clz:
1752   case Builtin::BI__builtin_clzl:
1753   case Builtin::BI__builtin_clzll: {
1754     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
1755 
1756     llvm::Type *ArgType = ArgValue->getType();
1757     Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1758 
1759     llvm::Type *ResultType = ConvertType(E->getType());
1760     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
1761     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
1762     if (Result->getType() != ResultType)
1763       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1764                                      "cast");
1765     return RValue::get(Result);
1766   }
1767   case Builtin::BI__builtin_ffs:
1768   case Builtin::BI__builtin_ffsl:
1769   case Builtin::BI__builtin_ffsll: {
1770     // ffs(x) -> x ? cttz(x) + 1 : 0
1771     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1772 
1773     llvm::Type *ArgType = ArgValue->getType();
1774     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1775 
1776     llvm::Type *ResultType = ConvertType(E->getType());
1777     Value *Tmp =
1778         Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
1779                           llvm::ConstantInt::get(ArgType, 1));
1780     Value *Zero = llvm::Constant::getNullValue(ArgType);
1781     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
1782     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
1783     if (Result->getType() != ResultType)
1784       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1785                                      "cast");
1786     return RValue::get(Result);
1787   }
1788   case Builtin::BI__builtin_parity:
1789   case Builtin::BI__builtin_parityl:
1790   case Builtin::BI__builtin_parityll: {
1791     // parity(x) -> ctpop(x) & 1
1792     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1793 
1794     llvm::Type *ArgType = ArgValue->getType();
1795     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
1796 
1797     llvm::Type *ResultType = ConvertType(E->getType());
1798     Value *Tmp = Builder.CreateCall(F, ArgValue);
1799     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
1800     if (Result->getType() != ResultType)
1801       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1802                                      "cast");
1803     return RValue::get(Result);
1804   }
1805   case Builtin::BI__popcnt16:
1806   case Builtin::BI__popcnt:
1807   case Builtin::BI__popcnt64:
1808   case Builtin::BI__builtin_popcount:
1809   case Builtin::BI__builtin_popcountl:
1810   case Builtin::BI__builtin_popcountll: {
1811     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1812 
1813     llvm::Type *ArgType = ArgValue->getType();
1814     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
1815 
1816     llvm::Type *ResultType = ConvertType(E->getType());
1817     Value *Result = Builder.CreateCall(F, ArgValue);
1818     if (Result->getType() != ResultType)
1819       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1820                                      "cast");
1821     return RValue::get(Result);
1822   }
1823   case Builtin::BI_rotr8:
1824   case Builtin::BI_rotr16:
1825   case Builtin::BI_rotr:
1826   case Builtin::BI_lrotr:
1827   case Builtin::BI_rotr64: {
1828     Value *Val = EmitScalarExpr(E->getArg(0));
1829     Value *Shift = EmitScalarExpr(E->getArg(1));
1830 
1831     llvm::Type *ArgType = Val->getType();
1832     Shift = Builder.CreateIntCast(Shift, ArgType, false);
1833     unsigned ArgWidth = ArgType->getIntegerBitWidth();
1834     Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
1835 
1836     Value *RightShiftAmt = Builder.CreateAnd(Shift, Mask);
1837     Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt);
1838     Value *LeftShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask);
1839     Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt);
1840     Value *Result = Builder.CreateOr(LeftShifted, RightShifted);
1841     return RValue::get(Result);
1842   }
1843   case Builtin::BI_rotl8:
1844   case Builtin::BI_rotl16:
1845   case Builtin::BI_rotl:
1846   case Builtin::BI_lrotl:
1847   case Builtin::BI_rotl64: {
1848     Value *Val = EmitScalarExpr(E->getArg(0));
1849     Value *Shift = EmitScalarExpr(E->getArg(1));
1850 
1851     llvm::Type *ArgType = Val->getType();
1852     Shift = Builder.CreateIntCast(Shift, ArgType, false);
1853     unsigned ArgWidth = ArgType->getIntegerBitWidth();
1854     Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
1855 
1856     Value *LeftShiftAmt = Builder.CreateAnd(Shift, Mask);
1857     Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt);
1858     Value *RightShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask);
1859     Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt);
1860     Value *Result = Builder.CreateOr(LeftShifted, RightShifted);
1861     return RValue::get(Result);
1862   }
1863   case Builtin::BI__builtin_unpredictable: {
1864     // Always return the argument of __builtin_unpredictable. LLVM does not
1865     // handle this builtin. Metadata for this builtin should be added directly
1866     // to instructions such as branches or switches that use it.
1867     return RValue::get(EmitScalarExpr(E->getArg(0)));
1868   }
1869   case Builtin::BI__builtin_expect: {
1870     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1871     llvm::Type *ArgType = ArgValue->getType();
1872 
1873     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
1874     // Don't generate llvm.expect on -O0 as the backend won't use it for
1875     // anything.
1876     // Note, we still IRGen ExpectedValue because it could have side-effects.
1877     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
1878       return RValue::get(ArgValue);
1879 
1880     Value *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
1881     Value *Result =
1882         Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
1883     return RValue::get(Result);
1884   }
1885   case Builtin::BI__builtin_assume_aligned: {
1886     Value *PtrValue = EmitScalarExpr(E->getArg(0));
1887     Value *OffsetValue =
1888       (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
1889 
1890     Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
1891     ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
1892     unsigned Alignment = (unsigned) AlignmentCI->getZExtValue();
1893 
1894     EmitAlignmentAssumption(PtrValue, Alignment, OffsetValue);
1895     return RValue::get(PtrValue);
1896   }
1897   case Builtin::BI__assume:
1898   case Builtin::BI__builtin_assume: {
1899     if (E->getArg(0)->HasSideEffects(getContext()))
1900       return RValue::get(nullptr);
1901 
1902     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1903     Value *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
1904     return RValue::get(Builder.CreateCall(FnAssume, ArgValue));
1905   }
1906   case Builtin::BI__builtin_bswap16:
1907   case Builtin::BI__builtin_bswap32:
1908   case Builtin::BI__builtin_bswap64: {
1909     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
1910   }
1911   case Builtin::BI__builtin_bitreverse8:
1912   case Builtin::BI__builtin_bitreverse16:
1913   case Builtin::BI__builtin_bitreverse32:
1914   case Builtin::BI__builtin_bitreverse64: {
1915     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
1916   }
1917   case Builtin::BI__builtin_rotateleft8:
1918   case Builtin::BI__builtin_rotateleft16:
1919   case Builtin::BI__builtin_rotateleft32:
1920   case Builtin::BI__builtin_rotateleft64:
1921     return emitRotate(E, false);
1922 
1923   case Builtin::BI__builtin_rotateright8:
1924   case Builtin::BI__builtin_rotateright16:
1925   case Builtin::BI__builtin_rotateright32:
1926   case Builtin::BI__builtin_rotateright64:
1927     return emitRotate(E, true);
1928 
1929   case Builtin::BI__builtin_object_size: {
1930     unsigned Type =
1931         E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
1932     auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
1933 
1934     // We pass this builtin onto the optimizer so that it can figure out the
1935     // object size in more complex cases.
1936     return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
1937                                              /*EmittedE=*/nullptr));
1938   }
1939   case Builtin::BI__builtin_prefetch: {
1940     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
1941     // FIXME: Technically these constants should of type 'int', yes?
1942     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
1943       llvm::ConstantInt::get(Int32Ty, 0);
1944     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
1945       llvm::ConstantInt::get(Int32Ty, 3);
1946     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
1947     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
1948     return RValue::get(Builder.CreateCall(F, {Address, RW, Locality, Data}));
1949   }
1950   case Builtin::BI__builtin_readcyclecounter: {
1951     Value *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
1952     return RValue::get(Builder.CreateCall(F));
1953   }
1954   case Builtin::BI__builtin___clear_cache: {
1955     Value *Begin = EmitScalarExpr(E->getArg(0));
1956     Value *End = EmitScalarExpr(E->getArg(1));
1957     Value *F = CGM.getIntrinsic(Intrinsic::clear_cache);
1958     return RValue::get(Builder.CreateCall(F, {Begin, End}));
1959   }
1960   case Builtin::BI__builtin_trap:
1961     return RValue::get(EmitTrapCall(Intrinsic::trap));
1962   case Builtin::BI__debugbreak:
1963     return RValue::get(EmitTrapCall(Intrinsic::debugtrap));
1964   case Builtin::BI__builtin_unreachable: {
1965     EmitUnreachable(E->getExprLoc());
1966 
1967     // We do need to preserve an insertion point.
1968     EmitBlock(createBasicBlock("unreachable.cont"));
1969 
1970     return RValue::get(nullptr);
1971   }
1972 
1973   case Builtin::BI__builtin_powi:
1974   case Builtin::BI__builtin_powif:
1975   case Builtin::BI__builtin_powil: {
1976     Value *Base = EmitScalarExpr(E->getArg(0));
1977     Value *Exponent = EmitScalarExpr(E->getArg(1));
1978     llvm::Type *ArgType = Base->getType();
1979     Value *F = CGM.getIntrinsic(Intrinsic::powi, ArgType);
1980     return RValue::get(Builder.CreateCall(F, {Base, Exponent}));
1981   }
1982 
1983   case Builtin::BI__builtin_isgreater:
1984   case Builtin::BI__builtin_isgreaterequal:
1985   case Builtin::BI__builtin_isless:
1986   case Builtin::BI__builtin_islessequal:
1987   case Builtin::BI__builtin_islessgreater:
1988   case Builtin::BI__builtin_isunordered: {
1989     // Ordered comparisons: we know the arguments to these are matching scalar
1990     // floating point values.
1991     Value *LHS = EmitScalarExpr(E->getArg(0));
1992     Value *RHS = EmitScalarExpr(E->getArg(1));
1993 
1994     switch (BuiltinID) {
1995     default: llvm_unreachable("Unknown ordered comparison");
1996     case Builtin::BI__builtin_isgreater:
1997       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
1998       break;
1999     case Builtin::BI__builtin_isgreaterequal:
2000       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
2001       break;
2002     case Builtin::BI__builtin_isless:
2003       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
2004       break;
2005     case Builtin::BI__builtin_islessequal:
2006       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
2007       break;
2008     case Builtin::BI__builtin_islessgreater:
2009       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
2010       break;
2011     case Builtin::BI__builtin_isunordered:
2012       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
2013       break;
2014     }
2015     // ZExt bool to int type.
2016     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
2017   }
2018   case Builtin::BI__builtin_isnan: {
2019     Value *V = EmitScalarExpr(E->getArg(0));
2020     V = Builder.CreateFCmpUNO(V, V, "cmp");
2021     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
2022   }
2023 
2024   case Builtin::BIfinite:
2025   case Builtin::BI__finite:
2026   case Builtin::BIfinitef:
2027   case Builtin::BI__finitef:
2028   case Builtin::BIfinitel:
2029   case Builtin::BI__finitel:
2030   case Builtin::BI__builtin_isinf:
2031   case Builtin::BI__builtin_isfinite: {
2032     // isinf(x)    --> fabs(x) == infinity
2033     // isfinite(x) --> fabs(x) != infinity
2034     // x != NaN via the ordered compare in either case.
2035     Value *V = EmitScalarExpr(E->getArg(0));
2036     Value *Fabs = EmitFAbs(*this, V);
2037     Constant *Infinity = ConstantFP::getInfinity(V->getType());
2038     CmpInst::Predicate Pred = (BuiltinID == Builtin::BI__builtin_isinf)
2039                                   ? CmpInst::FCMP_OEQ
2040                                   : CmpInst::FCMP_ONE;
2041     Value *FCmp = Builder.CreateFCmp(Pred, Fabs, Infinity, "cmpinf");
2042     return RValue::get(Builder.CreateZExt(FCmp, ConvertType(E->getType())));
2043   }
2044 
2045   case Builtin::BI__builtin_isinf_sign: {
2046     // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
2047     Value *Arg = EmitScalarExpr(E->getArg(0));
2048     Value *AbsArg = EmitFAbs(*this, Arg);
2049     Value *IsInf = Builder.CreateFCmpOEQ(
2050         AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
2051     Value *IsNeg = EmitSignBit(*this, Arg);
2052 
2053     llvm::Type *IntTy = ConvertType(E->getType());
2054     Value *Zero = Constant::getNullValue(IntTy);
2055     Value *One = ConstantInt::get(IntTy, 1);
2056     Value *NegativeOne = ConstantInt::get(IntTy, -1);
2057     Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
2058     Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
2059     return RValue::get(Result);
2060   }
2061 
2062   case Builtin::BI__builtin_isnormal: {
2063     // isnormal(x) --> x == x && fabsf(x) < infinity && fabsf(x) >= float_min
2064     Value *V = EmitScalarExpr(E->getArg(0));
2065     Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
2066 
2067     Value *Abs = EmitFAbs(*this, V);
2068     Value *IsLessThanInf =
2069       Builder.CreateFCmpULT(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
2070     APFloat Smallest = APFloat::getSmallestNormalized(
2071                    getContext().getFloatTypeSemantics(E->getArg(0)->getType()));
2072     Value *IsNormal =
2073       Builder.CreateFCmpUGE(Abs, ConstantFP::get(V->getContext(), Smallest),
2074                             "isnormal");
2075     V = Builder.CreateAnd(Eq, IsLessThanInf, "and");
2076     V = Builder.CreateAnd(V, IsNormal, "and");
2077     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
2078   }
2079 
2080   case Builtin::BI__builtin_fpclassify: {
2081     Value *V = EmitScalarExpr(E->getArg(5));
2082     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
2083 
2084     // Create Result
2085     BasicBlock *Begin = Builder.GetInsertBlock();
2086     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
2087     Builder.SetInsertPoint(End);
2088     PHINode *Result =
2089       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
2090                         "fpclassify_result");
2091 
2092     // if (V==0) return FP_ZERO
2093     Builder.SetInsertPoint(Begin);
2094     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
2095                                           "iszero");
2096     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
2097     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
2098     Builder.CreateCondBr(IsZero, End, NotZero);
2099     Result->addIncoming(ZeroLiteral, Begin);
2100 
2101     // if (V != V) return FP_NAN
2102     Builder.SetInsertPoint(NotZero);
2103     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
2104     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
2105     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
2106     Builder.CreateCondBr(IsNan, End, NotNan);
2107     Result->addIncoming(NanLiteral, NotZero);
2108 
2109     // if (fabs(V) == infinity) return FP_INFINITY
2110     Builder.SetInsertPoint(NotNan);
2111     Value *VAbs = EmitFAbs(*this, V);
2112     Value *IsInf =
2113       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
2114                             "isinf");
2115     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
2116     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
2117     Builder.CreateCondBr(IsInf, End, NotInf);
2118     Result->addIncoming(InfLiteral, NotNan);
2119 
2120     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
2121     Builder.SetInsertPoint(NotInf);
2122     APFloat Smallest = APFloat::getSmallestNormalized(
2123         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
2124     Value *IsNormal =
2125       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
2126                             "isnormal");
2127     Value *NormalResult =
2128       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
2129                            EmitScalarExpr(E->getArg(3)));
2130     Builder.CreateBr(End);
2131     Result->addIncoming(NormalResult, NotInf);
2132 
2133     // return Result
2134     Builder.SetInsertPoint(End);
2135     return RValue::get(Result);
2136   }
2137 
2138   case Builtin::BIalloca:
2139   case Builtin::BI_alloca:
2140   case Builtin::BI__builtin_alloca: {
2141     Value *Size = EmitScalarExpr(E->getArg(0));
2142     const TargetInfo &TI = getContext().getTargetInfo();
2143     // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
2144     unsigned SuitableAlignmentInBytes =
2145         CGM.getContext()
2146             .toCharUnitsFromBits(TI.getSuitableAlign())
2147             .getQuantity();
2148     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
2149     AI->setAlignment(SuitableAlignmentInBytes);
2150     return RValue::get(AI);
2151   }
2152 
2153   case Builtin::BI__builtin_alloca_with_align: {
2154     Value *Size = EmitScalarExpr(E->getArg(0));
2155     Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
2156     auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
2157     unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
2158     unsigned AlignmentInBytes =
2159         CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getQuantity();
2160     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
2161     AI->setAlignment(AlignmentInBytes);
2162     return RValue::get(AI);
2163   }
2164 
2165   case Builtin::BIbzero:
2166   case Builtin::BI__builtin_bzero: {
2167     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2168     Value *SizeVal = EmitScalarExpr(E->getArg(1));
2169     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
2170                         E->getArg(0)->getExprLoc(), FD, 0);
2171     Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
2172     return RValue::get(nullptr);
2173   }
2174   case Builtin::BImemcpy:
2175   case Builtin::BI__builtin_memcpy: {
2176     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2177     Address Src = EmitPointerWithAlignment(E->getArg(1));
2178     Value *SizeVal = EmitScalarExpr(E->getArg(2));
2179     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
2180                         E->getArg(0)->getExprLoc(), FD, 0);
2181     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
2182                         E->getArg(1)->getExprLoc(), FD, 1);
2183     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
2184     return RValue::get(Dest.getPointer());
2185   }
2186 
2187   case Builtin::BI__builtin_char_memchr:
2188     BuiltinID = Builtin::BI__builtin_memchr;
2189     break;
2190 
2191   case Builtin::BI__builtin___memcpy_chk: {
2192     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
2193     llvm::APSInt Size, DstSize;
2194     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
2195         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
2196       break;
2197     if (Size.ugt(DstSize))
2198       break;
2199     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2200     Address Src = EmitPointerWithAlignment(E->getArg(1));
2201     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
2202     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
2203     return RValue::get(Dest.getPointer());
2204   }
2205 
2206   case Builtin::BI__builtin_objc_memmove_collectable: {
2207     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
2208     Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
2209     Value *SizeVal = EmitScalarExpr(E->getArg(2));
2210     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
2211                                                   DestAddr, SrcAddr, SizeVal);
2212     return RValue::get(DestAddr.getPointer());
2213   }
2214 
2215   case Builtin::BI__builtin___memmove_chk: {
2216     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
2217     llvm::APSInt Size, DstSize;
2218     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
2219         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
2220       break;
2221     if (Size.ugt(DstSize))
2222       break;
2223     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2224     Address Src = EmitPointerWithAlignment(E->getArg(1));
2225     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
2226     Builder.CreateMemMove(Dest, Src, SizeVal, false);
2227     return RValue::get(Dest.getPointer());
2228   }
2229 
2230   case Builtin::BImemmove:
2231   case Builtin::BI__builtin_memmove: {
2232     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2233     Address Src = EmitPointerWithAlignment(E->getArg(1));
2234     Value *SizeVal = EmitScalarExpr(E->getArg(2));
2235     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
2236                         E->getArg(0)->getExprLoc(), FD, 0);
2237     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
2238                         E->getArg(1)->getExprLoc(), FD, 1);
2239     Builder.CreateMemMove(Dest, Src, SizeVal, false);
2240     return RValue::get(Dest.getPointer());
2241   }
2242   case Builtin::BImemset:
2243   case Builtin::BI__builtin_memset: {
2244     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2245     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
2246                                          Builder.getInt8Ty());
2247     Value *SizeVal = EmitScalarExpr(E->getArg(2));
2248     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
2249                         E->getArg(0)->getExprLoc(), FD, 0);
2250     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
2251     return RValue::get(Dest.getPointer());
2252   }
2253   case Builtin::BI__builtin___memset_chk: {
2254     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
2255     llvm::APSInt Size, DstSize;
2256     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
2257         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
2258       break;
2259     if (Size.ugt(DstSize))
2260       break;
2261     Address Dest = EmitPointerWithAlignment(E->getArg(0));
2262     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
2263                                          Builder.getInt8Ty());
2264     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
2265     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
2266     return RValue::get(Dest.getPointer());
2267   }
2268   case Builtin::BI__builtin_wmemcmp: {
2269     // The MSVC runtime library does not provide a definition of wmemcmp, so we
2270     // need an inline implementation.
2271     if (!getTarget().getTriple().isOSMSVCRT())
2272       break;
2273 
2274     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
2275 
2276     Value *Dst = EmitScalarExpr(E->getArg(0));
2277     Value *Src = EmitScalarExpr(E->getArg(1));
2278     Value *Size = EmitScalarExpr(E->getArg(2));
2279 
2280     BasicBlock *Entry = Builder.GetInsertBlock();
2281     BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
2282     BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
2283     BasicBlock *Next = createBasicBlock("wmemcmp.next");
2284     BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
2285     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
2286     Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
2287 
2288     EmitBlock(CmpGT);
2289     PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
2290     DstPhi->addIncoming(Dst, Entry);
2291     PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
2292     SrcPhi->addIncoming(Src, Entry);
2293     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
2294     SizePhi->addIncoming(Size, Entry);
2295     CharUnits WCharAlign =
2296         getContext().getTypeAlignInChars(getContext().WCharTy);
2297     Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
2298     Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
2299     Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
2300     Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
2301 
2302     EmitBlock(CmpLT);
2303     Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
2304     Builder.CreateCondBr(DstLtSrc, Exit, Next);
2305 
2306     EmitBlock(Next);
2307     Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
2308     Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
2309     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
2310     Value *NextSizeEq0 =
2311         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
2312     Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
2313     DstPhi->addIncoming(NextDst, Next);
2314     SrcPhi->addIncoming(NextSrc, Next);
2315     SizePhi->addIncoming(NextSize, Next);
2316 
2317     EmitBlock(Exit);
2318     PHINode *Ret = Builder.CreatePHI(IntTy, 4);
2319     Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
2320     Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
2321     Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
2322     Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
2323     return RValue::get(Ret);
2324   }
2325   case Builtin::BI__builtin_dwarf_cfa: {
2326     // The offset in bytes from the first argument to the CFA.
2327     //
2328     // Why on earth is this in the frontend?  Is there any reason at
2329     // all that the backend can't reasonably determine this while
2330     // lowering llvm.eh.dwarf.cfa()?
2331     //
2332     // TODO: If there's a satisfactory reason, add a target hook for
2333     // this instead of hard-coding 0, which is correct for most targets.
2334     int32_t Offset = 0;
2335 
2336     Value *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
2337     return RValue::get(Builder.CreateCall(F,
2338                                       llvm::ConstantInt::get(Int32Ty, Offset)));
2339   }
2340   case Builtin::BI__builtin_return_address: {
2341     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
2342                                                    getContext().UnsignedIntTy);
2343     Value *F = CGM.getIntrinsic(Intrinsic::returnaddress);
2344     return RValue::get(Builder.CreateCall(F, Depth));
2345   }
2346   case Builtin::BI_ReturnAddress: {
2347     Value *F = CGM.getIntrinsic(Intrinsic::returnaddress);
2348     return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
2349   }
2350   case Builtin::BI__builtin_frame_address: {
2351     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
2352                                                    getContext().UnsignedIntTy);
2353     Value *F = CGM.getIntrinsic(Intrinsic::frameaddress);
2354     return RValue::get(Builder.CreateCall(F, Depth));
2355   }
2356   case Builtin::BI__builtin_extract_return_addr: {
2357     Value *Address = EmitScalarExpr(E->getArg(0));
2358     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
2359     return RValue::get(Result);
2360   }
2361   case Builtin::BI__builtin_frob_return_addr: {
2362     Value *Address = EmitScalarExpr(E->getArg(0));
2363     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
2364     return RValue::get(Result);
2365   }
2366   case Builtin::BI__builtin_dwarf_sp_column: {
2367     llvm::IntegerType *Ty
2368       = cast<llvm::IntegerType>(ConvertType(E->getType()));
2369     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
2370     if (Column == -1) {
2371       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
2372       return RValue::get(llvm::UndefValue::get(Ty));
2373     }
2374     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
2375   }
2376   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
2377     Value *Address = EmitScalarExpr(E->getArg(0));
2378     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
2379       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
2380     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
2381   }
2382   case Builtin::BI__builtin_eh_return: {
2383     Value *Int = EmitScalarExpr(E->getArg(0));
2384     Value *Ptr = EmitScalarExpr(E->getArg(1));
2385 
2386     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
2387     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
2388            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
2389     Value *F = CGM.getIntrinsic(IntTy->getBitWidth() == 32
2390                                   ? Intrinsic::eh_return_i32
2391                                   : Intrinsic::eh_return_i64);
2392     Builder.CreateCall(F, {Int, Ptr});
2393     Builder.CreateUnreachable();
2394 
2395     // We do need to preserve an insertion point.
2396     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
2397 
2398     return RValue::get(nullptr);
2399   }
2400   case Builtin::BI__builtin_unwind_init: {
2401     Value *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
2402     return RValue::get(Builder.CreateCall(F));
2403   }
2404   case Builtin::BI__builtin_extend_pointer: {
2405     // Extends a pointer to the size of an _Unwind_Word, which is
2406     // uint64_t on all platforms.  Generally this gets poked into a
2407     // register and eventually used as an address, so if the
2408     // addressing registers are wider than pointers and the platform
2409     // doesn't implicitly ignore high-order bits when doing
2410     // addressing, we need to make sure we zext / sext based on
2411     // the platform's expectations.
2412     //
2413     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
2414 
2415     // Cast the pointer to intptr_t.
2416     Value *Ptr = EmitScalarExpr(E->getArg(0));
2417     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
2418 
2419     // If that's 64 bits, we're done.
2420     if (IntPtrTy->getBitWidth() == 64)
2421       return RValue::get(Result);
2422 
2423     // Otherwise, ask the codegen data what to do.
2424     if (getTargetHooks().extendPointerWithSExt())
2425       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
2426     else
2427       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
2428   }
2429   case Builtin::BI__builtin_setjmp: {
2430     // Buffer is a void**.
2431     Address Buf = EmitPointerWithAlignment(E->getArg(0));
2432 
2433     // Store the frame pointer to the setjmp buffer.
2434     Value *FrameAddr =
2435       Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
2436                          ConstantInt::get(Int32Ty, 0));
2437     Builder.CreateStore(FrameAddr, Buf);
2438 
2439     // Store the stack pointer to the setjmp buffer.
2440     Value *StackAddr =
2441         Builder.CreateCall(CGM.getIntrinsic(Intrinsic::stacksave));
2442     Address StackSaveSlot =
2443       Builder.CreateConstInBoundsGEP(Buf, 2, getPointerSize());
2444     Builder.CreateStore(StackAddr, StackSaveSlot);
2445 
2446     // Call LLVM's EH setjmp, which is lightweight.
2447     Value *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
2448     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
2449     return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
2450   }
2451   case Builtin::BI__builtin_longjmp: {
2452     Value *Buf = EmitScalarExpr(E->getArg(0));
2453     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
2454 
2455     // Call LLVM's EH longjmp, which is lightweight.
2456     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
2457 
2458     // longjmp doesn't return; mark this as unreachable.
2459     Builder.CreateUnreachable();
2460 
2461     // We do need to preserve an insertion point.
2462     EmitBlock(createBasicBlock("longjmp.cont"));
2463 
2464     return RValue::get(nullptr);
2465   }
2466   case Builtin::BI__sync_fetch_and_add:
2467   case Builtin::BI__sync_fetch_and_sub:
2468   case Builtin::BI__sync_fetch_and_or:
2469   case Builtin::BI__sync_fetch_and_and:
2470   case Builtin::BI__sync_fetch_and_xor:
2471   case Builtin::BI__sync_fetch_and_nand:
2472   case Builtin::BI__sync_add_and_fetch:
2473   case Builtin::BI__sync_sub_and_fetch:
2474   case Builtin::BI__sync_and_and_fetch:
2475   case Builtin::BI__sync_or_and_fetch:
2476   case Builtin::BI__sync_xor_and_fetch:
2477   case Builtin::BI__sync_nand_and_fetch:
2478   case Builtin::BI__sync_val_compare_and_swap:
2479   case Builtin::BI__sync_bool_compare_and_swap:
2480   case Builtin::BI__sync_lock_test_and_set:
2481   case Builtin::BI__sync_lock_release:
2482   case Builtin::BI__sync_swap:
2483     llvm_unreachable("Shouldn't make it through sema");
2484   case Builtin::BI__sync_fetch_and_add_1:
2485   case Builtin::BI__sync_fetch_and_add_2:
2486   case Builtin::BI__sync_fetch_and_add_4:
2487   case Builtin::BI__sync_fetch_and_add_8:
2488   case Builtin::BI__sync_fetch_and_add_16:
2489     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
2490   case Builtin::BI__sync_fetch_and_sub_1:
2491   case Builtin::BI__sync_fetch_and_sub_2:
2492   case Builtin::BI__sync_fetch_and_sub_4:
2493   case Builtin::BI__sync_fetch_and_sub_8:
2494   case Builtin::BI__sync_fetch_and_sub_16:
2495     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
2496   case Builtin::BI__sync_fetch_and_or_1:
2497   case Builtin::BI__sync_fetch_and_or_2:
2498   case Builtin::BI__sync_fetch_and_or_4:
2499   case Builtin::BI__sync_fetch_and_or_8:
2500   case Builtin::BI__sync_fetch_and_or_16:
2501     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
2502   case Builtin::BI__sync_fetch_and_and_1:
2503   case Builtin::BI__sync_fetch_and_and_2:
2504   case Builtin::BI__sync_fetch_and_and_4:
2505   case Builtin::BI__sync_fetch_and_and_8:
2506   case Builtin::BI__sync_fetch_and_and_16:
2507     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
2508   case Builtin::BI__sync_fetch_and_xor_1:
2509   case Builtin::BI__sync_fetch_and_xor_2:
2510   case Builtin::BI__sync_fetch_and_xor_4:
2511   case Builtin::BI__sync_fetch_and_xor_8:
2512   case Builtin::BI__sync_fetch_and_xor_16:
2513     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
2514   case Builtin::BI__sync_fetch_and_nand_1:
2515   case Builtin::BI__sync_fetch_and_nand_2:
2516   case Builtin::BI__sync_fetch_and_nand_4:
2517   case Builtin::BI__sync_fetch_and_nand_8:
2518   case Builtin::BI__sync_fetch_and_nand_16:
2519     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
2520 
2521   // Clang extensions: not overloaded yet.
2522   case Builtin::BI__sync_fetch_and_min:
2523     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
2524   case Builtin::BI__sync_fetch_and_max:
2525     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
2526   case Builtin::BI__sync_fetch_and_umin:
2527     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
2528   case Builtin::BI__sync_fetch_and_umax:
2529     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
2530 
2531   case Builtin::BI__sync_add_and_fetch_1:
2532   case Builtin::BI__sync_add_and_fetch_2:
2533   case Builtin::BI__sync_add_and_fetch_4:
2534   case Builtin::BI__sync_add_and_fetch_8:
2535   case Builtin::BI__sync_add_and_fetch_16:
2536     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
2537                                 llvm::Instruction::Add);
2538   case Builtin::BI__sync_sub_and_fetch_1:
2539   case Builtin::BI__sync_sub_and_fetch_2:
2540   case Builtin::BI__sync_sub_and_fetch_4:
2541   case Builtin::BI__sync_sub_and_fetch_8:
2542   case Builtin::BI__sync_sub_and_fetch_16:
2543     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
2544                                 llvm::Instruction::Sub);
2545   case Builtin::BI__sync_and_and_fetch_1:
2546   case Builtin::BI__sync_and_and_fetch_2:
2547   case Builtin::BI__sync_and_and_fetch_4:
2548   case Builtin::BI__sync_and_and_fetch_8:
2549   case Builtin::BI__sync_and_and_fetch_16:
2550     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
2551                                 llvm::Instruction::And);
2552   case Builtin::BI__sync_or_and_fetch_1:
2553   case Builtin::BI__sync_or_and_fetch_2:
2554   case Builtin::BI__sync_or_and_fetch_4:
2555   case Builtin::BI__sync_or_and_fetch_8:
2556   case Builtin::BI__sync_or_and_fetch_16:
2557     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
2558                                 llvm::Instruction::Or);
2559   case Builtin::BI__sync_xor_and_fetch_1:
2560   case Builtin::BI__sync_xor_and_fetch_2:
2561   case Builtin::BI__sync_xor_and_fetch_4:
2562   case Builtin::BI__sync_xor_and_fetch_8:
2563   case Builtin::BI__sync_xor_and_fetch_16:
2564     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
2565                                 llvm::Instruction::Xor);
2566   case Builtin::BI__sync_nand_and_fetch_1:
2567   case Builtin::BI__sync_nand_and_fetch_2:
2568   case Builtin::BI__sync_nand_and_fetch_4:
2569   case Builtin::BI__sync_nand_and_fetch_8:
2570   case Builtin::BI__sync_nand_and_fetch_16:
2571     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
2572                                 llvm::Instruction::And, true);
2573 
2574   case Builtin::BI__sync_val_compare_and_swap_1:
2575   case Builtin::BI__sync_val_compare_and_swap_2:
2576   case Builtin::BI__sync_val_compare_and_swap_4:
2577   case Builtin::BI__sync_val_compare_and_swap_8:
2578   case Builtin::BI__sync_val_compare_and_swap_16:
2579     return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
2580 
2581   case Builtin::BI__sync_bool_compare_and_swap_1:
2582   case Builtin::BI__sync_bool_compare_and_swap_2:
2583   case Builtin::BI__sync_bool_compare_and_swap_4:
2584   case Builtin::BI__sync_bool_compare_and_swap_8:
2585   case Builtin::BI__sync_bool_compare_and_swap_16:
2586     return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
2587 
2588   case Builtin::BI__sync_swap_1:
2589   case Builtin::BI__sync_swap_2:
2590   case Builtin::BI__sync_swap_4:
2591   case Builtin::BI__sync_swap_8:
2592   case Builtin::BI__sync_swap_16:
2593     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
2594 
2595   case Builtin::BI__sync_lock_test_and_set_1:
2596   case Builtin::BI__sync_lock_test_and_set_2:
2597   case Builtin::BI__sync_lock_test_and_set_4:
2598   case Builtin::BI__sync_lock_test_and_set_8:
2599   case Builtin::BI__sync_lock_test_and_set_16:
2600     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
2601 
2602   case Builtin::BI__sync_lock_release_1:
2603   case Builtin::BI__sync_lock_release_2:
2604   case Builtin::BI__sync_lock_release_4:
2605   case Builtin::BI__sync_lock_release_8:
2606   case Builtin::BI__sync_lock_release_16: {
2607     Value *Ptr = EmitScalarExpr(E->getArg(0));
2608     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
2609     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
2610     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
2611                                              StoreSize.getQuantity() * 8);
2612     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
2613     llvm::StoreInst *Store =
2614       Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr,
2615                                  StoreSize);
2616     Store->setAtomic(llvm::AtomicOrdering::Release);
2617     return RValue::get(nullptr);
2618   }
2619 
2620   case Builtin::BI__sync_synchronize: {
2621     // We assume this is supposed to correspond to a C++0x-style
2622     // sequentially-consistent fence (i.e. this is only usable for
2623     // synchronization, not device I/O or anything like that). This intrinsic
2624     // is really badly designed in the sense that in theory, there isn't
2625     // any way to safely use it... but in practice, it mostly works
2626     // to use it with non-atomic loads and stores to get acquire/release
2627     // semantics.
2628     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
2629     return RValue::get(nullptr);
2630   }
2631 
2632   case Builtin::BI__builtin_nontemporal_load:
2633     return RValue::get(EmitNontemporalLoad(*this, E));
2634   case Builtin::BI__builtin_nontemporal_store:
2635     return RValue::get(EmitNontemporalStore(*this, E));
2636   case Builtin::BI__c11_atomic_is_lock_free:
2637   case Builtin::BI__atomic_is_lock_free: {
2638     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
2639     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
2640     // _Atomic(T) is always properly-aligned.
2641     const char *LibCallName = "__atomic_is_lock_free";
2642     CallArgList Args;
2643     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
2644              getContext().getSizeType());
2645     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
2646       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
2647                getContext().VoidPtrTy);
2648     else
2649       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
2650                getContext().VoidPtrTy);
2651     const CGFunctionInfo &FuncInfo =
2652         CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
2653     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
2654     llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
2655     return EmitCall(FuncInfo, CGCallee::forDirect(Func),
2656                     ReturnValueSlot(), Args);
2657   }
2658 
2659   case Builtin::BI__atomic_test_and_set: {
2660     // Look at the argument type to determine whether this is a volatile
2661     // operation. The parameter type is always volatile.
2662     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
2663     bool Volatile =
2664         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
2665 
2666     Value *Ptr = EmitScalarExpr(E->getArg(0));
2667     unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
2668     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
2669     Value *NewVal = Builder.getInt8(1);
2670     Value *Order = EmitScalarExpr(E->getArg(1));
2671     if (isa<llvm::ConstantInt>(Order)) {
2672       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2673       AtomicRMWInst *Result = nullptr;
2674       switch (ord) {
2675       case 0:  // memory_order_relaxed
2676       default: // invalid order
2677         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2678                                          llvm::AtomicOrdering::Monotonic);
2679         break;
2680       case 1: // memory_order_consume
2681       case 2: // memory_order_acquire
2682         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2683                                          llvm::AtomicOrdering::Acquire);
2684         break;
2685       case 3: // memory_order_release
2686         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2687                                          llvm::AtomicOrdering::Release);
2688         break;
2689       case 4: // memory_order_acq_rel
2690 
2691         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2692                                          llvm::AtomicOrdering::AcquireRelease);
2693         break;
2694       case 5: // memory_order_seq_cst
2695         Result = Builder.CreateAtomicRMW(
2696             llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2697             llvm::AtomicOrdering::SequentiallyConsistent);
2698         break;
2699       }
2700       Result->setVolatile(Volatile);
2701       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
2702     }
2703 
2704     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2705 
2706     llvm::BasicBlock *BBs[5] = {
2707       createBasicBlock("monotonic", CurFn),
2708       createBasicBlock("acquire", CurFn),
2709       createBasicBlock("release", CurFn),
2710       createBasicBlock("acqrel", CurFn),
2711       createBasicBlock("seqcst", CurFn)
2712     };
2713     llvm::AtomicOrdering Orders[5] = {
2714         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
2715         llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
2716         llvm::AtomicOrdering::SequentiallyConsistent};
2717 
2718     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2719     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
2720 
2721     Builder.SetInsertPoint(ContBB);
2722     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
2723 
2724     for (unsigned i = 0; i < 5; ++i) {
2725       Builder.SetInsertPoint(BBs[i]);
2726       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
2727                                                    Ptr, NewVal, Orders[i]);
2728       RMW->setVolatile(Volatile);
2729       Result->addIncoming(RMW, BBs[i]);
2730       Builder.CreateBr(ContBB);
2731     }
2732 
2733     SI->addCase(Builder.getInt32(0), BBs[0]);
2734     SI->addCase(Builder.getInt32(1), BBs[1]);
2735     SI->addCase(Builder.getInt32(2), BBs[1]);
2736     SI->addCase(Builder.getInt32(3), BBs[2]);
2737     SI->addCase(Builder.getInt32(4), BBs[3]);
2738     SI->addCase(Builder.getInt32(5), BBs[4]);
2739 
2740     Builder.SetInsertPoint(ContBB);
2741     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
2742   }
2743 
2744   case Builtin::BI__atomic_clear: {
2745     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
2746     bool Volatile =
2747         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
2748 
2749     Address Ptr = EmitPointerWithAlignment(E->getArg(0));
2750     unsigned AddrSpace = Ptr.getPointer()->getType()->getPointerAddressSpace();
2751     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
2752     Value *NewVal = Builder.getInt8(0);
2753     Value *Order = EmitScalarExpr(E->getArg(1));
2754     if (isa<llvm::ConstantInt>(Order)) {
2755       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2756       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
2757       switch (ord) {
2758       case 0:  // memory_order_relaxed
2759       default: // invalid order
2760         Store->setOrdering(llvm::AtomicOrdering::Monotonic);
2761         break;
2762       case 3:  // memory_order_release
2763         Store->setOrdering(llvm::AtomicOrdering::Release);
2764         break;
2765       case 5:  // memory_order_seq_cst
2766         Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
2767         break;
2768       }
2769       return RValue::get(nullptr);
2770     }
2771 
2772     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2773 
2774     llvm::BasicBlock *BBs[3] = {
2775       createBasicBlock("monotonic", CurFn),
2776       createBasicBlock("release", CurFn),
2777       createBasicBlock("seqcst", CurFn)
2778     };
2779     llvm::AtomicOrdering Orders[3] = {
2780         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
2781         llvm::AtomicOrdering::SequentiallyConsistent};
2782 
2783     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2784     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
2785 
2786     for (unsigned i = 0; i < 3; ++i) {
2787       Builder.SetInsertPoint(BBs[i]);
2788       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
2789       Store->setOrdering(Orders[i]);
2790       Builder.CreateBr(ContBB);
2791     }
2792 
2793     SI->addCase(Builder.getInt32(0), BBs[0]);
2794     SI->addCase(Builder.getInt32(3), BBs[1]);
2795     SI->addCase(Builder.getInt32(5), BBs[2]);
2796 
2797     Builder.SetInsertPoint(ContBB);
2798     return RValue::get(nullptr);
2799   }
2800 
2801   case Builtin::BI__atomic_thread_fence:
2802   case Builtin::BI__atomic_signal_fence:
2803   case Builtin::BI__c11_atomic_thread_fence:
2804   case Builtin::BI__c11_atomic_signal_fence: {
2805     llvm::SyncScope::ID SSID;
2806     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
2807         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
2808       SSID = llvm::SyncScope::SingleThread;
2809     else
2810       SSID = llvm::SyncScope::System;
2811     Value *Order = EmitScalarExpr(E->getArg(0));
2812     if (isa<llvm::ConstantInt>(Order)) {
2813       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2814       switch (ord) {
2815       case 0:  // memory_order_relaxed
2816       default: // invalid order
2817         break;
2818       case 1:  // memory_order_consume
2819       case 2:  // memory_order_acquire
2820         Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
2821         break;
2822       case 3:  // memory_order_release
2823         Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
2824         break;
2825       case 4:  // memory_order_acq_rel
2826         Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
2827         break;
2828       case 5:  // memory_order_seq_cst
2829         Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
2830         break;
2831       }
2832       return RValue::get(nullptr);
2833     }
2834 
2835     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
2836     AcquireBB = createBasicBlock("acquire", CurFn);
2837     ReleaseBB = createBasicBlock("release", CurFn);
2838     AcqRelBB = createBasicBlock("acqrel", CurFn);
2839     SeqCstBB = createBasicBlock("seqcst", CurFn);
2840     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2841 
2842     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2843     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
2844 
2845     Builder.SetInsertPoint(AcquireBB);
2846     Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
2847     Builder.CreateBr(ContBB);
2848     SI->addCase(Builder.getInt32(1), AcquireBB);
2849     SI->addCase(Builder.getInt32(2), AcquireBB);
2850 
2851     Builder.SetInsertPoint(ReleaseBB);
2852     Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
2853     Builder.CreateBr(ContBB);
2854     SI->addCase(Builder.getInt32(3), ReleaseBB);
2855 
2856     Builder.SetInsertPoint(AcqRelBB);
2857     Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
2858     Builder.CreateBr(ContBB);
2859     SI->addCase(Builder.getInt32(4), AcqRelBB);
2860 
2861     Builder.SetInsertPoint(SeqCstBB);
2862     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
2863     Builder.CreateBr(ContBB);
2864     SI->addCase(Builder.getInt32(5), SeqCstBB);
2865 
2866     Builder.SetInsertPoint(ContBB);
2867     return RValue::get(nullptr);
2868   }
2869 
2870   case Builtin::BI__builtin_signbit:
2871   case Builtin::BI__builtin_signbitf:
2872   case Builtin::BI__builtin_signbitl: {
2873     return RValue::get(
2874         Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
2875                            ConvertType(E->getType())));
2876   }
2877   case Builtin::BI__annotation: {
2878     // Re-encode each wide string to UTF8 and make an MDString.
2879     SmallVector<Metadata *, 1> Strings;
2880     for (const Expr *Arg : E->arguments()) {
2881       const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
2882       assert(Str->getCharByteWidth() == 2);
2883       StringRef WideBytes = Str->getBytes();
2884       std::string StrUtf8;
2885       if (!convertUTF16ToUTF8String(
2886               makeArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
2887         CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
2888         continue;
2889       }
2890       Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
2891     }
2892 
2893     // Build and MDTuple of MDStrings and emit the intrinsic call.
2894     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
2895     MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
2896     Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
2897     return RValue::getIgnored();
2898   }
2899   case Builtin::BI__builtin_annotation: {
2900     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
2901     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::annotation,
2902                                       AnnVal->getType());
2903 
2904     // Get the annotation string, go through casts. Sema requires this to be a
2905     // non-wide string literal, potentially casted, so the cast<> is safe.
2906     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
2907     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
2908     return RValue::get(EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc()));
2909   }
2910   case Builtin::BI__builtin_addcb:
2911   case Builtin::BI__builtin_addcs:
2912   case Builtin::BI__builtin_addc:
2913   case Builtin::BI__builtin_addcl:
2914   case Builtin::BI__builtin_addcll:
2915   case Builtin::BI__builtin_subcb:
2916   case Builtin::BI__builtin_subcs:
2917   case Builtin::BI__builtin_subc:
2918   case Builtin::BI__builtin_subcl:
2919   case Builtin::BI__builtin_subcll: {
2920 
2921     // We translate all of these builtins from expressions of the form:
2922     //   int x = ..., y = ..., carryin = ..., carryout, result;
2923     //   result = __builtin_addc(x, y, carryin, &carryout);
2924     //
2925     // to LLVM IR of the form:
2926     //
2927     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
2928     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
2929     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
2930     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
2931     //                                                       i32 %carryin)
2932     //   %result = extractvalue {i32, i1} %tmp2, 0
2933     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
2934     //   %tmp3 = or i1 %carry1, %carry2
2935     //   %tmp4 = zext i1 %tmp3 to i32
2936     //   store i32 %tmp4, i32* %carryout
2937 
2938     // Scalarize our inputs.
2939     llvm::Value *X = EmitScalarExpr(E->getArg(0));
2940     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
2941     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
2942     Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
2943 
2944     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
2945     llvm::Intrinsic::ID IntrinsicId;
2946     switch (BuiltinID) {
2947     default: llvm_unreachable("Unknown multiprecision builtin id.");
2948     case Builtin::BI__builtin_addcb:
2949     case Builtin::BI__builtin_addcs:
2950     case Builtin::BI__builtin_addc:
2951     case Builtin::BI__builtin_addcl:
2952     case Builtin::BI__builtin_addcll:
2953       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
2954       break;
2955     case Builtin::BI__builtin_subcb:
2956     case Builtin::BI__builtin_subcs:
2957     case Builtin::BI__builtin_subc:
2958     case Builtin::BI__builtin_subcl:
2959     case Builtin::BI__builtin_subcll:
2960       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
2961       break;
2962     }
2963 
2964     // Construct our resulting LLVM IR expression.
2965     llvm::Value *Carry1;
2966     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
2967                                               X, Y, Carry1);
2968     llvm::Value *Carry2;
2969     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
2970                                               Sum1, Carryin, Carry2);
2971     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
2972                                                X->getType());
2973     Builder.CreateStore(CarryOut, CarryOutPtr);
2974     return RValue::get(Sum2);
2975   }
2976 
2977   case Builtin::BI__builtin_add_overflow:
2978   case Builtin::BI__builtin_sub_overflow:
2979   case Builtin::BI__builtin_mul_overflow: {
2980     const clang::Expr *LeftArg = E->getArg(0);
2981     const clang::Expr *RightArg = E->getArg(1);
2982     const clang::Expr *ResultArg = E->getArg(2);
2983 
2984     clang::QualType ResultQTy =
2985         ResultArg->getType()->castAs<PointerType>()->getPointeeType();
2986 
2987     WidthAndSignedness LeftInfo =
2988         getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
2989     WidthAndSignedness RightInfo =
2990         getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
2991     WidthAndSignedness ResultInfo =
2992         getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
2993 
2994     // Handle mixed-sign multiplication as a special case, because adding
2995     // runtime or backend support for our generic irgen would be too expensive.
2996     if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
2997       return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
2998                                           RightInfo, ResultArg, ResultQTy,
2999                                           ResultInfo);
3000 
3001     WidthAndSignedness EncompassingInfo =
3002         EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
3003 
3004     llvm::Type *EncompassingLLVMTy =
3005         llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
3006 
3007     llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
3008 
3009     llvm::Intrinsic::ID IntrinsicId;
3010     switch (BuiltinID) {
3011     default:
3012       llvm_unreachable("Unknown overflow builtin id.");
3013     case Builtin::BI__builtin_add_overflow:
3014       IntrinsicId = EncompassingInfo.Signed
3015                         ? llvm::Intrinsic::sadd_with_overflow
3016                         : llvm::Intrinsic::uadd_with_overflow;
3017       break;
3018     case Builtin::BI__builtin_sub_overflow:
3019       IntrinsicId = EncompassingInfo.Signed
3020                         ? llvm::Intrinsic::ssub_with_overflow
3021                         : llvm::Intrinsic::usub_with_overflow;
3022       break;
3023     case Builtin::BI__builtin_mul_overflow:
3024       IntrinsicId = EncompassingInfo.Signed
3025                         ? llvm::Intrinsic::smul_with_overflow
3026                         : llvm::Intrinsic::umul_with_overflow;
3027       break;
3028     }
3029 
3030     llvm::Value *Left = EmitScalarExpr(LeftArg);
3031     llvm::Value *Right = EmitScalarExpr(RightArg);
3032     Address ResultPtr = EmitPointerWithAlignment(ResultArg);
3033 
3034     // Extend each operand to the encompassing type.
3035     Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
3036     Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
3037 
3038     // Perform the operation on the extended values.
3039     llvm::Value *Overflow, *Result;
3040     Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
3041 
3042     if (EncompassingInfo.Width > ResultInfo.Width) {
3043       // The encompassing type is wider than the result type, so we need to
3044       // truncate it.
3045       llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
3046 
3047       // To see if the truncation caused an overflow, we will extend
3048       // the result and then compare it to the original result.
3049       llvm::Value *ResultTruncExt = Builder.CreateIntCast(
3050           ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
3051       llvm::Value *TruncationOverflow =
3052           Builder.CreateICmpNE(Result, ResultTruncExt);
3053 
3054       Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
3055       Result = ResultTrunc;
3056     }
3057 
3058     // Finally, store the result using the pointer.
3059     bool isVolatile =
3060       ResultArg->getType()->getPointeeType().isVolatileQualified();
3061     Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
3062 
3063     return RValue::get(Overflow);
3064   }
3065 
3066   case Builtin::BI__builtin_uadd_overflow:
3067   case Builtin::BI__builtin_uaddl_overflow:
3068   case Builtin::BI__builtin_uaddll_overflow:
3069   case Builtin::BI__builtin_usub_overflow:
3070   case Builtin::BI__builtin_usubl_overflow:
3071   case Builtin::BI__builtin_usubll_overflow:
3072   case Builtin::BI__builtin_umul_overflow:
3073   case Builtin::BI__builtin_umull_overflow:
3074   case Builtin::BI__builtin_umulll_overflow:
3075   case Builtin::BI__builtin_sadd_overflow:
3076   case Builtin::BI__builtin_saddl_overflow:
3077   case Builtin::BI__builtin_saddll_overflow:
3078   case Builtin::BI__builtin_ssub_overflow:
3079   case Builtin::BI__builtin_ssubl_overflow:
3080   case Builtin::BI__builtin_ssubll_overflow:
3081   case Builtin::BI__builtin_smul_overflow:
3082   case Builtin::BI__builtin_smull_overflow:
3083   case Builtin::BI__builtin_smulll_overflow: {
3084 
3085     // We translate all of these builtins directly to the relevant llvm IR node.
3086 
3087     // Scalarize our inputs.
3088     llvm::Value *X = EmitScalarExpr(E->getArg(0));
3089     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
3090     Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
3091 
3092     // Decide which of the overflow intrinsics we are lowering to:
3093     llvm::Intrinsic::ID IntrinsicId;
3094     switch (BuiltinID) {
3095     default: llvm_unreachable("Unknown overflow builtin id.");
3096     case Builtin::BI__builtin_uadd_overflow:
3097     case Builtin::BI__builtin_uaddl_overflow:
3098     case Builtin::BI__builtin_uaddll_overflow:
3099       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
3100       break;
3101     case Builtin::BI__builtin_usub_overflow:
3102     case Builtin::BI__builtin_usubl_overflow:
3103     case Builtin::BI__builtin_usubll_overflow:
3104       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
3105       break;
3106     case Builtin::BI__builtin_umul_overflow:
3107     case Builtin::BI__builtin_umull_overflow:
3108     case Builtin::BI__builtin_umulll_overflow:
3109       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
3110       break;
3111     case Builtin::BI__builtin_sadd_overflow:
3112     case Builtin::BI__builtin_saddl_overflow:
3113     case Builtin::BI__builtin_saddll_overflow:
3114       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
3115       break;
3116     case Builtin::BI__builtin_ssub_overflow:
3117     case Builtin::BI__builtin_ssubl_overflow:
3118     case Builtin::BI__builtin_ssubll_overflow:
3119       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
3120       break;
3121     case Builtin::BI__builtin_smul_overflow:
3122     case Builtin::BI__builtin_smull_overflow:
3123     case Builtin::BI__builtin_smulll_overflow:
3124       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
3125       break;
3126     }
3127 
3128 
3129     llvm::Value *Carry;
3130     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
3131     Builder.CreateStore(Sum, SumOutPtr);
3132 
3133     return RValue::get(Carry);
3134   }
3135   case Builtin::BI__builtin_addressof:
3136     return RValue::get(EmitLValue(E->getArg(0)).getPointer());
3137   case Builtin::BI__builtin_operator_new:
3138     return EmitBuiltinNewDeleteCall(
3139         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
3140   case Builtin::BI__builtin_operator_delete:
3141     return EmitBuiltinNewDeleteCall(
3142         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
3143 
3144   case Builtin::BI__noop:
3145     // __noop always evaluates to an integer literal zero.
3146     return RValue::get(ConstantInt::get(IntTy, 0));
3147   case Builtin::BI__builtin_call_with_static_chain: {
3148     const CallExpr *Call = cast<CallExpr>(E->getArg(0));
3149     const Expr *Chain = E->getArg(1);
3150     return EmitCall(Call->getCallee()->getType(),
3151                     EmitCallee(Call->getCallee()), Call, ReturnValue,
3152                     EmitScalarExpr(Chain));
3153   }
3154   case Builtin::BI_InterlockedExchange8:
3155   case Builtin::BI_InterlockedExchange16:
3156   case Builtin::BI_InterlockedExchange:
3157   case Builtin::BI_InterlockedExchangePointer:
3158     return RValue::get(
3159         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
3160   case Builtin::BI_InterlockedCompareExchangePointer:
3161   case Builtin::BI_InterlockedCompareExchangePointer_nf: {
3162     llvm::Type *RTy;
3163     llvm::IntegerType *IntType =
3164       IntegerType::get(getLLVMContext(),
3165                        getContext().getTypeSize(E->getType()));
3166     llvm::Type *IntPtrType = IntType->getPointerTo();
3167 
3168     llvm::Value *Destination =
3169       Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), IntPtrType);
3170 
3171     llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
3172     RTy = Exchange->getType();
3173     Exchange = Builder.CreatePtrToInt(Exchange, IntType);
3174 
3175     llvm::Value *Comparand =
3176       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
3177 
3178     auto Ordering =
3179       BuiltinID == Builtin::BI_InterlockedCompareExchangePointer_nf ?
3180       AtomicOrdering::Monotonic : AtomicOrdering::SequentiallyConsistent;
3181 
3182     auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
3183                                               Ordering, Ordering);
3184     Result->setVolatile(true);
3185 
3186     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
3187                                                                          0),
3188                                               RTy));
3189   }
3190   case Builtin::BI_InterlockedCompareExchange8:
3191   case Builtin::BI_InterlockedCompareExchange16:
3192   case Builtin::BI_InterlockedCompareExchange:
3193   case Builtin::BI_InterlockedCompareExchange64:
3194     return RValue::get(EmitAtomicCmpXchgForMSIntrin(*this, E));
3195   case Builtin::BI_InterlockedIncrement16:
3196   case Builtin::BI_InterlockedIncrement:
3197     return RValue::get(
3198         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
3199   case Builtin::BI_InterlockedDecrement16:
3200   case Builtin::BI_InterlockedDecrement:
3201     return RValue::get(
3202         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
3203   case Builtin::BI_InterlockedAnd8:
3204   case Builtin::BI_InterlockedAnd16:
3205   case Builtin::BI_InterlockedAnd:
3206     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
3207   case Builtin::BI_InterlockedExchangeAdd8:
3208   case Builtin::BI_InterlockedExchangeAdd16:
3209   case Builtin::BI_InterlockedExchangeAdd:
3210     return RValue::get(
3211         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
3212   case Builtin::BI_InterlockedExchangeSub8:
3213   case Builtin::BI_InterlockedExchangeSub16:
3214   case Builtin::BI_InterlockedExchangeSub:
3215     return RValue::get(
3216         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
3217   case Builtin::BI_InterlockedOr8:
3218   case Builtin::BI_InterlockedOr16:
3219   case Builtin::BI_InterlockedOr:
3220     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
3221   case Builtin::BI_InterlockedXor8:
3222   case Builtin::BI_InterlockedXor16:
3223   case Builtin::BI_InterlockedXor:
3224     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
3225 
3226   case Builtin::BI_bittest64:
3227   case Builtin::BI_bittest:
3228   case Builtin::BI_bittestandcomplement64:
3229   case Builtin::BI_bittestandcomplement:
3230   case Builtin::BI_bittestandreset64:
3231   case Builtin::BI_bittestandreset:
3232   case Builtin::BI_bittestandset64:
3233   case Builtin::BI_bittestandset:
3234   case Builtin::BI_interlockedbittestandreset:
3235   case Builtin::BI_interlockedbittestandreset64:
3236   case Builtin::BI_interlockedbittestandset64:
3237   case Builtin::BI_interlockedbittestandset:
3238   case Builtin::BI_interlockedbittestandset_acq:
3239   case Builtin::BI_interlockedbittestandset_rel:
3240   case Builtin::BI_interlockedbittestandset_nf:
3241   case Builtin::BI_interlockedbittestandreset_acq:
3242   case Builtin::BI_interlockedbittestandreset_rel:
3243   case Builtin::BI_interlockedbittestandreset_nf:
3244     return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
3245 
3246   case Builtin::BI__exception_code:
3247   case Builtin::BI_exception_code:
3248     return RValue::get(EmitSEHExceptionCode());
3249   case Builtin::BI__exception_info:
3250   case Builtin::BI_exception_info:
3251     return RValue::get(EmitSEHExceptionInfo());
3252   case Builtin::BI__abnormal_termination:
3253   case Builtin::BI_abnormal_termination:
3254     return RValue::get(EmitSEHAbnormalTermination());
3255   case Builtin::BI_setjmpex:
3256     if (getTarget().getTriple().isOSMSVCRT())
3257       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
3258     break;
3259   case Builtin::BI_setjmp:
3260     if (getTarget().getTriple().isOSMSVCRT()) {
3261       if (getTarget().getTriple().getArch() == llvm::Triple::x86)
3262         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
3263       else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
3264         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
3265       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
3266     }
3267     break;
3268 
3269   case Builtin::BI__GetExceptionInfo: {
3270     if (llvm::GlobalVariable *GV =
3271             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
3272       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
3273     break;
3274   }
3275 
3276   case Builtin::BI__fastfail:
3277     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
3278 
3279   case Builtin::BI__builtin_coro_size: {
3280     auto & Context = getContext();
3281     auto SizeTy = Context.getSizeType();
3282     auto T = Builder.getIntNTy(Context.getTypeSize(SizeTy));
3283     Value *F = CGM.getIntrinsic(Intrinsic::coro_size, T);
3284     return RValue::get(Builder.CreateCall(F));
3285   }
3286 
3287   case Builtin::BI__builtin_coro_id:
3288     return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
3289   case Builtin::BI__builtin_coro_promise:
3290     return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
3291   case Builtin::BI__builtin_coro_resume:
3292     return EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
3293   case Builtin::BI__builtin_coro_frame:
3294     return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
3295   case Builtin::BI__builtin_coro_noop:
3296     return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
3297   case Builtin::BI__builtin_coro_free:
3298     return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
3299   case Builtin::BI__builtin_coro_destroy:
3300     return EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
3301   case Builtin::BI__builtin_coro_done:
3302     return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
3303   case Builtin::BI__builtin_coro_alloc:
3304     return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
3305   case Builtin::BI__builtin_coro_begin:
3306     return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
3307   case Builtin::BI__builtin_coro_end:
3308     return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
3309   case Builtin::BI__builtin_coro_suspend:
3310     return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
3311   case Builtin::BI__builtin_coro_param:
3312     return EmitCoroutineIntrinsic(E, Intrinsic::coro_param);
3313 
3314   // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
3315   case Builtin::BIread_pipe:
3316   case Builtin::BIwrite_pipe: {
3317     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
3318           *Arg1 = EmitScalarExpr(E->getArg(1));
3319     CGOpenCLRuntime OpenCLRT(CGM);
3320     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3321     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3322 
3323     // Type of the generic packet parameter.
3324     unsigned GenericAS =
3325         getContext().getTargetAddressSpace(LangAS::opencl_generic);
3326     llvm::Type *I8PTy = llvm::PointerType::get(
3327         llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
3328 
3329     // Testing which overloaded version we should generate the call for.
3330     if (2U == E->getNumArgs()) {
3331       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
3332                                                              : "__write_pipe_2";
3333       // Creating a generic function type to be able to call with any builtin or
3334       // user defined type.
3335       llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
3336       llvm::FunctionType *FTy = llvm::FunctionType::get(
3337           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3338       Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
3339       return RValue::get(
3340           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3341                              {Arg0, BCast, PacketSize, PacketAlign}));
3342     } else {
3343       assert(4 == E->getNumArgs() &&
3344              "Illegal number of parameters to pipe function");
3345       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
3346                                                              : "__write_pipe_4";
3347 
3348       llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
3349                               Int32Ty, Int32Ty};
3350       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
3351             *Arg3 = EmitScalarExpr(E->getArg(3));
3352       llvm::FunctionType *FTy = llvm::FunctionType::get(
3353           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3354       Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
3355       // We know the third argument is an integer type, but we may need to cast
3356       // it to i32.
3357       if (Arg2->getType() != Int32Ty)
3358         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
3359       return RValue::get(Builder.CreateCall(
3360           CGM.CreateRuntimeFunction(FTy, Name),
3361           {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
3362     }
3363   }
3364   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
3365   // functions
3366   case Builtin::BIreserve_read_pipe:
3367   case Builtin::BIreserve_write_pipe:
3368   case Builtin::BIwork_group_reserve_read_pipe:
3369   case Builtin::BIwork_group_reserve_write_pipe:
3370   case Builtin::BIsub_group_reserve_read_pipe:
3371   case Builtin::BIsub_group_reserve_write_pipe: {
3372     // Composing the mangled name for the function.
3373     const char *Name;
3374     if (BuiltinID == Builtin::BIreserve_read_pipe)
3375       Name = "__reserve_read_pipe";
3376     else if (BuiltinID == Builtin::BIreserve_write_pipe)
3377       Name = "__reserve_write_pipe";
3378     else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
3379       Name = "__work_group_reserve_read_pipe";
3380     else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
3381       Name = "__work_group_reserve_write_pipe";
3382     else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
3383       Name = "__sub_group_reserve_read_pipe";
3384     else
3385       Name = "__sub_group_reserve_write_pipe";
3386 
3387     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
3388           *Arg1 = EmitScalarExpr(E->getArg(1));
3389     llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
3390     CGOpenCLRuntime OpenCLRT(CGM);
3391     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3392     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3393 
3394     // Building the generic function prototype.
3395     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
3396     llvm::FunctionType *FTy = llvm::FunctionType::get(
3397         ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3398     // We know the second argument is an integer type, but we may need to cast
3399     // it to i32.
3400     if (Arg1->getType() != Int32Ty)
3401       Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
3402     return RValue::get(
3403         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3404                            {Arg0, Arg1, PacketSize, PacketAlign}));
3405   }
3406   // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
3407   // functions
3408   case Builtin::BIcommit_read_pipe:
3409   case Builtin::BIcommit_write_pipe:
3410   case Builtin::BIwork_group_commit_read_pipe:
3411   case Builtin::BIwork_group_commit_write_pipe:
3412   case Builtin::BIsub_group_commit_read_pipe:
3413   case Builtin::BIsub_group_commit_write_pipe: {
3414     const char *Name;
3415     if (BuiltinID == Builtin::BIcommit_read_pipe)
3416       Name = "__commit_read_pipe";
3417     else if (BuiltinID == Builtin::BIcommit_write_pipe)
3418       Name = "__commit_write_pipe";
3419     else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
3420       Name = "__work_group_commit_read_pipe";
3421     else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
3422       Name = "__work_group_commit_write_pipe";
3423     else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
3424       Name = "__sub_group_commit_read_pipe";
3425     else
3426       Name = "__sub_group_commit_write_pipe";
3427 
3428     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
3429           *Arg1 = EmitScalarExpr(E->getArg(1));
3430     CGOpenCLRuntime OpenCLRT(CGM);
3431     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3432     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3433 
3434     // Building the generic function prototype.
3435     llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
3436     llvm::FunctionType *FTy =
3437         llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
3438                                 llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3439 
3440     return RValue::get(
3441         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3442                            {Arg0, Arg1, PacketSize, PacketAlign}));
3443   }
3444   // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
3445   case Builtin::BIget_pipe_num_packets:
3446   case Builtin::BIget_pipe_max_packets: {
3447     const char *BaseName;
3448     const PipeType *PipeTy = E->getArg(0)->getType()->getAs<PipeType>();
3449     if (BuiltinID == Builtin::BIget_pipe_num_packets)
3450       BaseName = "__get_pipe_num_packets";
3451     else
3452       BaseName = "__get_pipe_max_packets";
3453     auto Name = std::string(BaseName) +
3454                 std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
3455 
3456     // Building the generic function prototype.
3457     Value *Arg0 = EmitScalarExpr(E->getArg(0));
3458     CGOpenCLRuntime OpenCLRT(CGM);
3459     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3460     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3461     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
3462     llvm::FunctionType *FTy = llvm::FunctionType::get(
3463         Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3464 
3465     return RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3466                                           {Arg0, PacketSize, PacketAlign}));
3467   }
3468 
3469   // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
3470   case Builtin::BIto_global:
3471   case Builtin::BIto_local:
3472   case Builtin::BIto_private: {
3473     auto Arg0 = EmitScalarExpr(E->getArg(0));
3474     auto NewArgT = llvm::PointerType::get(Int8Ty,
3475       CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
3476     auto NewRetT = llvm::PointerType::get(Int8Ty,
3477       CGM.getContext().getTargetAddressSpace(
3478         E->getType()->getPointeeType().getAddressSpace()));
3479     auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
3480     llvm::Value *NewArg;
3481     if (Arg0->getType()->getPointerAddressSpace() !=
3482         NewArgT->getPointerAddressSpace())
3483       NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
3484     else
3485       NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
3486     auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
3487     auto NewCall =
3488         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
3489     return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
3490       ConvertType(E->getType())));
3491   }
3492 
3493   // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
3494   // It contains four different overload formats specified in Table 6.13.17.1.
3495   case Builtin::BIenqueue_kernel: {
3496     StringRef Name; // Generated function call name
3497     unsigned NumArgs = E->getNumArgs();
3498 
3499     llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
3500     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3501         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3502 
3503     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
3504     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
3505     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
3506     llvm::Value *Range = NDRangeL.getAddress().getPointer();
3507     llvm::Type *RangeTy = NDRangeL.getAddress().getType();
3508 
3509     if (NumArgs == 4) {
3510       // The most basic form of the call with parameters:
3511       // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
3512       Name = "__enqueue_kernel_basic";
3513       llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
3514                               GenericVoidPtrTy};
3515       llvm::FunctionType *FTy = llvm::FunctionType::get(
3516           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3517 
3518       auto Info =
3519           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
3520       llvm::Value *Kernel =
3521           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3522       llvm::Value *Block =
3523           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3524 
3525       AttrBuilder B;
3526       B.addAttribute(Attribute::ByVal);
3527       llvm::AttributeList ByValAttrSet =
3528           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
3529 
3530       auto RTCall =
3531           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
3532                              {Queue, Flags, Range, Kernel, Block});
3533       RTCall->setAttributes(ByValAttrSet);
3534       return RValue::get(RTCall);
3535     }
3536     assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
3537 
3538     // Create a temporary array to hold the sizes of local pointer arguments
3539     // for the block. \p First is the position of the first size argument.
3540     auto CreateArrayForSizeVar = [=](unsigned First)
3541         -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
3542       llvm::APInt ArraySize(32, NumArgs - First);
3543       QualType SizeArrayTy = getContext().getConstantArrayType(
3544           getContext().getSizeType(), ArraySize, ArrayType::Normal,
3545           /*IndexTypeQuals=*/0);
3546       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
3547       llvm::Value *TmpPtr = Tmp.getPointer();
3548       llvm::Value *TmpSize = EmitLifetimeStart(
3549           CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
3550       llvm::Value *ElemPtr;
3551       // Each of the following arguments specifies the size of the corresponding
3552       // argument passed to the enqueued block.
3553       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
3554       for (unsigned I = First; I < NumArgs; ++I) {
3555         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
3556         auto *GEP = Builder.CreateGEP(TmpPtr, {Zero, Index});
3557         if (I == First)
3558           ElemPtr = GEP;
3559         auto *V =
3560             Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
3561         Builder.CreateAlignedStore(
3562             V, GEP, CGM.getDataLayout().getPrefTypeAlignment(SizeTy));
3563       }
3564       return std::tie(ElemPtr, TmpSize, TmpPtr);
3565     };
3566 
3567     // Could have events and/or varargs.
3568     if (E->getArg(3)->getType()->isBlockPointerType()) {
3569       // No events passed, but has variadic arguments.
3570       Name = "__enqueue_kernel_varargs";
3571       auto Info =
3572           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
3573       llvm::Value *Kernel =
3574           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3575       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3576       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
3577       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4);
3578 
3579       // Create a vector of the arguments, as well as a constant value to
3580       // express to the runtime the number of variadic arguments.
3581       std::vector<llvm::Value *> Args = {
3582           Queue,  Flags, Range,
3583           Kernel, Block, ConstantInt::get(IntTy, NumArgs - 4),
3584           ElemPtr};
3585       std::vector<llvm::Type *> ArgTys = {
3586           QueueTy,          IntTy, RangeTy,           GenericVoidPtrTy,
3587           GenericVoidPtrTy, IntTy, ElemPtr->getType()};
3588 
3589       llvm::FunctionType *FTy = llvm::FunctionType::get(
3590           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3591       auto Call =
3592           RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3593                                          llvm::ArrayRef<llvm::Value *>(Args)));
3594       if (TmpSize)
3595         EmitLifetimeEnd(TmpSize, TmpPtr);
3596       return Call;
3597     }
3598     // Any calls now have event arguments passed.
3599     if (NumArgs >= 7) {
3600       llvm::Type *EventTy = ConvertType(getContext().OCLClkEventTy);
3601       llvm::Type *EventPtrTy = EventTy->getPointerTo(
3602           CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
3603 
3604       llvm::Value *NumEvents =
3605           Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
3606       llvm::Value *EventList =
3607           E->getArg(4)->getType()->isArrayType()
3608               ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
3609               : EmitScalarExpr(E->getArg(4));
3610       llvm::Value *ClkEvent = EmitScalarExpr(E->getArg(5));
3611       // Convert to generic address space.
3612       EventList = Builder.CreatePointerCast(EventList, EventPtrTy);
3613       ClkEvent = ClkEvent->getType()->isIntegerTy()
3614                    ? Builder.CreateBitOrPointerCast(ClkEvent, EventPtrTy)
3615                    : Builder.CreatePointerCast(ClkEvent, EventPtrTy);
3616       auto Info =
3617           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
3618       llvm::Value *Kernel =
3619           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3620       llvm::Value *Block =
3621           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3622 
3623       std::vector<llvm::Type *> ArgTys = {
3624           QueueTy,    Int32Ty,    RangeTy,          Int32Ty,
3625           EventPtrTy, EventPtrTy, GenericVoidPtrTy, GenericVoidPtrTy};
3626 
3627       std::vector<llvm::Value *> Args = {Queue,     Flags,    Range,  NumEvents,
3628                                          EventList, ClkEvent, Kernel, Block};
3629 
3630       if (NumArgs == 7) {
3631         // Has events but no variadics.
3632         Name = "__enqueue_kernel_basic_events";
3633         llvm::FunctionType *FTy = llvm::FunctionType::get(
3634             Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3635         return RValue::get(
3636             Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3637                                llvm::ArrayRef<llvm::Value *>(Args)));
3638       }
3639       // Has event info and variadics
3640       // Pass the number of variadics to the runtime function too.
3641       Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
3642       ArgTys.push_back(Int32Ty);
3643       Name = "__enqueue_kernel_events_varargs";
3644 
3645       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
3646       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7);
3647       Args.push_back(ElemPtr);
3648       ArgTys.push_back(ElemPtr->getType());
3649 
3650       llvm::FunctionType *FTy = llvm::FunctionType::get(
3651           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3652       auto Call =
3653           RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3654                                          llvm::ArrayRef<llvm::Value *>(Args)));
3655       if (TmpSize)
3656         EmitLifetimeEnd(TmpSize, TmpPtr);
3657       return Call;
3658     }
3659     LLVM_FALLTHROUGH;
3660   }
3661   // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
3662   // parameter.
3663   case Builtin::BIget_kernel_work_group_size: {
3664     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3665         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3666     auto Info =
3667         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
3668     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3669     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3670     return RValue::get(Builder.CreateCall(
3671         CGM.CreateRuntimeFunction(
3672             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
3673                                     false),
3674             "__get_kernel_work_group_size_impl"),
3675         {Kernel, Arg}));
3676   }
3677   case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
3678     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3679         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3680     auto Info =
3681         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
3682     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3683     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3684     return RValue::get(Builder.CreateCall(
3685         CGM.CreateRuntimeFunction(
3686             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
3687                                     false),
3688             "__get_kernel_preferred_work_group_size_multiple_impl"),
3689         {Kernel, Arg}));
3690   }
3691   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
3692   case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
3693     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3694         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3695     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
3696     llvm::Value *NDRange = NDRangeL.getAddress().getPointer();
3697     auto Info =
3698         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
3699     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3700     Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3701     const char *Name =
3702         BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
3703             ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
3704             : "__get_kernel_sub_group_count_for_ndrange_impl";
3705     return RValue::get(Builder.CreateCall(
3706         CGM.CreateRuntimeFunction(
3707             llvm::FunctionType::get(
3708                 IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
3709                 false),
3710             Name),
3711         {NDRange, Kernel, Block}));
3712   }
3713 
3714   case Builtin::BI__builtin_store_half:
3715   case Builtin::BI__builtin_store_halff: {
3716     Value *Val = EmitScalarExpr(E->getArg(0));
3717     Address Address = EmitPointerWithAlignment(E->getArg(1));
3718     Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
3719     return RValue::get(Builder.CreateStore(HalfVal, Address));
3720   }
3721   case Builtin::BI__builtin_load_half: {
3722     Address Address = EmitPointerWithAlignment(E->getArg(0));
3723     Value *HalfVal = Builder.CreateLoad(Address);
3724     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
3725   }
3726   case Builtin::BI__builtin_load_halff: {
3727     Address Address = EmitPointerWithAlignment(E->getArg(0));
3728     Value *HalfVal = Builder.CreateLoad(Address);
3729     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
3730   }
3731   case Builtin::BIprintf:
3732     if (getTarget().getTriple().isNVPTX())
3733       return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
3734     break;
3735   case Builtin::BI__builtin_canonicalize:
3736   case Builtin::BI__builtin_canonicalizef:
3737   case Builtin::BI__builtin_canonicalizel:
3738     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
3739 
3740   case Builtin::BI__builtin_thread_pointer: {
3741     if (!getContext().getTargetInfo().isTLSSupported())
3742       CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
3743     // Fall through - it's already mapped to the intrinsic by GCCBuiltin.
3744     break;
3745   }
3746   case Builtin::BI__builtin_os_log_format:
3747     return emitBuiltinOSLogFormat(*E);
3748 
3749   case Builtin::BI__xray_customevent: {
3750     if (!ShouldXRayInstrumentFunction())
3751       return RValue::getIgnored();
3752 
3753     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
3754             XRayInstrKind::Custom))
3755       return RValue::getIgnored();
3756 
3757     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
3758       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
3759         return RValue::getIgnored();
3760 
3761     Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
3762     auto FTy = F->getFunctionType();
3763     auto Arg0 = E->getArg(0);
3764     auto Arg0Val = EmitScalarExpr(Arg0);
3765     auto Arg0Ty = Arg0->getType();
3766     auto PTy0 = FTy->getParamType(0);
3767     if (PTy0 != Arg0Val->getType()) {
3768       if (Arg0Ty->isArrayType())
3769         Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
3770       else
3771         Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
3772     }
3773     auto Arg1 = EmitScalarExpr(E->getArg(1));
3774     auto PTy1 = FTy->getParamType(1);
3775     if (PTy1 != Arg1->getType())
3776       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
3777     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
3778   }
3779 
3780   case Builtin::BI__xray_typedevent: {
3781     // TODO: There should be a way to always emit events even if the current
3782     // function is not instrumented. Losing events in a stream can cripple
3783     // a trace.
3784     if (!ShouldXRayInstrumentFunction())
3785       return RValue::getIgnored();
3786 
3787     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
3788             XRayInstrKind::Typed))
3789       return RValue::getIgnored();
3790 
3791     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
3792       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
3793         return RValue::getIgnored();
3794 
3795     Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
3796     auto FTy = F->getFunctionType();
3797     auto Arg0 = EmitScalarExpr(E->getArg(0));
3798     auto PTy0 = FTy->getParamType(0);
3799     if (PTy0 != Arg0->getType())
3800       Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
3801     auto Arg1 = E->getArg(1);
3802     auto Arg1Val = EmitScalarExpr(Arg1);
3803     auto Arg1Ty = Arg1->getType();
3804     auto PTy1 = FTy->getParamType(1);
3805     if (PTy1 != Arg1Val->getType()) {
3806       if (Arg1Ty->isArrayType())
3807         Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
3808       else
3809         Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
3810     }
3811     auto Arg2 = EmitScalarExpr(E->getArg(2));
3812     auto PTy2 = FTy->getParamType(2);
3813     if (PTy2 != Arg2->getType())
3814       Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
3815     return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
3816   }
3817 
3818   case Builtin::BI__builtin_ms_va_start:
3819   case Builtin::BI__builtin_ms_va_end:
3820     return RValue::get(
3821         EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
3822                        BuiltinID == Builtin::BI__builtin_ms_va_start));
3823 
3824   case Builtin::BI__builtin_ms_va_copy: {
3825     // Lower this manually. We can't reliably determine whether or not any
3826     // given va_copy() is for a Win64 va_list from the calling convention
3827     // alone, because it's legal to do this from a System V ABI function.
3828     // With opaque pointer types, we won't have enough information in LLVM
3829     // IR to determine this from the argument types, either. Best to do it
3830     // now, while we have enough information.
3831     Address DestAddr = EmitMSVAListRef(E->getArg(0));
3832     Address SrcAddr = EmitMSVAListRef(E->getArg(1));
3833 
3834     llvm::Type *BPP = Int8PtrPtrTy;
3835 
3836     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
3837                        DestAddr.getAlignment());
3838     SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
3839                       SrcAddr.getAlignment());
3840 
3841     Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
3842     return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
3843   }
3844   }
3845 
3846   // If this is an alias for a lib function (e.g. __builtin_sin), emit
3847   // the call using the normal call path, but using the unmangled
3848   // version of the function name.
3849   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
3850     return emitLibraryCall(*this, FD, E,
3851                            CGM.getBuiltinLibFunction(FD, BuiltinID));
3852 
3853   // If this is a predefined lib function (e.g. malloc), emit the call
3854   // using exactly the normal call path.
3855   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
3856     return emitLibraryCall(*this, FD, E,
3857                       cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
3858 
3859   // Check that a call to a target specific builtin has the correct target
3860   // features.
3861   // This is down here to avoid non-target specific builtins, however, if
3862   // generic builtins start to require generic target features then we
3863   // can move this up to the beginning of the function.
3864   checkTargetFeatures(E, FD);
3865 
3866   if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
3867     LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
3868 
3869   // See if we have a target specific intrinsic.
3870   const char *Name = getContext().BuiltinInfo.getName(BuiltinID);
3871   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
3872   StringRef Prefix =
3873       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
3874   if (!Prefix.empty()) {
3875     IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix.data(), Name);
3876     // NOTE we don't need to perform a compatibility flag check here since the
3877     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
3878     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
3879     if (IntrinsicID == Intrinsic::not_intrinsic)
3880       IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
3881   }
3882 
3883   if (IntrinsicID != Intrinsic::not_intrinsic) {
3884     SmallVector<Value*, 16> Args;
3885 
3886     // Find out if any arguments are required to be integer constant
3887     // expressions.
3888     unsigned ICEArguments = 0;
3889     ASTContext::GetBuiltinTypeError Error;
3890     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
3891     assert(Error == ASTContext::GE_None && "Should not codegen an error");
3892 
3893     Function *F = CGM.getIntrinsic(IntrinsicID);
3894     llvm::FunctionType *FTy = F->getFunctionType();
3895 
3896     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
3897       Value *ArgValue;
3898       // If this is a normal argument, just emit it as a scalar.
3899       if ((ICEArguments & (1 << i)) == 0) {
3900         ArgValue = EmitScalarExpr(E->getArg(i));
3901       } else {
3902         // If this is required to be a constant, constant fold it so that we
3903         // know that the generated intrinsic gets a ConstantInt.
3904         llvm::APSInt Result;
3905         bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result,getContext());
3906         assert(IsConst && "Constant arg isn't actually constant?");
3907         (void)IsConst;
3908         ArgValue = llvm::ConstantInt::get(getLLVMContext(), Result);
3909       }
3910 
3911       // If the intrinsic arg type is different from the builtin arg type
3912       // we need to do a bit cast.
3913       llvm::Type *PTy = FTy->getParamType(i);
3914       if (PTy != ArgValue->getType()) {
3915         // XXX - vector of pointers?
3916         if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) {
3917           if (PtrTy->getAddressSpace() !=
3918               ArgValue->getType()->getPointerAddressSpace()) {
3919             ArgValue = Builder.CreateAddrSpaceCast(
3920               ArgValue,
3921               ArgValue->getType()->getPointerTo(PtrTy->getAddressSpace()));
3922           }
3923         }
3924 
3925         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
3926                "Must be able to losslessly bit cast to param");
3927         ArgValue = Builder.CreateBitCast(ArgValue, PTy);
3928       }
3929 
3930       Args.push_back(ArgValue);
3931     }
3932 
3933     Value *V = Builder.CreateCall(F, Args);
3934     QualType BuiltinRetType = E->getType();
3935 
3936     llvm::Type *RetTy = VoidTy;
3937     if (!BuiltinRetType->isVoidType())
3938       RetTy = ConvertType(BuiltinRetType);
3939 
3940     if (RetTy != V->getType()) {
3941       // XXX - vector of pointers?
3942       if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) {
3943         if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) {
3944           V = Builder.CreateAddrSpaceCast(
3945             V, V->getType()->getPointerTo(PtrTy->getAddressSpace()));
3946         }
3947       }
3948 
3949       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
3950              "Must be able to losslessly bit cast result type");
3951       V = Builder.CreateBitCast(V, RetTy);
3952     }
3953 
3954     return RValue::get(V);
3955   }
3956 
3957   // See if we have a target specific builtin that needs to be lowered.
3958   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E))
3959     return RValue::get(V);
3960 
3961   ErrorUnsupported(E, "builtin function");
3962 
3963   // Unknown builtin, for now just dump it out and return undef.
3964   return GetUndefRValue(E->getType());
3965 }
3966 
3967 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
3968                                         unsigned BuiltinID, const CallExpr *E,
3969                                         llvm::Triple::ArchType Arch) {
3970   switch (Arch) {
3971   case llvm::Triple::arm:
3972   case llvm::Triple::armeb:
3973   case llvm::Triple::thumb:
3974   case llvm::Triple::thumbeb:
3975     return CGF->EmitARMBuiltinExpr(BuiltinID, E, Arch);
3976   case llvm::Triple::aarch64:
3977   case llvm::Triple::aarch64_be:
3978     return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
3979   case llvm::Triple::x86:
3980   case llvm::Triple::x86_64:
3981     return CGF->EmitX86BuiltinExpr(BuiltinID, E);
3982   case llvm::Triple::ppc:
3983   case llvm::Triple::ppc64:
3984   case llvm::Triple::ppc64le:
3985     return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
3986   case llvm::Triple::r600:
3987   case llvm::Triple::amdgcn:
3988     return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
3989   case llvm::Triple::systemz:
3990     return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
3991   case llvm::Triple::nvptx:
3992   case llvm::Triple::nvptx64:
3993     return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
3994   case llvm::Triple::wasm32:
3995   case llvm::Triple::wasm64:
3996     return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
3997   case llvm::Triple::hexagon:
3998     return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
3999   default:
4000     return nullptr;
4001   }
4002 }
4003 
4004 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
4005                                               const CallExpr *E) {
4006   if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
4007     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
4008     return EmitTargetArchBuiltinExpr(
4009         this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
4010         getContext().getAuxTargetInfo()->getTriple().getArch());
4011   }
4012 
4013   return EmitTargetArchBuiltinExpr(this, BuiltinID, E,
4014                                    getTarget().getTriple().getArch());
4015 }
4016 
4017 static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
4018                                      NeonTypeFlags TypeFlags,
4019                                      bool HasLegalHalfType=true,
4020                                      bool V1Ty=false) {
4021   int IsQuad = TypeFlags.isQuad();
4022   switch (TypeFlags.getEltType()) {
4023   case NeonTypeFlags::Int8:
4024   case NeonTypeFlags::Poly8:
4025     return llvm::VectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
4026   case NeonTypeFlags::Int16:
4027   case NeonTypeFlags::Poly16:
4028     return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
4029   case NeonTypeFlags::Float16:
4030     if (HasLegalHalfType)
4031       return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
4032     else
4033       return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
4034   case NeonTypeFlags::Int32:
4035     return llvm::VectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
4036   case NeonTypeFlags::Int64:
4037   case NeonTypeFlags::Poly64:
4038     return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
4039   case NeonTypeFlags::Poly128:
4040     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
4041     // There is a lot of i128 and f128 API missing.
4042     // so we use v16i8 to represent poly128 and get pattern matched.
4043     return llvm::VectorType::get(CGF->Int8Ty, 16);
4044   case NeonTypeFlags::Float32:
4045     return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
4046   case NeonTypeFlags::Float64:
4047     return llvm::VectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
4048   }
4049   llvm_unreachable("Unknown vector element type!");
4050 }
4051 
4052 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
4053                                           NeonTypeFlags IntTypeFlags) {
4054   int IsQuad = IntTypeFlags.isQuad();
4055   switch (IntTypeFlags.getEltType()) {
4056   case NeonTypeFlags::Int16:
4057     return llvm::VectorType::get(CGF->HalfTy, (4 << IsQuad));
4058   case NeonTypeFlags::Int32:
4059     return llvm::VectorType::get(CGF->FloatTy, (2 << IsQuad));
4060   case NeonTypeFlags::Int64:
4061     return llvm::VectorType::get(CGF->DoubleTy, (1 << IsQuad));
4062   default:
4063     llvm_unreachable("Type can't be converted to floating-point!");
4064   }
4065 }
4066 
4067 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
4068   unsigned nElts = V->getType()->getVectorNumElements();
4069   Value* SV = llvm::ConstantVector::getSplat(nElts, C);
4070   return Builder.CreateShuffleVector(V, V, SV, "lane");
4071 }
4072 
4073 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
4074                                      const char *name,
4075                                      unsigned shift, bool rightshift) {
4076   unsigned j = 0;
4077   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
4078        ai != ae; ++ai, ++j)
4079     if (shift > 0 && shift == j)
4080       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
4081     else
4082       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
4083 
4084   return Builder.CreateCall(F, Ops, name);
4085 }
4086 
4087 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
4088                                             bool neg) {
4089   int SV = cast<ConstantInt>(V)->getSExtValue();
4090   return ConstantInt::get(Ty, neg ? -SV : SV);
4091 }
4092 
4093 // Right-shift a vector by a constant.
4094 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
4095                                           llvm::Type *Ty, bool usgn,
4096                                           const char *name) {
4097   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
4098 
4099   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
4100   int EltSize = VTy->getScalarSizeInBits();
4101 
4102   Vec = Builder.CreateBitCast(Vec, Ty);
4103 
4104   // lshr/ashr are undefined when the shift amount is equal to the vector
4105   // element size.
4106   if (ShiftAmt == EltSize) {
4107     if (usgn) {
4108       // Right-shifting an unsigned value by its size yields 0.
4109       return llvm::ConstantAggregateZero::get(VTy);
4110     } else {
4111       // Right-shifting a signed value by its size is equivalent
4112       // to a shift of size-1.
4113       --ShiftAmt;
4114       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
4115     }
4116   }
4117 
4118   Shift = EmitNeonShiftVector(Shift, Ty, false);
4119   if (usgn)
4120     return Builder.CreateLShr(Vec, Shift, name);
4121   else
4122     return Builder.CreateAShr(Vec, Shift, name);
4123 }
4124 
4125 enum {
4126   AddRetType = (1 << 0),
4127   Add1ArgType = (1 << 1),
4128   Add2ArgTypes = (1 << 2),
4129 
4130   VectorizeRetType = (1 << 3),
4131   VectorizeArgTypes = (1 << 4),
4132 
4133   InventFloatType = (1 << 5),
4134   UnsignedAlts = (1 << 6),
4135 
4136   Use64BitVectors = (1 << 7),
4137   Use128BitVectors = (1 << 8),
4138 
4139   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
4140   VectorRet = AddRetType | VectorizeRetType,
4141   VectorRetGetArgs01 =
4142       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
4143   FpCmpzModifiers =
4144       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
4145 };
4146 
4147 namespace {
4148 struct NeonIntrinsicInfo {
4149   const char *NameHint;
4150   unsigned BuiltinID;
4151   unsigned LLVMIntrinsic;
4152   unsigned AltLLVMIntrinsic;
4153   unsigned TypeModifier;
4154 
4155   bool operator<(unsigned RHSBuiltinID) const {
4156     return BuiltinID < RHSBuiltinID;
4157   }
4158   bool operator<(const NeonIntrinsicInfo &TE) const {
4159     return BuiltinID < TE.BuiltinID;
4160   }
4161 };
4162 } // end anonymous namespace
4163 
4164 #define NEONMAP0(NameBase) \
4165   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
4166 
4167 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
4168   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
4169       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
4170 
4171 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
4172   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
4173       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
4174       TypeModifier }
4175 
4176 static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
4177   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
4178   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
4179   NEONMAP1(vabs_v, arm_neon_vabs, 0),
4180   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
4181   NEONMAP0(vaddhn_v),
4182   NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
4183   NEONMAP1(vaeseq_v, arm_neon_aese, 0),
4184   NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
4185   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
4186   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
4187   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
4188   NEONMAP1(vcage_v, arm_neon_vacge, 0),
4189   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
4190   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
4191   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
4192   NEONMAP1(vcale_v, arm_neon_vacge, 0),
4193   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
4194   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
4195   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
4196   NEONMAP0(vceqz_v),
4197   NEONMAP0(vceqzq_v),
4198   NEONMAP0(vcgez_v),
4199   NEONMAP0(vcgezq_v),
4200   NEONMAP0(vcgtz_v),
4201   NEONMAP0(vcgtzq_v),
4202   NEONMAP0(vclez_v),
4203   NEONMAP0(vclezq_v),
4204   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
4205   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
4206   NEONMAP0(vcltz_v),
4207   NEONMAP0(vcltzq_v),
4208   NEONMAP1(vclz_v, ctlz, Add1ArgType),
4209   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
4210   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
4211   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
4212   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
4213   NEONMAP0(vcvt_f16_v),
4214   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
4215   NEONMAP0(vcvt_f32_v),
4216   NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
4217   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
4218   NEONMAP1(vcvt_n_s16_v, arm_neon_vcvtfp2fxs, 0),
4219   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
4220   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
4221   NEONMAP1(vcvt_n_u16_v, arm_neon_vcvtfp2fxu, 0),
4222   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
4223   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
4224   NEONMAP0(vcvt_s16_v),
4225   NEONMAP0(vcvt_s32_v),
4226   NEONMAP0(vcvt_s64_v),
4227   NEONMAP0(vcvt_u16_v),
4228   NEONMAP0(vcvt_u32_v),
4229   NEONMAP0(vcvt_u64_v),
4230   NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0),
4231   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
4232   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
4233   NEONMAP1(vcvta_u16_v, arm_neon_vcvtau, 0),
4234   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
4235   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
4236   NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0),
4237   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
4238   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
4239   NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0),
4240   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
4241   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
4242   NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0),
4243   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
4244   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
4245   NEONMAP1(vcvtm_u16_v, arm_neon_vcvtmu, 0),
4246   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
4247   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
4248   NEONMAP1(vcvtmq_s16_v, arm_neon_vcvtms, 0),
4249   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
4250   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
4251   NEONMAP1(vcvtmq_u16_v, arm_neon_vcvtmu, 0),
4252   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
4253   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
4254   NEONMAP1(vcvtn_s16_v, arm_neon_vcvtns, 0),
4255   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
4256   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
4257   NEONMAP1(vcvtn_u16_v, arm_neon_vcvtnu, 0),
4258   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
4259   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
4260   NEONMAP1(vcvtnq_s16_v, arm_neon_vcvtns, 0),
4261   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
4262   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
4263   NEONMAP1(vcvtnq_u16_v, arm_neon_vcvtnu, 0),
4264   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
4265   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
4266   NEONMAP1(vcvtp_s16_v, arm_neon_vcvtps, 0),
4267   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
4268   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
4269   NEONMAP1(vcvtp_u16_v, arm_neon_vcvtpu, 0),
4270   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
4271   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
4272   NEONMAP1(vcvtpq_s16_v, arm_neon_vcvtps, 0),
4273   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
4274   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
4275   NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0),
4276   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
4277   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
4278   NEONMAP0(vcvtq_f16_v),
4279   NEONMAP0(vcvtq_f32_v),
4280   NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
4281   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
4282   NEONMAP1(vcvtq_n_s16_v, arm_neon_vcvtfp2fxs, 0),
4283   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
4284   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
4285   NEONMAP1(vcvtq_n_u16_v, arm_neon_vcvtfp2fxu, 0),
4286   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
4287   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
4288   NEONMAP0(vcvtq_s16_v),
4289   NEONMAP0(vcvtq_s32_v),
4290   NEONMAP0(vcvtq_s64_v),
4291   NEONMAP0(vcvtq_u16_v),
4292   NEONMAP0(vcvtq_u32_v),
4293   NEONMAP0(vcvtq_u64_v),
4294   NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
4295   NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
4296   NEONMAP0(vext_v),
4297   NEONMAP0(vextq_v),
4298   NEONMAP0(vfma_v),
4299   NEONMAP0(vfmaq_v),
4300   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
4301   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
4302   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
4303   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
4304   NEONMAP0(vld1_dup_v),
4305   NEONMAP1(vld1_v, arm_neon_vld1, 0),
4306   NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
4307   NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
4308   NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
4309   NEONMAP0(vld1q_dup_v),
4310   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
4311   NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
4312   NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
4313   NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
4314   NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
4315   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
4316   NEONMAP1(vld2_v, arm_neon_vld2, 0),
4317   NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
4318   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
4319   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
4320   NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
4321   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
4322   NEONMAP1(vld3_v, arm_neon_vld3, 0),
4323   NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
4324   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
4325   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
4326   NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
4327   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
4328   NEONMAP1(vld4_v, arm_neon_vld4, 0),
4329   NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
4330   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
4331   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
4332   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
4333   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
4334   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
4335   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
4336   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
4337   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
4338   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
4339   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
4340   NEONMAP0(vmovl_v),
4341   NEONMAP0(vmovn_v),
4342   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
4343   NEONMAP0(vmull_v),
4344   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
4345   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
4346   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
4347   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
4348   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
4349   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
4350   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
4351   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
4352   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
4353   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
4354   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
4355   NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
4356   NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
4357   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
4358   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
4359   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
4360   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
4361   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
4362   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
4363   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
4364   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
4365   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
4366   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
4367   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
4368   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
4369   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
4370   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
4371   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
4372   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
4373   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
4374   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
4375   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
4376   NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
4377   NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
4378   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
4379   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
4380   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
4381   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
4382   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
4383   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
4384   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
4385   NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
4386   NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
4387   NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
4388   NEONMAP0(vrndi_v),
4389   NEONMAP0(vrndiq_v),
4390   NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
4391   NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
4392   NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
4393   NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
4394   NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
4395   NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
4396   NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
4397   NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
4398   NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
4399   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
4400   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
4401   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
4402   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
4403   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
4404   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
4405   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
4406   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
4407   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
4408   NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
4409   NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
4410   NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
4411   NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
4412   NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
4413   NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
4414   NEONMAP0(vshl_n_v),
4415   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
4416   NEONMAP0(vshll_n_v),
4417   NEONMAP0(vshlq_n_v),
4418   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
4419   NEONMAP0(vshr_n_v),
4420   NEONMAP0(vshrn_n_v),
4421   NEONMAP0(vshrq_n_v),
4422   NEONMAP1(vst1_v, arm_neon_vst1, 0),
4423   NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
4424   NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
4425   NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
4426   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
4427   NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
4428   NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
4429   NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
4430   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
4431   NEONMAP1(vst2_v, arm_neon_vst2, 0),
4432   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
4433   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
4434   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
4435   NEONMAP1(vst3_v, arm_neon_vst3, 0),
4436   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
4437   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
4438   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
4439   NEONMAP1(vst4_v, arm_neon_vst4, 0),
4440   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
4441   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
4442   NEONMAP0(vsubhn_v),
4443   NEONMAP0(vtrn_v),
4444   NEONMAP0(vtrnq_v),
4445   NEONMAP0(vtst_v),
4446   NEONMAP0(vtstq_v),
4447   NEONMAP0(vuzp_v),
4448   NEONMAP0(vuzpq_v),
4449   NEONMAP0(vzip_v),
4450   NEONMAP0(vzipq_v)
4451 };
4452 
4453 static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
4454   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
4455   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
4456   NEONMAP0(vaddhn_v),
4457   NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
4458   NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
4459   NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
4460   NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
4461   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
4462   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
4463   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
4464   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
4465   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
4466   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
4467   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
4468   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
4469   NEONMAP0(vceqz_v),
4470   NEONMAP0(vceqzq_v),
4471   NEONMAP0(vcgez_v),
4472   NEONMAP0(vcgezq_v),
4473   NEONMAP0(vcgtz_v),
4474   NEONMAP0(vcgtzq_v),
4475   NEONMAP0(vclez_v),
4476   NEONMAP0(vclezq_v),
4477   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
4478   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
4479   NEONMAP0(vcltz_v),
4480   NEONMAP0(vcltzq_v),
4481   NEONMAP1(vclz_v, ctlz, Add1ArgType),
4482   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
4483   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
4484   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
4485   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
4486   NEONMAP0(vcvt_f16_v),
4487   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
4488   NEONMAP0(vcvt_f32_v),
4489   NEONMAP2(vcvt_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4490   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4491   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4492   NEONMAP1(vcvt_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
4493   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
4494   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
4495   NEONMAP1(vcvt_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
4496   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
4497   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
4498   NEONMAP0(vcvtq_f16_v),
4499   NEONMAP0(vcvtq_f32_v),
4500   NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4501   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4502   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4503   NEONMAP1(vcvtq_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
4504   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
4505   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
4506   NEONMAP1(vcvtq_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
4507   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
4508   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
4509   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
4510   NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
4511   NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
4512   NEONMAP0(vext_v),
4513   NEONMAP0(vextq_v),
4514   NEONMAP0(vfma_v),
4515   NEONMAP0(vfmaq_v),
4516   NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0),
4517   NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0),
4518   NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0),
4519   NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0),
4520   NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0),
4521   NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0),
4522   NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0),
4523   NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0),
4524   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
4525   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
4526   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
4527   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
4528   NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
4529   NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
4530   NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
4531   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
4532   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
4533   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
4534   NEONMAP0(vmovl_v),
4535   NEONMAP0(vmovn_v),
4536   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
4537   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
4538   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
4539   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
4540   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
4541   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
4542   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
4543   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
4544   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
4545   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
4546   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
4547   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
4548   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
4549   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
4550   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
4551   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
4552   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
4553   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
4554   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
4555   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
4556   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
4557   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
4558   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
4559   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
4560   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
4561   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
4562   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
4563   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
4564   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
4565   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
4566   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
4567   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
4568   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
4569   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
4570   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
4571   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
4572   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
4573   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
4574   NEONMAP0(vrndi_v),
4575   NEONMAP0(vrndiq_v),
4576   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
4577   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
4578   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
4579   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
4580   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
4581   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
4582   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
4583   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
4584   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
4585   NEONMAP1(vsha1su0q_v, aarch64_crypto_sha1su0, 0),
4586   NEONMAP1(vsha1su1q_v, aarch64_crypto_sha1su1, 0),
4587   NEONMAP1(vsha256h2q_v, aarch64_crypto_sha256h2, 0),
4588   NEONMAP1(vsha256hq_v, aarch64_crypto_sha256h, 0),
4589   NEONMAP1(vsha256su0q_v, aarch64_crypto_sha256su0, 0),
4590   NEONMAP1(vsha256su1q_v, aarch64_crypto_sha256su1, 0),
4591   NEONMAP0(vshl_n_v),
4592   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
4593   NEONMAP0(vshll_n_v),
4594   NEONMAP0(vshlq_n_v),
4595   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
4596   NEONMAP0(vshr_n_v),
4597   NEONMAP0(vshrn_n_v),
4598   NEONMAP0(vshrq_n_v),
4599   NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
4600   NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
4601   NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
4602   NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
4603   NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
4604   NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
4605   NEONMAP0(vsubhn_v),
4606   NEONMAP0(vtst_v),
4607   NEONMAP0(vtstq_v),
4608 };
4609 
4610 static const NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = {
4611   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
4612   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
4613   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
4614   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
4615   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
4616   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
4617   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
4618   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
4619   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
4620   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4621   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
4622   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
4623   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
4624   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
4625   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4626   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4627   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
4628   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
4629   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
4630   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
4631   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
4632   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
4633   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
4634   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
4635   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4636   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4637   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4638   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4639   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4640   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4641   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4642   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4643   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4644   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4645   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4646   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4647   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4648   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4649   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4650   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4651   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4652   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4653   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4654   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4655   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4656   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4657   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4658   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4659   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
4660   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4661   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4662   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4663   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4664   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
4665   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
4666   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4667   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4668   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
4669   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
4670   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4671   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4672   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4673   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4674   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
4675   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
4676   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4677   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
4678   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
4679   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
4680   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
4681   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
4682   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
4683   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4684   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4685   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4686   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4687   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4688   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4689   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4690   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4691   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
4692   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4693   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
4694   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
4695   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
4696   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
4697   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
4698   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
4699   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
4700   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
4701   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
4702   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
4703   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
4704   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
4705   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
4706   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
4707   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
4708   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
4709   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
4710   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
4711   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
4712   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
4713   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
4714   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
4715   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
4716   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
4717   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
4718   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
4719   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
4720   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
4721   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
4722   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
4723   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
4724   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
4725   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
4726   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
4727   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
4728   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
4729   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
4730   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
4731   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
4732   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
4733   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
4734   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
4735   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
4736   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
4737   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
4738   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
4739   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
4740   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
4741   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4742   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4743   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4744   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4745   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
4746   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
4747   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4748   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4749   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4750   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4751   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
4752   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
4753   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
4754   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
4755   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
4756   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
4757   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
4758   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
4759   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
4760   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
4761   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
4762   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
4763   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
4764   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
4765   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
4766   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
4767   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
4768   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
4769   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
4770   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
4771   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
4772   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
4773   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
4774   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
4775   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
4776   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
4777   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
4778   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
4779   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
4780   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
4781   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
4782   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
4783   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
4784   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
4785   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
4786   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
4787   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
4788   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
4789   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
4790   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
4791   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
4792   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
4793   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
4794   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
4795   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
4796   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
4797   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
4798   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
4799   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
4800   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
4801   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
4802   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
4803   // FP16 scalar intrinisics go here.
4804   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
4805   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4806   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4807   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4808   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4809   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4810   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4811   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4812   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4813   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4814   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4815   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4816   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4817   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4818   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4819   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4820   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4821   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4822   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4823   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4824   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4825   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4826   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4827   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4828   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4829   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
4830   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
4831   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
4832   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
4833   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
4834 };
4835 
4836 #undef NEONMAP0
4837 #undef NEONMAP1
4838 #undef NEONMAP2
4839 
4840 static bool NEONSIMDIntrinsicsProvenSorted = false;
4841 
4842 static bool AArch64SIMDIntrinsicsProvenSorted = false;
4843 static bool AArch64SISDIntrinsicsProvenSorted = false;
4844 
4845 
4846 static const NeonIntrinsicInfo *
4847 findNeonIntrinsicInMap(ArrayRef<NeonIntrinsicInfo> IntrinsicMap,
4848                        unsigned BuiltinID, bool &MapProvenSorted) {
4849 
4850 #ifndef NDEBUG
4851   if (!MapProvenSorted) {
4852     assert(std::is_sorted(std::begin(IntrinsicMap), std::end(IntrinsicMap)));
4853     MapProvenSorted = true;
4854   }
4855 #endif
4856 
4857   const NeonIntrinsicInfo *Builtin =
4858       std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID);
4859 
4860   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
4861     return Builtin;
4862 
4863   return nullptr;
4864 }
4865 
4866 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
4867                                                    unsigned Modifier,
4868                                                    llvm::Type *ArgType,
4869                                                    const CallExpr *E) {
4870   int VectorSize = 0;
4871   if (Modifier & Use64BitVectors)
4872     VectorSize = 64;
4873   else if (Modifier & Use128BitVectors)
4874     VectorSize = 128;
4875 
4876   // Return type.
4877   SmallVector<llvm::Type *, 3> Tys;
4878   if (Modifier & AddRetType) {
4879     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
4880     if (Modifier & VectorizeRetType)
4881       Ty = llvm::VectorType::get(
4882           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
4883 
4884     Tys.push_back(Ty);
4885   }
4886 
4887   // Arguments.
4888   if (Modifier & VectorizeArgTypes) {
4889     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
4890     ArgType = llvm::VectorType::get(ArgType, Elts);
4891   }
4892 
4893   if (Modifier & (Add1ArgType | Add2ArgTypes))
4894     Tys.push_back(ArgType);
4895 
4896   if (Modifier & Add2ArgTypes)
4897     Tys.push_back(ArgType);
4898 
4899   if (Modifier & InventFloatType)
4900     Tys.push_back(FloatTy);
4901 
4902   return CGM.getIntrinsic(IntrinsicID, Tys);
4903 }
4904 
4905 static Value *EmitCommonNeonSISDBuiltinExpr(CodeGenFunction &CGF,
4906                                             const NeonIntrinsicInfo &SISDInfo,
4907                                             SmallVectorImpl<Value *> &Ops,
4908                                             const CallExpr *E) {
4909   unsigned BuiltinID = SISDInfo.BuiltinID;
4910   unsigned int Int = SISDInfo.LLVMIntrinsic;
4911   unsigned Modifier = SISDInfo.TypeModifier;
4912   const char *s = SISDInfo.NameHint;
4913 
4914   switch (BuiltinID) {
4915   case NEON::BI__builtin_neon_vcled_s64:
4916   case NEON::BI__builtin_neon_vcled_u64:
4917   case NEON::BI__builtin_neon_vcles_f32:
4918   case NEON::BI__builtin_neon_vcled_f64:
4919   case NEON::BI__builtin_neon_vcltd_s64:
4920   case NEON::BI__builtin_neon_vcltd_u64:
4921   case NEON::BI__builtin_neon_vclts_f32:
4922   case NEON::BI__builtin_neon_vcltd_f64:
4923   case NEON::BI__builtin_neon_vcales_f32:
4924   case NEON::BI__builtin_neon_vcaled_f64:
4925   case NEON::BI__builtin_neon_vcalts_f32:
4926   case NEON::BI__builtin_neon_vcaltd_f64:
4927     // Only one direction of comparisons actually exist, cmle is actually a cmge
4928     // with swapped operands. The table gives us the right intrinsic but we
4929     // still need to do the swap.
4930     std::swap(Ops[0], Ops[1]);
4931     break;
4932   }
4933 
4934   assert(Int && "Generic code assumes a valid intrinsic");
4935 
4936   // Determine the type(s) of this overloaded AArch64 intrinsic.
4937   const Expr *Arg = E->getArg(0);
4938   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
4939   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
4940 
4941   int j = 0;
4942   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
4943   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
4944        ai != ae; ++ai, ++j) {
4945     llvm::Type *ArgTy = ai->getType();
4946     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
4947              ArgTy->getPrimitiveSizeInBits())
4948       continue;
4949 
4950     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
4951     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
4952     // it before inserting.
4953     Ops[j] =
4954         CGF.Builder.CreateTruncOrBitCast(Ops[j], ArgTy->getVectorElementType());
4955     Ops[j] =
4956         CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0);
4957   }
4958 
4959   Value *Result = CGF.EmitNeonCall(F, Ops, s);
4960   llvm::Type *ResultType = CGF.ConvertType(E->getType());
4961   if (ResultType->getPrimitiveSizeInBits() <
4962       Result->getType()->getPrimitiveSizeInBits())
4963     return CGF.Builder.CreateExtractElement(Result, C0);
4964 
4965   return CGF.Builder.CreateBitCast(Result, ResultType, s);
4966 }
4967 
4968 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
4969     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
4970     const char *NameHint, unsigned Modifier, const CallExpr *E,
4971     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
4972     llvm::Triple::ArchType Arch) {
4973   // Get the last argument, which specifies the vector type.
4974   llvm::APSInt NeonTypeConst;
4975   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
4976   if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext()))
4977     return nullptr;
4978 
4979   // Determine the type of this overloaded NEON intrinsic.
4980   NeonTypeFlags Type(NeonTypeConst.getZExtValue());
4981   bool Usgn = Type.isUnsigned();
4982   bool Quad = Type.isQuad();
4983   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
4984 
4985   llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType);
4986   llvm::Type *Ty = VTy;
4987   if (!Ty)
4988     return nullptr;
4989 
4990   auto getAlignmentValue32 = [&](Address addr) -> Value* {
4991     return Builder.getInt32(addr.getAlignment().getQuantity());
4992   };
4993 
4994   unsigned Int = LLVMIntrinsic;
4995   if ((Modifier & UnsignedAlts) && !Usgn)
4996     Int = AltLLVMIntrinsic;
4997 
4998   switch (BuiltinID) {
4999   default: break;
5000   case NEON::BI__builtin_neon_vabs_v:
5001   case NEON::BI__builtin_neon_vabsq_v:
5002     if (VTy->getElementType()->isFloatingPointTy())
5003       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
5004     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
5005   case NEON::BI__builtin_neon_vaddhn_v: {
5006     llvm::VectorType *SrcTy =
5007         llvm::VectorType::getExtendedElementVectorType(VTy);
5008 
5009     // %sum = add <4 x i32> %lhs, %rhs
5010     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
5011     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
5012     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
5013 
5014     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
5015     Constant *ShiftAmt =
5016         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
5017     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
5018 
5019     // %res = trunc <4 x i32> %high to <4 x i16>
5020     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
5021   }
5022   case NEON::BI__builtin_neon_vcale_v:
5023   case NEON::BI__builtin_neon_vcaleq_v:
5024   case NEON::BI__builtin_neon_vcalt_v:
5025   case NEON::BI__builtin_neon_vcaltq_v:
5026     std::swap(Ops[0], Ops[1]);
5027     LLVM_FALLTHROUGH;
5028   case NEON::BI__builtin_neon_vcage_v:
5029   case NEON::BI__builtin_neon_vcageq_v:
5030   case NEON::BI__builtin_neon_vcagt_v:
5031   case NEON::BI__builtin_neon_vcagtq_v: {
5032     llvm::Type *Ty;
5033     switch (VTy->getScalarSizeInBits()) {
5034     default: llvm_unreachable("unexpected type");
5035     case 32:
5036       Ty = FloatTy;
5037       break;
5038     case 64:
5039       Ty = DoubleTy;
5040       break;
5041     case 16:
5042       Ty = HalfTy;
5043       break;
5044     }
5045     llvm::Type *VecFlt = llvm::VectorType::get(Ty, VTy->getNumElements());
5046     llvm::Type *Tys[] = { VTy, VecFlt };
5047     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
5048     return EmitNeonCall(F, Ops, NameHint);
5049   }
5050   case NEON::BI__builtin_neon_vceqz_v:
5051   case NEON::BI__builtin_neon_vceqzq_v:
5052     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
5053                                          ICmpInst::ICMP_EQ, "vceqz");
5054   case NEON::BI__builtin_neon_vcgez_v:
5055   case NEON::BI__builtin_neon_vcgezq_v:
5056     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
5057                                          ICmpInst::ICMP_SGE, "vcgez");
5058   case NEON::BI__builtin_neon_vclez_v:
5059   case NEON::BI__builtin_neon_vclezq_v:
5060     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
5061                                          ICmpInst::ICMP_SLE, "vclez");
5062   case NEON::BI__builtin_neon_vcgtz_v:
5063   case NEON::BI__builtin_neon_vcgtzq_v:
5064     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
5065                                          ICmpInst::ICMP_SGT, "vcgtz");
5066   case NEON::BI__builtin_neon_vcltz_v:
5067   case NEON::BI__builtin_neon_vcltzq_v:
5068     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
5069                                          ICmpInst::ICMP_SLT, "vcltz");
5070   case NEON::BI__builtin_neon_vclz_v:
5071   case NEON::BI__builtin_neon_vclzq_v:
5072     // We generate target-independent intrinsic, which needs a second argument
5073     // for whether or not clz of zero is undefined; on ARM it isn't.
5074     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
5075     break;
5076   case NEON::BI__builtin_neon_vcvt_f32_v:
5077   case NEON::BI__builtin_neon_vcvtq_f32_v:
5078     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5079     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
5080                      HasLegalHalfType);
5081     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
5082                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
5083   case NEON::BI__builtin_neon_vcvt_f16_v:
5084   case NEON::BI__builtin_neon_vcvtq_f16_v:
5085     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5086     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
5087                      HasLegalHalfType);
5088     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
5089                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
5090   case NEON::BI__builtin_neon_vcvt_n_f16_v:
5091   case NEON::BI__builtin_neon_vcvt_n_f32_v:
5092   case NEON::BI__builtin_neon_vcvt_n_f64_v:
5093   case NEON::BI__builtin_neon_vcvtq_n_f16_v:
5094   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
5095   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
5096     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
5097     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
5098     Function *F = CGM.getIntrinsic(Int, Tys);
5099     return EmitNeonCall(F, Ops, "vcvt_n");
5100   }
5101   case NEON::BI__builtin_neon_vcvt_n_s16_v:
5102   case NEON::BI__builtin_neon_vcvt_n_s32_v:
5103   case NEON::BI__builtin_neon_vcvt_n_u16_v:
5104   case NEON::BI__builtin_neon_vcvt_n_u32_v:
5105   case NEON::BI__builtin_neon_vcvt_n_s64_v:
5106   case NEON::BI__builtin_neon_vcvt_n_u64_v:
5107   case NEON::BI__builtin_neon_vcvtq_n_s16_v:
5108   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
5109   case NEON::BI__builtin_neon_vcvtq_n_u16_v:
5110   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
5111   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
5112   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
5113     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
5114     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
5115     return EmitNeonCall(F, Ops, "vcvt_n");
5116   }
5117   case NEON::BI__builtin_neon_vcvt_s32_v:
5118   case NEON::BI__builtin_neon_vcvt_u32_v:
5119   case NEON::BI__builtin_neon_vcvt_s64_v:
5120   case NEON::BI__builtin_neon_vcvt_u64_v:
5121   case NEON::BI__builtin_neon_vcvt_s16_v:
5122   case NEON::BI__builtin_neon_vcvt_u16_v:
5123   case NEON::BI__builtin_neon_vcvtq_s32_v:
5124   case NEON::BI__builtin_neon_vcvtq_u32_v:
5125   case NEON::BI__builtin_neon_vcvtq_s64_v:
5126   case NEON::BI__builtin_neon_vcvtq_u64_v:
5127   case NEON::BI__builtin_neon_vcvtq_s16_v:
5128   case NEON::BI__builtin_neon_vcvtq_u16_v: {
5129     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
5130     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
5131                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
5132   }
5133   case NEON::BI__builtin_neon_vcvta_s16_v:
5134   case NEON::BI__builtin_neon_vcvta_s32_v:
5135   case NEON::BI__builtin_neon_vcvta_s64_v:
5136   case NEON::BI__builtin_neon_vcvta_u16_v:
5137   case NEON::BI__builtin_neon_vcvta_u32_v:
5138   case NEON::BI__builtin_neon_vcvta_u64_v:
5139   case NEON::BI__builtin_neon_vcvtaq_s16_v:
5140   case NEON::BI__builtin_neon_vcvtaq_s32_v:
5141   case NEON::BI__builtin_neon_vcvtaq_s64_v:
5142   case NEON::BI__builtin_neon_vcvtaq_u16_v:
5143   case NEON::BI__builtin_neon_vcvtaq_u32_v:
5144   case NEON::BI__builtin_neon_vcvtaq_u64_v:
5145   case NEON::BI__builtin_neon_vcvtn_s16_v:
5146   case NEON::BI__builtin_neon_vcvtn_s32_v:
5147   case NEON::BI__builtin_neon_vcvtn_s64_v:
5148   case NEON::BI__builtin_neon_vcvtn_u16_v:
5149   case NEON::BI__builtin_neon_vcvtn_u32_v:
5150   case NEON::BI__builtin_neon_vcvtn_u64_v:
5151   case NEON::BI__builtin_neon_vcvtnq_s16_v:
5152   case NEON::BI__builtin_neon_vcvtnq_s32_v:
5153   case NEON::BI__builtin_neon_vcvtnq_s64_v:
5154   case NEON::BI__builtin_neon_vcvtnq_u16_v:
5155   case NEON::BI__builtin_neon_vcvtnq_u32_v:
5156   case NEON::BI__builtin_neon_vcvtnq_u64_v:
5157   case NEON::BI__builtin_neon_vcvtp_s16_v:
5158   case NEON::BI__builtin_neon_vcvtp_s32_v:
5159   case NEON::BI__builtin_neon_vcvtp_s64_v:
5160   case NEON::BI__builtin_neon_vcvtp_u16_v:
5161   case NEON::BI__builtin_neon_vcvtp_u32_v:
5162   case NEON::BI__builtin_neon_vcvtp_u64_v:
5163   case NEON::BI__builtin_neon_vcvtpq_s16_v:
5164   case NEON::BI__builtin_neon_vcvtpq_s32_v:
5165   case NEON::BI__builtin_neon_vcvtpq_s64_v:
5166   case NEON::BI__builtin_neon_vcvtpq_u16_v:
5167   case NEON::BI__builtin_neon_vcvtpq_u32_v:
5168   case NEON::BI__builtin_neon_vcvtpq_u64_v:
5169   case NEON::BI__builtin_neon_vcvtm_s16_v:
5170   case NEON::BI__builtin_neon_vcvtm_s32_v:
5171   case NEON::BI__builtin_neon_vcvtm_s64_v:
5172   case NEON::BI__builtin_neon_vcvtm_u16_v:
5173   case NEON::BI__builtin_neon_vcvtm_u32_v:
5174   case NEON::BI__builtin_neon_vcvtm_u64_v:
5175   case NEON::BI__builtin_neon_vcvtmq_s16_v:
5176   case NEON::BI__builtin_neon_vcvtmq_s32_v:
5177   case NEON::BI__builtin_neon_vcvtmq_s64_v:
5178   case NEON::BI__builtin_neon_vcvtmq_u16_v:
5179   case NEON::BI__builtin_neon_vcvtmq_u32_v:
5180   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
5181     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
5182     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
5183   }
5184   case NEON::BI__builtin_neon_vext_v:
5185   case NEON::BI__builtin_neon_vextq_v: {
5186     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
5187     SmallVector<uint32_t, 16> Indices;
5188     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
5189       Indices.push_back(i+CV);
5190 
5191     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5192     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5193     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
5194   }
5195   case NEON::BI__builtin_neon_vfma_v:
5196   case NEON::BI__builtin_neon_vfmaq_v: {
5197     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
5198     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5199     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5200     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
5201 
5202     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
5203     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
5204   }
5205   case NEON::BI__builtin_neon_vld1_v:
5206   case NEON::BI__builtin_neon_vld1q_v: {
5207     llvm::Type *Tys[] = {Ty, Int8PtrTy};
5208     Ops.push_back(getAlignmentValue32(PtrOp0));
5209     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
5210   }
5211   case NEON::BI__builtin_neon_vld1_x2_v:
5212   case NEON::BI__builtin_neon_vld1q_x2_v:
5213   case NEON::BI__builtin_neon_vld1_x3_v:
5214   case NEON::BI__builtin_neon_vld1q_x3_v:
5215   case NEON::BI__builtin_neon_vld1_x4_v:
5216   case NEON::BI__builtin_neon_vld1q_x4_v: {
5217     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
5218     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
5219     llvm::Type *Tys[2] = { VTy, PTy };
5220     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
5221     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
5222     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5223     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5224     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
5225   }
5226   case NEON::BI__builtin_neon_vld2_v:
5227   case NEON::BI__builtin_neon_vld2q_v:
5228   case NEON::BI__builtin_neon_vld3_v:
5229   case NEON::BI__builtin_neon_vld3q_v:
5230   case NEON::BI__builtin_neon_vld4_v:
5231   case NEON::BI__builtin_neon_vld4q_v:
5232   case NEON::BI__builtin_neon_vld2_dup_v:
5233   case NEON::BI__builtin_neon_vld2q_dup_v:
5234   case NEON::BI__builtin_neon_vld3_dup_v:
5235   case NEON::BI__builtin_neon_vld3q_dup_v:
5236   case NEON::BI__builtin_neon_vld4_dup_v:
5237   case NEON::BI__builtin_neon_vld4q_dup_v: {
5238     llvm::Type *Tys[] = {Ty, Int8PtrTy};
5239     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
5240     Value *Align = getAlignmentValue32(PtrOp1);
5241     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
5242     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5243     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5244     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
5245   }
5246   case NEON::BI__builtin_neon_vld1_dup_v:
5247   case NEON::BI__builtin_neon_vld1q_dup_v: {
5248     Value *V = UndefValue::get(Ty);
5249     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
5250     PtrOp0 = Builder.CreateBitCast(PtrOp0, Ty);
5251     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
5252     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
5253     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
5254     return EmitNeonSplat(Ops[0], CI);
5255   }
5256   case NEON::BI__builtin_neon_vld2_lane_v:
5257   case NEON::BI__builtin_neon_vld2q_lane_v:
5258   case NEON::BI__builtin_neon_vld3_lane_v:
5259   case NEON::BI__builtin_neon_vld3q_lane_v:
5260   case NEON::BI__builtin_neon_vld4_lane_v:
5261   case NEON::BI__builtin_neon_vld4q_lane_v: {
5262     llvm::Type *Tys[] = {Ty, Int8PtrTy};
5263     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
5264     for (unsigned I = 2; I < Ops.size() - 1; ++I)
5265       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
5266     Ops.push_back(getAlignmentValue32(PtrOp1));
5267     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
5268     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5269     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5270     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
5271   }
5272   case NEON::BI__builtin_neon_vmovl_v: {
5273     llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
5274     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
5275     if (Usgn)
5276       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
5277     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
5278   }
5279   case NEON::BI__builtin_neon_vmovn_v: {
5280     llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
5281     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
5282     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
5283   }
5284   case NEON::BI__builtin_neon_vmull_v:
5285     // FIXME: the integer vmull operations could be emitted in terms of pure
5286     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
5287     // hoisting the exts outside loops. Until global ISel comes along that can
5288     // see through such movement this leads to bad CodeGen. So we need an
5289     // intrinsic for now.
5290     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
5291     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
5292     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
5293   case NEON::BI__builtin_neon_vpadal_v:
5294   case NEON::BI__builtin_neon_vpadalq_v: {
5295     // The source operand type has twice as many elements of half the size.
5296     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
5297     llvm::Type *EltTy =
5298       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
5299     llvm::Type *NarrowTy =
5300       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
5301     llvm::Type *Tys[2] = { Ty, NarrowTy };
5302     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
5303   }
5304   case NEON::BI__builtin_neon_vpaddl_v:
5305   case NEON::BI__builtin_neon_vpaddlq_v: {
5306     // The source operand type has twice as many elements of half the size.
5307     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
5308     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
5309     llvm::Type *NarrowTy =
5310       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
5311     llvm::Type *Tys[2] = { Ty, NarrowTy };
5312     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
5313   }
5314   case NEON::BI__builtin_neon_vqdmlal_v:
5315   case NEON::BI__builtin_neon_vqdmlsl_v: {
5316     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
5317     Ops[1] =
5318         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
5319     Ops.resize(2);
5320     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
5321   }
5322   case NEON::BI__builtin_neon_vqshl_n_v:
5323   case NEON::BI__builtin_neon_vqshlq_n_v:
5324     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
5325                         1, false);
5326   case NEON::BI__builtin_neon_vqshlu_n_v:
5327   case NEON::BI__builtin_neon_vqshluq_n_v:
5328     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
5329                         1, false);
5330   case NEON::BI__builtin_neon_vrecpe_v:
5331   case NEON::BI__builtin_neon_vrecpeq_v:
5332   case NEON::BI__builtin_neon_vrsqrte_v:
5333   case NEON::BI__builtin_neon_vrsqrteq_v:
5334     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
5335     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
5336   case NEON::BI__builtin_neon_vrndi_v:
5337   case NEON::BI__builtin_neon_vrndiq_v:
5338     Int = Intrinsic::nearbyint;
5339     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
5340   case NEON::BI__builtin_neon_vrshr_n_v:
5341   case NEON::BI__builtin_neon_vrshrq_n_v:
5342     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
5343                         1, true);
5344   case NEON::BI__builtin_neon_vshl_n_v:
5345   case NEON::BI__builtin_neon_vshlq_n_v:
5346     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
5347     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
5348                              "vshl_n");
5349   case NEON::BI__builtin_neon_vshll_n_v: {
5350     llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
5351     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
5352     if (Usgn)
5353       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
5354     else
5355       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
5356     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
5357     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
5358   }
5359   case NEON::BI__builtin_neon_vshrn_n_v: {
5360     llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
5361     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
5362     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
5363     if (Usgn)
5364       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
5365     else
5366       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
5367     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
5368   }
5369   case NEON::BI__builtin_neon_vshr_n_v:
5370   case NEON::BI__builtin_neon_vshrq_n_v:
5371     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
5372   case NEON::BI__builtin_neon_vst1_v:
5373   case NEON::BI__builtin_neon_vst1q_v:
5374   case NEON::BI__builtin_neon_vst2_v:
5375   case NEON::BI__builtin_neon_vst2q_v:
5376   case NEON::BI__builtin_neon_vst3_v:
5377   case NEON::BI__builtin_neon_vst3q_v:
5378   case NEON::BI__builtin_neon_vst4_v:
5379   case NEON::BI__builtin_neon_vst4q_v:
5380   case NEON::BI__builtin_neon_vst2_lane_v:
5381   case NEON::BI__builtin_neon_vst2q_lane_v:
5382   case NEON::BI__builtin_neon_vst3_lane_v:
5383   case NEON::BI__builtin_neon_vst3q_lane_v:
5384   case NEON::BI__builtin_neon_vst4_lane_v:
5385   case NEON::BI__builtin_neon_vst4q_lane_v: {
5386     llvm::Type *Tys[] = {Int8PtrTy, Ty};
5387     Ops.push_back(getAlignmentValue32(PtrOp0));
5388     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
5389   }
5390   case NEON::BI__builtin_neon_vst1_x2_v:
5391   case NEON::BI__builtin_neon_vst1q_x2_v:
5392   case NEON::BI__builtin_neon_vst1_x3_v:
5393   case NEON::BI__builtin_neon_vst1q_x3_v:
5394   case NEON::BI__builtin_neon_vst1_x4_v:
5395   case NEON::BI__builtin_neon_vst1q_x4_v: {
5396     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
5397     // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
5398     // in AArch64 it comes last. We may want to stick to one or another.
5399     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
5400       llvm::Type *Tys[2] = { VTy, PTy };
5401       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
5402       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
5403     }
5404     llvm::Type *Tys[2] = { PTy, VTy };
5405     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
5406   }
5407   case NEON::BI__builtin_neon_vsubhn_v: {
5408     llvm::VectorType *SrcTy =
5409         llvm::VectorType::getExtendedElementVectorType(VTy);
5410 
5411     // %sum = add <4 x i32> %lhs, %rhs
5412     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
5413     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
5414     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
5415 
5416     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
5417     Constant *ShiftAmt =
5418         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
5419     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
5420 
5421     // %res = trunc <4 x i32> %high to <4 x i16>
5422     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
5423   }
5424   case NEON::BI__builtin_neon_vtrn_v:
5425   case NEON::BI__builtin_neon_vtrnq_v: {
5426     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
5427     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5428     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
5429     Value *SV = nullptr;
5430 
5431     for (unsigned vi = 0; vi != 2; ++vi) {
5432       SmallVector<uint32_t, 16> Indices;
5433       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
5434         Indices.push_back(i+vi);
5435         Indices.push_back(i+e+vi);
5436       }
5437       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
5438       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
5439       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
5440     }
5441     return SV;
5442   }
5443   case NEON::BI__builtin_neon_vtst_v:
5444   case NEON::BI__builtin_neon_vtstq_v: {
5445     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5446     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5447     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
5448     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
5449                                 ConstantAggregateZero::get(Ty));
5450     return Builder.CreateSExt(Ops[0], Ty, "vtst");
5451   }
5452   case NEON::BI__builtin_neon_vuzp_v:
5453   case NEON::BI__builtin_neon_vuzpq_v: {
5454     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
5455     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5456     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
5457     Value *SV = nullptr;
5458 
5459     for (unsigned vi = 0; vi != 2; ++vi) {
5460       SmallVector<uint32_t, 16> Indices;
5461       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
5462         Indices.push_back(2*i+vi);
5463 
5464       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
5465       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
5466       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
5467     }
5468     return SV;
5469   }
5470   case NEON::BI__builtin_neon_vzip_v:
5471   case NEON::BI__builtin_neon_vzipq_v: {
5472     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
5473     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5474     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
5475     Value *SV = nullptr;
5476 
5477     for (unsigned vi = 0; vi != 2; ++vi) {
5478       SmallVector<uint32_t, 16> Indices;
5479       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
5480         Indices.push_back((i + vi*e) >> 1);
5481         Indices.push_back(((i + vi*e) >> 1)+e);
5482       }
5483       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
5484       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
5485       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
5486     }
5487     return SV;
5488   }
5489   case NEON::BI__builtin_neon_vdot_v:
5490   case NEON::BI__builtin_neon_vdotq_v: {
5491     llvm::Type *InputTy =
5492         llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
5493     llvm::Type *Tys[2] = { Ty, InputTy };
5494     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
5495     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
5496   }
5497   case NEON::BI__builtin_neon_vfmlal_low_v:
5498   case NEON::BI__builtin_neon_vfmlalq_low_v: {
5499     llvm::Type *InputTy =
5500         llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
5501     llvm::Type *Tys[2] = { Ty, InputTy };
5502     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
5503   }
5504   case NEON::BI__builtin_neon_vfmlsl_low_v:
5505   case NEON::BI__builtin_neon_vfmlslq_low_v: {
5506     llvm::Type *InputTy =
5507         llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
5508     llvm::Type *Tys[2] = { Ty, InputTy };
5509     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
5510   }
5511   case NEON::BI__builtin_neon_vfmlal_high_v:
5512   case NEON::BI__builtin_neon_vfmlalq_high_v: {
5513     llvm::Type *InputTy =
5514            llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
5515     llvm::Type *Tys[2] = { Ty, InputTy };
5516     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
5517   }
5518   case NEON::BI__builtin_neon_vfmlsl_high_v:
5519   case NEON::BI__builtin_neon_vfmlslq_high_v: {
5520     llvm::Type *InputTy =
5521            llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
5522     llvm::Type *Tys[2] = { Ty, InputTy };
5523     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
5524   }
5525   }
5526 
5527   assert(Int && "Expected valid intrinsic number");
5528 
5529   // Determine the type(s) of this overloaded AArch64 intrinsic.
5530   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
5531 
5532   Value *Result = EmitNeonCall(F, Ops, NameHint);
5533   llvm::Type *ResultType = ConvertType(E->getType());
5534   // AArch64 intrinsic one-element vector type cast to
5535   // scalar type expected by the builtin
5536   return Builder.CreateBitCast(Result, ResultType, NameHint);
5537 }
5538 
5539 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
5540     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
5541     const CmpInst::Predicate Ip, const Twine &Name) {
5542   llvm::Type *OTy = Op->getType();
5543 
5544   // FIXME: this is utterly horrific. We should not be looking at previous
5545   // codegen context to find out what needs doing. Unfortunately TableGen
5546   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
5547   // (etc).
5548   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
5549     OTy = BI->getOperand(0)->getType();
5550 
5551   Op = Builder.CreateBitCast(Op, OTy);
5552   if (OTy->getScalarType()->isFloatingPointTy()) {
5553     Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
5554   } else {
5555     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
5556   }
5557   return Builder.CreateSExt(Op, Ty, Name);
5558 }
5559 
5560 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
5561                                  Value *ExtOp, Value *IndexOp,
5562                                  llvm::Type *ResTy, unsigned IntID,
5563                                  const char *Name) {
5564   SmallVector<Value *, 2> TblOps;
5565   if (ExtOp)
5566     TblOps.push_back(ExtOp);
5567 
5568   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
5569   SmallVector<uint32_t, 16> Indices;
5570   llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
5571   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
5572     Indices.push_back(2*i);
5573     Indices.push_back(2*i+1);
5574   }
5575 
5576   int PairPos = 0, End = Ops.size() - 1;
5577   while (PairPos < End) {
5578     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
5579                                                      Ops[PairPos+1], Indices,
5580                                                      Name));
5581     PairPos += 2;
5582   }
5583 
5584   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
5585   // of the 128-bit lookup table with zero.
5586   if (PairPos == End) {
5587     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
5588     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
5589                                                      ZeroTbl, Indices, Name));
5590   }
5591 
5592   Function *TblF;
5593   TblOps.push_back(IndexOp);
5594   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
5595 
5596   return CGF.EmitNeonCall(TblF, TblOps, Name);
5597 }
5598 
5599 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
5600   unsigned Value;
5601   switch (BuiltinID) {
5602   default:
5603     return nullptr;
5604   case ARM::BI__builtin_arm_nop:
5605     Value = 0;
5606     break;
5607   case ARM::BI__builtin_arm_yield:
5608   case ARM::BI__yield:
5609     Value = 1;
5610     break;
5611   case ARM::BI__builtin_arm_wfe:
5612   case ARM::BI__wfe:
5613     Value = 2;
5614     break;
5615   case ARM::BI__builtin_arm_wfi:
5616   case ARM::BI__wfi:
5617     Value = 3;
5618     break;
5619   case ARM::BI__builtin_arm_sev:
5620   case ARM::BI__sev:
5621     Value = 4;
5622     break;
5623   case ARM::BI__builtin_arm_sevl:
5624   case ARM::BI__sevl:
5625     Value = 5;
5626     break;
5627   }
5628 
5629   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
5630                             llvm::ConstantInt::get(Int32Ty, Value));
5631 }
5632 
5633 // Generates the IR for the read/write special register builtin,
5634 // ValueType is the type of the value that is to be written or read,
5635 // RegisterType is the type of the register being written to or read from.
5636 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
5637                                          const CallExpr *E,
5638                                          llvm::Type *RegisterType,
5639                                          llvm::Type *ValueType,
5640                                          bool IsRead,
5641                                          StringRef SysReg = "") {
5642   // write and register intrinsics only support 32 and 64 bit operations.
5643   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64))
5644           && "Unsupported size for register.");
5645 
5646   CodeGen::CGBuilderTy &Builder = CGF.Builder;
5647   CodeGen::CodeGenModule &CGM = CGF.CGM;
5648   LLVMContext &Context = CGM.getLLVMContext();
5649 
5650   if (SysReg.empty()) {
5651     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
5652     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
5653   }
5654 
5655   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
5656   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
5657   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
5658 
5659   llvm::Type *Types[] = { RegisterType };
5660 
5661   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
5662   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
5663             && "Can't fit 64-bit value in 32-bit register");
5664 
5665   if (IsRead) {
5666     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
5667     llvm::Value *Call = Builder.CreateCall(F, Metadata);
5668 
5669     if (MixedTypes)
5670       // Read into 64 bit register and then truncate result to 32 bit.
5671       return Builder.CreateTrunc(Call, ValueType);
5672 
5673     if (ValueType->isPointerTy())
5674       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
5675       return Builder.CreateIntToPtr(Call, ValueType);
5676 
5677     return Call;
5678   }
5679 
5680   llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
5681   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
5682   if (MixedTypes) {
5683     // Extend 32 bit write value to 64 bit to pass to write.
5684     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
5685     return Builder.CreateCall(F, { Metadata, ArgValue });
5686   }
5687 
5688   if (ValueType->isPointerTy()) {
5689     // Have VoidPtrTy ArgValue but want to return an i32/i64.
5690     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
5691     return Builder.CreateCall(F, { Metadata, ArgValue });
5692   }
5693 
5694   return Builder.CreateCall(F, { Metadata, ArgValue });
5695 }
5696 
5697 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
5698 /// argument that specifies the vector type.
5699 static bool HasExtraNeonArgument(unsigned BuiltinID) {
5700   switch (BuiltinID) {
5701   default: break;
5702   case NEON::BI__builtin_neon_vget_lane_i8:
5703   case NEON::BI__builtin_neon_vget_lane_i16:
5704   case NEON::BI__builtin_neon_vget_lane_i32:
5705   case NEON::BI__builtin_neon_vget_lane_i64:
5706   case NEON::BI__builtin_neon_vget_lane_f32:
5707   case NEON::BI__builtin_neon_vgetq_lane_i8:
5708   case NEON::BI__builtin_neon_vgetq_lane_i16:
5709   case NEON::BI__builtin_neon_vgetq_lane_i32:
5710   case NEON::BI__builtin_neon_vgetq_lane_i64:
5711   case NEON::BI__builtin_neon_vgetq_lane_f32:
5712   case NEON::BI__builtin_neon_vset_lane_i8:
5713   case NEON::BI__builtin_neon_vset_lane_i16:
5714   case NEON::BI__builtin_neon_vset_lane_i32:
5715   case NEON::BI__builtin_neon_vset_lane_i64:
5716   case NEON::BI__builtin_neon_vset_lane_f32:
5717   case NEON::BI__builtin_neon_vsetq_lane_i8:
5718   case NEON::BI__builtin_neon_vsetq_lane_i16:
5719   case NEON::BI__builtin_neon_vsetq_lane_i32:
5720   case NEON::BI__builtin_neon_vsetq_lane_i64:
5721   case NEON::BI__builtin_neon_vsetq_lane_f32:
5722   case NEON::BI__builtin_neon_vsha1h_u32:
5723   case NEON::BI__builtin_neon_vsha1cq_u32:
5724   case NEON::BI__builtin_neon_vsha1pq_u32:
5725   case NEON::BI__builtin_neon_vsha1mq_u32:
5726   case clang::ARM::BI_MoveToCoprocessor:
5727   case clang::ARM::BI_MoveToCoprocessor2:
5728     return false;
5729   }
5730   return true;
5731 }
5732 
5733 Value *CodeGenFunction::EmitISOVolatileLoad(const CallExpr *E) {
5734   Value *Ptr = EmitScalarExpr(E->getArg(0));
5735   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
5736   CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy);
5737   llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
5738                                            LoadSize.getQuantity() * 8);
5739   Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
5740   llvm::LoadInst *Load =
5741     Builder.CreateAlignedLoad(Ptr, LoadSize);
5742   Load->setVolatile(true);
5743   return Load;
5744 }
5745 
5746 Value *CodeGenFunction::EmitISOVolatileStore(const CallExpr *E) {
5747   Value *Ptr = EmitScalarExpr(E->getArg(0));
5748   Value *Value = EmitScalarExpr(E->getArg(1));
5749   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
5750   CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
5751   llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
5752                                            StoreSize.getQuantity() * 8);
5753   Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
5754   llvm::StoreInst *Store =
5755     Builder.CreateAlignedStore(Value, Ptr,
5756                                StoreSize);
5757   Store->setVolatile(true);
5758   return Store;
5759 }
5760 
5761 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
5762                                            const CallExpr *E,
5763                                            llvm::Triple::ArchType Arch) {
5764   if (auto Hint = GetValueForARMHint(BuiltinID))
5765     return Hint;
5766 
5767   if (BuiltinID == ARM::BI__emit) {
5768     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
5769     llvm::FunctionType *FTy =
5770         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
5771 
5772     APSInt Value;
5773     if (!E->getArg(0)->EvaluateAsInt(Value, CGM.getContext()))
5774       llvm_unreachable("Sema will ensure that the parameter is constant");
5775 
5776     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
5777 
5778     llvm::InlineAsm *Emit =
5779         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
5780                                  /*SideEffects=*/true)
5781                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
5782                                  /*SideEffects=*/true);
5783 
5784     return Builder.CreateCall(Emit);
5785   }
5786 
5787   if (BuiltinID == ARM::BI__builtin_arm_dbg) {
5788     Value *Option = EmitScalarExpr(E->getArg(0));
5789     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
5790   }
5791 
5792   if (BuiltinID == ARM::BI__builtin_arm_prefetch) {
5793     Value *Address = EmitScalarExpr(E->getArg(0));
5794     Value *RW      = EmitScalarExpr(E->getArg(1));
5795     Value *IsData  = EmitScalarExpr(E->getArg(2));
5796 
5797     // Locality is not supported on ARM target
5798     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
5799 
5800     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
5801     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
5802   }
5803 
5804   if (BuiltinID == ARM::BI__builtin_arm_rbit) {
5805     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5806     return Builder.CreateCall(
5807         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
5808   }
5809 
5810   if (BuiltinID == ARM::BI__clear_cache) {
5811     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
5812     const FunctionDecl *FD = E->getDirectCallee();
5813     Value *Ops[2];
5814     for (unsigned i = 0; i < 2; i++)
5815       Ops[i] = EmitScalarExpr(E->getArg(i));
5816     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
5817     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
5818     StringRef Name = FD->getName();
5819     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
5820   }
5821 
5822   if (BuiltinID == ARM::BI__builtin_arm_mcrr ||
5823       BuiltinID == ARM::BI__builtin_arm_mcrr2) {
5824     Function *F;
5825 
5826     switch (BuiltinID) {
5827     default: llvm_unreachable("unexpected builtin");
5828     case ARM::BI__builtin_arm_mcrr:
5829       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
5830       break;
5831     case ARM::BI__builtin_arm_mcrr2:
5832       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
5833       break;
5834     }
5835 
5836     // MCRR{2} instruction has 5 operands but
5837     // the intrinsic has 4 because Rt and Rt2
5838     // are represented as a single unsigned 64
5839     // bit integer in the intrinsic definition
5840     // but internally it's represented as 2 32
5841     // bit integers.
5842 
5843     Value *Coproc = EmitScalarExpr(E->getArg(0));
5844     Value *Opc1 = EmitScalarExpr(E->getArg(1));
5845     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
5846     Value *CRm = EmitScalarExpr(E->getArg(3));
5847 
5848     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
5849     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
5850     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
5851     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
5852 
5853     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
5854   }
5855 
5856   if (BuiltinID == ARM::BI__builtin_arm_mrrc ||
5857       BuiltinID == ARM::BI__builtin_arm_mrrc2) {
5858     Function *F;
5859 
5860     switch (BuiltinID) {
5861     default: llvm_unreachable("unexpected builtin");
5862     case ARM::BI__builtin_arm_mrrc:
5863       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
5864       break;
5865     case ARM::BI__builtin_arm_mrrc2:
5866       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
5867       break;
5868     }
5869 
5870     Value *Coproc = EmitScalarExpr(E->getArg(0));
5871     Value *Opc1 = EmitScalarExpr(E->getArg(1));
5872     Value *CRm  = EmitScalarExpr(E->getArg(2));
5873     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
5874 
5875     // Returns an unsigned 64 bit integer, represented
5876     // as two 32 bit integers.
5877 
5878     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
5879     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
5880     Rt = Builder.CreateZExt(Rt, Int64Ty);
5881     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
5882 
5883     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
5884     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
5885     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
5886 
5887     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
5888   }
5889 
5890   if (BuiltinID == ARM::BI__builtin_arm_ldrexd ||
5891       ((BuiltinID == ARM::BI__builtin_arm_ldrex ||
5892         BuiltinID == ARM::BI__builtin_arm_ldaex) &&
5893        getContext().getTypeSize(E->getType()) == 64) ||
5894       BuiltinID == ARM::BI__ldrexd) {
5895     Function *F;
5896 
5897     switch (BuiltinID) {
5898     default: llvm_unreachable("unexpected builtin");
5899     case ARM::BI__builtin_arm_ldaex:
5900       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
5901       break;
5902     case ARM::BI__builtin_arm_ldrexd:
5903     case ARM::BI__builtin_arm_ldrex:
5904     case ARM::BI__ldrexd:
5905       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
5906       break;
5907     }
5908 
5909     Value *LdPtr = EmitScalarExpr(E->getArg(0));
5910     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
5911                                     "ldrexd");
5912 
5913     Value *Val0 = Builder.CreateExtractValue(Val, 1);
5914     Value *Val1 = Builder.CreateExtractValue(Val, 0);
5915     Val0 = Builder.CreateZExt(Val0, Int64Ty);
5916     Val1 = Builder.CreateZExt(Val1, Int64Ty);
5917 
5918     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
5919     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
5920     Val = Builder.CreateOr(Val, Val1);
5921     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
5922   }
5923 
5924   if (BuiltinID == ARM::BI__builtin_arm_ldrex ||
5925       BuiltinID == ARM::BI__builtin_arm_ldaex) {
5926     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
5927 
5928     QualType Ty = E->getType();
5929     llvm::Type *RealResTy = ConvertType(Ty);
5930     llvm::Type *PtrTy = llvm::IntegerType::get(
5931         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
5932     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
5933 
5934     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_ldaex
5935                                        ? Intrinsic::arm_ldaex
5936                                        : Intrinsic::arm_ldrex,
5937                                    PtrTy);
5938     Value *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
5939 
5940     if (RealResTy->isPointerTy())
5941       return Builder.CreateIntToPtr(Val, RealResTy);
5942     else {
5943       llvm::Type *IntResTy = llvm::IntegerType::get(
5944           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
5945       Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
5946       return Builder.CreateBitCast(Val, RealResTy);
5947     }
5948   }
5949 
5950   if (BuiltinID == ARM::BI__builtin_arm_strexd ||
5951       ((BuiltinID == ARM::BI__builtin_arm_stlex ||
5952         BuiltinID == ARM::BI__builtin_arm_strex) &&
5953        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
5954     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
5955                                        ? Intrinsic::arm_stlexd
5956                                        : Intrinsic::arm_strexd);
5957     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
5958 
5959     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
5960     Value *Val = EmitScalarExpr(E->getArg(0));
5961     Builder.CreateStore(Val, Tmp);
5962 
5963     Address LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
5964     Val = Builder.CreateLoad(LdPtr);
5965 
5966     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
5967     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
5968     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), Int8PtrTy);
5969     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
5970   }
5971 
5972   if (BuiltinID == ARM::BI__builtin_arm_strex ||
5973       BuiltinID == ARM::BI__builtin_arm_stlex) {
5974     Value *StoreVal = EmitScalarExpr(E->getArg(0));
5975     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
5976 
5977     QualType Ty = E->getArg(0)->getType();
5978     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
5979                                                  getContext().getTypeSize(Ty));
5980     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
5981 
5982     if (StoreVal->getType()->isPointerTy())
5983       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
5984     else {
5985       llvm::Type *IntTy = llvm::IntegerType::get(
5986           getLLVMContext(),
5987           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
5988       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
5989       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
5990     }
5991 
5992     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
5993                                        ? Intrinsic::arm_stlex
5994                                        : Intrinsic::arm_strex,
5995                                    StoreAddr->getType());
5996     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
5997   }
5998 
5999   switch (BuiltinID) {
6000   case ARM::BI__iso_volatile_load8:
6001   case ARM::BI__iso_volatile_load16:
6002   case ARM::BI__iso_volatile_load32:
6003   case ARM::BI__iso_volatile_load64:
6004     return EmitISOVolatileLoad(E);
6005   case ARM::BI__iso_volatile_store8:
6006   case ARM::BI__iso_volatile_store16:
6007   case ARM::BI__iso_volatile_store32:
6008   case ARM::BI__iso_volatile_store64:
6009     return EmitISOVolatileStore(E);
6010   }
6011 
6012   if (BuiltinID == ARM::BI__builtin_arm_clrex) {
6013     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
6014     return Builder.CreateCall(F);
6015   }
6016 
6017   // CRC32
6018   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
6019   switch (BuiltinID) {
6020   case ARM::BI__builtin_arm_crc32b:
6021     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
6022   case ARM::BI__builtin_arm_crc32cb:
6023     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
6024   case ARM::BI__builtin_arm_crc32h:
6025     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
6026   case ARM::BI__builtin_arm_crc32ch:
6027     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
6028   case ARM::BI__builtin_arm_crc32w:
6029   case ARM::BI__builtin_arm_crc32d:
6030     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
6031   case ARM::BI__builtin_arm_crc32cw:
6032   case ARM::BI__builtin_arm_crc32cd:
6033     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
6034   }
6035 
6036   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
6037     Value *Arg0 = EmitScalarExpr(E->getArg(0));
6038     Value *Arg1 = EmitScalarExpr(E->getArg(1));
6039 
6040     // crc32{c,}d intrinsics are implemnted as two calls to crc32{c,}w
6041     // intrinsics, hence we need different codegen for these cases.
6042     if (BuiltinID == ARM::BI__builtin_arm_crc32d ||
6043         BuiltinID == ARM::BI__builtin_arm_crc32cd) {
6044       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
6045       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
6046       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
6047       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
6048 
6049       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
6050       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
6051       return Builder.CreateCall(F, {Res, Arg1b});
6052     } else {
6053       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
6054 
6055       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
6056       return Builder.CreateCall(F, {Arg0, Arg1});
6057     }
6058   }
6059 
6060   if (BuiltinID == ARM::BI__builtin_arm_rsr ||
6061       BuiltinID == ARM::BI__builtin_arm_rsr64 ||
6062       BuiltinID == ARM::BI__builtin_arm_rsrp ||
6063       BuiltinID == ARM::BI__builtin_arm_wsr ||
6064       BuiltinID == ARM::BI__builtin_arm_wsr64 ||
6065       BuiltinID == ARM::BI__builtin_arm_wsrp) {
6066 
6067     bool IsRead = BuiltinID == ARM::BI__builtin_arm_rsr ||
6068                   BuiltinID == ARM::BI__builtin_arm_rsr64 ||
6069                   BuiltinID == ARM::BI__builtin_arm_rsrp;
6070 
6071     bool IsPointerBuiltin = BuiltinID == ARM::BI__builtin_arm_rsrp ||
6072                             BuiltinID == ARM::BI__builtin_arm_wsrp;
6073 
6074     bool Is64Bit = BuiltinID == ARM::BI__builtin_arm_rsr64 ||
6075                    BuiltinID == ARM::BI__builtin_arm_wsr64;
6076 
6077     llvm::Type *ValueType;
6078     llvm::Type *RegisterType;
6079     if (IsPointerBuiltin) {
6080       ValueType = VoidPtrTy;
6081       RegisterType = Int32Ty;
6082     } else if (Is64Bit) {
6083       ValueType = RegisterType = Int64Ty;
6084     } else {
6085       ValueType = RegisterType = Int32Ty;
6086     }
6087 
6088     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, IsRead);
6089   }
6090 
6091   // Find out if any arguments are required to be integer constant
6092   // expressions.
6093   unsigned ICEArguments = 0;
6094   ASTContext::GetBuiltinTypeError Error;
6095   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
6096   assert(Error == ASTContext::GE_None && "Should not codegen an error");
6097 
6098   auto getAlignmentValue32 = [&](Address addr) -> Value* {
6099     return Builder.getInt32(addr.getAlignment().getQuantity());
6100   };
6101 
6102   Address PtrOp0 = Address::invalid();
6103   Address PtrOp1 = Address::invalid();
6104   SmallVector<Value*, 4> Ops;
6105   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
6106   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
6107   for (unsigned i = 0, e = NumArgs; i != e; i++) {
6108     if (i == 0) {
6109       switch (BuiltinID) {
6110       case NEON::BI__builtin_neon_vld1_v:
6111       case NEON::BI__builtin_neon_vld1q_v:
6112       case NEON::BI__builtin_neon_vld1q_lane_v:
6113       case NEON::BI__builtin_neon_vld1_lane_v:
6114       case NEON::BI__builtin_neon_vld1_dup_v:
6115       case NEON::BI__builtin_neon_vld1q_dup_v:
6116       case NEON::BI__builtin_neon_vst1_v:
6117       case NEON::BI__builtin_neon_vst1q_v:
6118       case NEON::BI__builtin_neon_vst1q_lane_v:
6119       case NEON::BI__builtin_neon_vst1_lane_v:
6120       case NEON::BI__builtin_neon_vst2_v:
6121       case NEON::BI__builtin_neon_vst2q_v:
6122       case NEON::BI__builtin_neon_vst2_lane_v:
6123       case NEON::BI__builtin_neon_vst2q_lane_v:
6124       case NEON::BI__builtin_neon_vst3_v:
6125       case NEON::BI__builtin_neon_vst3q_v:
6126       case NEON::BI__builtin_neon_vst3_lane_v:
6127       case NEON::BI__builtin_neon_vst3q_lane_v:
6128       case NEON::BI__builtin_neon_vst4_v:
6129       case NEON::BI__builtin_neon_vst4q_v:
6130       case NEON::BI__builtin_neon_vst4_lane_v:
6131       case NEON::BI__builtin_neon_vst4q_lane_v:
6132         // Get the alignment for the argument in addition to the value;
6133         // we'll use it later.
6134         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
6135         Ops.push_back(PtrOp0.getPointer());
6136         continue;
6137       }
6138     }
6139     if (i == 1) {
6140       switch (BuiltinID) {
6141       case NEON::BI__builtin_neon_vld2_v:
6142       case NEON::BI__builtin_neon_vld2q_v:
6143       case NEON::BI__builtin_neon_vld3_v:
6144       case NEON::BI__builtin_neon_vld3q_v:
6145       case NEON::BI__builtin_neon_vld4_v:
6146       case NEON::BI__builtin_neon_vld4q_v:
6147       case NEON::BI__builtin_neon_vld2_lane_v:
6148       case NEON::BI__builtin_neon_vld2q_lane_v:
6149       case NEON::BI__builtin_neon_vld3_lane_v:
6150       case NEON::BI__builtin_neon_vld3q_lane_v:
6151       case NEON::BI__builtin_neon_vld4_lane_v:
6152       case NEON::BI__builtin_neon_vld4q_lane_v:
6153       case NEON::BI__builtin_neon_vld2_dup_v:
6154       case NEON::BI__builtin_neon_vld2q_dup_v:
6155       case NEON::BI__builtin_neon_vld3_dup_v:
6156       case NEON::BI__builtin_neon_vld3q_dup_v:
6157       case NEON::BI__builtin_neon_vld4_dup_v:
6158       case NEON::BI__builtin_neon_vld4q_dup_v:
6159         // Get the alignment for the argument in addition to the value;
6160         // we'll use it later.
6161         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
6162         Ops.push_back(PtrOp1.getPointer());
6163         continue;
6164       }
6165     }
6166 
6167     if ((ICEArguments & (1 << i)) == 0) {
6168       Ops.push_back(EmitScalarExpr(E->getArg(i)));
6169     } else {
6170       // If this is required to be a constant, constant fold it so that we know
6171       // that the generated intrinsic gets a ConstantInt.
6172       llvm::APSInt Result;
6173       bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
6174       assert(IsConst && "Constant arg isn't actually constant?"); (void)IsConst;
6175       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
6176     }
6177   }
6178 
6179   switch (BuiltinID) {
6180   default: break;
6181 
6182   case NEON::BI__builtin_neon_vget_lane_i8:
6183   case NEON::BI__builtin_neon_vget_lane_i16:
6184   case NEON::BI__builtin_neon_vget_lane_i32:
6185   case NEON::BI__builtin_neon_vget_lane_i64:
6186   case NEON::BI__builtin_neon_vget_lane_f32:
6187   case NEON::BI__builtin_neon_vgetq_lane_i8:
6188   case NEON::BI__builtin_neon_vgetq_lane_i16:
6189   case NEON::BI__builtin_neon_vgetq_lane_i32:
6190   case NEON::BI__builtin_neon_vgetq_lane_i64:
6191   case NEON::BI__builtin_neon_vgetq_lane_f32:
6192     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
6193 
6194   case NEON::BI__builtin_neon_vrndns_f32: {
6195     Value *Arg = EmitScalarExpr(E->getArg(0));
6196     llvm::Type *Tys[] = {Arg->getType()};
6197     Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
6198     return Builder.CreateCall(F, {Arg}, "vrndn"); }
6199 
6200   case NEON::BI__builtin_neon_vset_lane_i8:
6201   case NEON::BI__builtin_neon_vset_lane_i16:
6202   case NEON::BI__builtin_neon_vset_lane_i32:
6203   case NEON::BI__builtin_neon_vset_lane_i64:
6204   case NEON::BI__builtin_neon_vset_lane_f32:
6205   case NEON::BI__builtin_neon_vsetq_lane_i8:
6206   case NEON::BI__builtin_neon_vsetq_lane_i16:
6207   case NEON::BI__builtin_neon_vsetq_lane_i32:
6208   case NEON::BI__builtin_neon_vsetq_lane_i64:
6209   case NEON::BI__builtin_neon_vsetq_lane_f32:
6210     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6211 
6212   case NEON::BI__builtin_neon_vsha1h_u32:
6213     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
6214                         "vsha1h");
6215   case NEON::BI__builtin_neon_vsha1cq_u32:
6216     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
6217                         "vsha1h");
6218   case NEON::BI__builtin_neon_vsha1pq_u32:
6219     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
6220                         "vsha1h");
6221   case NEON::BI__builtin_neon_vsha1mq_u32:
6222     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
6223                         "vsha1h");
6224 
6225   // The ARM _MoveToCoprocessor builtins put the input register value as
6226   // the first argument, but the LLVM intrinsic expects it as the third one.
6227   case ARM::BI_MoveToCoprocessor:
6228   case ARM::BI_MoveToCoprocessor2: {
6229     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
6230                                    Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
6231     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
6232                                   Ops[3], Ops[4], Ops[5]});
6233   }
6234   case ARM::BI_BitScanForward:
6235   case ARM::BI_BitScanForward64:
6236     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
6237   case ARM::BI_BitScanReverse:
6238   case ARM::BI_BitScanReverse64:
6239     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
6240 
6241   case ARM::BI_InterlockedAnd64:
6242     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
6243   case ARM::BI_InterlockedExchange64:
6244     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
6245   case ARM::BI_InterlockedExchangeAdd64:
6246     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
6247   case ARM::BI_InterlockedExchangeSub64:
6248     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
6249   case ARM::BI_InterlockedOr64:
6250     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
6251   case ARM::BI_InterlockedXor64:
6252     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
6253   case ARM::BI_InterlockedDecrement64:
6254     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
6255   case ARM::BI_InterlockedIncrement64:
6256     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
6257   case ARM::BI_InterlockedExchangeAdd8_acq:
6258   case ARM::BI_InterlockedExchangeAdd16_acq:
6259   case ARM::BI_InterlockedExchangeAdd_acq:
6260   case ARM::BI_InterlockedExchangeAdd64_acq:
6261     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_acq, E);
6262   case ARM::BI_InterlockedExchangeAdd8_rel:
6263   case ARM::BI_InterlockedExchangeAdd16_rel:
6264   case ARM::BI_InterlockedExchangeAdd_rel:
6265   case ARM::BI_InterlockedExchangeAdd64_rel:
6266     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_rel, E);
6267   case ARM::BI_InterlockedExchangeAdd8_nf:
6268   case ARM::BI_InterlockedExchangeAdd16_nf:
6269   case ARM::BI_InterlockedExchangeAdd_nf:
6270   case ARM::BI_InterlockedExchangeAdd64_nf:
6271     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_nf, E);
6272   case ARM::BI_InterlockedExchange8_acq:
6273   case ARM::BI_InterlockedExchange16_acq:
6274   case ARM::BI_InterlockedExchange_acq:
6275   case ARM::BI_InterlockedExchange64_acq:
6276     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_acq, E);
6277   case ARM::BI_InterlockedExchange8_rel:
6278   case ARM::BI_InterlockedExchange16_rel:
6279   case ARM::BI_InterlockedExchange_rel:
6280   case ARM::BI_InterlockedExchange64_rel:
6281     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_rel, E);
6282   case ARM::BI_InterlockedExchange8_nf:
6283   case ARM::BI_InterlockedExchange16_nf:
6284   case ARM::BI_InterlockedExchange_nf:
6285   case ARM::BI_InterlockedExchange64_nf:
6286     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_nf, E);
6287   case ARM::BI_InterlockedCompareExchange8_acq:
6288   case ARM::BI_InterlockedCompareExchange16_acq:
6289   case ARM::BI_InterlockedCompareExchange_acq:
6290   case ARM::BI_InterlockedCompareExchange64_acq:
6291     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_acq, E);
6292   case ARM::BI_InterlockedCompareExchange8_rel:
6293   case ARM::BI_InterlockedCompareExchange16_rel:
6294   case ARM::BI_InterlockedCompareExchange_rel:
6295   case ARM::BI_InterlockedCompareExchange64_rel:
6296     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_rel, E);
6297   case ARM::BI_InterlockedCompareExchange8_nf:
6298   case ARM::BI_InterlockedCompareExchange16_nf:
6299   case ARM::BI_InterlockedCompareExchange_nf:
6300   case ARM::BI_InterlockedCompareExchange64_nf:
6301     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_nf, E);
6302   case ARM::BI_InterlockedOr8_acq:
6303   case ARM::BI_InterlockedOr16_acq:
6304   case ARM::BI_InterlockedOr_acq:
6305   case ARM::BI_InterlockedOr64_acq:
6306     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_acq, E);
6307   case ARM::BI_InterlockedOr8_rel:
6308   case ARM::BI_InterlockedOr16_rel:
6309   case ARM::BI_InterlockedOr_rel:
6310   case ARM::BI_InterlockedOr64_rel:
6311     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_rel, E);
6312   case ARM::BI_InterlockedOr8_nf:
6313   case ARM::BI_InterlockedOr16_nf:
6314   case ARM::BI_InterlockedOr_nf:
6315   case ARM::BI_InterlockedOr64_nf:
6316     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_nf, E);
6317   case ARM::BI_InterlockedXor8_acq:
6318   case ARM::BI_InterlockedXor16_acq:
6319   case ARM::BI_InterlockedXor_acq:
6320   case ARM::BI_InterlockedXor64_acq:
6321     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_acq, E);
6322   case ARM::BI_InterlockedXor8_rel:
6323   case ARM::BI_InterlockedXor16_rel:
6324   case ARM::BI_InterlockedXor_rel:
6325   case ARM::BI_InterlockedXor64_rel:
6326     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_rel, E);
6327   case ARM::BI_InterlockedXor8_nf:
6328   case ARM::BI_InterlockedXor16_nf:
6329   case ARM::BI_InterlockedXor_nf:
6330   case ARM::BI_InterlockedXor64_nf:
6331     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_nf, E);
6332   case ARM::BI_InterlockedAnd8_acq:
6333   case ARM::BI_InterlockedAnd16_acq:
6334   case ARM::BI_InterlockedAnd_acq:
6335   case ARM::BI_InterlockedAnd64_acq:
6336     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_acq, E);
6337   case ARM::BI_InterlockedAnd8_rel:
6338   case ARM::BI_InterlockedAnd16_rel:
6339   case ARM::BI_InterlockedAnd_rel:
6340   case ARM::BI_InterlockedAnd64_rel:
6341     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_rel, E);
6342   case ARM::BI_InterlockedAnd8_nf:
6343   case ARM::BI_InterlockedAnd16_nf:
6344   case ARM::BI_InterlockedAnd_nf:
6345   case ARM::BI_InterlockedAnd64_nf:
6346     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_nf, E);
6347   case ARM::BI_InterlockedIncrement16_acq:
6348   case ARM::BI_InterlockedIncrement_acq:
6349   case ARM::BI_InterlockedIncrement64_acq:
6350     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_acq, E);
6351   case ARM::BI_InterlockedIncrement16_rel:
6352   case ARM::BI_InterlockedIncrement_rel:
6353   case ARM::BI_InterlockedIncrement64_rel:
6354     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_rel, E);
6355   case ARM::BI_InterlockedIncrement16_nf:
6356   case ARM::BI_InterlockedIncrement_nf:
6357   case ARM::BI_InterlockedIncrement64_nf:
6358     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_nf, E);
6359   case ARM::BI_InterlockedDecrement16_acq:
6360   case ARM::BI_InterlockedDecrement_acq:
6361   case ARM::BI_InterlockedDecrement64_acq:
6362     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_acq, E);
6363   case ARM::BI_InterlockedDecrement16_rel:
6364   case ARM::BI_InterlockedDecrement_rel:
6365   case ARM::BI_InterlockedDecrement64_rel:
6366     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_rel, E);
6367   case ARM::BI_InterlockedDecrement16_nf:
6368   case ARM::BI_InterlockedDecrement_nf:
6369   case ARM::BI_InterlockedDecrement64_nf:
6370     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_nf, E);
6371   }
6372 
6373   // Get the last argument, which specifies the vector type.
6374   assert(HasExtraArg);
6375   llvm::APSInt Result;
6376   const Expr *Arg = E->getArg(E->getNumArgs()-1);
6377   if (!Arg->isIntegerConstantExpr(Result, getContext()))
6378     return nullptr;
6379 
6380   if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f ||
6381       BuiltinID == ARM::BI__builtin_arm_vcvtr_d) {
6382     // Determine the overloaded type of this builtin.
6383     llvm::Type *Ty;
6384     if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f)
6385       Ty = FloatTy;
6386     else
6387       Ty = DoubleTy;
6388 
6389     // Determine whether this is an unsigned conversion or not.
6390     bool usgn = Result.getZExtValue() == 1;
6391     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
6392 
6393     // Call the appropriate intrinsic.
6394     Function *F = CGM.getIntrinsic(Int, Ty);
6395     return Builder.CreateCall(F, Ops, "vcvtr");
6396   }
6397 
6398   // Determine the type of this overloaded NEON intrinsic.
6399   NeonTypeFlags Type(Result.getZExtValue());
6400   bool usgn = Type.isUnsigned();
6401   bool rightShift = false;
6402 
6403   llvm::VectorType *VTy = GetNeonType(this, Type,
6404                                       getTarget().hasLegalHalfType());
6405   llvm::Type *Ty = VTy;
6406   if (!Ty)
6407     return nullptr;
6408 
6409   // Many NEON builtins have identical semantics and uses in ARM and
6410   // AArch64. Emit these in a single function.
6411   auto IntrinsicMap = makeArrayRef(ARMSIMDIntrinsicMap);
6412   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
6413       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
6414   if (Builtin)
6415     return EmitCommonNeonBuiltinExpr(
6416         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
6417         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
6418 
6419   unsigned Int;
6420   switch (BuiltinID) {
6421   default: return nullptr;
6422   case NEON::BI__builtin_neon_vld1q_lane_v:
6423     // Handle 64-bit integer elements as a special case.  Use shuffles of
6424     // one-element vectors to avoid poor code for i64 in the backend.
6425     if (VTy->getElementType()->isIntegerTy(64)) {
6426       // Extract the other lane.
6427       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6428       uint32_t Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
6429       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
6430       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
6431       // Load the value as a one-element vector.
6432       Ty = llvm::VectorType::get(VTy->getElementType(), 1);
6433       llvm::Type *Tys[] = {Ty, Int8PtrTy};
6434       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
6435       Value *Align = getAlignmentValue32(PtrOp0);
6436       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
6437       // Combine them.
6438       uint32_t Indices[] = {1 - Lane, Lane};
6439       SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices);
6440       return Builder.CreateShuffleVector(Ops[1], Ld, SV, "vld1q_lane");
6441     }
6442     LLVM_FALLTHROUGH;
6443   case NEON::BI__builtin_neon_vld1_lane_v: {
6444     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6445     PtrOp0 = Builder.CreateElementBitCast(PtrOp0, VTy->getElementType());
6446     Value *Ld = Builder.CreateLoad(PtrOp0);
6447     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
6448   }
6449   case NEON::BI__builtin_neon_vqrshrn_n_v:
6450     Int =
6451       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
6452     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
6453                         1, true);
6454   case NEON::BI__builtin_neon_vqrshrun_n_v:
6455     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
6456                         Ops, "vqrshrun_n", 1, true);
6457   case NEON::BI__builtin_neon_vqshrn_n_v:
6458     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
6459     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
6460                         1, true);
6461   case NEON::BI__builtin_neon_vqshrun_n_v:
6462     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
6463                         Ops, "vqshrun_n", 1, true);
6464   case NEON::BI__builtin_neon_vrecpe_v:
6465   case NEON::BI__builtin_neon_vrecpeq_v:
6466     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
6467                         Ops, "vrecpe");
6468   case NEON::BI__builtin_neon_vrshrn_n_v:
6469     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
6470                         Ops, "vrshrn_n", 1, true);
6471   case NEON::BI__builtin_neon_vrsra_n_v:
6472   case NEON::BI__builtin_neon_vrsraq_n_v:
6473     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6474     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6475     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
6476     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
6477     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
6478     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
6479   case NEON::BI__builtin_neon_vsri_n_v:
6480   case NEON::BI__builtin_neon_vsriq_n_v:
6481     rightShift = true;
6482     LLVM_FALLTHROUGH;
6483   case NEON::BI__builtin_neon_vsli_n_v:
6484   case NEON::BI__builtin_neon_vsliq_n_v:
6485     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
6486     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
6487                         Ops, "vsli_n");
6488   case NEON::BI__builtin_neon_vsra_n_v:
6489   case NEON::BI__builtin_neon_vsraq_n_v:
6490     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6491     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
6492     return Builder.CreateAdd(Ops[0], Ops[1]);
6493   case NEON::BI__builtin_neon_vst1q_lane_v:
6494     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
6495     // a one-element vector and avoid poor code for i64 in the backend.
6496     if (VTy->getElementType()->isIntegerTy(64)) {
6497       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6498       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
6499       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
6500       Ops[2] = getAlignmentValue32(PtrOp0);
6501       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
6502       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
6503                                                  Tys), Ops);
6504     }
6505     LLVM_FALLTHROUGH;
6506   case NEON::BI__builtin_neon_vst1_lane_v: {
6507     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6508     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
6509     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6510     auto St = Builder.CreateStore(Ops[1], Builder.CreateBitCast(PtrOp0, Ty));
6511     return St;
6512   }
6513   case NEON::BI__builtin_neon_vtbl1_v:
6514     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
6515                         Ops, "vtbl1");
6516   case NEON::BI__builtin_neon_vtbl2_v:
6517     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
6518                         Ops, "vtbl2");
6519   case NEON::BI__builtin_neon_vtbl3_v:
6520     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
6521                         Ops, "vtbl3");
6522   case NEON::BI__builtin_neon_vtbl4_v:
6523     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
6524                         Ops, "vtbl4");
6525   case NEON::BI__builtin_neon_vtbx1_v:
6526     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
6527                         Ops, "vtbx1");
6528   case NEON::BI__builtin_neon_vtbx2_v:
6529     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
6530                         Ops, "vtbx2");
6531   case NEON::BI__builtin_neon_vtbx3_v:
6532     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
6533                         Ops, "vtbx3");
6534   case NEON::BI__builtin_neon_vtbx4_v:
6535     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
6536                         Ops, "vtbx4");
6537   }
6538 }
6539 
6540 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
6541                                       const CallExpr *E,
6542                                       SmallVectorImpl<Value *> &Ops,
6543                                       llvm::Triple::ArchType Arch) {
6544   unsigned int Int = 0;
6545   const char *s = nullptr;
6546 
6547   switch (BuiltinID) {
6548   default:
6549     return nullptr;
6550   case NEON::BI__builtin_neon_vtbl1_v:
6551   case NEON::BI__builtin_neon_vqtbl1_v:
6552   case NEON::BI__builtin_neon_vqtbl1q_v:
6553   case NEON::BI__builtin_neon_vtbl2_v:
6554   case NEON::BI__builtin_neon_vqtbl2_v:
6555   case NEON::BI__builtin_neon_vqtbl2q_v:
6556   case NEON::BI__builtin_neon_vtbl3_v:
6557   case NEON::BI__builtin_neon_vqtbl3_v:
6558   case NEON::BI__builtin_neon_vqtbl3q_v:
6559   case NEON::BI__builtin_neon_vtbl4_v:
6560   case NEON::BI__builtin_neon_vqtbl4_v:
6561   case NEON::BI__builtin_neon_vqtbl4q_v:
6562     break;
6563   case NEON::BI__builtin_neon_vtbx1_v:
6564   case NEON::BI__builtin_neon_vqtbx1_v:
6565   case NEON::BI__builtin_neon_vqtbx1q_v:
6566   case NEON::BI__builtin_neon_vtbx2_v:
6567   case NEON::BI__builtin_neon_vqtbx2_v:
6568   case NEON::BI__builtin_neon_vqtbx2q_v:
6569   case NEON::BI__builtin_neon_vtbx3_v:
6570   case NEON::BI__builtin_neon_vqtbx3_v:
6571   case NEON::BI__builtin_neon_vqtbx3q_v:
6572   case NEON::BI__builtin_neon_vtbx4_v:
6573   case NEON::BI__builtin_neon_vqtbx4_v:
6574   case NEON::BI__builtin_neon_vqtbx4q_v:
6575     break;
6576   }
6577 
6578   assert(E->getNumArgs() >= 3);
6579 
6580   // Get the last argument, which specifies the vector type.
6581   llvm::APSInt Result;
6582   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
6583   if (!Arg->isIntegerConstantExpr(Result, CGF.getContext()))
6584     return nullptr;
6585 
6586   // Determine the type of this overloaded NEON intrinsic.
6587   NeonTypeFlags Type(Result.getZExtValue());
6588   llvm::VectorType *Ty = GetNeonType(&CGF, Type);
6589   if (!Ty)
6590     return nullptr;
6591 
6592   CodeGen::CGBuilderTy &Builder = CGF.Builder;
6593 
6594   // AArch64 scalar builtins are not overloaded, they do not have an extra
6595   // argument that specifies the vector type, need to handle each case.
6596   switch (BuiltinID) {
6597   case NEON::BI__builtin_neon_vtbl1_v: {
6598     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 1), nullptr,
6599                               Ops[1], Ty, Intrinsic::aarch64_neon_tbl1,
6600                               "vtbl1");
6601   }
6602   case NEON::BI__builtin_neon_vtbl2_v: {
6603     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 2), nullptr,
6604                               Ops[2], Ty, Intrinsic::aarch64_neon_tbl1,
6605                               "vtbl1");
6606   }
6607   case NEON::BI__builtin_neon_vtbl3_v: {
6608     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 3), nullptr,
6609                               Ops[3], Ty, Intrinsic::aarch64_neon_tbl2,
6610                               "vtbl2");
6611   }
6612   case NEON::BI__builtin_neon_vtbl4_v: {
6613     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 4), nullptr,
6614                               Ops[4], Ty, Intrinsic::aarch64_neon_tbl2,
6615                               "vtbl2");
6616   }
6617   case NEON::BI__builtin_neon_vtbx1_v: {
6618     Value *TblRes =
6619         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 1), nullptr, Ops[2],
6620                            Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
6621 
6622     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
6623     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
6624     CmpRes = Builder.CreateSExt(CmpRes, Ty);
6625 
6626     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
6627     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
6628     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
6629   }
6630   case NEON::BI__builtin_neon_vtbx2_v: {
6631     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 2), Ops[0],
6632                               Ops[3], Ty, Intrinsic::aarch64_neon_tbx1,
6633                               "vtbx1");
6634   }
6635   case NEON::BI__builtin_neon_vtbx3_v: {
6636     Value *TblRes =
6637         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 3), nullptr, Ops[4],
6638                            Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
6639 
6640     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
6641     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
6642                                            TwentyFourV);
6643     CmpRes = Builder.CreateSExt(CmpRes, Ty);
6644 
6645     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
6646     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
6647     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
6648   }
6649   case NEON::BI__builtin_neon_vtbx4_v: {
6650     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 4), Ops[0],
6651                               Ops[5], Ty, Intrinsic::aarch64_neon_tbx2,
6652                               "vtbx2");
6653   }
6654   case NEON::BI__builtin_neon_vqtbl1_v:
6655   case NEON::BI__builtin_neon_vqtbl1q_v:
6656     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
6657   case NEON::BI__builtin_neon_vqtbl2_v:
6658   case NEON::BI__builtin_neon_vqtbl2q_v: {
6659     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
6660   case NEON::BI__builtin_neon_vqtbl3_v:
6661   case NEON::BI__builtin_neon_vqtbl3q_v:
6662     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
6663   case NEON::BI__builtin_neon_vqtbl4_v:
6664   case NEON::BI__builtin_neon_vqtbl4q_v:
6665     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
6666   case NEON::BI__builtin_neon_vqtbx1_v:
6667   case NEON::BI__builtin_neon_vqtbx1q_v:
6668     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
6669   case NEON::BI__builtin_neon_vqtbx2_v:
6670   case NEON::BI__builtin_neon_vqtbx2q_v:
6671     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
6672   case NEON::BI__builtin_neon_vqtbx3_v:
6673   case NEON::BI__builtin_neon_vqtbx3q_v:
6674     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
6675   case NEON::BI__builtin_neon_vqtbx4_v:
6676   case NEON::BI__builtin_neon_vqtbx4q_v:
6677     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
6678   }
6679   }
6680 
6681   if (!Int)
6682     return nullptr;
6683 
6684   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
6685   return CGF.EmitNeonCall(F, Ops, s);
6686 }
6687 
6688 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
6689   llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
6690   Op = Builder.CreateBitCast(Op, Int16Ty);
6691   Value *V = UndefValue::get(VTy);
6692   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
6693   Op = Builder.CreateInsertElement(V, Op, CI);
6694   return Op;
6695 }
6696 
6697 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
6698                                                const CallExpr *E,
6699                                                llvm::Triple::ArchType Arch) {
6700   unsigned HintID = static_cast<unsigned>(-1);
6701   switch (BuiltinID) {
6702   default: break;
6703   case AArch64::BI__builtin_arm_nop:
6704     HintID = 0;
6705     break;
6706   case AArch64::BI__builtin_arm_yield:
6707   case AArch64::BI__yield:
6708     HintID = 1;
6709     break;
6710   case AArch64::BI__builtin_arm_wfe:
6711   case AArch64::BI__wfe:
6712     HintID = 2;
6713     break;
6714   case AArch64::BI__builtin_arm_wfi:
6715   case AArch64::BI__wfi:
6716     HintID = 3;
6717     break;
6718   case AArch64::BI__builtin_arm_sev:
6719   case AArch64::BI__sev:
6720     HintID = 4;
6721     break;
6722   case AArch64::BI__builtin_arm_sevl:
6723   case AArch64::BI__sevl:
6724     HintID = 5;
6725     break;
6726   }
6727 
6728   if (HintID != static_cast<unsigned>(-1)) {
6729     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
6730     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
6731   }
6732 
6733   if (BuiltinID == AArch64::BI__builtin_arm_prefetch) {
6734     Value *Address         = EmitScalarExpr(E->getArg(0));
6735     Value *RW              = EmitScalarExpr(E->getArg(1));
6736     Value *CacheLevel      = EmitScalarExpr(E->getArg(2));
6737     Value *RetentionPolicy = EmitScalarExpr(E->getArg(3));
6738     Value *IsData          = EmitScalarExpr(E->getArg(4));
6739 
6740     Value *Locality = nullptr;
6741     if (cast<llvm::ConstantInt>(RetentionPolicy)->isZero()) {
6742       // Temporal fetch, needs to convert cache level to locality.
6743       Locality = llvm::ConstantInt::get(Int32Ty,
6744         -cast<llvm::ConstantInt>(CacheLevel)->getValue() + 3);
6745     } else {
6746       // Streaming fetch.
6747       Locality = llvm::ConstantInt::get(Int32Ty, 0);
6748     }
6749 
6750     // FIXME: We need AArch64 specific LLVM intrinsic if we want to specify
6751     // PLDL3STRM or PLDL2STRM.
6752     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
6753     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
6754   }
6755 
6756   if (BuiltinID == AArch64::BI__builtin_arm_rbit) {
6757     assert((getContext().getTypeSize(E->getType()) == 32) &&
6758            "rbit of unusual size!");
6759     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
6760     return Builder.CreateCall(
6761         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
6762   }
6763   if (BuiltinID == AArch64::BI__builtin_arm_rbit64) {
6764     assert((getContext().getTypeSize(E->getType()) == 64) &&
6765            "rbit of unusual size!");
6766     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
6767     return Builder.CreateCall(
6768         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
6769   }
6770 
6771   if (BuiltinID == AArch64::BI__clear_cache) {
6772     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
6773     const FunctionDecl *FD = E->getDirectCallee();
6774     Value *Ops[2];
6775     for (unsigned i = 0; i < 2; i++)
6776       Ops[i] = EmitScalarExpr(E->getArg(i));
6777     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
6778     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
6779     StringRef Name = FD->getName();
6780     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
6781   }
6782 
6783   if ((BuiltinID == AArch64::BI__builtin_arm_ldrex ||
6784       BuiltinID == AArch64::BI__builtin_arm_ldaex) &&
6785       getContext().getTypeSize(E->getType()) == 128) {
6786     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
6787                                        ? Intrinsic::aarch64_ldaxp
6788                                        : Intrinsic::aarch64_ldxp);
6789 
6790     Value *LdPtr = EmitScalarExpr(E->getArg(0));
6791     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
6792                                     "ldxp");
6793 
6794     Value *Val0 = Builder.CreateExtractValue(Val, 1);
6795     Value *Val1 = Builder.CreateExtractValue(Val, 0);
6796     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
6797     Val0 = Builder.CreateZExt(Val0, Int128Ty);
6798     Val1 = Builder.CreateZExt(Val1, Int128Ty);
6799 
6800     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
6801     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
6802     Val = Builder.CreateOr(Val, Val1);
6803     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
6804   } else if (BuiltinID == AArch64::BI__builtin_arm_ldrex ||
6805              BuiltinID == AArch64::BI__builtin_arm_ldaex) {
6806     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
6807 
6808     QualType Ty = E->getType();
6809     llvm::Type *RealResTy = ConvertType(Ty);
6810     llvm::Type *PtrTy = llvm::IntegerType::get(
6811         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
6812     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
6813 
6814     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
6815                                        ? Intrinsic::aarch64_ldaxr
6816                                        : Intrinsic::aarch64_ldxr,
6817                                    PtrTy);
6818     Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
6819 
6820     if (RealResTy->isPointerTy())
6821       return Builder.CreateIntToPtr(Val, RealResTy);
6822 
6823     llvm::Type *IntResTy = llvm::IntegerType::get(
6824         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
6825     Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
6826     return Builder.CreateBitCast(Val, RealResTy);
6827   }
6828 
6829   if ((BuiltinID == AArch64::BI__builtin_arm_strex ||
6830        BuiltinID == AArch64::BI__builtin_arm_stlex) &&
6831       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
6832     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
6833                                        ? Intrinsic::aarch64_stlxp
6834                                        : Intrinsic::aarch64_stxp);
6835     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
6836 
6837     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
6838     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
6839 
6840     Tmp = Builder.CreateBitCast(Tmp, llvm::PointerType::getUnqual(STy));
6841     llvm::Value *Val = Builder.CreateLoad(Tmp);
6842 
6843     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
6844     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
6845     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
6846                                          Int8PtrTy);
6847     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
6848   }
6849 
6850   if (BuiltinID == AArch64::BI__builtin_arm_strex ||
6851       BuiltinID == AArch64::BI__builtin_arm_stlex) {
6852     Value *StoreVal = EmitScalarExpr(E->getArg(0));
6853     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
6854 
6855     QualType Ty = E->getArg(0)->getType();
6856     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
6857                                                  getContext().getTypeSize(Ty));
6858     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
6859 
6860     if (StoreVal->getType()->isPointerTy())
6861       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
6862     else {
6863       llvm::Type *IntTy = llvm::IntegerType::get(
6864           getLLVMContext(),
6865           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
6866       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
6867       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
6868     }
6869 
6870     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
6871                                        ? Intrinsic::aarch64_stlxr
6872                                        : Intrinsic::aarch64_stxr,
6873                                    StoreAddr->getType());
6874     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
6875   }
6876 
6877   if (BuiltinID == AArch64::BI__getReg) {
6878     APSInt Value;
6879     if (!E->getArg(0)->EvaluateAsInt(Value, CGM.getContext()))
6880       llvm_unreachable("Sema will ensure that the parameter is constant");
6881 
6882     LLVMContext &Context = CGM.getLLVMContext();
6883     std::string Reg = Value == 31 ? "sp" : "x" + Value.toString(10);
6884 
6885     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
6886     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
6887     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
6888 
6889     llvm::Value *F =
6890         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
6891     return Builder.CreateCall(F, Metadata);
6892   }
6893 
6894   if (BuiltinID == AArch64::BI__builtin_arm_clrex) {
6895     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
6896     return Builder.CreateCall(F);
6897   }
6898 
6899   if (BuiltinID == AArch64::BI_ReadWriteBarrier)
6900     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
6901                                llvm::SyncScope::SingleThread);
6902 
6903   // CRC32
6904   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
6905   switch (BuiltinID) {
6906   case AArch64::BI__builtin_arm_crc32b:
6907     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
6908   case AArch64::BI__builtin_arm_crc32cb:
6909     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
6910   case AArch64::BI__builtin_arm_crc32h:
6911     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
6912   case AArch64::BI__builtin_arm_crc32ch:
6913     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
6914   case AArch64::BI__builtin_arm_crc32w:
6915     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
6916   case AArch64::BI__builtin_arm_crc32cw:
6917     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
6918   case AArch64::BI__builtin_arm_crc32d:
6919     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
6920   case AArch64::BI__builtin_arm_crc32cd:
6921     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
6922   }
6923 
6924   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
6925     Value *Arg0 = EmitScalarExpr(E->getArg(0));
6926     Value *Arg1 = EmitScalarExpr(E->getArg(1));
6927     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
6928 
6929     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
6930     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
6931 
6932     return Builder.CreateCall(F, {Arg0, Arg1});
6933   }
6934 
6935   if (BuiltinID == AArch64::BI__builtin_arm_rsr ||
6936       BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
6937       BuiltinID == AArch64::BI__builtin_arm_rsrp ||
6938       BuiltinID == AArch64::BI__builtin_arm_wsr ||
6939       BuiltinID == AArch64::BI__builtin_arm_wsr64 ||
6940       BuiltinID == AArch64::BI__builtin_arm_wsrp) {
6941 
6942     bool IsRead = BuiltinID == AArch64::BI__builtin_arm_rsr ||
6943                   BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
6944                   BuiltinID == AArch64::BI__builtin_arm_rsrp;
6945 
6946     bool IsPointerBuiltin = BuiltinID == AArch64::BI__builtin_arm_rsrp ||
6947                             BuiltinID == AArch64::BI__builtin_arm_wsrp;
6948 
6949     bool Is64Bit = BuiltinID != AArch64::BI__builtin_arm_rsr &&
6950                    BuiltinID != AArch64::BI__builtin_arm_wsr;
6951 
6952     llvm::Type *ValueType;
6953     llvm::Type *RegisterType = Int64Ty;
6954     if (IsPointerBuiltin) {
6955       ValueType = VoidPtrTy;
6956     } else if (Is64Bit) {
6957       ValueType = Int64Ty;
6958     } else {
6959       ValueType = Int32Ty;
6960     }
6961 
6962     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, IsRead);
6963   }
6964 
6965   if (BuiltinID == AArch64::BI_ReadStatusReg ||
6966       BuiltinID == AArch64::BI_WriteStatusReg) {
6967     LLVMContext &Context = CGM.getLLVMContext();
6968 
6969     unsigned SysReg =
6970       E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
6971 
6972     std::string SysRegStr;
6973     llvm::raw_string_ostream(SysRegStr) <<
6974                        ((1 << 1) | ((SysReg >> 14) & 1))  << ":" <<
6975                        ((SysReg >> 11) & 7)               << ":" <<
6976                        ((SysReg >> 7)  & 15)              << ":" <<
6977                        ((SysReg >> 3)  & 15)              << ":" <<
6978                        ( SysReg        & 7);
6979 
6980     llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
6981     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
6982     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
6983 
6984     llvm::Type *RegisterType = Int64Ty;
6985     llvm::Type *ValueType = Int32Ty;
6986     llvm::Type *Types[] = { RegisterType };
6987 
6988     if (BuiltinID == AArch64::BI_ReadStatusReg) {
6989       llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
6990       llvm::Value *Call = Builder.CreateCall(F, Metadata);
6991 
6992       return Builder.CreateTrunc(Call, ValueType);
6993     }
6994 
6995     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
6996     llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
6997     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
6998 
6999     return Builder.CreateCall(F, { Metadata, ArgValue });
7000   }
7001 
7002   if (BuiltinID == AArch64::BI_AddressOfReturnAddress) {
7003     llvm::Value *F = CGM.getIntrinsic(Intrinsic::addressofreturnaddress);
7004     return Builder.CreateCall(F);
7005   }
7006 
7007   // Find out if any arguments are required to be integer constant
7008   // expressions.
7009   unsigned ICEArguments = 0;
7010   ASTContext::GetBuiltinTypeError Error;
7011   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
7012   assert(Error == ASTContext::GE_None && "Should not codegen an error");
7013 
7014   llvm::SmallVector<Value*, 4> Ops;
7015   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
7016     if ((ICEArguments & (1 << i)) == 0) {
7017       Ops.push_back(EmitScalarExpr(E->getArg(i)));
7018     } else {
7019       // If this is required to be a constant, constant fold it so that we know
7020       // that the generated intrinsic gets a ConstantInt.
7021       llvm::APSInt Result;
7022       bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
7023       assert(IsConst && "Constant arg isn't actually constant?");
7024       (void)IsConst;
7025       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
7026     }
7027   }
7028 
7029   auto SISDMap = makeArrayRef(AArch64SISDIntrinsicMap);
7030   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
7031       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
7032 
7033   if (Builtin) {
7034     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
7035     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
7036     assert(Result && "SISD intrinsic should have been handled");
7037     return Result;
7038   }
7039 
7040   llvm::APSInt Result;
7041   const Expr *Arg = E->getArg(E->getNumArgs()-1);
7042   NeonTypeFlags Type(0);
7043   if (Arg->isIntegerConstantExpr(Result, getContext()))
7044     // Determine the type of this overloaded NEON intrinsic.
7045     Type = NeonTypeFlags(Result.getZExtValue());
7046 
7047   bool usgn = Type.isUnsigned();
7048   bool quad = Type.isQuad();
7049 
7050   // Handle non-overloaded intrinsics first.
7051   switch (BuiltinID) {
7052   default: break;
7053   case NEON::BI__builtin_neon_vabsh_f16:
7054     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7055     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
7056   case NEON::BI__builtin_neon_vldrq_p128: {
7057     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
7058     llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
7059     Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
7060     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
7061                                      CharUnits::fromQuantity(16));
7062   }
7063   case NEON::BI__builtin_neon_vstrq_p128: {
7064     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
7065     Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
7066     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
7067   }
7068   case NEON::BI__builtin_neon_vcvts_u32_f32:
7069   case NEON::BI__builtin_neon_vcvtd_u64_f64:
7070     usgn = true;
7071     LLVM_FALLTHROUGH;
7072   case NEON::BI__builtin_neon_vcvts_s32_f32:
7073   case NEON::BI__builtin_neon_vcvtd_s64_f64: {
7074     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7075     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
7076     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
7077     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
7078     Ops[0] = Builder.CreateBitCast(Ops[0], FTy);
7079     if (usgn)
7080       return Builder.CreateFPToUI(Ops[0], InTy);
7081     return Builder.CreateFPToSI(Ops[0], InTy);
7082   }
7083   case NEON::BI__builtin_neon_vcvts_f32_u32:
7084   case NEON::BI__builtin_neon_vcvtd_f64_u64:
7085     usgn = true;
7086     LLVM_FALLTHROUGH;
7087   case NEON::BI__builtin_neon_vcvts_f32_s32:
7088   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
7089     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7090     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
7091     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
7092     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
7093     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
7094     if (usgn)
7095       return Builder.CreateUIToFP(Ops[0], FTy);
7096     return Builder.CreateSIToFP(Ops[0], FTy);
7097   }
7098   case NEON::BI__builtin_neon_vcvth_f16_u16:
7099   case NEON::BI__builtin_neon_vcvth_f16_u32:
7100   case NEON::BI__builtin_neon_vcvth_f16_u64:
7101     usgn = true;
7102     LLVM_FALLTHROUGH;
7103   case NEON::BI__builtin_neon_vcvth_f16_s16:
7104   case NEON::BI__builtin_neon_vcvth_f16_s32:
7105   case NEON::BI__builtin_neon_vcvth_f16_s64: {
7106     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7107     llvm::Type *FTy = HalfTy;
7108     llvm::Type *InTy;
7109     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
7110       InTy = Int64Ty;
7111     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
7112       InTy = Int32Ty;
7113     else
7114       InTy = Int16Ty;
7115     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
7116     if (usgn)
7117       return Builder.CreateUIToFP(Ops[0], FTy);
7118     return Builder.CreateSIToFP(Ops[0], FTy);
7119   }
7120   case NEON::BI__builtin_neon_vcvth_u16_f16:
7121     usgn = true;
7122     LLVM_FALLTHROUGH;
7123   case NEON::BI__builtin_neon_vcvth_s16_f16: {
7124     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7125     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
7126     if (usgn)
7127       return Builder.CreateFPToUI(Ops[0], Int16Ty);
7128     return Builder.CreateFPToSI(Ops[0], Int16Ty);
7129   }
7130   case NEON::BI__builtin_neon_vcvth_u32_f16:
7131     usgn = true;
7132     LLVM_FALLTHROUGH;
7133   case NEON::BI__builtin_neon_vcvth_s32_f16: {
7134     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7135     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
7136     if (usgn)
7137       return Builder.CreateFPToUI(Ops[0], Int32Ty);
7138     return Builder.CreateFPToSI(Ops[0], Int32Ty);
7139   }
7140   case NEON::BI__builtin_neon_vcvth_u64_f16:
7141     usgn = true;
7142     LLVM_FALLTHROUGH;
7143   case NEON::BI__builtin_neon_vcvth_s64_f16: {
7144     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7145     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
7146     if (usgn)
7147       return Builder.CreateFPToUI(Ops[0], Int64Ty);
7148     return Builder.CreateFPToSI(Ops[0], Int64Ty);
7149   }
7150   case NEON::BI__builtin_neon_vcvtah_u16_f16:
7151   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
7152   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
7153   case NEON::BI__builtin_neon_vcvtph_u16_f16:
7154   case NEON::BI__builtin_neon_vcvtah_s16_f16:
7155   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
7156   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
7157   case NEON::BI__builtin_neon_vcvtph_s16_f16: {
7158     unsigned Int;
7159     llvm::Type* InTy = Int32Ty;
7160     llvm::Type* FTy  = HalfTy;
7161     llvm::Type *Tys[2] = {InTy, FTy};
7162     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7163     switch (BuiltinID) {
7164     default: llvm_unreachable("missing builtin ID in switch!");
7165     case NEON::BI__builtin_neon_vcvtah_u16_f16:
7166       Int = Intrinsic::aarch64_neon_fcvtau; break;
7167     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
7168       Int = Intrinsic::aarch64_neon_fcvtmu; break;
7169     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
7170       Int = Intrinsic::aarch64_neon_fcvtnu; break;
7171     case NEON::BI__builtin_neon_vcvtph_u16_f16:
7172       Int = Intrinsic::aarch64_neon_fcvtpu; break;
7173     case NEON::BI__builtin_neon_vcvtah_s16_f16:
7174       Int = Intrinsic::aarch64_neon_fcvtas; break;
7175     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
7176       Int = Intrinsic::aarch64_neon_fcvtms; break;
7177     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
7178       Int = Intrinsic::aarch64_neon_fcvtns; break;
7179     case NEON::BI__builtin_neon_vcvtph_s16_f16:
7180       Int = Intrinsic::aarch64_neon_fcvtps; break;
7181     }
7182     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
7183     return Builder.CreateTrunc(Ops[0], Int16Ty);
7184   }
7185   case NEON::BI__builtin_neon_vcaleh_f16:
7186   case NEON::BI__builtin_neon_vcalth_f16:
7187   case NEON::BI__builtin_neon_vcageh_f16:
7188   case NEON::BI__builtin_neon_vcagth_f16: {
7189     unsigned Int;
7190     llvm::Type* InTy = Int32Ty;
7191     llvm::Type* FTy  = HalfTy;
7192     llvm::Type *Tys[2] = {InTy, FTy};
7193     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7194     switch (BuiltinID) {
7195     default: llvm_unreachable("missing builtin ID in switch!");
7196     case NEON::BI__builtin_neon_vcageh_f16:
7197       Int = Intrinsic::aarch64_neon_facge; break;
7198     case NEON::BI__builtin_neon_vcagth_f16:
7199       Int = Intrinsic::aarch64_neon_facgt; break;
7200     case NEON::BI__builtin_neon_vcaleh_f16:
7201       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
7202     case NEON::BI__builtin_neon_vcalth_f16:
7203       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
7204     }
7205     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
7206     return Builder.CreateTrunc(Ops[0], Int16Ty);
7207   }
7208   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
7209   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
7210     unsigned Int;
7211     llvm::Type* InTy = Int32Ty;
7212     llvm::Type* FTy  = HalfTy;
7213     llvm::Type *Tys[2] = {InTy, FTy};
7214     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7215     switch (BuiltinID) {
7216     default: llvm_unreachable("missing builtin ID in switch!");
7217     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
7218       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
7219     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
7220       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
7221     }
7222     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
7223     return Builder.CreateTrunc(Ops[0], Int16Ty);
7224   }
7225   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
7226   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
7227     unsigned Int;
7228     llvm::Type* FTy  = HalfTy;
7229     llvm::Type* InTy = Int32Ty;
7230     llvm::Type *Tys[2] = {FTy, InTy};
7231     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7232     switch (BuiltinID) {
7233     default: llvm_unreachable("missing builtin ID in switch!");
7234     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
7235       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
7236       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
7237       break;
7238     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
7239       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
7240       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
7241       break;
7242     }
7243     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
7244   }
7245   case NEON::BI__builtin_neon_vpaddd_s64: {
7246     llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
7247     Value *Vec = EmitScalarExpr(E->getArg(0));
7248     // The vector is v2f64, so make sure it's bitcast to that.
7249     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
7250     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
7251     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
7252     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
7253     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
7254     // Pairwise addition of a v2f64 into a scalar f64.
7255     return Builder.CreateAdd(Op0, Op1, "vpaddd");
7256   }
7257   case NEON::BI__builtin_neon_vpaddd_f64: {
7258     llvm::Type *Ty =
7259       llvm::VectorType::get(DoubleTy, 2);
7260     Value *Vec = EmitScalarExpr(E->getArg(0));
7261     // The vector is v2f64, so make sure it's bitcast to that.
7262     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
7263     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
7264     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
7265     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
7266     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
7267     // Pairwise addition of a v2f64 into a scalar f64.
7268     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
7269   }
7270   case NEON::BI__builtin_neon_vpadds_f32: {
7271     llvm::Type *Ty =
7272       llvm::VectorType::get(FloatTy, 2);
7273     Value *Vec = EmitScalarExpr(E->getArg(0));
7274     // The vector is v2f32, so make sure it's bitcast to that.
7275     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
7276     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
7277     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
7278     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
7279     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
7280     // Pairwise addition of a v2f32 into a scalar f32.
7281     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
7282   }
7283   case NEON::BI__builtin_neon_vceqzd_s64:
7284   case NEON::BI__builtin_neon_vceqzd_f64:
7285   case NEON::BI__builtin_neon_vceqzs_f32:
7286   case NEON::BI__builtin_neon_vceqzh_f16:
7287     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7288     return EmitAArch64CompareBuiltinExpr(
7289         Ops[0], ConvertType(E->getCallReturnType(getContext())),
7290         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
7291   case NEON::BI__builtin_neon_vcgezd_s64:
7292   case NEON::BI__builtin_neon_vcgezd_f64:
7293   case NEON::BI__builtin_neon_vcgezs_f32:
7294   case NEON::BI__builtin_neon_vcgezh_f16:
7295     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7296     return EmitAArch64CompareBuiltinExpr(
7297         Ops[0], ConvertType(E->getCallReturnType(getContext())),
7298         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
7299   case NEON::BI__builtin_neon_vclezd_s64:
7300   case NEON::BI__builtin_neon_vclezd_f64:
7301   case NEON::BI__builtin_neon_vclezs_f32:
7302   case NEON::BI__builtin_neon_vclezh_f16:
7303     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7304     return EmitAArch64CompareBuiltinExpr(
7305         Ops[0], ConvertType(E->getCallReturnType(getContext())),
7306         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
7307   case NEON::BI__builtin_neon_vcgtzd_s64:
7308   case NEON::BI__builtin_neon_vcgtzd_f64:
7309   case NEON::BI__builtin_neon_vcgtzs_f32:
7310   case NEON::BI__builtin_neon_vcgtzh_f16:
7311     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7312     return EmitAArch64CompareBuiltinExpr(
7313         Ops[0], ConvertType(E->getCallReturnType(getContext())),
7314         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
7315   case NEON::BI__builtin_neon_vcltzd_s64:
7316   case NEON::BI__builtin_neon_vcltzd_f64:
7317   case NEON::BI__builtin_neon_vcltzs_f32:
7318   case NEON::BI__builtin_neon_vcltzh_f16:
7319     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7320     return EmitAArch64CompareBuiltinExpr(
7321         Ops[0], ConvertType(E->getCallReturnType(getContext())),
7322         ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
7323 
7324   case NEON::BI__builtin_neon_vceqzd_u64: {
7325     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7326     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
7327     Ops[0] =
7328         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
7329     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
7330   }
7331   case NEON::BI__builtin_neon_vceqd_f64:
7332   case NEON::BI__builtin_neon_vcled_f64:
7333   case NEON::BI__builtin_neon_vcltd_f64:
7334   case NEON::BI__builtin_neon_vcged_f64:
7335   case NEON::BI__builtin_neon_vcgtd_f64: {
7336     llvm::CmpInst::Predicate P;
7337     switch (BuiltinID) {
7338     default: llvm_unreachable("missing builtin ID in switch!");
7339     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
7340     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
7341     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
7342     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
7343     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
7344     }
7345     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7346     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7347     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
7348     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
7349     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
7350   }
7351   case NEON::BI__builtin_neon_vceqs_f32:
7352   case NEON::BI__builtin_neon_vcles_f32:
7353   case NEON::BI__builtin_neon_vclts_f32:
7354   case NEON::BI__builtin_neon_vcges_f32:
7355   case NEON::BI__builtin_neon_vcgts_f32: {
7356     llvm::CmpInst::Predicate P;
7357     switch (BuiltinID) {
7358     default: llvm_unreachable("missing builtin ID in switch!");
7359     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
7360     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
7361     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
7362     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
7363     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
7364     }
7365     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7366     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
7367     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
7368     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
7369     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
7370   }
7371   case NEON::BI__builtin_neon_vceqh_f16:
7372   case NEON::BI__builtin_neon_vcleh_f16:
7373   case NEON::BI__builtin_neon_vclth_f16:
7374   case NEON::BI__builtin_neon_vcgeh_f16:
7375   case NEON::BI__builtin_neon_vcgth_f16: {
7376     llvm::CmpInst::Predicate P;
7377     switch (BuiltinID) {
7378     default: llvm_unreachable("missing builtin ID in switch!");
7379     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
7380     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
7381     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
7382     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
7383     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
7384     }
7385     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7386     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
7387     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
7388     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
7389     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
7390   }
7391   case NEON::BI__builtin_neon_vceqd_s64:
7392   case NEON::BI__builtin_neon_vceqd_u64:
7393   case NEON::BI__builtin_neon_vcgtd_s64:
7394   case NEON::BI__builtin_neon_vcgtd_u64:
7395   case NEON::BI__builtin_neon_vcltd_s64:
7396   case NEON::BI__builtin_neon_vcltd_u64:
7397   case NEON::BI__builtin_neon_vcged_u64:
7398   case NEON::BI__builtin_neon_vcged_s64:
7399   case NEON::BI__builtin_neon_vcled_u64:
7400   case NEON::BI__builtin_neon_vcled_s64: {
7401     llvm::CmpInst::Predicate P;
7402     switch (BuiltinID) {
7403     default: llvm_unreachable("missing builtin ID in switch!");
7404     case NEON::BI__builtin_neon_vceqd_s64:
7405     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
7406     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
7407     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
7408     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
7409     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
7410     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
7411     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
7412     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
7413     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
7414     }
7415     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7416     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
7417     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
7418     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
7419     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
7420   }
7421   case NEON::BI__builtin_neon_vtstd_s64:
7422   case NEON::BI__builtin_neon_vtstd_u64: {
7423     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7424     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
7425     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
7426     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
7427     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
7428                                 llvm::Constant::getNullValue(Int64Ty));
7429     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
7430   }
7431   case NEON::BI__builtin_neon_vset_lane_i8:
7432   case NEON::BI__builtin_neon_vset_lane_i16:
7433   case NEON::BI__builtin_neon_vset_lane_i32:
7434   case NEON::BI__builtin_neon_vset_lane_i64:
7435   case NEON::BI__builtin_neon_vset_lane_f32:
7436   case NEON::BI__builtin_neon_vsetq_lane_i8:
7437   case NEON::BI__builtin_neon_vsetq_lane_i16:
7438   case NEON::BI__builtin_neon_vsetq_lane_i32:
7439   case NEON::BI__builtin_neon_vsetq_lane_i64:
7440   case NEON::BI__builtin_neon_vsetq_lane_f32:
7441     Ops.push_back(EmitScalarExpr(E->getArg(2)));
7442     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
7443   case NEON::BI__builtin_neon_vset_lane_f64:
7444     // The vector type needs a cast for the v1f64 variant.
7445     Ops[1] = Builder.CreateBitCast(Ops[1],
7446                                    llvm::VectorType::get(DoubleTy, 1));
7447     Ops.push_back(EmitScalarExpr(E->getArg(2)));
7448     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
7449   case NEON::BI__builtin_neon_vsetq_lane_f64:
7450     // The vector type needs a cast for the v2f64 variant.
7451     Ops[1] = Builder.CreateBitCast(Ops[1],
7452         llvm::VectorType::get(DoubleTy, 2));
7453     Ops.push_back(EmitScalarExpr(E->getArg(2)));
7454     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
7455 
7456   case NEON::BI__builtin_neon_vget_lane_i8:
7457   case NEON::BI__builtin_neon_vdupb_lane_i8:
7458     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 8));
7459     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7460                                         "vget_lane");
7461   case NEON::BI__builtin_neon_vgetq_lane_i8:
7462   case NEON::BI__builtin_neon_vdupb_laneq_i8:
7463     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 16));
7464     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7465                                         "vgetq_lane");
7466   case NEON::BI__builtin_neon_vget_lane_i16:
7467   case NEON::BI__builtin_neon_vduph_lane_i16:
7468     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 4));
7469     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7470                                         "vget_lane");
7471   case NEON::BI__builtin_neon_vgetq_lane_i16:
7472   case NEON::BI__builtin_neon_vduph_laneq_i16:
7473     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 8));
7474     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7475                                         "vgetq_lane");
7476   case NEON::BI__builtin_neon_vget_lane_i32:
7477   case NEON::BI__builtin_neon_vdups_lane_i32:
7478     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 2));
7479     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7480                                         "vget_lane");
7481   case NEON::BI__builtin_neon_vdups_lane_f32:
7482     Ops[0] = Builder.CreateBitCast(Ops[0],
7483         llvm::VectorType::get(FloatTy, 2));
7484     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7485                                         "vdups_lane");
7486   case NEON::BI__builtin_neon_vgetq_lane_i32:
7487   case NEON::BI__builtin_neon_vdups_laneq_i32:
7488     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
7489     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7490                                         "vgetq_lane");
7491   case NEON::BI__builtin_neon_vget_lane_i64:
7492   case NEON::BI__builtin_neon_vdupd_lane_i64:
7493     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 1));
7494     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7495                                         "vget_lane");
7496   case NEON::BI__builtin_neon_vdupd_lane_f64:
7497     Ops[0] = Builder.CreateBitCast(Ops[0],
7498         llvm::VectorType::get(DoubleTy, 1));
7499     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7500                                         "vdupd_lane");
7501   case NEON::BI__builtin_neon_vgetq_lane_i64:
7502   case NEON::BI__builtin_neon_vdupd_laneq_i64:
7503     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
7504     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7505                                         "vgetq_lane");
7506   case NEON::BI__builtin_neon_vget_lane_f32:
7507     Ops[0] = Builder.CreateBitCast(Ops[0],
7508         llvm::VectorType::get(FloatTy, 2));
7509     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7510                                         "vget_lane");
7511   case NEON::BI__builtin_neon_vget_lane_f64:
7512     Ops[0] = Builder.CreateBitCast(Ops[0],
7513         llvm::VectorType::get(DoubleTy, 1));
7514     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7515                                         "vget_lane");
7516   case NEON::BI__builtin_neon_vgetq_lane_f32:
7517   case NEON::BI__builtin_neon_vdups_laneq_f32:
7518     Ops[0] = Builder.CreateBitCast(Ops[0],
7519         llvm::VectorType::get(FloatTy, 4));
7520     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7521                                         "vgetq_lane");
7522   case NEON::BI__builtin_neon_vgetq_lane_f64:
7523   case NEON::BI__builtin_neon_vdupd_laneq_f64:
7524     Ops[0] = Builder.CreateBitCast(Ops[0],
7525         llvm::VectorType::get(DoubleTy, 2));
7526     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
7527                                         "vgetq_lane");
7528   case NEON::BI__builtin_neon_vaddh_f16:
7529     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7530     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
7531   case NEON::BI__builtin_neon_vsubh_f16:
7532     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7533     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
7534   case NEON::BI__builtin_neon_vmulh_f16:
7535     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7536     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
7537   case NEON::BI__builtin_neon_vdivh_f16:
7538     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7539     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
7540   case NEON::BI__builtin_neon_vfmah_f16: {
7541     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
7542     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
7543     return Builder.CreateCall(F,
7544       {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
7545   }
7546   case NEON::BI__builtin_neon_vfmsh_f16: {
7547     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
7548     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
7549     Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
7550     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
7551     return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
7552   }
7553   case NEON::BI__builtin_neon_vaddd_s64:
7554   case NEON::BI__builtin_neon_vaddd_u64:
7555     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
7556   case NEON::BI__builtin_neon_vsubd_s64:
7557   case NEON::BI__builtin_neon_vsubd_u64:
7558     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
7559   case NEON::BI__builtin_neon_vqdmlalh_s16:
7560   case NEON::BI__builtin_neon_vqdmlslh_s16: {
7561     SmallVector<Value *, 2> ProductOps;
7562     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
7563     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
7564     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
7565     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
7566                           ProductOps, "vqdmlXl");
7567     Constant *CI = ConstantInt::get(SizeTy, 0);
7568     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
7569 
7570     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
7571                                         ? Intrinsic::aarch64_neon_sqadd
7572                                         : Intrinsic::aarch64_neon_sqsub;
7573     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
7574   }
7575   case NEON::BI__builtin_neon_vqshlud_n_s64: {
7576     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7577     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
7578     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
7579                         Ops, "vqshlu_n");
7580   }
7581   case NEON::BI__builtin_neon_vqshld_n_u64:
7582   case NEON::BI__builtin_neon_vqshld_n_s64: {
7583     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
7584                                    ? Intrinsic::aarch64_neon_uqshl
7585                                    : Intrinsic::aarch64_neon_sqshl;
7586     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7587     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
7588     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
7589   }
7590   case NEON::BI__builtin_neon_vrshrd_n_u64:
7591   case NEON::BI__builtin_neon_vrshrd_n_s64: {
7592     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
7593                                    ? Intrinsic::aarch64_neon_urshl
7594                                    : Intrinsic::aarch64_neon_srshl;
7595     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7596     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
7597     Ops[1] = ConstantInt::get(Int64Ty, -SV);
7598     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
7599   }
7600   case NEON::BI__builtin_neon_vrsrad_n_u64:
7601   case NEON::BI__builtin_neon_vrsrad_n_s64: {
7602     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
7603                                    ? Intrinsic::aarch64_neon_urshl
7604                                    : Intrinsic::aarch64_neon_srshl;
7605     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
7606     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
7607     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
7608                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
7609     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
7610   }
7611   case NEON::BI__builtin_neon_vshld_n_s64:
7612   case NEON::BI__builtin_neon_vshld_n_u64: {
7613     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
7614     return Builder.CreateShl(
7615         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
7616   }
7617   case NEON::BI__builtin_neon_vshrd_n_s64: {
7618     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
7619     return Builder.CreateAShr(
7620         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
7621                                                    Amt->getZExtValue())),
7622         "shrd_n");
7623   }
7624   case NEON::BI__builtin_neon_vshrd_n_u64: {
7625     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
7626     uint64_t ShiftAmt = Amt->getZExtValue();
7627     // Right-shifting an unsigned value by its size yields 0.
7628     if (ShiftAmt == 64)
7629       return ConstantInt::get(Int64Ty, 0);
7630     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
7631                               "shrd_n");
7632   }
7633   case NEON::BI__builtin_neon_vsrad_n_s64: {
7634     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
7635     Ops[1] = Builder.CreateAShr(
7636         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
7637                                                    Amt->getZExtValue())),
7638         "shrd_n");
7639     return Builder.CreateAdd(Ops[0], Ops[1]);
7640   }
7641   case NEON::BI__builtin_neon_vsrad_n_u64: {
7642     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
7643     uint64_t ShiftAmt = Amt->getZExtValue();
7644     // Right-shifting an unsigned value by its size yields 0.
7645     // As Op + 0 = Op, return Ops[0] directly.
7646     if (ShiftAmt == 64)
7647       return Ops[0];
7648     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
7649                                 "shrd_n");
7650     return Builder.CreateAdd(Ops[0], Ops[1]);
7651   }
7652   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
7653   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
7654   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
7655   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
7656     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
7657                                           "lane");
7658     SmallVector<Value *, 2> ProductOps;
7659     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
7660     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
7661     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
7662     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
7663                           ProductOps, "vqdmlXl");
7664     Constant *CI = ConstantInt::get(SizeTy, 0);
7665     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
7666     Ops.pop_back();
7667 
7668     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
7669                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
7670                           ? Intrinsic::aarch64_neon_sqadd
7671                           : Intrinsic::aarch64_neon_sqsub;
7672     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
7673   }
7674   case NEON::BI__builtin_neon_vqdmlals_s32:
7675   case NEON::BI__builtin_neon_vqdmlsls_s32: {
7676     SmallVector<Value *, 2> ProductOps;
7677     ProductOps.push_back(Ops[1]);
7678     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
7679     Ops[1] =
7680         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
7681                      ProductOps, "vqdmlXl");
7682 
7683     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
7684                                         ? Intrinsic::aarch64_neon_sqadd
7685                                         : Intrinsic::aarch64_neon_sqsub;
7686     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
7687   }
7688   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
7689   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
7690   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
7691   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
7692     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
7693                                           "lane");
7694     SmallVector<Value *, 2> ProductOps;
7695     ProductOps.push_back(Ops[1]);
7696     ProductOps.push_back(Ops[2]);
7697     Ops[1] =
7698         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
7699                      ProductOps, "vqdmlXl");
7700     Ops.pop_back();
7701 
7702     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
7703                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
7704                           ? Intrinsic::aarch64_neon_sqadd
7705                           : Intrinsic::aarch64_neon_sqsub;
7706     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
7707   }
7708   }
7709 
7710   llvm::VectorType *VTy = GetNeonType(this, Type);
7711   llvm::Type *Ty = VTy;
7712   if (!Ty)
7713     return nullptr;
7714 
7715   // Not all intrinsics handled by the common case work for AArch64 yet, so only
7716   // defer to common code if it's been added to our special map.
7717   Builtin = findNeonIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
7718                                    AArch64SIMDIntrinsicsProvenSorted);
7719 
7720   if (Builtin)
7721     return EmitCommonNeonBuiltinExpr(
7722         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
7723         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
7724         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
7725 
7726   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
7727     return V;
7728 
7729   unsigned Int;
7730   switch (BuiltinID) {
7731   default: return nullptr;
7732   case NEON::BI__builtin_neon_vbsl_v:
7733   case NEON::BI__builtin_neon_vbslq_v: {
7734     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
7735     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
7736     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
7737     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
7738 
7739     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
7740     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
7741     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
7742     return Builder.CreateBitCast(Ops[0], Ty);
7743   }
7744   case NEON::BI__builtin_neon_vfma_lane_v:
7745   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
7746     // The ARM builtins (and instructions) have the addend as the first
7747     // operand, but the 'fma' intrinsics have it last. Swap it around here.
7748     Value *Addend = Ops[0];
7749     Value *Multiplicand = Ops[1];
7750     Value *LaneSource = Ops[2];
7751     Ops[0] = Multiplicand;
7752     Ops[1] = LaneSource;
7753     Ops[2] = Addend;
7754 
7755     // Now adjust things to handle the lane access.
7756     llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ?
7757       llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) :
7758       VTy;
7759     llvm::Constant *cst = cast<Constant>(Ops[3]);
7760     Value *SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), cst);
7761     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
7762     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
7763 
7764     Ops.pop_back();
7765     Int = Intrinsic::fma;
7766     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
7767   }
7768   case NEON::BI__builtin_neon_vfma_laneq_v: {
7769     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
7770     // v1f64 fma should be mapped to Neon scalar f64 fma
7771     if (VTy && VTy->getElementType() == DoubleTy) {
7772       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7773       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
7774       llvm::Type *VTy = GetNeonType(this,
7775         NeonTypeFlags(NeonTypeFlags::Float64, false, true));
7776       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
7777       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
7778       Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
7779       Value *Result = Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
7780       return Builder.CreateBitCast(Result, Ty);
7781     }
7782     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7783     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7784     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7785 
7786     llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
7787                                             VTy->getNumElements() * 2);
7788     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
7789     Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
7790                                                cast<ConstantInt>(Ops[3]));
7791     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
7792 
7793     return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
7794   }
7795   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
7796     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7797     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7798     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7799 
7800     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7801     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
7802     return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
7803   }
7804   case NEON::BI__builtin_neon_vfmah_lane_f16:
7805   case NEON::BI__builtin_neon_vfmas_lane_f32:
7806   case NEON::BI__builtin_neon_vfmah_laneq_f16:
7807   case NEON::BI__builtin_neon_vfmas_laneq_f32:
7808   case NEON::BI__builtin_neon_vfmad_lane_f64:
7809   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
7810     Ops.push_back(EmitScalarExpr(E->getArg(3)));
7811     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
7812     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7813     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
7814     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
7815   }
7816   case NEON::BI__builtin_neon_vmull_v:
7817     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7818     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
7819     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
7820     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
7821   case NEON::BI__builtin_neon_vmax_v:
7822   case NEON::BI__builtin_neon_vmaxq_v:
7823     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7824     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
7825     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
7826     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
7827   case NEON::BI__builtin_neon_vmaxh_f16: {
7828     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7829     Int = Intrinsic::aarch64_neon_fmax;
7830     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
7831   }
7832   case NEON::BI__builtin_neon_vmin_v:
7833   case NEON::BI__builtin_neon_vminq_v:
7834     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7835     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
7836     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
7837     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
7838   case NEON::BI__builtin_neon_vminh_f16: {
7839     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7840     Int = Intrinsic::aarch64_neon_fmin;
7841     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
7842   }
7843   case NEON::BI__builtin_neon_vabd_v:
7844   case NEON::BI__builtin_neon_vabdq_v:
7845     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7846     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
7847     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
7848     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
7849   case NEON::BI__builtin_neon_vpadal_v:
7850   case NEON::BI__builtin_neon_vpadalq_v: {
7851     unsigned ArgElts = VTy->getNumElements();
7852     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
7853     unsigned BitWidth = EltTy->getBitWidth();
7854     llvm::Type *ArgTy = llvm::VectorType::get(
7855         llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts);
7856     llvm::Type* Tys[2] = { VTy, ArgTy };
7857     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
7858     SmallVector<llvm::Value*, 1> TmpOps;
7859     TmpOps.push_back(Ops[1]);
7860     Function *F = CGM.getIntrinsic(Int, Tys);
7861     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
7862     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
7863     return Builder.CreateAdd(tmp, addend);
7864   }
7865   case NEON::BI__builtin_neon_vpmin_v:
7866   case NEON::BI__builtin_neon_vpminq_v:
7867     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7868     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
7869     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
7870     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
7871   case NEON::BI__builtin_neon_vpmax_v:
7872   case NEON::BI__builtin_neon_vpmaxq_v:
7873     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7874     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
7875     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
7876     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
7877   case NEON::BI__builtin_neon_vminnm_v:
7878   case NEON::BI__builtin_neon_vminnmq_v:
7879     Int = Intrinsic::aarch64_neon_fminnm;
7880     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
7881   case NEON::BI__builtin_neon_vminnmh_f16:
7882     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7883     Int = Intrinsic::aarch64_neon_fminnm;
7884     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
7885   case NEON::BI__builtin_neon_vmaxnm_v:
7886   case NEON::BI__builtin_neon_vmaxnmq_v:
7887     Int = Intrinsic::aarch64_neon_fmaxnm;
7888     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
7889   case NEON::BI__builtin_neon_vmaxnmh_f16:
7890     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7891     Int = Intrinsic::aarch64_neon_fmaxnm;
7892     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
7893   case NEON::BI__builtin_neon_vrecpss_f32: {
7894     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7895     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
7896                         Ops, "vrecps");
7897   }
7898   case NEON::BI__builtin_neon_vrecpsd_f64:
7899     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7900     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
7901                         Ops, "vrecps");
7902   case NEON::BI__builtin_neon_vrecpsh_f16:
7903     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7904     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
7905                         Ops, "vrecps");
7906   case NEON::BI__builtin_neon_vqshrun_n_v:
7907     Int = Intrinsic::aarch64_neon_sqshrun;
7908     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
7909   case NEON::BI__builtin_neon_vqrshrun_n_v:
7910     Int = Intrinsic::aarch64_neon_sqrshrun;
7911     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
7912   case NEON::BI__builtin_neon_vqshrn_n_v:
7913     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
7914     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
7915   case NEON::BI__builtin_neon_vrshrn_n_v:
7916     Int = Intrinsic::aarch64_neon_rshrn;
7917     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
7918   case NEON::BI__builtin_neon_vqrshrn_n_v:
7919     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
7920     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
7921   case NEON::BI__builtin_neon_vrndah_f16: {
7922     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7923     Int = Intrinsic::round;
7924     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
7925   }
7926   case NEON::BI__builtin_neon_vrnda_v:
7927   case NEON::BI__builtin_neon_vrndaq_v: {
7928     Int = Intrinsic::round;
7929     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
7930   }
7931   case NEON::BI__builtin_neon_vrndih_f16: {
7932     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7933     Int = Intrinsic::nearbyint;
7934     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
7935   }
7936   case NEON::BI__builtin_neon_vrndmh_f16: {
7937     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7938     Int = Intrinsic::floor;
7939     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
7940   }
7941   case NEON::BI__builtin_neon_vrndm_v:
7942   case NEON::BI__builtin_neon_vrndmq_v: {
7943     Int = Intrinsic::floor;
7944     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
7945   }
7946   case NEON::BI__builtin_neon_vrndnh_f16: {
7947     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7948     Int = Intrinsic::aarch64_neon_frintn;
7949     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
7950   }
7951   case NEON::BI__builtin_neon_vrndn_v:
7952   case NEON::BI__builtin_neon_vrndnq_v: {
7953     Int = Intrinsic::aarch64_neon_frintn;
7954     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
7955   }
7956   case NEON::BI__builtin_neon_vrndns_f32: {
7957     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7958     Int = Intrinsic::aarch64_neon_frintn;
7959     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
7960   }
7961   case NEON::BI__builtin_neon_vrndph_f16: {
7962     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7963     Int = Intrinsic::ceil;
7964     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
7965   }
7966   case NEON::BI__builtin_neon_vrndp_v:
7967   case NEON::BI__builtin_neon_vrndpq_v: {
7968     Int = Intrinsic::ceil;
7969     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
7970   }
7971   case NEON::BI__builtin_neon_vrndxh_f16: {
7972     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7973     Int = Intrinsic::rint;
7974     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
7975   }
7976   case NEON::BI__builtin_neon_vrndx_v:
7977   case NEON::BI__builtin_neon_vrndxq_v: {
7978     Int = Intrinsic::rint;
7979     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
7980   }
7981   case NEON::BI__builtin_neon_vrndh_f16: {
7982     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7983     Int = Intrinsic::trunc;
7984     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
7985   }
7986   case NEON::BI__builtin_neon_vrnd_v:
7987   case NEON::BI__builtin_neon_vrndq_v: {
7988     Int = Intrinsic::trunc;
7989     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
7990   }
7991   case NEON::BI__builtin_neon_vcvt_f64_v:
7992   case NEON::BI__builtin_neon_vcvtq_f64_v:
7993     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7994     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
7995     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7996                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7997   case NEON::BI__builtin_neon_vcvt_f64_f32: {
7998     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
7999            "unexpected vcvt_f64_f32 builtin");
8000     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
8001     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
8002 
8003     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
8004   }
8005   case NEON::BI__builtin_neon_vcvt_f32_f64: {
8006     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
8007            "unexpected vcvt_f32_f64 builtin");
8008     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
8009     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
8010 
8011     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
8012   }
8013   case NEON::BI__builtin_neon_vcvt_s32_v:
8014   case NEON::BI__builtin_neon_vcvt_u32_v:
8015   case NEON::BI__builtin_neon_vcvt_s64_v:
8016   case NEON::BI__builtin_neon_vcvt_u64_v:
8017   case NEON::BI__builtin_neon_vcvt_s16_v:
8018   case NEON::BI__builtin_neon_vcvt_u16_v:
8019   case NEON::BI__builtin_neon_vcvtq_s32_v:
8020   case NEON::BI__builtin_neon_vcvtq_u32_v:
8021   case NEON::BI__builtin_neon_vcvtq_s64_v:
8022   case NEON::BI__builtin_neon_vcvtq_u64_v:
8023   case NEON::BI__builtin_neon_vcvtq_s16_v:
8024   case NEON::BI__builtin_neon_vcvtq_u16_v: {
8025     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
8026     if (usgn)
8027       return Builder.CreateFPToUI(Ops[0], Ty);
8028     return Builder.CreateFPToSI(Ops[0], Ty);
8029   }
8030   case NEON::BI__builtin_neon_vcvta_s16_v:
8031   case NEON::BI__builtin_neon_vcvta_u16_v:
8032   case NEON::BI__builtin_neon_vcvta_s32_v:
8033   case NEON::BI__builtin_neon_vcvtaq_s16_v:
8034   case NEON::BI__builtin_neon_vcvtaq_s32_v:
8035   case NEON::BI__builtin_neon_vcvta_u32_v:
8036   case NEON::BI__builtin_neon_vcvtaq_u16_v:
8037   case NEON::BI__builtin_neon_vcvtaq_u32_v:
8038   case NEON::BI__builtin_neon_vcvta_s64_v:
8039   case NEON::BI__builtin_neon_vcvtaq_s64_v:
8040   case NEON::BI__builtin_neon_vcvta_u64_v:
8041   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
8042     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
8043     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
8044     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
8045   }
8046   case NEON::BI__builtin_neon_vcvtm_s16_v:
8047   case NEON::BI__builtin_neon_vcvtm_s32_v:
8048   case NEON::BI__builtin_neon_vcvtmq_s16_v:
8049   case NEON::BI__builtin_neon_vcvtmq_s32_v:
8050   case NEON::BI__builtin_neon_vcvtm_u16_v:
8051   case NEON::BI__builtin_neon_vcvtm_u32_v:
8052   case NEON::BI__builtin_neon_vcvtmq_u16_v:
8053   case NEON::BI__builtin_neon_vcvtmq_u32_v:
8054   case NEON::BI__builtin_neon_vcvtm_s64_v:
8055   case NEON::BI__builtin_neon_vcvtmq_s64_v:
8056   case NEON::BI__builtin_neon_vcvtm_u64_v:
8057   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
8058     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
8059     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
8060     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
8061   }
8062   case NEON::BI__builtin_neon_vcvtn_s16_v:
8063   case NEON::BI__builtin_neon_vcvtn_s32_v:
8064   case NEON::BI__builtin_neon_vcvtnq_s16_v:
8065   case NEON::BI__builtin_neon_vcvtnq_s32_v:
8066   case NEON::BI__builtin_neon_vcvtn_u16_v:
8067   case NEON::BI__builtin_neon_vcvtn_u32_v:
8068   case NEON::BI__builtin_neon_vcvtnq_u16_v:
8069   case NEON::BI__builtin_neon_vcvtnq_u32_v:
8070   case NEON::BI__builtin_neon_vcvtn_s64_v:
8071   case NEON::BI__builtin_neon_vcvtnq_s64_v:
8072   case NEON::BI__builtin_neon_vcvtn_u64_v:
8073   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
8074     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
8075     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
8076     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
8077   }
8078   case NEON::BI__builtin_neon_vcvtp_s16_v:
8079   case NEON::BI__builtin_neon_vcvtp_s32_v:
8080   case NEON::BI__builtin_neon_vcvtpq_s16_v:
8081   case NEON::BI__builtin_neon_vcvtpq_s32_v:
8082   case NEON::BI__builtin_neon_vcvtp_u16_v:
8083   case NEON::BI__builtin_neon_vcvtp_u32_v:
8084   case NEON::BI__builtin_neon_vcvtpq_u16_v:
8085   case NEON::BI__builtin_neon_vcvtpq_u32_v:
8086   case NEON::BI__builtin_neon_vcvtp_s64_v:
8087   case NEON::BI__builtin_neon_vcvtpq_s64_v:
8088   case NEON::BI__builtin_neon_vcvtp_u64_v:
8089   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
8090     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
8091     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
8092     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
8093   }
8094   case NEON::BI__builtin_neon_vmulx_v:
8095   case NEON::BI__builtin_neon_vmulxq_v: {
8096     Int = Intrinsic::aarch64_neon_fmulx;
8097     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
8098   }
8099   case NEON::BI__builtin_neon_vmulxh_lane_f16:
8100   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
8101     // vmulx_lane should be mapped to Neon scalar mulx after
8102     // extracting the scalar element
8103     Ops.push_back(EmitScalarExpr(E->getArg(2)));
8104     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
8105     Ops.pop_back();
8106     Int = Intrinsic::aarch64_neon_fmulx;
8107     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
8108   }
8109   case NEON::BI__builtin_neon_vmul_lane_v:
8110   case NEON::BI__builtin_neon_vmul_laneq_v: {
8111     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
8112     bool Quad = false;
8113     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
8114       Quad = true;
8115     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
8116     llvm::Type *VTy = GetNeonType(this,
8117       NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
8118     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
8119     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
8120     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
8121     return Builder.CreateBitCast(Result, Ty);
8122   }
8123   case NEON::BI__builtin_neon_vnegd_s64:
8124     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
8125   case NEON::BI__builtin_neon_vnegh_f16:
8126     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
8127   case NEON::BI__builtin_neon_vpmaxnm_v:
8128   case NEON::BI__builtin_neon_vpmaxnmq_v: {
8129     Int = Intrinsic::aarch64_neon_fmaxnmp;
8130     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
8131   }
8132   case NEON::BI__builtin_neon_vpminnm_v:
8133   case NEON::BI__builtin_neon_vpminnmq_v: {
8134     Int = Intrinsic::aarch64_neon_fminnmp;
8135     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
8136   }
8137   case NEON::BI__builtin_neon_vsqrth_f16: {
8138     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8139     Int = Intrinsic::sqrt;
8140     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
8141   }
8142   case NEON::BI__builtin_neon_vsqrt_v:
8143   case NEON::BI__builtin_neon_vsqrtq_v: {
8144     Int = Intrinsic::sqrt;
8145     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8146     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
8147   }
8148   case NEON::BI__builtin_neon_vrbit_v:
8149   case NEON::BI__builtin_neon_vrbitq_v: {
8150     Int = Intrinsic::aarch64_neon_rbit;
8151     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
8152   }
8153   case NEON::BI__builtin_neon_vaddv_u8:
8154     // FIXME: These are handled by the AArch64 scalar code.
8155     usgn = true;
8156     LLVM_FALLTHROUGH;
8157   case NEON::BI__builtin_neon_vaddv_s8: {
8158     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
8159     Ty = Int32Ty;
8160     VTy = llvm::VectorType::get(Int8Ty, 8);
8161     llvm::Type *Tys[2] = { Ty, VTy };
8162     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8163     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
8164     return Builder.CreateTrunc(Ops[0], Int8Ty);
8165   }
8166   case NEON::BI__builtin_neon_vaddv_u16:
8167     usgn = true;
8168     LLVM_FALLTHROUGH;
8169   case NEON::BI__builtin_neon_vaddv_s16: {
8170     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
8171     Ty = Int32Ty;
8172     VTy = llvm::VectorType::get(Int16Ty, 4);
8173     llvm::Type *Tys[2] = { Ty, VTy };
8174     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8175     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
8176     return Builder.CreateTrunc(Ops[0], Int16Ty);
8177   }
8178   case NEON::BI__builtin_neon_vaddvq_u8:
8179     usgn = true;
8180     LLVM_FALLTHROUGH;
8181   case NEON::BI__builtin_neon_vaddvq_s8: {
8182     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
8183     Ty = Int32Ty;
8184     VTy = llvm::VectorType::get(Int8Ty, 16);
8185     llvm::Type *Tys[2] = { Ty, VTy };
8186     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8187     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
8188     return Builder.CreateTrunc(Ops[0], Int8Ty);
8189   }
8190   case NEON::BI__builtin_neon_vaddvq_u16:
8191     usgn = true;
8192     LLVM_FALLTHROUGH;
8193   case NEON::BI__builtin_neon_vaddvq_s16: {
8194     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
8195     Ty = Int32Ty;
8196     VTy = llvm::VectorType::get(Int16Ty, 8);
8197     llvm::Type *Tys[2] = { Ty, VTy };
8198     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8199     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
8200     return Builder.CreateTrunc(Ops[0], Int16Ty);
8201   }
8202   case NEON::BI__builtin_neon_vmaxv_u8: {
8203     Int = Intrinsic::aarch64_neon_umaxv;
8204     Ty = Int32Ty;
8205     VTy = llvm::VectorType::get(Int8Ty, 8);
8206     llvm::Type *Tys[2] = { Ty, VTy };
8207     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8208     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8209     return Builder.CreateTrunc(Ops[0], Int8Ty);
8210   }
8211   case NEON::BI__builtin_neon_vmaxv_u16: {
8212     Int = Intrinsic::aarch64_neon_umaxv;
8213     Ty = Int32Ty;
8214     VTy = llvm::VectorType::get(Int16Ty, 4);
8215     llvm::Type *Tys[2] = { Ty, VTy };
8216     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8217     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8218     return Builder.CreateTrunc(Ops[0], Int16Ty);
8219   }
8220   case NEON::BI__builtin_neon_vmaxvq_u8: {
8221     Int = Intrinsic::aarch64_neon_umaxv;
8222     Ty = Int32Ty;
8223     VTy = llvm::VectorType::get(Int8Ty, 16);
8224     llvm::Type *Tys[2] = { Ty, VTy };
8225     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8226     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8227     return Builder.CreateTrunc(Ops[0], Int8Ty);
8228   }
8229   case NEON::BI__builtin_neon_vmaxvq_u16: {
8230     Int = Intrinsic::aarch64_neon_umaxv;
8231     Ty = Int32Ty;
8232     VTy = llvm::VectorType::get(Int16Ty, 8);
8233     llvm::Type *Tys[2] = { Ty, VTy };
8234     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8235     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8236     return Builder.CreateTrunc(Ops[0], Int16Ty);
8237   }
8238   case NEON::BI__builtin_neon_vmaxv_s8: {
8239     Int = Intrinsic::aarch64_neon_smaxv;
8240     Ty = Int32Ty;
8241     VTy = llvm::VectorType::get(Int8Ty, 8);
8242     llvm::Type *Tys[2] = { Ty, VTy };
8243     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8244     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8245     return Builder.CreateTrunc(Ops[0], Int8Ty);
8246   }
8247   case NEON::BI__builtin_neon_vmaxv_s16: {
8248     Int = Intrinsic::aarch64_neon_smaxv;
8249     Ty = Int32Ty;
8250     VTy = llvm::VectorType::get(Int16Ty, 4);
8251     llvm::Type *Tys[2] = { Ty, VTy };
8252     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8253     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8254     return Builder.CreateTrunc(Ops[0], Int16Ty);
8255   }
8256   case NEON::BI__builtin_neon_vmaxvq_s8: {
8257     Int = Intrinsic::aarch64_neon_smaxv;
8258     Ty = Int32Ty;
8259     VTy = llvm::VectorType::get(Int8Ty, 16);
8260     llvm::Type *Tys[2] = { Ty, VTy };
8261     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8262     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8263     return Builder.CreateTrunc(Ops[0], Int8Ty);
8264   }
8265   case NEON::BI__builtin_neon_vmaxvq_s16: {
8266     Int = Intrinsic::aarch64_neon_smaxv;
8267     Ty = Int32Ty;
8268     VTy = llvm::VectorType::get(Int16Ty, 8);
8269     llvm::Type *Tys[2] = { Ty, VTy };
8270     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8271     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8272     return Builder.CreateTrunc(Ops[0], Int16Ty);
8273   }
8274   case NEON::BI__builtin_neon_vmaxv_f16: {
8275     Int = Intrinsic::aarch64_neon_fmaxv;
8276     Ty = HalfTy;
8277     VTy = llvm::VectorType::get(HalfTy, 4);
8278     llvm::Type *Tys[2] = { Ty, VTy };
8279     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8280     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8281     return Builder.CreateTrunc(Ops[0], HalfTy);
8282   }
8283   case NEON::BI__builtin_neon_vmaxvq_f16: {
8284     Int = Intrinsic::aarch64_neon_fmaxv;
8285     Ty = HalfTy;
8286     VTy = llvm::VectorType::get(HalfTy, 8);
8287     llvm::Type *Tys[2] = { Ty, VTy };
8288     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8289     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
8290     return Builder.CreateTrunc(Ops[0], HalfTy);
8291   }
8292   case NEON::BI__builtin_neon_vminv_u8: {
8293     Int = Intrinsic::aarch64_neon_uminv;
8294     Ty = Int32Ty;
8295     VTy = llvm::VectorType::get(Int8Ty, 8);
8296     llvm::Type *Tys[2] = { Ty, VTy };
8297     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8298     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8299     return Builder.CreateTrunc(Ops[0], Int8Ty);
8300   }
8301   case NEON::BI__builtin_neon_vminv_u16: {
8302     Int = Intrinsic::aarch64_neon_uminv;
8303     Ty = Int32Ty;
8304     VTy = llvm::VectorType::get(Int16Ty, 4);
8305     llvm::Type *Tys[2] = { Ty, VTy };
8306     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8307     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8308     return Builder.CreateTrunc(Ops[0], Int16Ty);
8309   }
8310   case NEON::BI__builtin_neon_vminvq_u8: {
8311     Int = Intrinsic::aarch64_neon_uminv;
8312     Ty = Int32Ty;
8313     VTy = llvm::VectorType::get(Int8Ty, 16);
8314     llvm::Type *Tys[2] = { Ty, VTy };
8315     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8316     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8317     return Builder.CreateTrunc(Ops[0], Int8Ty);
8318   }
8319   case NEON::BI__builtin_neon_vminvq_u16: {
8320     Int = Intrinsic::aarch64_neon_uminv;
8321     Ty = Int32Ty;
8322     VTy = llvm::VectorType::get(Int16Ty, 8);
8323     llvm::Type *Tys[2] = { Ty, VTy };
8324     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8325     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8326     return Builder.CreateTrunc(Ops[0], Int16Ty);
8327   }
8328   case NEON::BI__builtin_neon_vminv_s8: {
8329     Int = Intrinsic::aarch64_neon_sminv;
8330     Ty = Int32Ty;
8331     VTy = llvm::VectorType::get(Int8Ty, 8);
8332     llvm::Type *Tys[2] = { Ty, VTy };
8333     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8334     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8335     return Builder.CreateTrunc(Ops[0], Int8Ty);
8336   }
8337   case NEON::BI__builtin_neon_vminv_s16: {
8338     Int = Intrinsic::aarch64_neon_sminv;
8339     Ty = Int32Ty;
8340     VTy = llvm::VectorType::get(Int16Ty, 4);
8341     llvm::Type *Tys[2] = { Ty, VTy };
8342     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8343     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8344     return Builder.CreateTrunc(Ops[0], Int16Ty);
8345   }
8346   case NEON::BI__builtin_neon_vminvq_s8: {
8347     Int = Intrinsic::aarch64_neon_sminv;
8348     Ty = Int32Ty;
8349     VTy = llvm::VectorType::get(Int8Ty, 16);
8350     llvm::Type *Tys[2] = { Ty, VTy };
8351     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8352     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8353     return Builder.CreateTrunc(Ops[0], Int8Ty);
8354   }
8355   case NEON::BI__builtin_neon_vminvq_s16: {
8356     Int = Intrinsic::aarch64_neon_sminv;
8357     Ty = Int32Ty;
8358     VTy = llvm::VectorType::get(Int16Ty, 8);
8359     llvm::Type *Tys[2] = { Ty, VTy };
8360     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8361     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8362     return Builder.CreateTrunc(Ops[0], Int16Ty);
8363   }
8364   case NEON::BI__builtin_neon_vminv_f16: {
8365     Int = Intrinsic::aarch64_neon_fminv;
8366     Ty = HalfTy;
8367     VTy = llvm::VectorType::get(HalfTy, 4);
8368     llvm::Type *Tys[2] = { Ty, VTy };
8369     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8370     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8371     return Builder.CreateTrunc(Ops[0], HalfTy);
8372   }
8373   case NEON::BI__builtin_neon_vminvq_f16: {
8374     Int = Intrinsic::aarch64_neon_fminv;
8375     Ty = HalfTy;
8376     VTy = llvm::VectorType::get(HalfTy, 8);
8377     llvm::Type *Tys[2] = { Ty, VTy };
8378     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8379     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
8380     return Builder.CreateTrunc(Ops[0], HalfTy);
8381   }
8382   case NEON::BI__builtin_neon_vmaxnmv_f16: {
8383     Int = Intrinsic::aarch64_neon_fmaxnmv;
8384     Ty = HalfTy;
8385     VTy = llvm::VectorType::get(HalfTy, 4);
8386     llvm::Type *Tys[2] = { Ty, VTy };
8387     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8388     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
8389     return Builder.CreateTrunc(Ops[0], HalfTy);
8390   }
8391   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
8392     Int = Intrinsic::aarch64_neon_fmaxnmv;
8393     Ty = HalfTy;
8394     VTy = llvm::VectorType::get(HalfTy, 8);
8395     llvm::Type *Tys[2] = { Ty, VTy };
8396     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8397     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
8398     return Builder.CreateTrunc(Ops[0], HalfTy);
8399   }
8400   case NEON::BI__builtin_neon_vminnmv_f16: {
8401     Int = Intrinsic::aarch64_neon_fminnmv;
8402     Ty = HalfTy;
8403     VTy = llvm::VectorType::get(HalfTy, 4);
8404     llvm::Type *Tys[2] = { Ty, VTy };
8405     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8406     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
8407     return Builder.CreateTrunc(Ops[0], HalfTy);
8408   }
8409   case NEON::BI__builtin_neon_vminnmvq_f16: {
8410     Int = Intrinsic::aarch64_neon_fminnmv;
8411     Ty = HalfTy;
8412     VTy = llvm::VectorType::get(HalfTy, 8);
8413     llvm::Type *Tys[2] = { Ty, VTy };
8414     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8415     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
8416     return Builder.CreateTrunc(Ops[0], HalfTy);
8417   }
8418   case NEON::BI__builtin_neon_vmul_n_f64: {
8419     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
8420     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
8421     return Builder.CreateFMul(Ops[0], RHS);
8422   }
8423   case NEON::BI__builtin_neon_vaddlv_u8: {
8424     Int = Intrinsic::aarch64_neon_uaddlv;
8425     Ty = Int32Ty;
8426     VTy = llvm::VectorType::get(Int8Ty, 8);
8427     llvm::Type *Tys[2] = { Ty, VTy };
8428     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8429     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8430     return Builder.CreateTrunc(Ops[0], Int16Ty);
8431   }
8432   case NEON::BI__builtin_neon_vaddlv_u16: {
8433     Int = Intrinsic::aarch64_neon_uaddlv;
8434     Ty = Int32Ty;
8435     VTy = llvm::VectorType::get(Int16Ty, 4);
8436     llvm::Type *Tys[2] = { Ty, VTy };
8437     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8438     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8439   }
8440   case NEON::BI__builtin_neon_vaddlvq_u8: {
8441     Int = Intrinsic::aarch64_neon_uaddlv;
8442     Ty = Int32Ty;
8443     VTy = llvm::VectorType::get(Int8Ty, 16);
8444     llvm::Type *Tys[2] = { Ty, VTy };
8445     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8446     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8447     return Builder.CreateTrunc(Ops[0], Int16Ty);
8448   }
8449   case NEON::BI__builtin_neon_vaddlvq_u16: {
8450     Int = Intrinsic::aarch64_neon_uaddlv;
8451     Ty = Int32Ty;
8452     VTy = llvm::VectorType::get(Int16Ty, 8);
8453     llvm::Type *Tys[2] = { Ty, VTy };
8454     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8455     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8456   }
8457   case NEON::BI__builtin_neon_vaddlv_s8: {
8458     Int = Intrinsic::aarch64_neon_saddlv;
8459     Ty = Int32Ty;
8460     VTy = llvm::VectorType::get(Int8Ty, 8);
8461     llvm::Type *Tys[2] = { Ty, VTy };
8462     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8463     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8464     return Builder.CreateTrunc(Ops[0], Int16Ty);
8465   }
8466   case NEON::BI__builtin_neon_vaddlv_s16: {
8467     Int = Intrinsic::aarch64_neon_saddlv;
8468     Ty = Int32Ty;
8469     VTy = llvm::VectorType::get(Int16Ty, 4);
8470     llvm::Type *Tys[2] = { Ty, VTy };
8471     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8472     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8473   }
8474   case NEON::BI__builtin_neon_vaddlvq_s8: {
8475     Int = Intrinsic::aarch64_neon_saddlv;
8476     Ty = Int32Ty;
8477     VTy = llvm::VectorType::get(Int8Ty, 16);
8478     llvm::Type *Tys[2] = { Ty, VTy };
8479     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8480     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8481     return Builder.CreateTrunc(Ops[0], Int16Ty);
8482   }
8483   case NEON::BI__builtin_neon_vaddlvq_s16: {
8484     Int = Intrinsic::aarch64_neon_saddlv;
8485     Ty = Int32Ty;
8486     VTy = llvm::VectorType::get(Int16Ty, 8);
8487     llvm::Type *Tys[2] = { Ty, VTy };
8488     Ops.push_back(EmitScalarExpr(E->getArg(0)));
8489     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
8490   }
8491   case NEON::BI__builtin_neon_vsri_n_v:
8492   case NEON::BI__builtin_neon_vsriq_n_v: {
8493     Int = Intrinsic::aarch64_neon_vsri;
8494     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
8495     return EmitNeonCall(Intrin, Ops, "vsri_n");
8496   }
8497   case NEON::BI__builtin_neon_vsli_n_v:
8498   case NEON::BI__builtin_neon_vsliq_n_v: {
8499     Int = Intrinsic::aarch64_neon_vsli;
8500     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
8501     return EmitNeonCall(Intrin, Ops, "vsli_n");
8502   }
8503   case NEON::BI__builtin_neon_vsra_n_v:
8504   case NEON::BI__builtin_neon_vsraq_n_v:
8505     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8506     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
8507     return Builder.CreateAdd(Ops[0], Ops[1]);
8508   case NEON::BI__builtin_neon_vrsra_n_v:
8509   case NEON::BI__builtin_neon_vrsraq_n_v: {
8510     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
8511     SmallVector<llvm::Value*,2> TmpOps;
8512     TmpOps.push_back(Ops[1]);
8513     TmpOps.push_back(Ops[2]);
8514     Function* F = CGM.getIntrinsic(Int, Ty);
8515     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
8516     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
8517     return Builder.CreateAdd(Ops[0], tmp);
8518   }
8519   case NEON::BI__builtin_neon_vld1_v:
8520   case NEON::BI__builtin_neon_vld1q_v: {
8521     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
8522     auto Alignment = CharUnits::fromQuantity(
8523         BuiltinID == NEON::BI__builtin_neon_vld1_v ? 8 : 16);
8524     return Builder.CreateAlignedLoad(VTy, Ops[0], Alignment);
8525   }
8526   case NEON::BI__builtin_neon_vst1_v:
8527   case NEON::BI__builtin_neon_vst1q_v:
8528     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
8529     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
8530     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8531   case NEON::BI__builtin_neon_vld1_lane_v:
8532   case NEON::BI__builtin_neon_vld1q_lane_v: {
8533     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8534     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
8535     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8536     auto Alignment = CharUnits::fromQuantity(
8537         BuiltinID == NEON::BI__builtin_neon_vld1_lane_v ? 8 : 16);
8538     Ops[0] =
8539         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
8540     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
8541   }
8542   case NEON::BI__builtin_neon_vld1_dup_v:
8543   case NEON::BI__builtin_neon_vld1q_dup_v: {
8544     Value *V = UndefValue::get(Ty);
8545     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
8546     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8547     auto Alignment = CharUnits::fromQuantity(
8548         BuiltinID == NEON::BI__builtin_neon_vld1_dup_v ? 8 : 16);
8549     Ops[0] =
8550         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
8551     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
8552     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
8553     return EmitNeonSplat(Ops[0], CI);
8554   }
8555   case NEON::BI__builtin_neon_vst1_lane_v:
8556   case NEON::BI__builtin_neon_vst1q_lane_v:
8557     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8558     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
8559     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8560     return Builder.CreateDefaultAlignedStore(Ops[1],
8561                                              Builder.CreateBitCast(Ops[0], Ty));
8562   case NEON::BI__builtin_neon_vld2_v:
8563   case NEON::BI__builtin_neon_vld2q_v: {
8564     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
8565     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8566     llvm::Type *Tys[2] = { VTy, PTy };
8567     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
8568     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
8569     Ops[0] = Builder.CreateBitCast(Ops[0],
8570                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8571     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8572   }
8573   case NEON::BI__builtin_neon_vld3_v:
8574   case NEON::BI__builtin_neon_vld3q_v: {
8575     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
8576     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8577     llvm::Type *Tys[2] = { VTy, PTy };
8578     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
8579     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
8580     Ops[0] = Builder.CreateBitCast(Ops[0],
8581                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8582     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8583   }
8584   case NEON::BI__builtin_neon_vld4_v:
8585   case NEON::BI__builtin_neon_vld4q_v: {
8586     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
8587     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8588     llvm::Type *Tys[2] = { VTy, PTy };
8589     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
8590     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
8591     Ops[0] = Builder.CreateBitCast(Ops[0],
8592                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8593     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8594   }
8595   case NEON::BI__builtin_neon_vld2_dup_v:
8596   case NEON::BI__builtin_neon_vld2q_dup_v: {
8597     llvm::Type *PTy =
8598       llvm::PointerType::getUnqual(VTy->getElementType());
8599     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8600     llvm::Type *Tys[2] = { VTy, PTy };
8601     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
8602     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
8603     Ops[0] = Builder.CreateBitCast(Ops[0],
8604                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8605     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8606   }
8607   case NEON::BI__builtin_neon_vld3_dup_v:
8608   case NEON::BI__builtin_neon_vld3q_dup_v: {
8609     llvm::Type *PTy =
8610       llvm::PointerType::getUnqual(VTy->getElementType());
8611     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8612     llvm::Type *Tys[2] = { VTy, PTy };
8613     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
8614     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
8615     Ops[0] = Builder.CreateBitCast(Ops[0],
8616                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8617     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8618   }
8619   case NEON::BI__builtin_neon_vld4_dup_v:
8620   case NEON::BI__builtin_neon_vld4q_dup_v: {
8621     llvm::Type *PTy =
8622       llvm::PointerType::getUnqual(VTy->getElementType());
8623     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8624     llvm::Type *Tys[2] = { VTy, PTy };
8625     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
8626     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
8627     Ops[0] = Builder.CreateBitCast(Ops[0],
8628                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8629     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8630   }
8631   case NEON::BI__builtin_neon_vld2_lane_v:
8632   case NEON::BI__builtin_neon_vld2q_lane_v: {
8633     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8634     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
8635     Ops.push_back(Ops[1]);
8636     Ops.erase(Ops.begin()+1);
8637     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8638     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8639     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
8640     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane");
8641     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8642     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8643     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8644   }
8645   case NEON::BI__builtin_neon_vld3_lane_v:
8646   case NEON::BI__builtin_neon_vld3q_lane_v: {
8647     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8648     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
8649     Ops.push_back(Ops[1]);
8650     Ops.erase(Ops.begin()+1);
8651     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8652     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8653     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
8654     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
8655     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
8656     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8657     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8658     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8659   }
8660   case NEON::BI__builtin_neon_vld4_lane_v:
8661   case NEON::BI__builtin_neon_vld4q_lane_v: {
8662     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8663     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
8664     Ops.push_back(Ops[1]);
8665     Ops.erase(Ops.begin()+1);
8666     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8667     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8668     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
8669     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
8670     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
8671     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld4_lane");
8672     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8673     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8674     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8675   }
8676   case NEON::BI__builtin_neon_vst2_v:
8677   case NEON::BI__builtin_neon_vst2q_v: {
8678     Ops.push_back(Ops[0]);
8679     Ops.erase(Ops.begin());
8680     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
8681     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
8682                         Ops, "");
8683   }
8684   case NEON::BI__builtin_neon_vst2_lane_v:
8685   case NEON::BI__builtin_neon_vst2q_lane_v: {
8686     Ops.push_back(Ops[0]);
8687     Ops.erase(Ops.begin());
8688     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
8689     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
8690     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
8691                         Ops, "");
8692   }
8693   case NEON::BI__builtin_neon_vst3_v:
8694   case NEON::BI__builtin_neon_vst3q_v: {
8695     Ops.push_back(Ops[0]);
8696     Ops.erase(Ops.begin());
8697     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
8698     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
8699                         Ops, "");
8700   }
8701   case NEON::BI__builtin_neon_vst3_lane_v:
8702   case NEON::BI__builtin_neon_vst3q_lane_v: {
8703     Ops.push_back(Ops[0]);
8704     Ops.erase(Ops.begin());
8705     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
8706     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
8707     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
8708                         Ops, "");
8709   }
8710   case NEON::BI__builtin_neon_vst4_v:
8711   case NEON::BI__builtin_neon_vst4q_v: {
8712     Ops.push_back(Ops[0]);
8713     Ops.erase(Ops.begin());
8714     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
8715     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
8716                         Ops, "");
8717   }
8718   case NEON::BI__builtin_neon_vst4_lane_v:
8719   case NEON::BI__builtin_neon_vst4q_lane_v: {
8720     Ops.push_back(Ops[0]);
8721     Ops.erase(Ops.begin());
8722     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
8723     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
8724     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
8725                         Ops, "");
8726   }
8727   case NEON::BI__builtin_neon_vtrn_v:
8728   case NEON::BI__builtin_neon_vtrnq_v: {
8729     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8730     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8731     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8732     Value *SV = nullptr;
8733 
8734     for (unsigned vi = 0; vi != 2; ++vi) {
8735       SmallVector<uint32_t, 16> Indices;
8736       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8737         Indices.push_back(i+vi);
8738         Indices.push_back(i+e+vi);
8739       }
8740       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8741       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
8742       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8743     }
8744     return SV;
8745   }
8746   case NEON::BI__builtin_neon_vuzp_v:
8747   case NEON::BI__builtin_neon_vuzpq_v: {
8748     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8749     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8750     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8751     Value *SV = nullptr;
8752 
8753     for (unsigned vi = 0; vi != 2; ++vi) {
8754       SmallVector<uint32_t, 16> Indices;
8755       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
8756         Indices.push_back(2*i+vi);
8757 
8758       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8759       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
8760       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8761     }
8762     return SV;
8763   }
8764   case NEON::BI__builtin_neon_vzip_v:
8765   case NEON::BI__builtin_neon_vzipq_v: {
8766     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8767     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8768     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8769     Value *SV = nullptr;
8770 
8771     for (unsigned vi = 0; vi != 2; ++vi) {
8772       SmallVector<uint32_t, 16> Indices;
8773       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8774         Indices.push_back((i + vi*e) >> 1);
8775         Indices.push_back(((i + vi*e) >> 1)+e);
8776       }
8777       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8778       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
8779       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8780     }
8781     return SV;
8782   }
8783   case NEON::BI__builtin_neon_vqtbl1q_v: {
8784     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
8785                         Ops, "vtbl1");
8786   }
8787   case NEON::BI__builtin_neon_vqtbl2q_v: {
8788     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
8789                         Ops, "vtbl2");
8790   }
8791   case NEON::BI__builtin_neon_vqtbl3q_v: {
8792     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
8793                         Ops, "vtbl3");
8794   }
8795   case NEON::BI__builtin_neon_vqtbl4q_v: {
8796     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
8797                         Ops, "vtbl4");
8798   }
8799   case NEON::BI__builtin_neon_vqtbx1q_v: {
8800     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
8801                         Ops, "vtbx1");
8802   }
8803   case NEON::BI__builtin_neon_vqtbx2q_v: {
8804     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
8805                         Ops, "vtbx2");
8806   }
8807   case NEON::BI__builtin_neon_vqtbx3q_v: {
8808     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
8809                         Ops, "vtbx3");
8810   }
8811   case NEON::BI__builtin_neon_vqtbx4q_v: {
8812     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
8813                         Ops, "vtbx4");
8814   }
8815   case NEON::BI__builtin_neon_vsqadd_v:
8816   case NEON::BI__builtin_neon_vsqaddq_v: {
8817     Int = Intrinsic::aarch64_neon_usqadd;
8818     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
8819   }
8820   case NEON::BI__builtin_neon_vuqadd_v:
8821   case NEON::BI__builtin_neon_vuqaddq_v: {
8822     Int = Intrinsic::aarch64_neon_suqadd;
8823     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
8824   }
8825   case AArch64::BI__iso_volatile_load8:
8826   case AArch64::BI__iso_volatile_load16:
8827   case AArch64::BI__iso_volatile_load32:
8828   case AArch64::BI__iso_volatile_load64:
8829     return EmitISOVolatileLoad(E);
8830   case AArch64::BI__iso_volatile_store8:
8831   case AArch64::BI__iso_volatile_store16:
8832   case AArch64::BI__iso_volatile_store32:
8833   case AArch64::BI__iso_volatile_store64:
8834     return EmitISOVolatileStore(E);
8835   case AArch64::BI_BitScanForward:
8836   case AArch64::BI_BitScanForward64:
8837     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
8838   case AArch64::BI_BitScanReverse:
8839   case AArch64::BI_BitScanReverse64:
8840     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
8841   case AArch64::BI_InterlockedAnd64:
8842     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
8843   case AArch64::BI_InterlockedExchange64:
8844     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
8845   case AArch64::BI_InterlockedExchangeAdd64:
8846     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
8847   case AArch64::BI_InterlockedExchangeSub64:
8848     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
8849   case AArch64::BI_InterlockedOr64:
8850     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
8851   case AArch64::BI_InterlockedXor64:
8852     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
8853   case AArch64::BI_InterlockedDecrement64:
8854     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
8855   case AArch64::BI_InterlockedIncrement64:
8856     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
8857   case AArch64::BI_InterlockedExchangeAdd8_acq:
8858   case AArch64::BI_InterlockedExchangeAdd16_acq:
8859   case AArch64::BI_InterlockedExchangeAdd_acq:
8860   case AArch64::BI_InterlockedExchangeAdd64_acq:
8861     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_acq, E);
8862   case AArch64::BI_InterlockedExchangeAdd8_rel:
8863   case AArch64::BI_InterlockedExchangeAdd16_rel:
8864   case AArch64::BI_InterlockedExchangeAdd_rel:
8865   case AArch64::BI_InterlockedExchangeAdd64_rel:
8866     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_rel, E);
8867   case AArch64::BI_InterlockedExchangeAdd8_nf:
8868   case AArch64::BI_InterlockedExchangeAdd16_nf:
8869   case AArch64::BI_InterlockedExchangeAdd_nf:
8870   case AArch64::BI_InterlockedExchangeAdd64_nf:
8871     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd_nf, E);
8872   case AArch64::BI_InterlockedExchange8_acq:
8873   case AArch64::BI_InterlockedExchange16_acq:
8874   case AArch64::BI_InterlockedExchange_acq:
8875   case AArch64::BI_InterlockedExchange64_acq:
8876     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_acq, E);
8877   case AArch64::BI_InterlockedExchange8_rel:
8878   case AArch64::BI_InterlockedExchange16_rel:
8879   case AArch64::BI_InterlockedExchange_rel:
8880   case AArch64::BI_InterlockedExchange64_rel:
8881     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_rel, E);
8882   case AArch64::BI_InterlockedExchange8_nf:
8883   case AArch64::BI_InterlockedExchange16_nf:
8884   case AArch64::BI_InterlockedExchange_nf:
8885   case AArch64::BI_InterlockedExchange64_nf:
8886     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange_nf, E);
8887   case AArch64::BI_InterlockedCompareExchange8_acq:
8888   case AArch64::BI_InterlockedCompareExchange16_acq:
8889   case AArch64::BI_InterlockedCompareExchange_acq:
8890   case AArch64::BI_InterlockedCompareExchange64_acq:
8891     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_acq, E);
8892   case AArch64::BI_InterlockedCompareExchange8_rel:
8893   case AArch64::BI_InterlockedCompareExchange16_rel:
8894   case AArch64::BI_InterlockedCompareExchange_rel:
8895   case AArch64::BI_InterlockedCompareExchange64_rel:
8896     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_rel, E);
8897   case AArch64::BI_InterlockedCompareExchange8_nf:
8898   case AArch64::BI_InterlockedCompareExchange16_nf:
8899   case AArch64::BI_InterlockedCompareExchange_nf:
8900   case AArch64::BI_InterlockedCompareExchange64_nf:
8901     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedCompareExchange_nf, E);
8902   case AArch64::BI_InterlockedOr8_acq:
8903   case AArch64::BI_InterlockedOr16_acq:
8904   case AArch64::BI_InterlockedOr_acq:
8905   case AArch64::BI_InterlockedOr64_acq:
8906     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_acq, E);
8907   case AArch64::BI_InterlockedOr8_rel:
8908   case AArch64::BI_InterlockedOr16_rel:
8909   case AArch64::BI_InterlockedOr_rel:
8910   case AArch64::BI_InterlockedOr64_rel:
8911     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_rel, E);
8912   case AArch64::BI_InterlockedOr8_nf:
8913   case AArch64::BI_InterlockedOr16_nf:
8914   case AArch64::BI_InterlockedOr_nf:
8915   case AArch64::BI_InterlockedOr64_nf:
8916     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr_nf, E);
8917   case AArch64::BI_InterlockedXor8_acq:
8918   case AArch64::BI_InterlockedXor16_acq:
8919   case AArch64::BI_InterlockedXor_acq:
8920   case AArch64::BI_InterlockedXor64_acq:
8921     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_acq, E);
8922   case AArch64::BI_InterlockedXor8_rel:
8923   case AArch64::BI_InterlockedXor16_rel:
8924   case AArch64::BI_InterlockedXor_rel:
8925   case AArch64::BI_InterlockedXor64_rel:
8926     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_rel, E);
8927   case AArch64::BI_InterlockedXor8_nf:
8928   case AArch64::BI_InterlockedXor16_nf:
8929   case AArch64::BI_InterlockedXor_nf:
8930   case AArch64::BI_InterlockedXor64_nf:
8931     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor_nf, E);
8932   case AArch64::BI_InterlockedAnd8_acq:
8933   case AArch64::BI_InterlockedAnd16_acq:
8934   case AArch64::BI_InterlockedAnd_acq:
8935   case AArch64::BI_InterlockedAnd64_acq:
8936     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_acq, E);
8937   case AArch64::BI_InterlockedAnd8_rel:
8938   case AArch64::BI_InterlockedAnd16_rel:
8939   case AArch64::BI_InterlockedAnd_rel:
8940   case AArch64::BI_InterlockedAnd64_rel:
8941     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_rel, E);
8942   case AArch64::BI_InterlockedAnd8_nf:
8943   case AArch64::BI_InterlockedAnd16_nf:
8944   case AArch64::BI_InterlockedAnd_nf:
8945   case AArch64::BI_InterlockedAnd64_nf:
8946     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd_nf, E);
8947   case AArch64::BI_InterlockedIncrement16_acq:
8948   case AArch64::BI_InterlockedIncrement_acq:
8949   case AArch64::BI_InterlockedIncrement64_acq:
8950     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_acq, E);
8951   case AArch64::BI_InterlockedIncrement16_rel:
8952   case AArch64::BI_InterlockedIncrement_rel:
8953   case AArch64::BI_InterlockedIncrement64_rel:
8954     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_rel, E);
8955   case AArch64::BI_InterlockedIncrement16_nf:
8956   case AArch64::BI_InterlockedIncrement_nf:
8957   case AArch64::BI_InterlockedIncrement64_nf:
8958     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement_nf, E);
8959   case AArch64::BI_InterlockedDecrement16_acq:
8960   case AArch64::BI_InterlockedDecrement_acq:
8961   case AArch64::BI_InterlockedDecrement64_acq:
8962     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_acq, E);
8963   case AArch64::BI_InterlockedDecrement16_rel:
8964   case AArch64::BI_InterlockedDecrement_rel:
8965   case AArch64::BI_InterlockedDecrement64_rel:
8966     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_rel, E);
8967   case AArch64::BI_InterlockedDecrement16_nf:
8968   case AArch64::BI_InterlockedDecrement_nf:
8969   case AArch64::BI_InterlockedDecrement64_nf:
8970     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement_nf, E);
8971 
8972   case AArch64::BI_InterlockedAdd: {
8973     Value *Arg0 = EmitScalarExpr(E->getArg(0));
8974     Value *Arg1 = EmitScalarExpr(E->getArg(1));
8975     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
8976       AtomicRMWInst::Add, Arg0, Arg1,
8977       llvm::AtomicOrdering::SequentiallyConsistent);
8978     return Builder.CreateAdd(RMWI, Arg1);
8979   }
8980   }
8981 }
8982 
8983 llvm::Value *CodeGenFunction::
8984 BuildVector(ArrayRef<llvm::Value*> Ops) {
8985   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
8986          "Not a power-of-two sized vector!");
8987   bool AllConstants = true;
8988   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
8989     AllConstants &= isa<Constant>(Ops[i]);
8990 
8991   // If this is a constant vector, create a ConstantVector.
8992   if (AllConstants) {
8993     SmallVector<llvm::Constant*, 16> CstOps;
8994     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
8995       CstOps.push_back(cast<Constant>(Ops[i]));
8996     return llvm::ConstantVector::get(CstOps);
8997   }
8998 
8999   // Otherwise, insertelement the values to build the vector.
9000   Value *Result =
9001     llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), Ops.size()));
9002 
9003   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
9004     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt32(i));
9005 
9006   return Result;
9007 }
9008 
9009 // Convert the mask from an integer type to a vector of i1.
9010 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
9011                               unsigned NumElts) {
9012 
9013   llvm::VectorType *MaskTy = llvm::VectorType::get(CGF.Builder.getInt1Ty(),
9014                          cast<IntegerType>(Mask->getType())->getBitWidth());
9015   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
9016 
9017   // If we have less than 8 elements, then the starting mask was an i8 and
9018   // we need to extract down to the right number of elements.
9019   if (NumElts < 8) {
9020     uint32_t Indices[4];
9021     for (unsigned i = 0; i != NumElts; ++i)
9022       Indices[i] = i;
9023     MaskVec = CGF.Builder.CreateShuffleVector(MaskVec, MaskVec,
9024                                              makeArrayRef(Indices, NumElts),
9025                                              "extract");
9026   }
9027   return MaskVec;
9028 }
9029 
9030 static Value *EmitX86MaskedStore(CodeGenFunction &CGF,
9031                                  ArrayRef<Value *> Ops,
9032                                  unsigned Align) {
9033   // Cast the pointer to right type.
9034   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
9035                                llvm::PointerType::getUnqual(Ops[1]->getType()));
9036 
9037   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
9038                                    Ops[1]->getType()->getVectorNumElements());
9039 
9040   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Align, MaskVec);
9041 }
9042 
9043 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF,
9044                                 ArrayRef<Value *> Ops, unsigned Align) {
9045   // Cast the pointer to right type.
9046   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
9047                                llvm::PointerType::getUnqual(Ops[1]->getType()));
9048 
9049   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
9050                                    Ops[1]->getType()->getVectorNumElements());
9051 
9052   return CGF.Builder.CreateMaskedLoad(Ptr, Align, MaskVec, Ops[1]);
9053 }
9054 
9055 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
9056                                 ArrayRef<Value *> Ops) {
9057   llvm::Type *ResultTy = Ops[1]->getType();
9058   llvm::Type *PtrTy = ResultTy->getVectorElementType();
9059 
9060   // Cast the pointer to element type.
9061   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
9062                                          llvm::PointerType::getUnqual(PtrTy));
9063 
9064   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
9065                                    ResultTy->getVectorNumElements());
9066 
9067   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
9068                                            ResultTy);
9069   return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
9070 }
9071 
9072 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
9073                                    ArrayRef<Value *> Ops) {
9074   llvm::Type *ResultTy = Ops[1]->getType();
9075   llvm::Type *PtrTy = ResultTy->getVectorElementType();
9076 
9077   // Cast the pointer to element type.
9078   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
9079                                          llvm::PointerType::getUnqual(PtrTy));
9080 
9081   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
9082                                    ResultTy->getVectorNumElements());
9083 
9084   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
9085                                            ResultTy);
9086   return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
9087 }
9088 
9089 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
9090                               ArrayRef<Value *> Ops,
9091                               bool InvertLHS = false) {
9092   unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
9093   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
9094   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
9095 
9096   if (InvertLHS)
9097     LHS = CGF.Builder.CreateNot(LHS);
9098 
9099   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
9100                                    Ops[0]->getType());
9101 }
9102 
9103 static Value *EmitX86Select(CodeGenFunction &CGF,
9104                             Value *Mask, Value *Op0, Value *Op1) {
9105 
9106   // If the mask is all ones just return first argument.
9107   if (const auto *C = dyn_cast<Constant>(Mask))
9108     if (C->isAllOnesValue())
9109       return Op0;
9110 
9111   Mask = getMaskVecValue(CGF, Mask, Op0->getType()->getVectorNumElements());
9112 
9113   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
9114 }
9115 
9116 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
9117                                   Value *Mask, Value *Op0, Value *Op1) {
9118   // If the mask is all ones just return first argument.
9119   if (const auto *C = dyn_cast<Constant>(Mask))
9120     if (C->isAllOnesValue())
9121       return Op0;
9122 
9123   llvm::VectorType *MaskTy =
9124     llvm::VectorType::get(CGF.Builder.getInt1Ty(),
9125                           Mask->getType()->getIntegerBitWidth());
9126   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
9127   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
9128   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
9129 }
9130 
9131 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
9132                                          unsigned NumElts, Value *MaskIn) {
9133   if (MaskIn) {
9134     const auto *C = dyn_cast<Constant>(MaskIn);
9135     if (!C || !C->isAllOnesValue())
9136       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
9137   }
9138 
9139   if (NumElts < 8) {
9140     uint32_t Indices[8];
9141     for (unsigned i = 0; i != NumElts; ++i)
9142       Indices[i] = i;
9143     for (unsigned i = NumElts; i != 8; ++i)
9144       Indices[i] = i % NumElts + NumElts;
9145     Cmp = CGF.Builder.CreateShuffleVector(
9146         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
9147   }
9148 
9149   return CGF.Builder.CreateBitCast(Cmp,
9150                                    IntegerType::get(CGF.getLLVMContext(),
9151                                                     std::max(NumElts, 8U)));
9152 }
9153 
9154 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
9155                                    bool Signed, ArrayRef<Value *> Ops) {
9156   assert((Ops.size() == 2 || Ops.size() == 4) &&
9157          "Unexpected number of arguments");
9158   unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
9159   Value *Cmp;
9160 
9161   if (CC == 3) {
9162     Cmp = Constant::getNullValue(
9163                        llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
9164   } else if (CC == 7) {
9165     Cmp = Constant::getAllOnesValue(
9166                        llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
9167   } else {
9168     ICmpInst::Predicate Pred;
9169     switch (CC) {
9170     default: llvm_unreachable("Unknown condition code");
9171     case 0: Pred = ICmpInst::ICMP_EQ;  break;
9172     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
9173     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
9174     case 4: Pred = ICmpInst::ICMP_NE;  break;
9175     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
9176     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
9177     }
9178     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
9179   }
9180 
9181   Value *MaskIn = nullptr;
9182   if (Ops.size() == 4)
9183     MaskIn = Ops[3];
9184 
9185   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
9186 }
9187 
9188 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
9189   Value *Zero = Constant::getNullValue(In->getType());
9190   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
9191 }
9192 
9193 static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
9194 
9195   llvm::Type *Ty = Ops[0]->getType();
9196   Value *Zero = llvm::Constant::getNullValue(Ty);
9197   Value *Sub = CGF.Builder.CreateSub(Zero, Ops[0]);
9198   Value *Cmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Zero);
9199   Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Sub);
9200   return Res;
9201 }
9202 
9203 static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
9204                             ArrayRef<Value *> Ops) {
9205   Value *Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
9206   Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
9207 
9208   assert(Ops.size() == 2);
9209   return Res;
9210 }
9211 
9212 // Lowers X86 FMA intrinsics to IR.
9213 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
9214                              unsigned BuiltinID, bool IsAddSub) {
9215 
9216   bool Subtract = false;
9217   Intrinsic::ID IID = Intrinsic::not_intrinsic;
9218   switch (BuiltinID) {
9219   default: break;
9220   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
9221     Subtract = true;
9222     LLVM_FALLTHROUGH;
9223   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
9224   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
9225   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
9226     IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
9227   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
9228     Subtract = true;
9229     LLVM_FALLTHROUGH;
9230   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
9231   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
9232   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
9233     IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
9234   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
9235     Subtract = true;
9236     LLVM_FALLTHROUGH;
9237   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
9238   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
9239   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
9240     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
9241     break;
9242   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
9243     Subtract = true;
9244     LLVM_FALLTHROUGH;
9245   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
9246   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
9247   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
9248     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
9249     break;
9250   }
9251 
9252   Value *A = Ops[0];
9253   Value *B = Ops[1];
9254   Value *C = Ops[2];
9255 
9256   if (Subtract)
9257     C = CGF.Builder.CreateFNeg(C);
9258 
9259   Value *Res;
9260 
9261   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
9262   if (IID != Intrinsic::not_intrinsic &&
9263       cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4) {
9264     Function *Intr = CGF.CGM.getIntrinsic(IID);
9265     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
9266   } else {
9267     llvm::Type *Ty = A->getType();
9268     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
9269     Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
9270 
9271     if (IsAddSub) {
9272       // Negate even elts in C using a mask.
9273       unsigned NumElts = Ty->getVectorNumElements();
9274       SmallVector<uint32_t, 16> Indices(NumElts);
9275       for (unsigned i = 0; i != NumElts; ++i)
9276         Indices[i] = i + (i % 2) * NumElts;
9277 
9278       Value *NegC = CGF.Builder.CreateFNeg(C);
9279       Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
9280       Res = CGF.Builder.CreateShuffleVector(FMSub, Res, Indices);
9281     }
9282   }
9283 
9284   // Handle any required masking.
9285   Value *MaskFalseVal = nullptr;
9286   switch (BuiltinID) {
9287   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
9288   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
9289   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
9290   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
9291     MaskFalseVal = Ops[0];
9292     break;
9293   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
9294   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
9295   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
9296   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
9297     MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
9298     break;
9299   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
9300   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
9301   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
9302   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
9303   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
9304   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
9305   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
9306   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
9307     MaskFalseVal = Ops[2];
9308     break;
9309   }
9310 
9311   if (MaskFalseVal)
9312     return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
9313 
9314   return Res;
9315 }
9316 
9317 static Value *
9318 EmitScalarFMAExpr(CodeGenFunction &CGF, MutableArrayRef<Value *> Ops,
9319                   Value *Upper, bool ZeroMask = false, unsigned PTIdx = 0,
9320                   bool NegAcc = false) {
9321   unsigned Rnd = 4;
9322   if (Ops.size() > 4)
9323     Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
9324 
9325   if (NegAcc)
9326     Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
9327 
9328   Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
9329   Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
9330   Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
9331   Value *Res;
9332   if (Rnd != 4) {
9333     Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ?
9334                         Intrinsic::x86_avx512_vfmadd_f32 :
9335                         Intrinsic::x86_avx512_vfmadd_f64;
9336     Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
9337                                  {Ops[0], Ops[1], Ops[2], Ops[4]});
9338   } else {
9339     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
9340     Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
9341   }
9342   // If we have more than 3 arguments, we need to do masking.
9343   if (Ops.size() > 3) {
9344     Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
9345                                : Ops[PTIdx];
9346 
9347     // If we negated the accumulator and the its the PassThru value we need to
9348     // bypass the negate. Conveniently Upper should be the same thing in this
9349     // case.
9350     if (NegAcc && PTIdx == 2)
9351       PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
9352 
9353     Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
9354   }
9355   return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
9356 }
9357 
9358 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
9359                            ArrayRef<Value *> Ops) {
9360   llvm::Type *Ty = Ops[0]->getType();
9361   // Arguments have a vXi32 type so cast to vXi64.
9362   Ty = llvm::VectorType::get(CGF.Int64Ty,
9363                              Ty->getPrimitiveSizeInBits() / 64);
9364   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
9365   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
9366 
9367   if (IsSigned) {
9368     // Shift left then arithmetic shift right.
9369     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
9370     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
9371     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
9372     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
9373     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
9374   } else {
9375     // Clear the upper bits.
9376     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
9377     LHS = CGF.Builder.CreateAnd(LHS, Mask);
9378     RHS = CGF.Builder.CreateAnd(RHS, Mask);
9379   }
9380 
9381   return CGF.Builder.CreateMul(LHS, RHS);
9382 }
9383 
9384 // Emit a masked pternlog intrinsic. This only exists because the header has to
9385 // use a macro and we aren't able to pass the input argument to a pternlog
9386 // builtin and a select builtin without evaluating it twice.
9387 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
9388                              ArrayRef<Value *> Ops) {
9389   llvm::Type *Ty = Ops[0]->getType();
9390 
9391   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
9392   unsigned EltWidth = Ty->getScalarSizeInBits();
9393   Intrinsic::ID IID;
9394   if (VecWidth == 128 && EltWidth == 32)
9395     IID = Intrinsic::x86_avx512_pternlog_d_128;
9396   else if (VecWidth == 256 && EltWidth == 32)
9397     IID = Intrinsic::x86_avx512_pternlog_d_256;
9398   else if (VecWidth == 512 && EltWidth == 32)
9399     IID = Intrinsic::x86_avx512_pternlog_d_512;
9400   else if (VecWidth == 128 && EltWidth == 64)
9401     IID = Intrinsic::x86_avx512_pternlog_q_128;
9402   else if (VecWidth == 256 && EltWidth == 64)
9403     IID = Intrinsic::x86_avx512_pternlog_q_256;
9404   else if (VecWidth == 512 && EltWidth == 64)
9405     IID = Intrinsic::x86_avx512_pternlog_q_512;
9406   else
9407     llvm_unreachable("Unexpected intrinsic");
9408 
9409   Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
9410                                           Ops.drop_back());
9411   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
9412   return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
9413 }
9414 
9415 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
9416                               llvm::Type *DstTy) {
9417   unsigned NumberOfElements = DstTy->getVectorNumElements();
9418   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
9419   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
9420 }
9421 
9422 // Emit addition or subtraction with saturation.
9423 // Handles both signed and unsigned intrinsics.
9424 static Value *EmitX86AddSubSatExpr(CodeGenFunction &CGF, const CallExpr *E,
9425                                    SmallVectorImpl<Value *> &Ops,
9426                                    bool IsAddition) {
9427 
9428   // Collect vector elements and type data.
9429   llvm::Type *ResultType = CGF.ConvertType(E->getType());
9430 
9431   Value *Res;
9432   if (IsAddition) {
9433     // ADDUS: a > (a+b) ? ~0 : (a+b)
9434     // If Ops[0] > Add, overflow occured.
9435     Value *Add = CGF.Builder.CreateAdd(Ops[0], Ops[1]);
9436     Value *ICmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Add);
9437     Value *Max = llvm::Constant::getAllOnesValue(ResultType);
9438     Res = CGF.Builder.CreateSelect(ICmp, Max, Add);
9439   } else {
9440     // SUBUS: max(a, b) - b
9441     Value *ICmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]);
9442     Value *Select = CGF.Builder.CreateSelect(ICmp, Ops[0], Ops[1]);
9443     Res = CGF.Builder.CreateSub(Select, Ops[1]);
9444   }
9445 
9446   return Res;
9447 }
9448 
9449 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
9450   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
9451   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
9452   return EmitX86CpuIs(CPUStr);
9453 }
9454 
9455 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
9456 
9457   llvm::Type *Int32Ty = Builder.getInt32Ty();
9458 
9459   // Matching the struct layout from the compiler-rt/libgcc structure that is
9460   // filled in:
9461   // unsigned int __cpu_vendor;
9462   // unsigned int __cpu_type;
9463   // unsigned int __cpu_subtype;
9464   // unsigned int __cpu_features[1];
9465   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
9466                                           llvm::ArrayType::get(Int32Ty, 1));
9467 
9468   // Grab the global __cpu_model.
9469   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
9470 
9471   // Calculate the index needed to access the correct field based on the
9472   // range. Also adjust the expected value.
9473   unsigned Index;
9474   unsigned Value;
9475   std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
9476 #define X86_VENDOR(ENUM, STRING)                                               \
9477   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
9478 #define X86_CPU_TYPE_COMPAT_WITH_ALIAS(ARCHNAME, ENUM, STR, ALIAS)             \
9479   .Cases(STR, ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
9480 #define X86_CPU_TYPE_COMPAT(ARCHNAME, ENUM, STR)                               \
9481   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
9482 #define X86_CPU_SUBTYPE_COMPAT(ARCHNAME, ENUM, STR)                            \
9483   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
9484 #include "llvm/Support/X86TargetParser.def"
9485                                .Default({0, 0});
9486   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
9487 
9488   // Grab the appropriate field from __cpu_model.
9489   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
9490                          ConstantInt::get(Int32Ty, Index)};
9491   llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
9492   CpuValue = Builder.CreateAlignedLoad(CpuValue, CharUnits::fromQuantity(4));
9493 
9494   // Check the value of the field against the requested value.
9495   return Builder.CreateICmpEQ(CpuValue,
9496                                   llvm::ConstantInt::get(Int32Ty, Value));
9497 }
9498 
9499 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
9500   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
9501   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
9502   return EmitX86CpuSupports(FeatureStr);
9503 }
9504 
9505 uint64_t
9506 CodeGenFunction::GetX86CpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
9507   // Processor features and mapping to processor feature value.
9508   uint64_t FeaturesMask = 0;
9509   for (const StringRef &FeatureStr : FeatureStrs) {
9510     unsigned Feature =
9511         StringSwitch<unsigned>(FeatureStr)
9512 #define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, VAL)
9513 #include "llvm/Support/X86TargetParser.def"
9514         ;
9515     FeaturesMask |= (1ULL << Feature);
9516   }
9517   return FeaturesMask;
9518 }
9519 
9520 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
9521   return EmitX86CpuSupports(GetX86CpuSupportsMask(FeatureStrs));
9522 }
9523 
9524 llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
9525   uint32_t Features1 = Lo_32(FeaturesMask);
9526   uint32_t Features2 = Hi_32(FeaturesMask);
9527 
9528   Value *Result = Builder.getTrue();
9529 
9530   if (Features1 != 0) {
9531     // Matching the struct layout from the compiler-rt/libgcc structure that is
9532     // filled in:
9533     // unsigned int __cpu_vendor;
9534     // unsigned int __cpu_type;
9535     // unsigned int __cpu_subtype;
9536     // unsigned int __cpu_features[1];
9537     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
9538                                             llvm::ArrayType::get(Int32Ty, 1));
9539 
9540     // Grab the global __cpu_model.
9541     llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
9542 
9543     // Grab the first (0th) element from the field __cpu_features off of the
9544     // global in the struct STy.
9545     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
9546                      Builder.getInt32(0)};
9547     Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
9548     Value *Features =
9549         Builder.CreateAlignedLoad(CpuFeatures, CharUnits::fromQuantity(4));
9550 
9551     // Check the value of the bit corresponding to the feature requested.
9552     Value *Mask = Builder.getInt32(Features1);
9553     Value *Bitset = Builder.CreateAnd(Features, Mask);
9554     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
9555     Result = Builder.CreateAnd(Result, Cmp);
9556   }
9557 
9558   if (Features2 != 0) {
9559     llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty,
9560                                                              "__cpu_features2");
9561     Value *Features =
9562         Builder.CreateAlignedLoad(CpuFeatures2, CharUnits::fromQuantity(4));
9563 
9564     // Check the value of the bit corresponding to the feature requested.
9565     Value *Mask = Builder.getInt32(Features2);
9566     Value *Bitset = Builder.CreateAnd(Features, Mask);
9567     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
9568     Result = Builder.CreateAnd(Result, Cmp);
9569   }
9570 
9571   return Result;
9572 }
9573 
9574 Value *CodeGenFunction::EmitX86CpuInit() {
9575   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
9576                                                     /*Variadic*/ false);
9577   llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
9578   return Builder.CreateCall(Func);
9579 }
9580 
9581 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
9582                                            const CallExpr *E) {
9583   if (BuiltinID == X86::BI__builtin_cpu_is)
9584     return EmitX86CpuIs(E);
9585   if (BuiltinID == X86::BI__builtin_cpu_supports)
9586     return EmitX86CpuSupports(E);
9587   if (BuiltinID == X86::BI__builtin_cpu_init)
9588     return EmitX86CpuInit();
9589 
9590   SmallVector<Value*, 4> Ops;
9591 
9592   // Find out if any arguments are required to be integer constant expressions.
9593   unsigned ICEArguments = 0;
9594   ASTContext::GetBuiltinTypeError Error;
9595   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
9596   assert(Error == ASTContext::GE_None && "Should not codegen an error");
9597 
9598   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
9599     // If this is a normal argument, just emit it as a scalar.
9600     if ((ICEArguments & (1 << i)) == 0) {
9601       Ops.push_back(EmitScalarExpr(E->getArg(i)));
9602       continue;
9603     }
9604 
9605     // If this is required to be a constant, constant fold it so that we know
9606     // that the generated intrinsic gets a ConstantInt.
9607     llvm::APSInt Result;
9608     bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
9609     assert(IsConst && "Constant arg isn't actually constant?"); (void)IsConst;
9610     Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
9611   }
9612 
9613   // These exist so that the builtin that takes an immediate can be bounds
9614   // checked by clang to avoid passing bad immediates to the backend. Since
9615   // AVX has a larger immediate than SSE we would need separate builtins to
9616   // do the different bounds checking. Rather than create a clang specific
9617   // SSE only builtin, this implements eight separate builtins to match gcc
9618   // implementation.
9619   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
9620     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
9621     llvm::Function *F = CGM.getIntrinsic(ID);
9622     return Builder.CreateCall(F, Ops);
9623   };
9624 
9625   // For the vector forms of FP comparisons, translate the builtins directly to
9626   // IR.
9627   // TODO: The builtins could be removed if the SSE header files used vector
9628   // extension comparisons directly (vector ordered/unordered may need
9629   // additional support via __builtin_isnan()).
9630   auto getVectorFCmpIR = [this, &Ops](CmpInst::Predicate Pred) {
9631     Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
9632     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
9633     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
9634     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
9635     return Builder.CreateBitCast(Sext, FPVecTy);
9636   };
9637 
9638   switch (BuiltinID) {
9639   default: return nullptr;
9640   case X86::BI_mm_prefetch: {
9641     Value *Address = Ops[0];
9642     ConstantInt *C = cast<ConstantInt>(Ops[1]);
9643     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
9644     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
9645     Value *Data = ConstantInt::get(Int32Ty, 1);
9646     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
9647     return Builder.CreateCall(F, {Address, RW, Locality, Data});
9648   }
9649   case X86::BI_mm_clflush: {
9650     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
9651                               Ops[0]);
9652   }
9653   case X86::BI_mm_lfence: {
9654     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
9655   }
9656   case X86::BI_mm_mfence: {
9657     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
9658   }
9659   case X86::BI_mm_sfence: {
9660     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
9661   }
9662   case X86::BI_mm_pause: {
9663     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
9664   }
9665   case X86::BI__rdtsc: {
9666     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
9667   }
9668   case X86::BI__builtin_ia32_rdtscp: {
9669     Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
9670     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
9671                                       Ops[0]);
9672     return Builder.CreateExtractValue(Call, 0);
9673   }
9674   case X86::BI__builtin_ia32_lzcnt_u16:
9675   case X86::BI__builtin_ia32_lzcnt_u32:
9676   case X86::BI__builtin_ia32_lzcnt_u64: {
9677     Value *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
9678     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
9679   }
9680   case X86::BI__builtin_ia32_tzcnt_u16:
9681   case X86::BI__builtin_ia32_tzcnt_u32:
9682   case X86::BI__builtin_ia32_tzcnt_u64: {
9683     Value *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
9684     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
9685   }
9686   case X86::BI__builtin_ia32_undef128:
9687   case X86::BI__builtin_ia32_undef256:
9688   case X86::BI__builtin_ia32_undef512:
9689     // The x86 definition of "undef" is not the same as the LLVM definition
9690     // (PR32176). We leave optimizing away an unnecessary zero constant to the
9691     // IR optimizer and backend.
9692     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
9693     // value, we should use that here instead of a zero.
9694     return llvm::Constant::getNullValue(ConvertType(E->getType()));
9695   case X86::BI__builtin_ia32_vec_init_v8qi:
9696   case X86::BI__builtin_ia32_vec_init_v4hi:
9697   case X86::BI__builtin_ia32_vec_init_v2si:
9698     return Builder.CreateBitCast(BuildVector(Ops),
9699                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
9700   case X86::BI__builtin_ia32_vec_ext_v2si:
9701   case X86::BI__builtin_ia32_vec_ext_v16qi:
9702   case X86::BI__builtin_ia32_vec_ext_v8hi:
9703   case X86::BI__builtin_ia32_vec_ext_v4si:
9704   case X86::BI__builtin_ia32_vec_ext_v4sf:
9705   case X86::BI__builtin_ia32_vec_ext_v2di:
9706   case X86::BI__builtin_ia32_vec_ext_v32qi:
9707   case X86::BI__builtin_ia32_vec_ext_v16hi:
9708   case X86::BI__builtin_ia32_vec_ext_v8si:
9709   case X86::BI__builtin_ia32_vec_ext_v4di: {
9710     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
9711     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
9712     Index &= NumElts - 1;
9713     // These builtins exist so we can ensure the index is an ICE and in range.
9714     // Otherwise we could just do this in the header file.
9715     return Builder.CreateExtractElement(Ops[0], Index);
9716   }
9717   case X86::BI__builtin_ia32_vec_set_v16qi:
9718   case X86::BI__builtin_ia32_vec_set_v8hi:
9719   case X86::BI__builtin_ia32_vec_set_v4si:
9720   case X86::BI__builtin_ia32_vec_set_v2di:
9721   case X86::BI__builtin_ia32_vec_set_v32qi:
9722   case X86::BI__builtin_ia32_vec_set_v16hi:
9723   case X86::BI__builtin_ia32_vec_set_v8si:
9724   case X86::BI__builtin_ia32_vec_set_v4di: {
9725     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
9726     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
9727     Index &= NumElts - 1;
9728     // These builtins exist so we can ensure the index is an ICE and in range.
9729     // Otherwise we could just do this in the header file.
9730     return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
9731   }
9732   case X86::BI_mm_setcsr:
9733   case X86::BI__builtin_ia32_ldmxcsr: {
9734     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
9735     Builder.CreateStore(Ops[0], Tmp);
9736     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
9737                           Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
9738   }
9739   case X86::BI_mm_getcsr:
9740   case X86::BI__builtin_ia32_stmxcsr: {
9741     Address Tmp = CreateMemTemp(E->getType());
9742     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
9743                        Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
9744     return Builder.CreateLoad(Tmp, "stmxcsr");
9745   }
9746   case X86::BI__builtin_ia32_xsave:
9747   case X86::BI__builtin_ia32_xsave64:
9748   case X86::BI__builtin_ia32_xrstor:
9749   case X86::BI__builtin_ia32_xrstor64:
9750   case X86::BI__builtin_ia32_xsaveopt:
9751   case X86::BI__builtin_ia32_xsaveopt64:
9752   case X86::BI__builtin_ia32_xrstors:
9753   case X86::BI__builtin_ia32_xrstors64:
9754   case X86::BI__builtin_ia32_xsavec:
9755   case X86::BI__builtin_ia32_xsavec64:
9756   case X86::BI__builtin_ia32_xsaves:
9757   case X86::BI__builtin_ia32_xsaves64: {
9758     Intrinsic::ID ID;
9759 #define INTRINSIC_X86_XSAVE_ID(NAME) \
9760     case X86::BI__builtin_ia32_##NAME: \
9761       ID = Intrinsic::x86_##NAME; \
9762       break
9763     switch (BuiltinID) {
9764     default: llvm_unreachable("Unsupported intrinsic!");
9765     INTRINSIC_X86_XSAVE_ID(xsave);
9766     INTRINSIC_X86_XSAVE_ID(xsave64);
9767     INTRINSIC_X86_XSAVE_ID(xrstor);
9768     INTRINSIC_X86_XSAVE_ID(xrstor64);
9769     INTRINSIC_X86_XSAVE_ID(xsaveopt);
9770     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
9771     INTRINSIC_X86_XSAVE_ID(xrstors);
9772     INTRINSIC_X86_XSAVE_ID(xrstors64);
9773     INTRINSIC_X86_XSAVE_ID(xsavec);
9774     INTRINSIC_X86_XSAVE_ID(xsavec64);
9775     INTRINSIC_X86_XSAVE_ID(xsaves);
9776     INTRINSIC_X86_XSAVE_ID(xsaves64);
9777     }
9778 #undef INTRINSIC_X86_XSAVE_ID
9779     Value *Mhi = Builder.CreateTrunc(
9780       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
9781     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
9782     Ops[1] = Mhi;
9783     Ops.push_back(Mlo);
9784     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
9785   }
9786   case X86::BI__builtin_ia32_storedqudi128_mask:
9787   case X86::BI__builtin_ia32_storedqusi128_mask:
9788   case X86::BI__builtin_ia32_storedquhi128_mask:
9789   case X86::BI__builtin_ia32_storedquqi128_mask:
9790   case X86::BI__builtin_ia32_storeupd128_mask:
9791   case X86::BI__builtin_ia32_storeups128_mask:
9792   case X86::BI__builtin_ia32_storedqudi256_mask:
9793   case X86::BI__builtin_ia32_storedqusi256_mask:
9794   case X86::BI__builtin_ia32_storedquhi256_mask:
9795   case X86::BI__builtin_ia32_storedquqi256_mask:
9796   case X86::BI__builtin_ia32_storeupd256_mask:
9797   case X86::BI__builtin_ia32_storeups256_mask:
9798   case X86::BI__builtin_ia32_storedqudi512_mask:
9799   case X86::BI__builtin_ia32_storedqusi512_mask:
9800   case X86::BI__builtin_ia32_storedquhi512_mask:
9801   case X86::BI__builtin_ia32_storedquqi512_mask:
9802   case X86::BI__builtin_ia32_storeupd512_mask:
9803   case X86::BI__builtin_ia32_storeups512_mask:
9804     return EmitX86MaskedStore(*this, Ops, 1);
9805 
9806   case X86::BI__builtin_ia32_storess128_mask:
9807   case X86::BI__builtin_ia32_storesd128_mask: {
9808     return EmitX86MaskedStore(*this, Ops, 1);
9809   }
9810   case X86::BI__builtin_ia32_vpopcntb_128:
9811   case X86::BI__builtin_ia32_vpopcntd_128:
9812   case X86::BI__builtin_ia32_vpopcntq_128:
9813   case X86::BI__builtin_ia32_vpopcntw_128:
9814   case X86::BI__builtin_ia32_vpopcntb_256:
9815   case X86::BI__builtin_ia32_vpopcntd_256:
9816   case X86::BI__builtin_ia32_vpopcntq_256:
9817   case X86::BI__builtin_ia32_vpopcntw_256:
9818   case X86::BI__builtin_ia32_vpopcntb_512:
9819   case X86::BI__builtin_ia32_vpopcntd_512:
9820   case X86::BI__builtin_ia32_vpopcntq_512:
9821   case X86::BI__builtin_ia32_vpopcntw_512: {
9822     llvm::Type *ResultType = ConvertType(E->getType());
9823     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
9824     return Builder.CreateCall(F, Ops);
9825   }
9826   case X86::BI__builtin_ia32_cvtmask2b128:
9827   case X86::BI__builtin_ia32_cvtmask2b256:
9828   case X86::BI__builtin_ia32_cvtmask2b512:
9829   case X86::BI__builtin_ia32_cvtmask2w128:
9830   case X86::BI__builtin_ia32_cvtmask2w256:
9831   case X86::BI__builtin_ia32_cvtmask2w512:
9832   case X86::BI__builtin_ia32_cvtmask2d128:
9833   case X86::BI__builtin_ia32_cvtmask2d256:
9834   case X86::BI__builtin_ia32_cvtmask2d512:
9835   case X86::BI__builtin_ia32_cvtmask2q128:
9836   case X86::BI__builtin_ia32_cvtmask2q256:
9837   case X86::BI__builtin_ia32_cvtmask2q512:
9838     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
9839 
9840   case X86::BI__builtin_ia32_cvtb2mask128:
9841   case X86::BI__builtin_ia32_cvtb2mask256:
9842   case X86::BI__builtin_ia32_cvtb2mask512:
9843   case X86::BI__builtin_ia32_cvtw2mask128:
9844   case X86::BI__builtin_ia32_cvtw2mask256:
9845   case X86::BI__builtin_ia32_cvtw2mask512:
9846   case X86::BI__builtin_ia32_cvtd2mask128:
9847   case X86::BI__builtin_ia32_cvtd2mask256:
9848   case X86::BI__builtin_ia32_cvtd2mask512:
9849   case X86::BI__builtin_ia32_cvtq2mask128:
9850   case X86::BI__builtin_ia32_cvtq2mask256:
9851   case X86::BI__builtin_ia32_cvtq2mask512:
9852     return EmitX86ConvertToMask(*this, Ops[0]);
9853 
9854   case X86::BI__builtin_ia32_vfmaddss3:
9855   case X86::BI__builtin_ia32_vfmaddsd3:
9856   case X86::BI__builtin_ia32_vfmaddss3_mask:
9857   case X86::BI__builtin_ia32_vfmaddsd3_mask:
9858     return EmitScalarFMAExpr(*this, Ops, Ops[0]);
9859   case X86::BI__builtin_ia32_vfmaddss:
9860   case X86::BI__builtin_ia32_vfmaddsd:
9861     return EmitScalarFMAExpr(*this, Ops,
9862                              Constant::getNullValue(Ops[0]->getType()));
9863   case X86::BI__builtin_ia32_vfmaddss3_maskz:
9864   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
9865     return EmitScalarFMAExpr(*this, Ops, Ops[0], /*ZeroMask*/true);
9866   case X86::BI__builtin_ia32_vfmaddss3_mask3:
9867   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
9868     return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2);
9869   case X86::BI__builtin_ia32_vfmsubss3_mask3:
9870   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
9871     return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2,
9872                              /*NegAcc*/true);
9873   case X86::BI__builtin_ia32_vfmaddps:
9874   case X86::BI__builtin_ia32_vfmaddpd:
9875   case X86::BI__builtin_ia32_vfmaddps256:
9876   case X86::BI__builtin_ia32_vfmaddpd256:
9877   case X86::BI__builtin_ia32_vfmaddps512_mask:
9878   case X86::BI__builtin_ia32_vfmaddps512_maskz:
9879   case X86::BI__builtin_ia32_vfmaddps512_mask3:
9880   case X86::BI__builtin_ia32_vfmsubps512_mask3:
9881   case X86::BI__builtin_ia32_vfmaddpd512_mask:
9882   case X86::BI__builtin_ia32_vfmaddpd512_maskz:
9883   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
9884   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
9885     return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false);
9886   case X86::BI__builtin_ia32_vfmaddsubps:
9887   case X86::BI__builtin_ia32_vfmaddsubpd:
9888   case X86::BI__builtin_ia32_vfmaddsubps256:
9889   case X86::BI__builtin_ia32_vfmaddsubpd256:
9890   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
9891   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
9892   case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
9893   case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
9894   case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
9895   case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
9896   case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
9897   case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
9898     return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/true);
9899 
9900   case X86::BI__builtin_ia32_movdqa32store128_mask:
9901   case X86::BI__builtin_ia32_movdqa64store128_mask:
9902   case X86::BI__builtin_ia32_storeaps128_mask:
9903   case X86::BI__builtin_ia32_storeapd128_mask:
9904   case X86::BI__builtin_ia32_movdqa32store256_mask:
9905   case X86::BI__builtin_ia32_movdqa64store256_mask:
9906   case X86::BI__builtin_ia32_storeaps256_mask:
9907   case X86::BI__builtin_ia32_storeapd256_mask:
9908   case X86::BI__builtin_ia32_movdqa32store512_mask:
9909   case X86::BI__builtin_ia32_movdqa64store512_mask:
9910   case X86::BI__builtin_ia32_storeaps512_mask:
9911   case X86::BI__builtin_ia32_storeapd512_mask: {
9912     unsigned Align =
9913       getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
9914     return EmitX86MaskedStore(*this, Ops, Align);
9915   }
9916   case X86::BI__builtin_ia32_loadups128_mask:
9917   case X86::BI__builtin_ia32_loadups256_mask:
9918   case X86::BI__builtin_ia32_loadups512_mask:
9919   case X86::BI__builtin_ia32_loadupd128_mask:
9920   case X86::BI__builtin_ia32_loadupd256_mask:
9921   case X86::BI__builtin_ia32_loadupd512_mask:
9922   case X86::BI__builtin_ia32_loaddquqi128_mask:
9923   case X86::BI__builtin_ia32_loaddquqi256_mask:
9924   case X86::BI__builtin_ia32_loaddquqi512_mask:
9925   case X86::BI__builtin_ia32_loaddquhi128_mask:
9926   case X86::BI__builtin_ia32_loaddquhi256_mask:
9927   case X86::BI__builtin_ia32_loaddquhi512_mask:
9928   case X86::BI__builtin_ia32_loaddqusi128_mask:
9929   case X86::BI__builtin_ia32_loaddqusi256_mask:
9930   case X86::BI__builtin_ia32_loaddqusi512_mask:
9931   case X86::BI__builtin_ia32_loaddqudi128_mask:
9932   case X86::BI__builtin_ia32_loaddqudi256_mask:
9933   case X86::BI__builtin_ia32_loaddqudi512_mask:
9934     return EmitX86MaskedLoad(*this, Ops, 1);
9935 
9936   case X86::BI__builtin_ia32_loadss128_mask:
9937   case X86::BI__builtin_ia32_loadsd128_mask:
9938     return EmitX86MaskedLoad(*this, Ops, 1);
9939 
9940   case X86::BI__builtin_ia32_loadaps128_mask:
9941   case X86::BI__builtin_ia32_loadaps256_mask:
9942   case X86::BI__builtin_ia32_loadaps512_mask:
9943   case X86::BI__builtin_ia32_loadapd128_mask:
9944   case X86::BI__builtin_ia32_loadapd256_mask:
9945   case X86::BI__builtin_ia32_loadapd512_mask:
9946   case X86::BI__builtin_ia32_movdqa32load128_mask:
9947   case X86::BI__builtin_ia32_movdqa32load256_mask:
9948   case X86::BI__builtin_ia32_movdqa32load512_mask:
9949   case X86::BI__builtin_ia32_movdqa64load128_mask:
9950   case X86::BI__builtin_ia32_movdqa64load256_mask:
9951   case X86::BI__builtin_ia32_movdqa64load512_mask: {
9952     unsigned Align =
9953       getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
9954     return EmitX86MaskedLoad(*this, Ops, Align);
9955   }
9956 
9957   case X86::BI__builtin_ia32_expandloaddf128_mask:
9958   case X86::BI__builtin_ia32_expandloaddf256_mask:
9959   case X86::BI__builtin_ia32_expandloaddf512_mask:
9960   case X86::BI__builtin_ia32_expandloadsf128_mask:
9961   case X86::BI__builtin_ia32_expandloadsf256_mask:
9962   case X86::BI__builtin_ia32_expandloadsf512_mask:
9963   case X86::BI__builtin_ia32_expandloaddi128_mask:
9964   case X86::BI__builtin_ia32_expandloaddi256_mask:
9965   case X86::BI__builtin_ia32_expandloaddi512_mask:
9966   case X86::BI__builtin_ia32_expandloadsi128_mask:
9967   case X86::BI__builtin_ia32_expandloadsi256_mask:
9968   case X86::BI__builtin_ia32_expandloadsi512_mask:
9969   case X86::BI__builtin_ia32_expandloadhi128_mask:
9970   case X86::BI__builtin_ia32_expandloadhi256_mask:
9971   case X86::BI__builtin_ia32_expandloadhi512_mask:
9972   case X86::BI__builtin_ia32_expandloadqi128_mask:
9973   case X86::BI__builtin_ia32_expandloadqi256_mask:
9974   case X86::BI__builtin_ia32_expandloadqi512_mask:
9975     return EmitX86ExpandLoad(*this, Ops);
9976 
9977   case X86::BI__builtin_ia32_compressstoredf128_mask:
9978   case X86::BI__builtin_ia32_compressstoredf256_mask:
9979   case X86::BI__builtin_ia32_compressstoredf512_mask:
9980   case X86::BI__builtin_ia32_compressstoresf128_mask:
9981   case X86::BI__builtin_ia32_compressstoresf256_mask:
9982   case X86::BI__builtin_ia32_compressstoresf512_mask:
9983   case X86::BI__builtin_ia32_compressstoredi128_mask:
9984   case X86::BI__builtin_ia32_compressstoredi256_mask:
9985   case X86::BI__builtin_ia32_compressstoredi512_mask:
9986   case X86::BI__builtin_ia32_compressstoresi128_mask:
9987   case X86::BI__builtin_ia32_compressstoresi256_mask:
9988   case X86::BI__builtin_ia32_compressstoresi512_mask:
9989   case X86::BI__builtin_ia32_compressstorehi128_mask:
9990   case X86::BI__builtin_ia32_compressstorehi256_mask:
9991   case X86::BI__builtin_ia32_compressstorehi512_mask:
9992   case X86::BI__builtin_ia32_compressstoreqi128_mask:
9993   case X86::BI__builtin_ia32_compressstoreqi256_mask:
9994   case X86::BI__builtin_ia32_compressstoreqi512_mask:
9995     return EmitX86CompressStore(*this, Ops);
9996 
9997   case X86::BI__builtin_ia32_storehps:
9998   case X86::BI__builtin_ia32_storelps: {
9999     llvm::Type *PtrTy = llvm::PointerType::getUnqual(Int64Ty);
10000     llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 2);
10001 
10002     // cast val v2i64
10003     Ops[1] = Builder.CreateBitCast(Ops[1], VecTy, "cast");
10004 
10005     // extract (0, 1)
10006     unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1;
10007     Ops[1] = Builder.CreateExtractElement(Ops[1], Index, "extract");
10008 
10009     // cast pointer to i64 & store
10010     Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
10011     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
10012   }
10013   case X86::BI__builtin_ia32_vextractf128_pd256:
10014   case X86::BI__builtin_ia32_vextractf128_ps256:
10015   case X86::BI__builtin_ia32_vextractf128_si256:
10016   case X86::BI__builtin_ia32_extract128i256:
10017   case X86::BI__builtin_ia32_extractf64x4_mask:
10018   case X86::BI__builtin_ia32_extractf32x4_mask:
10019   case X86::BI__builtin_ia32_extracti64x4_mask:
10020   case X86::BI__builtin_ia32_extracti32x4_mask:
10021   case X86::BI__builtin_ia32_extractf32x8_mask:
10022   case X86::BI__builtin_ia32_extracti32x8_mask:
10023   case X86::BI__builtin_ia32_extractf32x4_256_mask:
10024   case X86::BI__builtin_ia32_extracti32x4_256_mask:
10025   case X86::BI__builtin_ia32_extractf64x2_256_mask:
10026   case X86::BI__builtin_ia32_extracti64x2_256_mask:
10027   case X86::BI__builtin_ia32_extractf64x2_512_mask:
10028   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
10029     llvm::Type *DstTy = ConvertType(E->getType());
10030     unsigned NumElts = DstTy->getVectorNumElements();
10031     unsigned SrcNumElts = Ops[0]->getType()->getVectorNumElements();
10032     unsigned SubVectors = SrcNumElts / NumElts;
10033     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
10034     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
10035     Index &= SubVectors - 1; // Remove any extra bits.
10036     Index *= NumElts;
10037 
10038     uint32_t Indices[16];
10039     for (unsigned i = 0; i != NumElts; ++i)
10040       Indices[i] = i + Index;
10041 
10042     Value *Res = Builder.CreateShuffleVector(Ops[0],
10043                                              UndefValue::get(Ops[0]->getType()),
10044                                              makeArrayRef(Indices, NumElts),
10045                                              "extract");
10046 
10047     if (Ops.size() == 4)
10048       Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
10049 
10050     return Res;
10051   }
10052   case X86::BI__builtin_ia32_vinsertf128_pd256:
10053   case X86::BI__builtin_ia32_vinsertf128_ps256:
10054   case X86::BI__builtin_ia32_vinsertf128_si256:
10055   case X86::BI__builtin_ia32_insert128i256:
10056   case X86::BI__builtin_ia32_insertf64x4:
10057   case X86::BI__builtin_ia32_insertf32x4:
10058   case X86::BI__builtin_ia32_inserti64x4:
10059   case X86::BI__builtin_ia32_inserti32x4:
10060   case X86::BI__builtin_ia32_insertf32x8:
10061   case X86::BI__builtin_ia32_inserti32x8:
10062   case X86::BI__builtin_ia32_insertf32x4_256:
10063   case X86::BI__builtin_ia32_inserti32x4_256:
10064   case X86::BI__builtin_ia32_insertf64x2_256:
10065   case X86::BI__builtin_ia32_inserti64x2_256:
10066   case X86::BI__builtin_ia32_insertf64x2_512:
10067   case X86::BI__builtin_ia32_inserti64x2_512: {
10068     unsigned DstNumElts = Ops[0]->getType()->getVectorNumElements();
10069     unsigned SrcNumElts = Ops[1]->getType()->getVectorNumElements();
10070     unsigned SubVectors = DstNumElts / SrcNumElts;
10071     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
10072     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
10073     Index &= SubVectors - 1; // Remove any extra bits.
10074     Index *= SrcNumElts;
10075 
10076     uint32_t Indices[16];
10077     for (unsigned i = 0; i != DstNumElts; ++i)
10078       Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
10079 
10080     Value *Op1 = Builder.CreateShuffleVector(Ops[1],
10081                                              UndefValue::get(Ops[1]->getType()),
10082                                              makeArrayRef(Indices, DstNumElts),
10083                                              "widen");
10084 
10085     for (unsigned i = 0; i != DstNumElts; ++i) {
10086       if (i >= Index && i < (Index + SrcNumElts))
10087         Indices[i] = (i - Index) + DstNumElts;
10088       else
10089         Indices[i] = i;
10090     }
10091 
10092     return Builder.CreateShuffleVector(Ops[0], Op1,
10093                                        makeArrayRef(Indices, DstNumElts),
10094                                        "insert");
10095   }
10096   case X86::BI__builtin_ia32_pmovqd512_mask:
10097   case X86::BI__builtin_ia32_pmovwb512_mask: {
10098     Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
10099     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
10100   }
10101   case X86::BI__builtin_ia32_pmovdb512_mask:
10102   case X86::BI__builtin_ia32_pmovdw512_mask:
10103   case X86::BI__builtin_ia32_pmovqw512_mask: {
10104     if (const auto *C = dyn_cast<Constant>(Ops[2]))
10105       if (C->isAllOnesValue())
10106         return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
10107 
10108     Intrinsic::ID IID;
10109     switch (BuiltinID) {
10110     default: llvm_unreachable("Unsupported intrinsic!");
10111     case X86::BI__builtin_ia32_pmovdb512_mask:
10112       IID = Intrinsic::x86_avx512_mask_pmov_db_512;
10113       break;
10114     case X86::BI__builtin_ia32_pmovdw512_mask:
10115       IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
10116       break;
10117     case X86::BI__builtin_ia32_pmovqw512_mask:
10118       IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
10119       break;
10120     }
10121 
10122     Function *Intr = CGM.getIntrinsic(IID);
10123     return Builder.CreateCall(Intr, Ops);
10124   }
10125   case X86::BI__builtin_ia32_pblendw128:
10126   case X86::BI__builtin_ia32_blendpd:
10127   case X86::BI__builtin_ia32_blendps:
10128   case X86::BI__builtin_ia32_blendpd256:
10129   case X86::BI__builtin_ia32_blendps256:
10130   case X86::BI__builtin_ia32_pblendw256:
10131   case X86::BI__builtin_ia32_pblendd128:
10132   case X86::BI__builtin_ia32_pblendd256: {
10133     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
10134     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
10135 
10136     uint32_t Indices[16];
10137     // If there are more than 8 elements, the immediate is used twice so make
10138     // sure we handle that.
10139     for (unsigned i = 0; i != NumElts; ++i)
10140       Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
10141 
10142     return Builder.CreateShuffleVector(Ops[0], Ops[1],
10143                                        makeArrayRef(Indices, NumElts),
10144                                        "blend");
10145   }
10146   case X86::BI__builtin_ia32_pshuflw:
10147   case X86::BI__builtin_ia32_pshuflw256:
10148   case X86::BI__builtin_ia32_pshuflw512: {
10149     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
10150     llvm::Type *Ty = Ops[0]->getType();
10151     unsigned NumElts = Ty->getVectorNumElements();
10152 
10153     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
10154     Imm = (Imm & 0xff) * 0x01010101;
10155 
10156     uint32_t Indices[32];
10157     for (unsigned l = 0; l != NumElts; l += 8) {
10158       for (unsigned i = 0; i != 4; ++i) {
10159         Indices[l + i] = l + (Imm & 3);
10160         Imm >>= 2;
10161       }
10162       for (unsigned i = 4; i != 8; ++i)
10163         Indices[l + i] = l + i;
10164     }
10165 
10166     return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
10167                                        makeArrayRef(Indices, NumElts),
10168                                        "pshuflw");
10169   }
10170   case X86::BI__builtin_ia32_pshufhw:
10171   case X86::BI__builtin_ia32_pshufhw256:
10172   case X86::BI__builtin_ia32_pshufhw512: {
10173     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
10174     llvm::Type *Ty = Ops[0]->getType();
10175     unsigned NumElts = Ty->getVectorNumElements();
10176 
10177     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
10178     Imm = (Imm & 0xff) * 0x01010101;
10179 
10180     uint32_t Indices[32];
10181     for (unsigned l = 0; l != NumElts; l += 8) {
10182       for (unsigned i = 0; i != 4; ++i)
10183         Indices[l + i] = l + i;
10184       for (unsigned i = 4; i != 8; ++i) {
10185         Indices[l + i] = l + 4 + (Imm & 3);
10186         Imm >>= 2;
10187       }
10188     }
10189 
10190     return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
10191                                        makeArrayRef(Indices, NumElts),
10192                                        "pshufhw");
10193   }
10194   case X86::BI__builtin_ia32_pshufd:
10195   case X86::BI__builtin_ia32_pshufd256:
10196   case X86::BI__builtin_ia32_pshufd512:
10197   case X86::BI__builtin_ia32_vpermilpd:
10198   case X86::BI__builtin_ia32_vpermilps:
10199   case X86::BI__builtin_ia32_vpermilpd256:
10200   case X86::BI__builtin_ia32_vpermilps256:
10201   case X86::BI__builtin_ia32_vpermilpd512:
10202   case X86::BI__builtin_ia32_vpermilps512: {
10203     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
10204     llvm::Type *Ty = Ops[0]->getType();
10205     unsigned NumElts = Ty->getVectorNumElements();
10206     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
10207     unsigned NumLaneElts = NumElts / NumLanes;
10208 
10209     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
10210     Imm = (Imm & 0xff) * 0x01010101;
10211 
10212     uint32_t Indices[16];
10213     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
10214       for (unsigned i = 0; i != NumLaneElts; ++i) {
10215         Indices[i + l] = (Imm % NumLaneElts) + l;
10216         Imm /= NumLaneElts;
10217       }
10218     }
10219 
10220     return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
10221                                        makeArrayRef(Indices, NumElts),
10222                                        "permil");
10223   }
10224   case X86::BI__builtin_ia32_shufpd:
10225   case X86::BI__builtin_ia32_shufpd256:
10226   case X86::BI__builtin_ia32_shufpd512:
10227   case X86::BI__builtin_ia32_shufps:
10228   case X86::BI__builtin_ia32_shufps256:
10229   case X86::BI__builtin_ia32_shufps512: {
10230     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
10231     llvm::Type *Ty = Ops[0]->getType();
10232     unsigned NumElts = Ty->getVectorNumElements();
10233     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
10234     unsigned NumLaneElts = NumElts / NumLanes;
10235 
10236     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
10237     Imm = (Imm & 0xff) * 0x01010101;
10238 
10239     uint32_t Indices[16];
10240     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
10241       for (unsigned i = 0; i != NumLaneElts; ++i) {
10242         unsigned Index = Imm % NumLaneElts;
10243         Imm /= NumLaneElts;
10244         if (i >= (NumLaneElts / 2))
10245           Index += NumElts;
10246         Indices[l + i] = l + Index;
10247       }
10248     }
10249 
10250     return Builder.CreateShuffleVector(Ops[0], Ops[1],
10251                                        makeArrayRef(Indices, NumElts),
10252                                        "shufp");
10253   }
10254   case X86::BI__builtin_ia32_permdi256:
10255   case X86::BI__builtin_ia32_permdf256:
10256   case X86::BI__builtin_ia32_permdi512:
10257   case X86::BI__builtin_ia32_permdf512: {
10258     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
10259     llvm::Type *Ty = Ops[0]->getType();
10260     unsigned NumElts = Ty->getVectorNumElements();
10261 
10262     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
10263     uint32_t Indices[8];
10264     for (unsigned l = 0; l != NumElts; l += 4)
10265       for (unsigned i = 0; i != 4; ++i)
10266         Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
10267 
10268     return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
10269                                        makeArrayRef(Indices, NumElts),
10270                                        "perm");
10271   }
10272   case X86::BI__builtin_ia32_palignr128:
10273   case X86::BI__builtin_ia32_palignr256:
10274   case X86::BI__builtin_ia32_palignr512: {
10275     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
10276 
10277     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
10278     assert(NumElts % 16 == 0);
10279 
10280     // If palignr is shifting the pair of vectors more than the size of two
10281     // lanes, emit zero.
10282     if (ShiftVal >= 32)
10283       return llvm::Constant::getNullValue(ConvertType(E->getType()));
10284 
10285     // If palignr is shifting the pair of input vectors more than one lane,
10286     // but less than two lanes, convert to shifting in zeroes.
10287     if (ShiftVal > 16) {
10288       ShiftVal -= 16;
10289       Ops[1] = Ops[0];
10290       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
10291     }
10292 
10293     uint32_t Indices[64];
10294     // 256-bit palignr operates on 128-bit lanes so we need to handle that
10295     for (unsigned l = 0; l != NumElts; l += 16) {
10296       for (unsigned i = 0; i != 16; ++i) {
10297         unsigned Idx = ShiftVal + i;
10298         if (Idx >= 16)
10299           Idx += NumElts - 16; // End of lane, switch operand.
10300         Indices[l + i] = Idx + l;
10301       }
10302     }
10303 
10304     return Builder.CreateShuffleVector(Ops[1], Ops[0],
10305                                        makeArrayRef(Indices, NumElts),
10306                                        "palignr");
10307   }
10308   case X86::BI__builtin_ia32_alignd128:
10309   case X86::BI__builtin_ia32_alignd256:
10310   case X86::BI__builtin_ia32_alignd512:
10311   case X86::BI__builtin_ia32_alignq128:
10312   case X86::BI__builtin_ia32_alignq256:
10313   case X86::BI__builtin_ia32_alignq512: {
10314     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
10315     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
10316 
10317     // Mask the shift amount to width of two vectors.
10318     ShiftVal &= (2 * NumElts) - 1;
10319 
10320     uint32_t Indices[16];
10321     for (unsigned i = 0; i != NumElts; ++i)
10322       Indices[i] = i + ShiftVal;
10323 
10324     return Builder.CreateShuffleVector(Ops[1], Ops[0],
10325                                        makeArrayRef(Indices, NumElts),
10326                                        "valign");
10327   }
10328   case X86::BI__builtin_ia32_shuf_f32x4_256:
10329   case X86::BI__builtin_ia32_shuf_f64x2_256:
10330   case X86::BI__builtin_ia32_shuf_i32x4_256:
10331   case X86::BI__builtin_ia32_shuf_i64x2_256:
10332   case X86::BI__builtin_ia32_shuf_f32x4:
10333   case X86::BI__builtin_ia32_shuf_f64x2:
10334   case X86::BI__builtin_ia32_shuf_i32x4:
10335   case X86::BI__builtin_ia32_shuf_i64x2: {
10336     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
10337     llvm::Type *Ty = Ops[0]->getType();
10338     unsigned NumElts = Ty->getVectorNumElements();
10339     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
10340     unsigned NumLaneElts = NumElts / NumLanes;
10341 
10342     uint32_t Indices[16];
10343     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
10344       unsigned Index = (Imm % NumLanes) * NumLaneElts;
10345       Imm /= NumLanes; // Discard the bits we just used.
10346       if (l >= (NumElts / 2))
10347         Index += NumElts; // Switch to other source.
10348       for (unsigned i = 0; i != NumLaneElts; ++i) {
10349         Indices[l + i] = Index + i;
10350       }
10351     }
10352 
10353     return Builder.CreateShuffleVector(Ops[0], Ops[1],
10354                                        makeArrayRef(Indices, NumElts),
10355                                        "shuf");
10356   }
10357 
10358   case X86::BI__builtin_ia32_vperm2f128_pd256:
10359   case X86::BI__builtin_ia32_vperm2f128_ps256:
10360   case X86::BI__builtin_ia32_vperm2f128_si256:
10361   case X86::BI__builtin_ia32_permti256: {
10362     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
10363     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
10364 
10365     // This takes a very simple approach since there are two lanes and a
10366     // shuffle can have 2 inputs. So we reserve the first input for the first
10367     // lane and the second input for the second lane. This may result in
10368     // duplicate sources, but this can be dealt with in the backend.
10369 
10370     Value *OutOps[2];
10371     uint32_t Indices[8];
10372     for (unsigned l = 0; l != 2; ++l) {
10373       // Determine the source for this lane.
10374       if (Imm & (1 << ((l * 4) + 3)))
10375         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
10376       else if (Imm & (1 << ((l * 4) + 1)))
10377         OutOps[l] = Ops[1];
10378       else
10379         OutOps[l] = Ops[0];
10380 
10381       for (unsigned i = 0; i != NumElts/2; ++i) {
10382         // Start with ith element of the source for this lane.
10383         unsigned Idx = (l * NumElts) + i;
10384         // If bit 0 of the immediate half is set, switch to the high half of
10385         // the source.
10386         if (Imm & (1 << (l * 4)))
10387           Idx += NumElts/2;
10388         Indices[(l * (NumElts/2)) + i] = Idx;
10389       }
10390     }
10391 
10392     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
10393                                        makeArrayRef(Indices, NumElts),
10394                                        "vperm");
10395   }
10396 
10397   case X86::BI__builtin_ia32_pslldqi128_byteshift:
10398   case X86::BI__builtin_ia32_pslldqi256_byteshift:
10399   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
10400     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
10401     llvm::Type *ResultType = Ops[0]->getType();
10402     // Builtin type is vXi64 so multiply by 8 to get bytes.
10403     unsigned NumElts = ResultType->getVectorNumElements() * 8;
10404 
10405     // If pslldq is shifting the vector more than 15 bytes, emit zero.
10406     if (ShiftVal >= 16)
10407       return llvm::Constant::getNullValue(ResultType);
10408 
10409     uint32_t Indices[64];
10410     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
10411     for (unsigned l = 0; l != NumElts; l += 16) {
10412       for (unsigned i = 0; i != 16; ++i) {
10413         unsigned Idx = NumElts + i - ShiftVal;
10414         if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
10415         Indices[l + i] = Idx + l;
10416       }
10417     }
10418 
10419     llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
10420     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
10421     Value *Zero = llvm::Constant::getNullValue(VecTy);
10422     Value *SV = Builder.CreateShuffleVector(Zero, Cast,
10423                                             makeArrayRef(Indices, NumElts),
10424                                             "pslldq");
10425     return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
10426   }
10427   case X86::BI__builtin_ia32_psrldqi128_byteshift:
10428   case X86::BI__builtin_ia32_psrldqi256_byteshift:
10429   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
10430     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
10431     llvm::Type *ResultType = Ops[0]->getType();
10432     // Builtin type is vXi64 so multiply by 8 to get bytes.
10433     unsigned NumElts = ResultType->getVectorNumElements() * 8;
10434 
10435     // If psrldq is shifting the vector more than 15 bytes, emit zero.
10436     if (ShiftVal >= 16)
10437       return llvm::Constant::getNullValue(ResultType);
10438 
10439     uint32_t Indices[64];
10440     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
10441     for (unsigned l = 0; l != NumElts; l += 16) {
10442       for (unsigned i = 0; i != 16; ++i) {
10443         unsigned Idx = i + ShiftVal;
10444         if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
10445         Indices[l + i] = Idx + l;
10446       }
10447     }
10448 
10449     llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
10450     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
10451     Value *Zero = llvm::Constant::getNullValue(VecTy);
10452     Value *SV = Builder.CreateShuffleVector(Cast, Zero,
10453                                             makeArrayRef(Indices, NumElts),
10454                                             "psrldq");
10455     return Builder.CreateBitCast(SV, ResultType, "cast");
10456   }
10457   case X86::BI__builtin_ia32_kshiftliqi:
10458   case X86::BI__builtin_ia32_kshiftlihi:
10459   case X86::BI__builtin_ia32_kshiftlisi:
10460   case X86::BI__builtin_ia32_kshiftlidi: {
10461     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
10462     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10463 
10464     if (ShiftVal >= NumElts)
10465       return llvm::Constant::getNullValue(Ops[0]->getType());
10466 
10467     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
10468 
10469     uint32_t Indices[64];
10470     for (unsigned i = 0; i != NumElts; ++i)
10471       Indices[i] = NumElts + i - ShiftVal;
10472 
10473     Value *Zero = llvm::Constant::getNullValue(In->getType());
10474     Value *SV = Builder.CreateShuffleVector(Zero, In,
10475                                             makeArrayRef(Indices, NumElts),
10476                                             "kshiftl");
10477     return Builder.CreateBitCast(SV, Ops[0]->getType());
10478   }
10479   case X86::BI__builtin_ia32_kshiftriqi:
10480   case X86::BI__builtin_ia32_kshiftrihi:
10481   case X86::BI__builtin_ia32_kshiftrisi:
10482   case X86::BI__builtin_ia32_kshiftridi: {
10483     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
10484     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10485 
10486     if (ShiftVal >= NumElts)
10487       return llvm::Constant::getNullValue(Ops[0]->getType());
10488 
10489     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
10490 
10491     uint32_t Indices[64];
10492     for (unsigned i = 0; i != NumElts; ++i)
10493       Indices[i] = i + ShiftVal;
10494 
10495     Value *Zero = llvm::Constant::getNullValue(In->getType());
10496     Value *SV = Builder.CreateShuffleVector(In, Zero,
10497                                             makeArrayRef(Indices, NumElts),
10498                                             "kshiftr");
10499     return Builder.CreateBitCast(SV, Ops[0]->getType());
10500   }
10501   case X86::BI__builtin_ia32_movnti:
10502   case X86::BI__builtin_ia32_movnti64:
10503   case X86::BI__builtin_ia32_movntsd:
10504   case X86::BI__builtin_ia32_movntss: {
10505     llvm::MDNode *Node = llvm::MDNode::get(
10506         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
10507 
10508     Value *Ptr = Ops[0];
10509     Value *Src = Ops[1];
10510 
10511     // Extract the 0'th element of the source vector.
10512     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
10513         BuiltinID == X86::BI__builtin_ia32_movntss)
10514       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
10515 
10516     // Convert the type of the pointer to a pointer to the stored type.
10517     Value *BC = Builder.CreateBitCast(
10518         Ptr, llvm::PointerType::getUnqual(Src->getType()), "cast");
10519 
10520     // Unaligned nontemporal store of the scalar value.
10521     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, BC);
10522     SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
10523     SI->setAlignment(1);
10524     return SI;
10525   }
10526 
10527   case X86::BI__builtin_ia32_selectb_128:
10528   case X86::BI__builtin_ia32_selectb_256:
10529   case X86::BI__builtin_ia32_selectb_512:
10530   case X86::BI__builtin_ia32_selectw_128:
10531   case X86::BI__builtin_ia32_selectw_256:
10532   case X86::BI__builtin_ia32_selectw_512:
10533   case X86::BI__builtin_ia32_selectd_128:
10534   case X86::BI__builtin_ia32_selectd_256:
10535   case X86::BI__builtin_ia32_selectd_512:
10536   case X86::BI__builtin_ia32_selectq_128:
10537   case X86::BI__builtin_ia32_selectq_256:
10538   case X86::BI__builtin_ia32_selectq_512:
10539   case X86::BI__builtin_ia32_selectps_128:
10540   case X86::BI__builtin_ia32_selectps_256:
10541   case X86::BI__builtin_ia32_selectps_512:
10542   case X86::BI__builtin_ia32_selectpd_128:
10543   case X86::BI__builtin_ia32_selectpd_256:
10544   case X86::BI__builtin_ia32_selectpd_512:
10545     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
10546   case X86::BI__builtin_ia32_selectss_128:
10547   case X86::BI__builtin_ia32_selectsd_128: {
10548     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
10549     Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
10550     A = EmitX86ScalarSelect(*this, Ops[0], A, B);
10551     return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
10552   }
10553   case X86::BI__builtin_ia32_cmpb128_mask:
10554   case X86::BI__builtin_ia32_cmpb256_mask:
10555   case X86::BI__builtin_ia32_cmpb512_mask:
10556   case X86::BI__builtin_ia32_cmpw128_mask:
10557   case X86::BI__builtin_ia32_cmpw256_mask:
10558   case X86::BI__builtin_ia32_cmpw512_mask:
10559   case X86::BI__builtin_ia32_cmpd128_mask:
10560   case X86::BI__builtin_ia32_cmpd256_mask:
10561   case X86::BI__builtin_ia32_cmpd512_mask:
10562   case X86::BI__builtin_ia32_cmpq128_mask:
10563   case X86::BI__builtin_ia32_cmpq256_mask:
10564   case X86::BI__builtin_ia32_cmpq512_mask: {
10565     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
10566     return EmitX86MaskedCompare(*this, CC, true, Ops);
10567   }
10568   case X86::BI__builtin_ia32_ucmpb128_mask:
10569   case X86::BI__builtin_ia32_ucmpb256_mask:
10570   case X86::BI__builtin_ia32_ucmpb512_mask:
10571   case X86::BI__builtin_ia32_ucmpw128_mask:
10572   case X86::BI__builtin_ia32_ucmpw256_mask:
10573   case X86::BI__builtin_ia32_ucmpw512_mask:
10574   case X86::BI__builtin_ia32_ucmpd128_mask:
10575   case X86::BI__builtin_ia32_ucmpd256_mask:
10576   case X86::BI__builtin_ia32_ucmpd512_mask:
10577   case X86::BI__builtin_ia32_ucmpq128_mask:
10578   case X86::BI__builtin_ia32_ucmpq256_mask:
10579   case X86::BI__builtin_ia32_ucmpq512_mask: {
10580     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
10581     return EmitX86MaskedCompare(*this, CC, false, Ops);
10582   }
10583 
10584   case X86::BI__builtin_ia32_kortestcqi:
10585   case X86::BI__builtin_ia32_kortestchi:
10586   case X86::BI__builtin_ia32_kortestcsi:
10587   case X86::BI__builtin_ia32_kortestcdi: {
10588     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
10589     Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
10590     Value *Cmp = Builder.CreateICmpEQ(Or, C);
10591     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
10592   }
10593   case X86::BI__builtin_ia32_kortestzqi:
10594   case X86::BI__builtin_ia32_kortestzhi:
10595   case X86::BI__builtin_ia32_kortestzsi:
10596   case X86::BI__builtin_ia32_kortestzdi: {
10597     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
10598     Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
10599     Value *Cmp = Builder.CreateICmpEQ(Or, C);
10600     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
10601   }
10602 
10603   case X86::BI__builtin_ia32_ktestcqi:
10604   case X86::BI__builtin_ia32_ktestzqi:
10605   case X86::BI__builtin_ia32_ktestchi:
10606   case X86::BI__builtin_ia32_ktestzhi:
10607   case X86::BI__builtin_ia32_ktestcsi:
10608   case X86::BI__builtin_ia32_ktestzsi:
10609   case X86::BI__builtin_ia32_ktestcdi:
10610   case X86::BI__builtin_ia32_ktestzdi: {
10611     Intrinsic::ID IID;
10612     switch (BuiltinID) {
10613     default: llvm_unreachable("Unsupported intrinsic!");
10614     case X86::BI__builtin_ia32_ktestcqi:
10615       IID = Intrinsic::x86_avx512_ktestc_b;
10616       break;
10617     case X86::BI__builtin_ia32_ktestzqi:
10618       IID = Intrinsic::x86_avx512_ktestz_b;
10619       break;
10620     case X86::BI__builtin_ia32_ktestchi:
10621       IID = Intrinsic::x86_avx512_ktestc_w;
10622       break;
10623     case X86::BI__builtin_ia32_ktestzhi:
10624       IID = Intrinsic::x86_avx512_ktestz_w;
10625       break;
10626     case X86::BI__builtin_ia32_ktestcsi:
10627       IID = Intrinsic::x86_avx512_ktestc_d;
10628       break;
10629     case X86::BI__builtin_ia32_ktestzsi:
10630       IID = Intrinsic::x86_avx512_ktestz_d;
10631       break;
10632     case X86::BI__builtin_ia32_ktestcdi:
10633       IID = Intrinsic::x86_avx512_ktestc_q;
10634       break;
10635     case X86::BI__builtin_ia32_ktestzdi:
10636       IID = Intrinsic::x86_avx512_ktestz_q;
10637       break;
10638     }
10639 
10640     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10641     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
10642     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
10643     Function *Intr = CGM.getIntrinsic(IID);
10644     return Builder.CreateCall(Intr, {LHS, RHS});
10645   }
10646 
10647   case X86::BI__builtin_ia32_kaddqi:
10648   case X86::BI__builtin_ia32_kaddhi:
10649   case X86::BI__builtin_ia32_kaddsi:
10650   case X86::BI__builtin_ia32_kadddi: {
10651     Intrinsic::ID IID;
10652     switch (BuiltinID) {
10653     default: llvm_unreachable("Unsupported intrinsic!");
10654     case X86::BI__builtin_ia32_kaddqi:
10655       IID = Intrinsic::x86_avx512_kadd_b;
10656       break;
10657     case X86::BI__builtin_ia32_kaddhi:
10658       IID = Intrinsic::x86_avx512_kadd_w;
10659       break;
10660     case X86::BI__builtin_ia32_kaddsi:
10661       IID = Intrinsic::x86_avx512_kadd_d;
10662       break;
10663     case X86::BI__builtin_ia32_kadddi:
10664       IID = Intrinsic::x86_avx512_kadd_q;
10665       break;
10666     }
10667 
10668     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10669     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
10670     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
10671     Function *Intr = CGM.getIntrinsic(IID);
10672     Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
10673     return Builder.CreateBitCast(Res, Ops[0]->getType());
10674   }
10675   case X86::BI__builtin_ia32_kandqi:
10676   case X86::BI__builtin_ia32_kandhi:
10677   case X86::BI__builtin_ia32_kandsi:
10678   case X86::BI__builtin_ia32_kanddi:
10679     return EmitX86MaskLogic(*this, Instruction::And, Ops);
10680   case X86::BI__builtin_ia32_kandnqi:
10681   case X86::BI__builtin_ia32_kandnhi:
10682   case X86::BI__builtin_ia32_kandnsi:
10683   case X86::BI__builtin_ia32_kandndi:
10684     return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
10685   case X86::BI__builtin_ia32_korqi:
10686   case X86::BI__builtin_ia32_korhi:
10687   case X86::BI__builtin_ia32_korsi:
10688   case X86::BI__builtin_ia32_kordi:
10689     return EmitX86MaskLogic(*this, Instruction::Or, Ops);
10690   case X86::BI__builtin_ia32_kxnorqi:
10691   case X86::BI__builtin_ia32_kxnorhi:
10692   case X86::BI__builtin_ia32_kxnorsi:
10693   case X86::BI__builtin_ia32_kxnordi:
10694     return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
10695   case X86::BI__builtin_ia32_kxorqi:
10696   case X86::BI__builtin_ia32_kxorhi:
10697   case X86::BI__builtin_ia32_kxorsi:
10698   case X86::BI__builtin_ia32_kxordi:
10699     return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
10700   case X86::BI__builtin_ia32_knotqi:
10701   case X86::BI__builtin_ia32_knothi:
10702   case X86::BI__builtin_ia32_knotsi:
10703   case X86::BI__builtin_ia32_knotdi: {
10704     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10705     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
10706     return Builder.CreateBitCast(Builder.CreateNot(Res),
10707                                  Ops[0]->getType());
10708   }
10709   case X86::BI__builtin_ia32_kmovb:
10710   case X86::BI__builtin_ia32_kmovw:
10711   case X86::BI__builtin_ia32_kmovd:
10712   case X86::BI__builtin_ia32_kmovq: {
10713     // Bitcast to vXi1 type and then back to integer. This gets the mask
10714     // register type into the IR, but might be optimized out depending on
10715     // what's around it.
10716     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10717     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
10718     return Builder.CreateBitCast(Res, Ops[0]->getType());
10719   }
10720 
10721   case X86::BI__builtin_ia32_kunpckdi:
10722   case X86::BI__builtin_ia32_kunpcksi:
10723   case X86::BI__builtin_ia32_kunpckhi: {
10724     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
10725     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
10726     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
10727     uint32_t Indices[64];
10728     for (unsigned i = 0; i != NumElts; ++i)
10729       Indices[i] = i;
10730 
10731     // First extract half of each vector. This gives better codegen than
10732     // doing it in a single shuffle.
10733     LHS = Builder.CreateShuffleVector(LHS, LHS,
10734                                       makeArrayRef(Indices, NumElts / 2));
10735     RHS = Builder.CreateShuffleVector(RHS, RHS,
10736                                       makeArrayRef(Indices, NumElts / 2));
10737     // Concat the vectors.
10738     // NOTE: Operands are swapped to match the intrinsic definition.
10739     Value *Res = Builder.CreateShuffleVector(RHS, LHS,
10740                                              makeArrayRef(Indices, NumElts));
10741     return Builder.CreateBitCast(Res, Ops[0]->getType());
10742   }
10743 
10744   case X86::BI__builtin_ia32_vplzcntd_128:
10745   case X86::BI__builtin_ia32_vplzcntd_256:
10746   case X86::BI__builtin_ia32_vplzcntd_512:
10747   case X86::BI__builtin_ia32_vplzcntq_128:
10748   case X86::BI__builtin_ia32_vplzcntq_256:
10749   case X86::BI__builtin_ia32_vplzcntq_512: {
10750     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
10751     return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
10752   }
10753   case X86::BI__builtin_ia32_sqrtss:
10754   case X86::BI__builtin_ia32_sqrtsd: {
10755     Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
10756     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
10757     A = Builder.CreateCall(F, {A});
10758     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
10759   }
10760   case X86::BI__builtin_ia32_sqrtsd_round_mask:
10761   case X86::BI__builtin_ia32_sqrtss_round_mask: {
10762     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
10763     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
10764     // otherwise keep the intrinsic.
10765     if (CC != 4) {
10766       Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ?
10767                           Intrinsic::x86_avx512_mask_sqrt_sd :
10768                           Intrinsic::x86_avx512_mask_sqrt_ss;
10769       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
10770     }
10771     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
10772     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
10773     A = Builder.CreateCall(F, A);
10774     Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
10775     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
10776     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
10777   }
10778   case X86::BI__builtin_ia32_sqrtpd256:
10779   case X86::BI__builtin_ia32_sqrtpd:
10780   case X86::BI__builtin_ia32_sqrtps256:
10781   case X86::BI__builtin_ia32_sqrtps:
10782   case X86::BI__builtin_ia32_sqrtps512:
10783   case X86::BI__builtin_ia32_sqrtpd512: {
10784     if (Ops.size() == 2) {
10785       unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
10786       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
10787       // otherwise keep the intrinsic.
10788       if (CC != 4) {
10789         Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ?
10790                             Intrinsic::x86_avx512_sqrt_ps_512 :
10791                             Intrinsic::x86_avx512_sqrt_pd_512;
10792         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
10793       }
10794     }
10795     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
10796     return Builder.CreateCall(F, Ops[0]);
10797   }
10798   case X86::BI__builtin_ia32_pabsb128:
10799   case X86::BI__builtin_ia32_pabsw128:
10800   case X86::BI__builtin_ia32_pabsd128:
10801   case X86::BI__builtin_ia32_pabsb256:
10802   case X86::BI__builtin_ia32_pabsw256:
10803   case X86::BI__builtin_ia32_pabsd256:
10804   case X86::BI__builtin_ia32_pabsq128:
10805   case X86::BI__builtin_ia32_pabsq256:
10806   case X86::BI__builtin_ia32_pabsb512:
10807   case X86::BI__builtin_ia32_pabsw512:
10808   case X86::BI__builtin_ia32_pabsd512:
10809   case X86::BI__builtin_ia32_pabsq512:
10810     return EmitX86Abs(*this, Ops);
10811 
10812   case X86::BI__builtin_ia32_pmaxsb128:
10813   case X86::BI__builtin_ia32_pmaxsw128:
10814   case X86::BI__builtin_ia32_pmaxsd128:
10815   case X86::BI__builtin_ia32_pmaxsq128:
10816   case X86::BI__builtin_ia32_pmaxsb256:
10817   case X86::BI__builtin_ia32_pmaxsw256:
10818   case X86::BI__builtin_ia32_pmaxsd256:
10819   case X86::BI__builtin_ia32_pmaxsq256:
10820   case X86::BI__builtin_ia32_pmaxsb512:
10821   case X86::BI__builtin_ia32_pmaxsw512:
10822   case X86::BI__builtin_ia32_pmaxsd512:
10823   case X86::BI__builtin_ia32_pmaxsq512:
10824     return EmitX86MinMax(*this, ICmpInst::ICMP_SGT, Ops);
10825   case X86::BI__builtin_ia32_pmaxub128:
10826   case X86::BI__builtin_ia32_pmaxuw128:
10827   case X86::BI__builtin_ia32_pmaxud128:
10828   case X86::BI__builtin_ia32_pmaxuq128:
10829   case X86::BI__builtin_ia32_pmaxub256:
10830   case X86::BI__builtin_ia32_pmaxuw256:
10831   case X86::BI__builtin_ia32_pmaxud256:
10832   case X86::BI__builtin_ia32_pmaxuq256:
10833   case X86::BI__builtin_ia32_pmaxub512:
10834   case X86::BI__builtin_ia32_pmaxuw512:
10835   case X86::BI__builtin_ia32_pmaxud512:
10836   case X86::BI__builtin_ia32_pmaxuq512:
10837     return EmitX86MinMax(*this, ICmpInst::ICMP_UGT, Ops);
10838   case X86::BI__builtin_ia32_pminsb128:
10839   case X86::BI__builtin_ia32_pminsw128:
10840   case X86::BI__builtin_ia32_pminsd128:
10841   case X86::BI__builtin_ia32_pminsq128:
10842   case X86::BI__builtin_ia32_pminsb256:
10843   case X86::BI__builtin_ia32_pminsw256:
10844   case X86::BI__builtin_ia32_pminsd256:
10845   case X86::BI__builtin_ia32_pminsq256:
10846   case X86::BI__builtin_ia32_pminsb512:
10847   case X86::BI__builtin_ia32_pminsw512:
10848   case X86::BI__builtin_ia32_pminsd512:
10849   case X86::BI__builtin_ia32_pminsq512:
10850     return EmitX86MinMax(*this, ICmpInst::ICMP_SLT, Ops);
10851   case X86::BI__builtin_ia32_pminub128:
10852   case X86::BI__builtin_ia32_pminuw128:
10853   case X86::BI__builtin_ia32_pminud128:
10854   case X86::BI__builtin_ia32_pminuq128:
10855   case X86::BI__builtin_ia32_pminub256:
10856   case X86::BI__builtin_ia32_pminuw256:
10857   case X86::BI__builtin_ia32_pminud256:
10858   case X86::BI__builtin_ia32_pminuq256:
10859   case X86::BI__builtin_ia32_pminub512:
10860   case X86::BI__builtin_ia32_pminuw512:
10861   case X86::BI__builtin_ia32_pminud512:
10862   case X86::BI__builtin_ia32_pminuq512:
10863     return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops);
10864 
10865   case X86::BI__builtin_ia32_pmuludq128:
10866   case X86::BI__builtin_ia32_pmuludq256:
10867   case X86::BI__builtin_ia32_pmuludq512:
10868     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
10869 
10870   case X86::BI__builtin_ia32_pmuldq128:
10871   case X86::BI__builtin_ia32_pmuldq256:
10872   case X86::BI__builtin_ia32_pmuldq512:
10873     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
10874 
10875   case X86::BI__builtin_ia32_pternlogd512_mask:
10876   case X86::BI__builtin_ia32_pternlogq512_mask:
10877   case X86::BI__builtin_ia32_pternlogd128_mask:
10878   case X86::BI__builtin_ia32_pternlogd256_mask:
10879   case X86::BI__builtin_ia32_pternlogq128_mask:
10880   case X86::BI__builtin_ia32_pternlogq256_mask:
10881     return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
10882 
10883   case X86::BI__builtin_ia32_pternlogd512_maskz:
10884   case X86::BI__builtin_ia32_pternlogq512_maskz:
10885   case X86::BI__builtin_ia32_pternlogd128_maskz:
10886   case X86::BI__builtin_ia32_pternlogd256_maskz:
10887   case X86::BI__builtin_ia32_pternlogq128_maskz:
10888   case X86::BI__builtin_ia32_pternlogq256_maskz:
10889     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
10890 
10891   // 3DNow!
10892   case X86::BI__builtin_ia32_pswapdsf:
10893   case X86::BI__builtin_ia32_pswapdsi: {
10894     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
10895     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
10896     llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
10897     return Builder.CreateCall(F, Ops, "pswapd");
10898   }
10899   case X86::BI__builtin_ia32_rdrand16_step:
10900   case X86::BI__builtin_ia32_rdrand32_step:
10901   case X86::BI__builtin_ia32_rdrand64_step:
10902   case X86::BI__builtin_ia32_rdseed16_step:
10903   case X86::BI__builtin_ia32_rdseed32_step:
10904   case X86::BI__builtin_ia32_rdseed64_step: {
10905     Intrinsic::ID ID;
10906     switch (BuiltinID) {
10907     default: llvm_unreachable("Unsupported intrinsic!");
10908     case X86::BI__builtin_ia32_rdrand16_step:
10909       ID = Intrinsic::x86_rdrand_16;
10910       break;
10911     case X86::BI__builtin_ia32_rdrand32_step:
10912       ID = Intrinsic::x86_rdrand_32;
10913       break;
10914     case X86::BI__builtin_ia32_rdrand64_step:
10915       ID = Intrinsic::x86_rdrand_64;
10916       break;
10917     case X86::BI__builtin_ia32_rdseed16_step:
10918       ID = Intrinsic::x86_rdseed_16;
10919       break;
10920     case X86::BI__builtin_ia32_rdseed32_step:
10921       ID = Intrinsic::x86_rdseed_32;
10922       break;
10923     case X86::BI__builtin_ia32_rdseed64_step:
10924       ID = Intrinsic::x86_rdseed_64;
10925       break;
10926     }
10927 
10928     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
10929     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
10930                                       Ops[0]);
10931     return Builder.CreateExtractValue(Call, 1);
10932   }
10933   case X86::BI__builtin_ia32_addcarryx_u32:
10934   case X86::BI__builtin_ia32_addcarryx_u64:
10935   case X86::BI__builtin_ia32_addcarry_u32:
10936   case X86::BI__builtin_ia32_addcarry_u64:
10937   case X86::BI__builtin_ia32_subborrow_u32:
10938   case X86::BI__builtin_ia32_subborrow_u64: {
10939     Intrinsic::ID IID;
10940     switch (BuiltinID) {
10941     default: llvm_unreachable("Unsupported intrinsic!");
10942     case X86::BI__builtin_ia32_addcarryx_u32:
10943       IID = Intrinsic::x86_addcarryx_u32;
10944       break;
10945     case X86::BI__builtin_ia32_addcarryx_u64:
10946       IID = Intrinsic::x86_addcarryx_u64;
10947       break;
10948     case X86::BI__builtin_ia32_addcarry_u32:
10949       IID = Intrinsic::x86_addcarry_u32;
10950       break;
10951     case X86::BI__builtin_ia32_addcarry_u64:
10952       IID = Intrinsic::x86_addcarry_u64;
10953       break;
10954     case X86::BI__builtin_ia32_subborrow_u32:
10955       IID = Intrinsic::x86_subborrow_u32;
10956       break;
10957     case X86::BI__builtin_ia32_subborrow_u64:
10958       IID = Intrinsic::x86_subborrow_u64;
10959       break;
10960     }
10961 
10962     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
10963                                      { Ops[0], Ops[1], Ops[2] });
10964     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
10965                                       Ops[3]);
10966     return Builder.CreateExtractValue(Call, 0);
10967   }
10968 
10969   case X86::BI__builtin_ia32_fpclassps128_mask:
10970   case X86::BI__builtin_ia32_fpclassps256_mask:
10971   case X86::BI__builtin_ia32_fpclassps512_mask:
10972   case X86::BI__builtin_ia32_fpclasspd128_mask:
10973   case X86::BI__builtin_ia32_fpclasspd256_mask:
10974   case X86::BI__builtin_ia32_fpclasspd512_mask: {
10975     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
10976     Value *MaskIn = Ops[2];
10977     Ops.erase(&Ops[2]);
10978 
10979     Intrinsic::ID ID;
10980     switch (BuiltinID) {
10981     default: llvm_unreachable("Unsupported intrinsic!");
10982     case X86::BI__builtin_ia32_fpclassps128_mask:
10983       ID = Intrinsic::x86_avx512_fpclass_ps_128;
10984       break;
10985     case X86::BI__builtin_ia32_fpclassps256_mask:
10986       ID = Intrinsic::x86_avx512_fpclass_ps_256;
10987       break;
10988     case X86::BI__builtin_ia32_fpclassps512_mask:
10989       ID = Intrinsic::x86_avx512_fpclass_ps_512;
10990       break;
10991     case X86::BI__builtin_ia32_fpclasspd128_mask:
10992       ID = Intrinsic::x86_avx512_fpclass_pd_128;
10993       break;
10994     case X86::BI__builtin_ia32_fpclasspd256_mask:
10995       ID = Intrinsic::x86_avx512_fpclass_pd_256;
10996       break;
10997     case X86::BI__builtin_ia32_fpclasspd512_mask:
10998       ID = Intrinsic::x86_avx512_fpclass_pd_512;
10999       break;
11000     }
11001 
11002     Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
11003     return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
11004   }
11005 
11006   // packed comparison intrinsics
11007   case X86::BI__builtin_ia32_cmpeqps:
11008   case X86::BI__builtin_ia32_cmpeqpd:
11009     return getVectorFCmpIR(CmpInst::FCMP_OEQ);
11010   case X86::BI__builtin_ia32_cmpltps:
11011   case X86::BI__builtin_ia32_cmpltpd:
11012     return getVectorFCmpIR(CmpInst::FCMP_OLT);
11013   case X86::BI__builtin_ia32_cmpleps:
11014   case X86::BI__builtin_ia32_cmplepd:
11015     return getVectorFCmpIR(CmpInst::FCMP_OLE);
11016   case X86::BI__builtin_ia32_cmpunordps:
11017   case X86::BI__builtin_ia32_cmpunordpd:
11018     return getVectorFCmpIR(CmpInst::FCMP_UNO);
11019   case X86::BI__builtin_ia32_cmpneqps:
11020   case X86::BI__builtin_ia32_cmpneqpd:
11021     return getVectorFCmpIR(CmpInst::FCMP_UNE);
11022   case X86::BI__builtin_ia32_cmpnltps:
11023   case X86::BI__builtin_ia32_cmpnltpd:
11024     return getVectorFCmpIR(CmpInst::FCMP_UGE);
11025   case X86::BI__builtin_ia32_cmpnleps:
11026   case X86::BI__builtin_ia32_cmpnlepd:
11027     return getVectorFCmpIR(CmpInst::FCMP_UGT);
11028   case X86::BI__builtin_ia32_cmpordps:
11029   case X86::BI__builtin_ia32_cmpordpd:
11030     return getVectorFCmpIR(CmpInst::FCMP_ORD);
11031   case X86::BI__builtin_ia32_cmpps:
11032   case X86::BI__builtin_ia32_cmpps256:
11033   case X86::BI__builtin_ia32_cmppd:
11034   case X86::BI__builtin_ia32_cmppd256:
11035   case X86::BI__builtin_ia32_cmpps128_mask:
11036   case X86::BI__builtin_ia32_cmpps256_mask:
11037   case X86::BI__builtin_ia32_cmpps512_mask:
11038   case X86::BI__builtin_ia32_cmppd128_mask:
11039   case X86::BI__builtin_ia32_cmppd256_mask:
11040   case X86::BI__builtin_ia32_cmppd512_mask: {
11041     // Lowering vector comparisons to fcmp instructions, while
11042     // ignoring signalling behaviour requested
11043     // ignoring rounding mode requested
11044     // This is is only possible as long as FENV_ACCESS is not implemented.
11045     // See also: https://reviews.llvm.org/D45616
11046 
11047     // The third argument is the comparison condition, and integer in the
11048     // range [0, 31]
11049     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
11050 
11051     // Lowering to IR fcmp instruction.
11052     // Ignoring requested signaling behaviour,
11053     // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
11054     FCmpInst::Predicate Pred;
11055     switch (CC) {
11056     case 0x00: Pred = FCmpInst::FCMP_OEQ;   break;
11057     case 0x01: Pred = FCmpInst::FCMP_OLT;   break;
11058     case 0x02: Pred = FCmpInst::FCMP_OLE;   break;
11059     case 0x03: Pred = FCmpInst::FCMP_UNO;   break;
11060     case 0x04: Pred = FCmpInst::FCMP_UNE;   break;
11061     case 0x05: Pred = FCmpInst::FCMP_UGE;   break;
11062     case 0x06: Pred = FCmpInst::FCMP_UGT;   break;
11063     case 0x07: Pred = FCmpInst::FCMP_ORD;   break;
11064     case 0x08: Pred = FCmpInst::FCMP_UEQ;   break;
11065     case 0x09: Pred = FCmpInst::FCMP_ULT;   break;
11066     case 0x0a: Pred = FCmpInst::FCMP_ULE;   break;
11067     case 0x0b: Pred = FCmpInst::FCMP_FALSE; break;
11068     case 0x0c: Pred = FCmpInst::FCMP_ONE;   break;
11069     case 0x0d: Pred = FCmpInst::FCMP_OGE;   break;
11070     case 0x0e: Pred = FCmpInst::FCMP_OGT;   break;
11071     case 0x0f: Pred = FCmpInst::FCMP_TRUE;  break;
11072     case 0x10: Pred = FCmpInst::FCMP_OEQ;   break;
11073     case 0x11: Pred = FCmpInst::FCMP_OLT;   break;
11074     case 0x12: Pred = FCmpInst::FCMP_OLE;   break;
11075     case 0x13: Pred = FCmpInst::FCMP_UNO;   break;
11076     case 0x14: Pred = FCmpInst::FCMP_UNE;   break;
11077     case 0x15: Pred = FCmpInst::FCMP_UGE;   break;
11078     case 0x16: Pred = FCmpInst::FCMP_UGT;   break;
11079     case 0x17: Pred = FCmpInst::FCMP_ORD;   break;
11080     case 0x18: Pred = FCmpInst::FCMP_UEQ;   break;
11081     case 0x19: Pred = FCmpInst::FCMP_ULT;   break;
11082     case 0x1a: Pred = FCmpInst::FCMP_ULE;   break;
11083     case 0x1b: Pred = FCmpInst::FCMP_FALSE; break;
11084     case 0x1c: Pred = FCmpInst::FCMP_ONE;   break;
11085     case 0x1d: Pred = FCmpInst::FCMP_OGE;   break;
11086     case 0x1e: Pred = FCmpInst::FCMP_OGT;   break;
11087     case 0x1f: Pred = FCmpInst::FCMP_TRUE;  break;
11088     default: llvm_unreachable("Unhandled CC");
11089     }
11090 
11091     // Builtins without the _mask suffix return a vector of integers
11092     // of the same width as the input vectors
11093     switch (BuiltinID) {
11094     case X86::BI__builtin_ia32_cmpps512_mask:
11095     case X86::BI__builtin_ia32_cmppd512_mask:
11096     case X86::BI__builtin_ia32_cmpps128_mask:
11097     case X86::BI__builtin_ia32_cmpps256_mask:
11098     case X86::BI__builtin_ia32_cmppd128_mask:
11099     case X86::BI__builtin_ia32_cmppd256_mask: {
11100       unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
11101       Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
11102       return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
11103     }
11104     default:
11105       return getVectorFCmpIR(Pred);
11106     }
11107   }
11108 
11109   // SSE scalar comparison intrinsics
11110   case X86::BI__builtin_ia32_cmpeqss:
11111     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
11112   case X86::BI__builtin_ia32_cmpltss:
11113     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
11114   case X86::BI__builtin_ia32_cmpless:
11115     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
11116   case X86::BI__builtin_ia32_cmpunordss:
11117     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
11118   case X86::BI__builtin_ia32_cmpneqss:
11119     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
11120   case X86::BI__builtin_ia32_cmpnltss:
11121     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
11122   case X86::BI__builtin_ia32_cmpnless:
11123     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
11124   case X86::BI__builtin_ia32_cmpordss:
11125     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
11126   case X86::BI__builtin_ia32_cmpeqsd:
11127     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
11128   case X86::BI__builtin_ia32_cmpltsd:
11129     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
11130   case X86::BI__builtin_ia32_cmplesd:
11131     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
11132   case X86::BI__builtin_ia32_cmpunordsd:
11133     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
11134   case X86::BI__builtin_ia32_cmpneqsd:
11135     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
11136   case X86::BI__builtin_ia32_cmpnltsd:
11137     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
11138   case X86::BI__builtin_ia32_cmpnlesd:
11139     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
11140   case X86::BI__builtin_ia32_cmpordsd:
11141     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
11142 
11143   case X86::BI__emul:
11144   case X86::BI__emulu: {
11145     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
11146     bool isSigned = (BuiltinID == X86::BI__emul);
11147     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
11148     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
11149     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
11150   }
11151   case X86::BI__mulh:
11152   case X86::BI__umulh:
11153   case X86::BI_mul128:
11154   case X86::BI_umul128: {
11155     llvm::Type *ResType = ConvertType(E->getType());
11156     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
11157 
11158     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
11159     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
11160     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
11161 
11162     Value *MulResult, *HigherBits;
11163     if (IsSigned) {
11164       MulResult = Builder.CreateNSWMul(LHS, RHS);
11165       HigherBits = Builder.CreateAShr(MulResult, 64);
11166     } else {
11167       MulResult = Builder.CreateNUWMul(LHS, RHS);
11168       HigherBits = Builder.CreateLShr(MulResult, 64);
11169     }
11170     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
11171 
11172     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
11173       return HigherBits;
11174 
11175     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
11176     Builder.CreateStore(HigherBits, HighBitsAddress);
11177     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
11178   }
11179 
11180   case X86::BI__faststorefence: {
11181     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
11182                                llvm::SyncScope::System);
11183   }
11184   case X86::BI__shiftleft128:
11185   case X86::BI__shiftright128: {
11186     // FIXME: Once fshl/fshr no longer add an unneeded and and cmov, do this:
11187     // llvm::Function *F = CGM.getIntrinsic(
11188     //   BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
11189     //   Int64Ty);
11190     // Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
11191     // return Builder.CreateCall(F, Ops);
11192     llvm::Type *Int128Ty = Builder.getInt128Ty();
11193     Value *Val = Builder.CreateOr(
11194         Builder.CreateShl(Builder.CreateZExt(Ops[1], Int128Ty), 64),
11195         Builder.CreateZExt(Ops[0], Int128Ty));
11196     Value *Amt = Builder.CreateAnd(Builder.CreateZExt(Ops[2], Int128Ty),
11197                                    llvm::ConstantInt::get(Int128Ty, 0x3f));
11198     Value *Res;
11199     if (BuiltinID == X86::BI__shiftleft128)
11200       Res = Builder.CreateLShr(Builder.CreateShl(Val, Amt), 64);
11201     else
11202       Res = Builder.CreateLShr(Val, Amt);
11203     return Builder.CreateTrunc(Res, Int64Ty);
11204   }
11205   case X86::BI_ReadWriteBarrier:
11206   case X86::BI_ReadBarrier:
11207   case X86::BI_WriteBarrier: {
11208     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
11209                                llvm::SyncScope::SingleThread);
11210   }
11211   case X86::BI_BitScanForward:
11212   case X86::BI_BitScanForward64:
11213     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
11214   case X86::BI_BitScanReverse:
11215   case X86::BI_BitScanReverse64:
11216     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
11217 
11218   case X86::BI_InterlockedAnd64:
11219     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
11220   case X86::BI_InterlockedExchange64:
11221     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
11222   case X86::BI_InterlockedExchangeAdd64:
11223     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
11224   case X86::BI_InterlockedExchangeSub64:
11225     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
11226   case X86::BI_InterlockedOr64:
11227     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
11228   case X86::BI_InterlockedXor64:
11229     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
11230   case X86::BI_InterlockedDecrement64:
11231     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
11232   case X86::BI_InterlockedIncrement64:
11233     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
11234   case X86::BI_InterlockedCompareExchange128: {
11235     // InterlockedCompareExchange128 doesn't directly refer to 128bit ints,
11236     // instead it takes pointers to 64bit ints for Destination and
11237     // ComparandResult, and exchange is taken as two 64bit ints (high & low).
11238     // The previous value is written to ComparandResult, and success is
11239     // returned.
11240 
11241     llvm::Type *Int128Ty = Builder.getInt128Ty();
11242     llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
11243 
11244     Value *Destination =
11245         Builder.CreateBitCast(Ops[0], Int128PtrTy);
11246     Value *ExchangeHigh128 = Builder.CreateZExt(Ops[1], Int128Ty);
11247     Value *ExchangeLow128 = Builder.CreateZExt(Ops[2], Int128Ty);
11248     Address ComparandResult(Builder.CreateBitCast(Ops[3], Int128PtrTy),
11249                             getContext().toCharUnitsFromBits(128));
11250 
11251     Value *Exchange = Builder.CreateOr(
11252         Builder.CreateShl(ExchangeHigh128, 64, "", false, false),
11253         ExchangeLow128);
11254 
11255     Value *Comparand = Builder.CreateLoad(ComparandResult);
11256 
11257     AtomicCmpXchgInst *CXI =
11258         Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
11259                                     AtomicOrdering::SequentiallyConsistent,
11260                                     AtomicOrdering::SequentiallyConsistent);
11261     CXI->setVolatile(true);
11262 
11263     // Write the result back to the inout pointer.
11264     Builder.CreateStore(Builder.CreateExtractValue(CXI, 0), ComparandResult);
11265 
11266     // Get the success boolean and zero extend it to i8.
11267     Value *Success = Builder.CreateExtractValue(CXI, 1);
11268     return Builder.CreateZExt(Success, ConvertType(E->getType()));
11269   }
11270 
11271   case X86::BI_AddressOfReturnAddress: {
11272     Value *F = CGM.getIntrinsic(Intrinsic::addressofreturnaddress);
11273     return Builder.CreateCall(F);
11274   }
11275   case X86::BI__stosb: {
11276     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
11277     // instruction, but it will create a memset that won't be optimized away.
11278     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], 1, true);
11279   }
11280   case X86::BI__ud2:
11281     // llvm.trap makes a ud2a instruction on x86.
11282     return EmitTrapCall(Intrinsic::trap);
11283   case X86::BI__int2c: {
11284     // This syscall signals a driver assertion failure in x86 NT kernels.
11285     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
11286     llvm::InlineAsm *IA =
11287         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*SideEffects=*/true);
11288     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
11289         getLLVMContext(), llvm::AttributeList::FunctionIndex,
11290         llvm::Attribute::NoReturn);
11291     CallSite CS = Builder.CreateCall(IA);
11292     CS.setAttributes(NoReturnAttr);
11293     return CS.getInstruction();
11294   }
11295   case X86::BI__readfsbyte:
11296   case X86::BI__readfsword:
11297   case X86::BI__readfsdword:
11298   case X86::BI__readfsqword: {
11299     llvm::Type *IntTy = ConvertType(E->getType());
11300     Value *Ptr =
11301         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 257));
11302     LoadInst *Load = Builder.CreateAlignedLoad(
11303         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
11304     Load->setVolatile(true);
11305     return Load;
11306   }
11307   case X86::BI__readgsbyte:
11308   case X86::BI__readgsword:
11309   case X86::BI__readgsdword:
11310   case X86::BI__readgsqword: {
11311     llvm::Type *IntTy = ConvertType(E->getType());
11312     Value *Ptr =
11313         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 256));
11314     LoadInst *Load = Builder.CreateAlignedLoad(
11315         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
11316     Load->setVolatile(true);
11317     return Load;
11318   }
11319   case X86::BI__builtin_ia32_paddusb512:
11320   case X86::BI__builtin_ia32_paddusw512:
11321   case X86::BI__builtin_ia32_paddusb256:
11322   case X86::BI__builtin_ia32_paddusw256:
11323   case X86::BI__builtin_ia32_paddusb128:
11324   case X86::BI__builtin_ia32_paddusw128:
11325     return EmitX86AddSubSatExpr(*this, E, Ops, true /* IsAddition */);
11326   case X86::BI__builtin_ia32_psubusb512:
11327   case X86::BI__builtin_ia32_psubusw512:
11328   case X86::BI__builtin_ia32_psubusb256:
11329   case X86::BI__builtin_ia32_psubusw256:
11330   case X86::BI__builtin_ia32_psubusb128:
11331   case X86::BI__builtin_ia32_psubusw128:
11332     return EmitX86AddSubSatExpr(*this, E, Ops, false /* IsAddition */);
11333   }
11334 }
11335 
11336 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
11337                                            const CallExpr *E) {
11338   SmallVector<Value*, 4> Ops;
11339 
11340   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
11341     Ops.push_back(EmitScalarExpr(E->getArg(i)));
11342 
11343   Intrinsic::ID ID = Intrinsic::not_intrinsic;
11344 
11345   switch (BuiltinID) {
11346   default: return nullptr;
11347 
11348   // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
11349   // call __builtin_readcyclecounter.
11350   case PPC::BI__builtin_ppc_get_timebase:
11351     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
11352 
11353   // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
11354   case PPC::BI__builtin_altivec_lvx:
11355   case PPC::BI__builtin_altivec_lvxl:
11356   case PPC::BI__builtin_altivec_lvebx:
11357   case PPC::BI__builtin_altivec_lvehx:
11358   case PPC::BI__builtin_altivec_lvewx:
11359   case PPC::BI__builtin_altivec_lvsl:
11360   case PPC::BI__builtin_altivec_lvsr:
11361   case PPC::BI__builtin_vsx_lxvd2x:
11362   case PPC::BI__builtin_vsx_lxvw4x:
11363   case PPC::BI__builtin_vsx_lxvd2x_be:
11364   case PPC::BI__builtin_vsx_lxvw4x_be:
11365   case PPC::BI__builtin_vsx_lxvl:
11366   case PPC::BI__builtin_vsx_lxvll:
11367   {
11368     if(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
11369        BuiltinID == PPC::BI__builtin_vsx_lxvll){
11370       Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
11371     }else {
11372       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
11373       Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
11374       Ops.pop_back();
11375     }
11376 
11377     switch (BuiltinID) {
11378     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
11379     case PPC::BI__builtin_altivec_lvx:
11380       ID = Intrinsic::ppc_altivec_lvx;
11381       break;
11382     case PPC::BI__builtin_altivec_lvxl:
11383       ID = Intrinsic::ppc_altivec_lvxl;
11384       break;
11385     case PPC::BI__builtin_altivec_lvebx:
11386       ID = Intrinsic::ppc_altivec_lvebx;
11387       break;
11388     case PPC::BI__builtin_altivec_lvehx:
11389       ID = Intrinsic::ppc_altivec_lvehx;
11390       break;
11391     case PPC::BI__builtin_altivec_lvewx:
11392       ID = Intrinsic::ppc_altivec_lvewx;
11393       break;
11394     case PPC::BI__builtin_altivec_lvsl:
11395       ID = Intrinsic::ppc_altivec_lvsl;
11396       break;
11397     case PPC::BI__builtin_altivec_lvsr:
11398       ID = Intrinsic::ppc_altivec_lvsr;
11399       break;
11400     case PPC::BI__builtin_vsx_lxvd2x:
11401       ID = Intrinsic::ppc_vsx_lxvd2x;
11402       break;
11403     case PPC::BI__builtin_vsx_lxvw4x:
11404       ID = Intrinsic::ppc_vsx_lxvw4x;
11405       break;
11406     case PPC::BI__builtin_vsx_lxvd2x_be:
11407       ID = Intrinsic::ppc_vsx_lxvd2x_be;
11408       break;
11409     case PPC::BI__builtin_vsx_lxvw4x_be:
11410       ID = Intrinsic::ppc_vsx_lxvw4x_be;
11411       break;
11412     case PPC::BI__builtin_vsx_lxvl:
11413       ID = Intrinsic::ppc_vsx_lxvl;
11414       break;
11415     case PPC::BI__builtin_vsx_lxvll:
11416       ID = Intrinsic::ppc_vsx_lxvll;
11417       break;
11418     }
11419     llvm::Function *F = CGM.getIntrinsic(ID);
11420     return Builder.CreateCall(F, Ops, "");
11421   }
11422 
11423   // vec_st, vec_xst_be
11424   case PPC::BI__builtin_altivec_stvx:
11425   case PPC::BI__builtin_altivec_stvxl:
11426   case PPC::BI__builtin_altivec_stvebx:
11427   case PPC::BI__builtin_altivec_stvehx:
11428   case PPC::BI__builtin_altivec_stvewx:
11429   case PPC::BI__builtin_vsx_stxvd2x:
11430   case PPC::BI__builtin_vsx_stxvw4x:
11431   case PPC::BI__builtin_vsx_stxvd2x_be:
11432   case PPC::BI__builtin_vsx_stxvw4x_be:
11433   case PPC::BI__builtin_vsx_stxvl:
11434   case PPC::BI__builtin_vsx_stxvll:
11435   {
11436     if(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
11437       BuiltinID == PPC::BI__builtin_vsx_stxvll ){
11438       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
11439     }else {
11440       Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
11441       Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
11442       Ops.pop_back();
11443     }
11444 
11445     switch (BuiltinID) {
11446     default: llvm_unreachable("Unsupported st intrinsic!");
11447     case PPC::BI__builtin_altivec_stvx:
11448       ID = Intrinsic::ppc_altivec_stvx;
11449       break;
11450     case PPC::BI__builtin_altivec_stvxl:
11451       ID = Intrinsic::ppc_altivec_stvxl;
11452       break;
11453     case PPC::BI__builtin_altivec_stvebx:
11454       ID = Intrinsic::ppc_altivec_stvebx;
11455       break;
11456     case PPC::BI__builtin_altivec_stvehx:
11457       ID = Intrinsic::ppc_altivec_stvehx;
11458       break;
11459     case PPC::BI__builtin_altivec_stvewx:
11460       ID = Intrinsic::ppc_altivec_stvewx;
11461       break;
11462     case PPC::BI__builtin_vsx_stxvd2x:
11463       ID = Intrinsic::ppc_vsx_stxvd2x;
11464       break;
11465     case PPC::BI__builtin_vsx_stxvw4x:
11466       ID = Intrinsic::ppc_vsx_stxvw4x;
11467       break;
11468     case PPC::BI__builtin_vsx_stxvd2x_be:
11469       ID = Intrinsic::ppc_vsx_stxvd2x_be;
11470       break;
11471     case PPC::BI__builtin_vsx_stxvw4x_be:
11472       ID = Intrinsic::ppc_vsx_stxvw4x_be;
11473       break;
11474     case PPC::BI__builtin_vsx_stxvl:
11475       ID = Intrinsic::ppc_vsx_stxvl;
11476       break;
11477     case PPC::BI__builtin_vsx_stxvll:
11478       ID = Intrinsic::ppc_vsx_stxvll;
11479       break;
11480     }
11481     llvm::Function *F = CGM.getIntrinsic(ID);
11482     return Builder.CreateCall(F, Ops, "");
11483   }
11484   // Square root
11485   case PPC::BI__builtin_vsx_xvsqrtsp:
11486   case PPC::BI__builtin_vsx_xvsqrtdp: {
11487     llvm::Type *ResultType = ConvertType(E->getType());
11488     Value *X = EmitScalarExpr(E->getArg(0));
11489     ID = Intrinsic::sqrt;
11490     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
11491     return Builder.CreateCall(F, X);
11492   }
11493   // Count leading zeros
11494   case PPC::BI__builtin_altivec_vclzb:
11495   case PPC::BI__builtin_altivec_vclzh:
11496   case PPC::BI__builtin_altivec_vclzw:
11497   case PPC::BI__builtin_altivec_vclzd: {
11498     llvm::Type *ResultType = ConvertType(E->getType());
11499     Value *X = EmitScalarExpr(E->getArg(0));
11500     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
11501     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
11502     return Builder.CreateCall(F, {X, Undef});
11503   }
11504   case PPC::BI__builtin_altivec_vctzb:
11505   case PPC::BI__builtin_altivec_vctzh:
11506   case PPC::BI__builtin_altivec_vctzw:
11507   case PPC::BI__builtin_altivec_vctzd: {
11508     llvm::Type *ResultType = ConvertType(E->getType());
11509     Value *X = EmitScalarExpr(E->getArg(0));
11510     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
11511     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
11512     return Builder.CreateCall(F, {X, Undef});
11513   }
11514   case PPC::BI__builtin_altivec_vpopcntb:
11515   case PPC::BI__builtin_altivec_vpopcnth:
11516   case PPC::BI__builtin_altivec_vpopcntw:
11517   case PPC::BI__builtin_altivec_vpopcntd: {
11518     llvm::Type *ResultType = ConvertType(E->getType());
11519     Value *X = EmitScalarExpr(E->getArg(0));
11520     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
11521     return Builder.CreateCall(F, X);
11522   }
11523   // Copy sign
11524   case PPC::BI__builtin_vsx_xvcpsgnsp:
11525   case PPC::BI__builtin_vsx_xvcpsgndp: {
11526     llvm::Type *ResultType = ConvertType(E->getType());
11527     Value *X = EmitScalarExpr(E->getArg(0));
11528     Value *Y = EmitScalarExpr(E->getArg(1));
11529     ID = Intrinsic::copysign;
11530     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
11531     return Builder.CreateCall(F, {X, Y});
11532   }
11533   // Rounding/truncation
11534   case PPC::BI__builtin_vsx_xvrspip:
11535   case PPC::BI__builtin_vsx_xvrdpip:
11536   case PPC::BI__builtin_vsx_xvrdpim:
11537   case PPC::BI__builtin_vsx_xvrspim:
11538   case PPC::BI__builtin_vsx_xvrdpi:
11539   case PPC::BI__builtin_vsx_xvrspi:
11540   case PPC::BI__builtin_vsx_xvrdpic:
11541   case PPC::BI__builtin_vsx_xvrspic:
11542   case PPC::BI__builtin_vsx_xvrdpiz:
11543   case PPC::BI__builtin_vsx_xvrspiz: {
11544     llvm::Type *ResultType = ConvertType(E->getType());
11545     Value *X = EmitScalarExpr(E->getArg(0));
11546     if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
11547         BuiltinID == PPC::BI__builtin_vsx_xvrspim)
11548       ID = Intrinsic::floor;
11549     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
11550              BuiltinID == PPC::BI__builtin_vsx_xvrspi)
11551       ID = Intrinsic::round;
11552     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
11553              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
11554       ID = Intrinsic::nearbyint;
11555     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
11556              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
11557       ID = Intrinsic::ceil;
11558     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
11559              BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
11560       ID = Intrinsic::trunc;
11561     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
11562     return Builder.CreateCall(F, X);
11563   }
11564 
11565   // Absolute value
11566   case PPC::BI__builtin_vsx_xvabsdp:
11567   case PPC::BI__builtin_vsx_xvabssp: {
11568     llvm::Type *ResultType = ConvertType(E->getType());
11569     Value *X = EmitScalarExpr(E->getArg(0));
11570     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
11571     return Builder.CreateCall(F, X);
11572   }
11573 
11574   // FMA variations
11575   case PPC::BI__builtin_vsx_xvmaddadp:
11576   case PPC::BI__builtin_vsx_xvmaddasp:
11577   case PPC::BI__builtin_vsx_xvnmaddadp:
11578   case PPC::BI__builtin_vsx_xvnmaddasp:
11579   case PPC::BI__builtin_vsx_xvmsubadp:
11580   case PPC::BI__builtin_vsx_xvmsubasp:
11581   case PPC::BI__builtin_vsx_xvnmsubadp:
11582   case PPC::BI__builtin_vsx_xvnmsubasp: {
11583     llvm::Type *ResultType = ConvertType(E->getType());
11584     Value *X = EmitScalarExpr(E->getArg(0));
11585     Value *Y = EmitScalarExpr(E->getArg(1));
11586     Value *Z = EmitScalarExpr(E->getArg(2));
11587     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
11588     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
11589     switch (BuiltinID) {
11590       case PPC::BI__builtin_vsx_xvmaddadp:
11591       case PPC::BI__builtin_vsx_xvmaddasp:
11592         return Builder.CreateCall(F, {X, Y, Z});
11593       case PPC::BI__builtin_vsx_xvnmaddadp:
11594       case PPC::BI__builtin_vsx_xvnmaddasp:
11595         return Builder.CreateFSub(Zero,
11596                                   Builder.CreateCall(F, {X, Y, Z}), "sub");
11597       case PPC::BI__builtin_vsx_xvmsubadp:
11598       case PPC::BI__builtin_vsx_xvmsubasp:
11599         return Builder.CreateCall(F,
11600                                   {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
11601       case PPC::BI__builtin_vsx_xvnmsubadp:
11602       case PPC::BI__builtin_vsx_xvnmsubasp:
11603         Value *FsubRes =
11604           Builder.CreateCall(F, {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
11605         return Builder.CreateFSub(Zero, FsubRes, "sub");
11606     }
11607     llvm_unreachable("Unknown FMA operation");
11608     return nullptr; // Suppress no-return warning
11609   }
11610 
11611   case PPC::BI__builtin_vsx_insertword: {
11612     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
11613 
11614     // Third argument is a compile time constant int. It must be clamped to
11615     // to the range [0, 12].
11616     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
11617     assert(ArgCI &&
11618            "Third arg to xxinsertw intrinsic must be constant integer");
11619     const int64_t MaxIndex = 12;
11620     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
11621 
11622     // The builtin semantics don't exactly match the xxinsertw instructions
11623     // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
11624     // word from the first argument, and inserts it in the second argument. The
11625     // instruction extracts the word from its second input register and inserts
11626     // it into its first input register, so swap the first and second arguments.
11627     std::swap(Ops[0], Ops[1]);
11628 
11629     // Need to cast the second argument from a vector of unsigned int to a
11630     // vector of long long.
11631     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
11632 
11633     if (getTarget().isLittleEndian()) {
11634       // Create a shuffle mask of (1, 0)
11635       Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
11636                                    ConstantInt::get(Int32Ty, 0)
11637                                  };
11638       Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
11639 
11640       // Reverse the double words in the vector we will extract from.
11641       Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
11642       Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ShuffleMask);
11643 
11644       // Reverse the index.
11645       Index = MaxIndex - Index;
11646     }
11647 
11648     // Intrinsic expects the first arg to be a vector of int.
11649     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
11650     Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
11651     return Builder.CreateCall(F, Ops);
11652   }
11653 
11654   case PPC::BI__builtin_vsx_extractuword: {
11655     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
11656 
11657     // Intrinsic expects the first argument to be a vector of doublewords.
11658     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
11659 
11660     // The second argument is a compile time constant int that needs to
11661     // be clamped to the range [0, 12].
11662     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]);
11663     assert(ArgCI &&
11664            "Second Arg to xxextractuw intrinsic must be a constant integer!");
11665     const int64_t MaxIndex = 12;
11666     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
11667 
11668     if (getTarget().isLittleEndian()) {
11669       // Reverse the index.
11670       Index = MaxIndex - Index;
11671       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
11672 
11673       // Emit the call, then reverse the double words of the results vector.
11674       Value *Call = Builder.CreateCall(F, Ops);
11675 
11676       // Create a shuffle mask of (1, 0)
11677       Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
11678                                    ConstantInt::get(Int32Ty, 0)
11679                                  };
11680       Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
11681 
11682       Value *ShuffleCall = Builder.CreateShuffleVector(Call, Call, ShuffleMask);
11683       return ShuffleCall;
11684     } else {
11685       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
11686       return Builder.CreateCall(F, Ops);
11687     }
11688   }
11689 
11690   case PPC::BI__builtin_vsx_xxpermdi: {
11691     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
11692     assert(ArgCI && "Third arg must be constant integer!");
11693 
11694     unsigned Index = ArgCI->getZExtValue();
11695     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
11696     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
11697 
11698     // Account for endianness by treating this as just a shuffle. So we use the
11699     // same indices for both LE and BE in order to produce expected results in
11700     // both cases.
11701     unsigned ElemIdx0 = (Index & 2) >> 1;
11702     unsigned ElemIdx1 = 2 + (Index & 1);
11703 
11704     Constant *ShuffleElts[2] = {ConstantInt::get(Int32Ty, ElemIdx0),
11705                                 ConstantInt::get(Int32Ty, ElemIdx1)};
11706     Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
11707 
11708     Value *ShuffleCall =
11709         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleMask);
11710     QualType BIRetType = E->getType();
11711     auto RetTy = ConvertType(BIRetType);
11712     return Builder.CreateBitCast(ShuffleCall, RetTy);
11713   }
11714 
11715   case PPC::BI__builtin_vsx_xxsldwi: {
11716     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
11717     assert(ArgCI && "Third argument must be a compile time constant");
11718     unsigned Index = ArgCI->getZExtValue() & 0x3;
11719     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
11720     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int32Ty, 4));
11721 
11722     // Create a shuffle mask
11723     unsigned ElemIdx0;
11724     unsigned ElemIdx1;
11725     unsigned ElemIdx2;
11726     unsigned ElemIdx3;
11727     if (getTarget().isLittleEndian()) {
11728       // Little endian element N comes from element 8+N-Index of the
11729       // concatenated wide vector (of course, using modulo arithmetic on
11730       // the total number of elements).
11731       ElemIdx0 = (8 - Index) % 8;
11732       ElemIdx1 = (9 - Index) % 8;
11733       ElemIdx2 = (10 - Index) % 8;
11734       ElemIdx3 = (11 - Index) % 8;
11735     } else {
11736       // Big endian ElemIdx<N> = Index + N
11737       ElemIdx0 = Index;
11738       ElemIdx1 = Index + 1;
11739       ElemIdx2 = Index + 2;
11740       ElemIdx3 = Index + 3;
11741     }
11742 
11743     Constant *ShuffleElts[4] = {ConstantInt::get(Int32Ty, ElemIdx0),
11744                                 ConstantInt::get(Int32Ty, ElemIdx1),
11745                                 ConstantInt::get(Int32Ty, ElemIdx2),
11746                                 ConstantInt::get(Int32Ty, ElemIdx3)};
11747 
11748     Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
11749     Value *ShuffleCall =
11750         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleMask);
11751     QualType BIRetType = E->getType();
11752     auto RetTy = ConvertType(BIRetType);
11753     return Builder.CreateBitCast(ShuffleCall, RetTy);
11754   }
11755 
11756   case PPC::BI__builtin_pack_vector_int128: {
11757     bool isLittleEndian = getTarget().isLittleEndian();
11758     Value *UndefValue =
11759         llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), 2));
11760     Value *Res = Builder.CreateInsertElement(
11761         UndefValue, Ops[0], (uint64_t)(isLittleEndian ? 1 : 0));
11762     Res = Builder.CreateInsertElement(Res, Ops[1],
11763                                       (uint64_t)(isLittleEndian ? 0 : 1));
11764     return Builder.CreateBitCast(Res, ConvertType(E->getType()));
11765   }
11766 
11767   case PPC::BI__builtin_unpack_vector_int128: {
11768     ConstantInt *Index = cast<ConstantInt>(Ops[1]);
11769     Value *Unpacked = Builder.CreateBitCast(
11770         Ops[0], llvm::VectorType::get(ConvertType(E->getType()), 2));
11771 
11772     if (getTarget().isLittleEndian())
11773       Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());
11774 
11775     return Builder.CreateExtractElement(Unpacked, Index);
11776   }
11777   }
11778 }
11779 
11780 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
11781                                               const CallExpr *E) {
11782   switch (BuiltinID) {
11783   case AMDGPU::BI__builtin_amdgcn_div_scale:
11784   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
11785     // Translate from the intrinsics's struct return to the builtin's out
11786     // argument.
11787 
11788     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
11789 
11790     llvm::Value *X = EmitScalarExpr(E->getArg(0));
11791     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
11792     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
11793 
11794     llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
11795                                            X->getType());
11796 
11797     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
11798 
11799     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
11800     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
11801 
11802     llvm::Type *RealFlagType
11803       = FlagOutPtr.getPointer()->getType()->getPointerElementType();
11804 
11805     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
11806     Builder.CreateStore(FlagExt, FlagOutPtr);
11807     return Result;
11808   }
11809   case AMDGPU::BI__builtin_amdgcn_div_fmas:
11810   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
11811     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
11812     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
11813     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
11814     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
11815 
11816     llvm::Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
11817                                       Src0->getType());
11818     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
11819     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
11820   }
11821 
11822   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
11823     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
11824   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
11825   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
11826     llvm::SmallVector<llvm::Value *, 6> Args;
11827     for (unsigned I = 0; I != E->getNumArgs(); ++I)
11828       Args.push_back(EmitScalarExpr(E->getArg(I)));
11829     assert(Args.size() == 5 || Args.size() == 6);
11830     if (Args.size() == 5)
11831       Args.insert(Args.begin(), llvm::UndefValue::get(Args[0]->getType()));
11832     Value *F =
11833         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
11834     return Builder.CreateCall(F, Args);
11835   }
11836   case AMDGPU::BI__builtin_amdgcn_div_fixup:
11837   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
11838   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
11839     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
11840   case AMDGPU::BI__builtin_amdgcn_trig_preop:
11841   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
11842     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
11843   case AMDGPU::BI__builtin_amdgcn_rcp:
11844   case AMDGPU::BI__builtin_amdgcn_rcpf:
11845   case AMDGPU::BI__builtin_amdgcn_rcph:
11846     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
11847   case AMDGPU::BI__builtin_amdgcn_rsq:
11848   case AMDGPU::BI__builtin_amdgcn_rsqf:
11849   case AMDGPU::BI__builtin_amdgcn_rsqh:
11850     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
11851   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
11852   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
11853     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
11854   case AMDGPU::BI__builtin_amdgcn_sinf:
11855   case AMDGPU::BI__builtin_amdgcn_sinh:
11856     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
11857   case AMDGPU::BI__builtin_amdgcn_cosf:
11858   case AMDGPU::BI__builtin_amdgcn_cosh:
11859     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
11860   case AMDGPU::BI__builtin_amdgcn_log_clampf:
11861     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
11862   case AMDGPU::BI__builtin_amdgcn_ldexp:
11863   case AMDGPU::BI__builtin_amdgcn_ldexpf:
11864   case AMDGPU::BI__builtin_amdgcn_ldexph:
11865     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);
11866   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
11867   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
11868   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
11869     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
11870   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
11871   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
11872     Value *Src0 = EmitScalarExpr(E->getArg(0));
11873     Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
11874                                 { Builder.getInt32Ty(), Src0->getType() });
11875     return Builder.CreateCall(F, Src0);
11876   }
11877   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
11878     Value *Src0 = EmitScalarExpr(E->getArg(0));
11879     Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
11880                                 { Builder.getInt16Ty(), Src0->getType() });
11881     return Builder.CreateCall(F, Src0);
11882   }
11883   case AMDGPU::BI__builtin_amdgcn_fract:
11884   case AMDGPU::BI__builtin_amdgcn_fractf:
11885   case AMDGPU::BI__builtin_amdgcn_fracth:
11886     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
11887   case AMDGPU::BI__builtin_amdgcn_lerp:
11888     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
11889   case AMDGPU::BI__builtin_amdgcn_uicmp:
11890   case AMDGPU::BI__builtin_amdgcn_uicmpl:
11891   case AMDGPU::BI__builtin_amdgcn_sicmp:
11892   case AMDGPU::BI__builtin_amdgcn_sicmpl:
11893     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_icmp);
11894   case AMDGPU::BI__builtin_amdgcn_fcmp:
11895   case AMDGPU::BI__builtin_amdgcn_fcmpf:
11896     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fcmp);
11897   case AMDGPU::BI__builtin_amdgcn_class:
11898   case AMDGPU::BI__builtin_amdgcn_classf:
11899   case AMDGPU::BI__builtin_amdgcn_classh:
11900     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
11901   case AMDGPU::BI__builtin_amdgcn_fmed3f:
11902   case AMDGPU::BI__builtin_amdgcn_fmed3h:
11903     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
11904   case AMDGPU::BI__builtin_amdgcn_read_exec: {
11905     CallInst *CI = cast<CallInst>(
11906       EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, true, "exec"));
11907     CI->setConvergent();
11908     return CI;
11909   }
11910   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
11911   case AMDGPU::BI__builtin_amdgcn_read_exec_hi: {
11912     StringRef RegName = BuiltinID == AMDGPU::BI__builtin_amdgcn_read_exec_lo ?
11913       "exec_lo" : "exec_hi";
11914     CallInst *CI = cast<CallInst>(
11915       EmitSpecialRegisterBuiltin(*this, E, Int32Ty, Int32Ty, true, RegName));
11916     CI->setConvergent();
11917     return CI;
11918   }
11919   // amdgcn workitem
11920   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
11921     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
11922   case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
11923     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
11924   case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
11925     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
11926 
11927   // r600 intrinsics
11928   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
11929   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
11930     return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
11931   case AMDGPU::BI__builtin_r600_read_tidig_x:
11932     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
11933   case AMDGPU::BI__builtin_r600_read_tidig_y:
11934     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
11935   case AMDGPU::BI__builtin_r600_read_tidig_z:
11936     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
11937   default:
11938     return nullptr;
11939   }
11940 }
11941 
11942 /// Handle a SystemZ function in which the final argument is a pointer
11943 /// to an int that receives the post-instruction CC value.  At the LLVM level
11944 /// this is represented as a function that returns a {result, cc} pair.
11945 static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
11946                                          unsigned IntrinsicID,
11947                                          const CallExpr *E) {
11948   unsigned NumArgs = E->getNumArgs() - 1;
11949   SmallVector<Value *, 8> Args(NumArgs);
11950   for (unsigned I = 0; I < NumArgs; ++I)
11951     Args[I] = CGF.EmitScalarExpr(E->getArg(I));
11952   Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
11953   Value *F = CGF.CGM.getIntrinsic(IntrinsicID);
11954   Value *Call = CGF.Builder.CreateCall(F, Args);
11955   Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
11956   CGF.Builder.CreateStore(CC, CCPtr);
11957   return CGF.Builder.CreateExtractValue(Call, 0);
11958 }
11959 
11960 Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
11961                                                const CallExpr *E) {
11962   switch (BuiltinID) {
11963   case SystemZ::BI__builtin_tbegin: {
11964     Value *TDB = EmitScalarExpr(E->getArg(0));
11965     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
11966     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
11967     return Builder.CreateCall(F, {TDB, Control});
11968   }
11969   case SystemZ::BI__builtin_tbegin_nofloat: {
11970     Value *TDB = EmitScalarExpr(E->getArg(0));
11971     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
11972     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
11973     return Builder.CreateCall(F, {TDB, Control});
11974   }
11975   case SystemZ::BI__builtin_tbeginc: {
11976     Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
11977     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
11978     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
11979     return Builder.CreateCall(F, {TDB, Control});
11980   }
11981   case SystemZ::BI__builtin_tabort: {
11982     Value *Data = EmitScalarExpr(E->getArg(0));
11983     Value *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
11984     return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
11985   }
11986   case SystemZ::BI__builtin_non_tx_store: {
11987     Value *Address = EmitScalarExpr(E->getArg(0));
11988     Value *Data = EmitScalarExpr(E->getArg(1));
11989     Value *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
11990     return Builder.CreateCall(F, {Data, Address});
11991   }
11992 
11993   // Vector builtins.  Note that most vector builtins are mapped automatically
11994   // to target-specific LLVM intrinsics.  The ones handled specially here can
11995   // be represented via standard LLVM IR, which is preferable to enable common
11996   // LLVM optimizations.
11997 
11998   case SystemZ::BI__builtin_s390_vpopctb:
11999   case SystemZ::BI__builtin_s390_vpopcth:
12000   case SystemZ::BI__builtin_s390_vpopctf:
12001   case SystemZ::BI__builtin_s390_vpopctg: {
12002     llvm::Type *ResultType = ConvertType(E->getType());
12003     Value *X = EmitScalarExpr(E->getArg(0));
12004     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
12005     return Builder.CreateCall(F, X);
12006   }
12007 
12008   case SystemZ::BI__builtin_s390_vclzb:
12009   case SystemZ::BI__builtin_s390_vclzh:
12010   case SystemZ::BI__builtin_s390_vclzf:
12011   case SystemZ::BI__builtin_s390_vclzg: {
12012     llvm::Type *ResultType = ConvertType(E->getType());
12013     Value *X = EmitScalarExpr(E->getArg(0));
12014     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
12015     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
12016     return Builder.CreateCall(F, {X, Undef});
12017   }
12018 
12019   case SystemZ::BI__builtin_s390_vctzb:
12020   case SystemZ::BI__builtin_s390_vctzh:
12021   case SystemZ::BI__builtin_s390_vctzf:
12022   case SystemZ::BI__builtin_s390_vctzg: {
12023     llvm::Type *ResultType = ConvertType(E->getType());
12024     Value *X = EmitScalarExpr(E->getArg(0));
12025     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
12026     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
12027     return Builder.CreateCall(F, {X, Undef});
12028   }
12029 
12030   case SystemZ::BI__builtin_s390_vfsqsb:
12031   case SystemZ::BI__builtin_s390_vfsqdb: {
12032     llvm::Type *ResultType = ConvertType(E->getType());
12033     Value *X = EmitScalarExpr(E->getArg(0));
12034     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
12035     return Builder.CreateCall(F, X);
12036   }
12037   case SystemZ::BI__builtin_s390_vfmasb:
12038   case SystemZ::BI__builtin_s390_vfmadb: {
12039     llvm::Type *ResultType = ConvertType(E->getType());
12040     Value *X = EmitScalarExpr(E->getArg(0));
12041     Value *Y = EmitScalarExpr(E->getArg(1));
12042     Value *Z = EmitScalarExpr(E->getArg(2));
12043     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
12044     return Builder.CreateCall(F, {X, Y, Z});
12045   }
12046   case SystemZ::BI__builtin_s390_vfmssb:
12047   case SystemZ::BI__builtin_s390_vfmsdb: {
12048     llvm::Type *ResultType = ConvertType(E->getType());
12049     Value *X = EmitScalarExpr(E->getArg(0));
12050     Value *Y = EmitScalarExpr(E->getArg(1));
12051     Value *Z = EmitScalarExpr(E->getArg(2));
12052     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
12053     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
12054     return Builder.CreateCall(F, {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
12055   }
12056   case SystemZ::BI__builtin_s390_vfnmasb:
12057   case SystemZ::BI__builtin_s390_vfnmadb: {
12058     llvm::Type *ResultType = ConvertType(E->getType());
12059     Value *X = EmitScalarExpr(E->getArg(0));
12060     Value *Y = EmitScalarExpr(E->getArg(1));
12061     Value *Z = EmitScalarExpr(E->getArg(2));
12062     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
12063     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
12064     return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, Z}), "sub");
12065   }
12066   case SystemZ::BI__builtin_s390_vfnmssb:
12067   case SystemZ::BI__builtin_s390_vfnmsdb: {
12068     llvm::Type *ResultType = ConvertType(E->getType());
12069     Value *X = EmitScalarExpr(E->getArg(0));
12070     Value *Y = EmitScalarExpr(E->getArg(1));
12071     Value *Z = EmitScalarExpr(E->getArg(2));
12072     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
12073     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
12074     Value *NegZ = Builder.CreateFSub(Zero, Z, "sub");
12075     return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, NegZ}));
12076   }
12077   case SystemZ::BI__builtin_s390_vflpsb:
12078   case SystemZ::BI__builtin_s390_vflpdb: {
12079     llvm::Type *ResultType = ConvertType(E->getType());
12080     Value *X = EmitScalarExpr(E->getArg(0));
12081     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
12082     return Builder.CreateCall(F, X);
12083   }
12084   case SystemZ::BI__builtin_s390_vflnsb:
12085   case SystemZ::BI__builtin_s390_vflndb: {
12086     llvm::Type *ResultType = ConvertType(E->getType());
12087     Value *X = EmitScalarExpr(E->getArg(0));
12088     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
12089     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
12090     return Builder.CreateFSub(Zero, Builder.CreateCall(F, X), "sub");
12091   }
12092   case SystemZ::BI__builtin_s390_vfisb:
12093   case SystemZ::BI__builtin_s390_vfidb: {
12094     llvm::Type *ResultType = ConvertType(E->getType());
12095     Value *X = EmitScalarExpr(E->getArg(0));
12096     // Constant-fold the M4 and M5 mask arguments.
12097     llvm::APSInt M4, M5;
12098     bool IsConstM4 = E->getArg(1)->isIntegerConstantExpr(M4, getContext());
12099     bool IsConstM5 = E->getArg(2)->isIntegerConstantExpr(M5, getContext());
12100     assert(IsConstM4 && IsConstM5 && "Constant arg isn't actually constant?");
12101     (void)IsConstM4; (void)IsConstM5;
12102     // Check whether this instance can be represented via a LLVM standard
12103     // intrinsic.  We only support some combinations of M4 and M5.
12104     Intrinsic::ID ID = Intrinsic::not_intrinsic;
12105     switch (M4.getZExtValue()) {
12106     default: break;
12107     case 0:  // IEEE-inexact exception allowed
12108       switch (M5.getZExtValue()) {
12109       default: break;
12110       case 0: ID = Intrinsic::rint; break;
12111       }
12112       break;
12113     case 4:  // IEEE-inexact exception suppressed
12114       switch (M5.getZExtValue()) {
12115       default: break;
12116       case 0: ID = Intrinsic::nearbyint; break;
12117       case 1: ID = Intrinsic::round; break;
12118       case 5: ID = Intrinsic::trunc; break;
12119       case 6: ID = Intrinsic::ceil; break;
12120       case 7: ID = Intrinsic::floor; break;
12121       }
12122       break;
12123     }
12124     if (ID != Intrinsic::not_intrinsic) {
12125       Function *F = CGM.getIntrinsic(ID, ResultType);
12126       return Builder.CreateCall(F, X);
12127     }
12128     switch (BuiltinID) {
12129       case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
12130       case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
12131       default: llvm_unreachable("Unknown BuiltinID");
12132     }
12133     Function *F = CGM.getIntrinsic(ID);
12134     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
12135     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
12136     return Builder.CreateCall(F, {X, M4Value, M5Value});
12137   }
12138   case SystemZ::BI__builtin_s390_vfmaxsb:
12139   case SystemZ::BI__builtin_s390_vfmaxdb: {
12140     llvm::Type *ResultType = ConvertType(E->getType());
12141     Value *X = EmitScalarExpr(E->getArg(0));
12142     Value *Y = EmitScalarExpr(E->getArg(1));
12143     // Constant-fold the M4 mask argument.
12144     llvm::APSInt M4;
12145     bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
12146     assert(IsConstM4 && "Constant arg isn't actually constant?");
12147     (void)IsConstM4;
12148     // Check whether this instance can be represented via a LLVM standard
12149     // intrinsic.  We only support some values of M4.
12150     Intrinsic::ID ID = Intrinsic::not_intrinsic;
12151     switch (M4.getZExtValue()) {
12152     default: break;
12153     case 4: ID = Intrinsic::maxnum; break;
12154     }
12155     if (ID != Intrinsic::not_intrinsic) {
12156       Function *F = CGM.getIntrinsic(ID, ResultType);
12157       return Builder.CreateCall(F, {X, Y});
12158     }
12159     switch (BuiltinID) {
12160       case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
12161       case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
12162       default: llvm_unreachable("Unknown BuiltinID");
12163     }
12164     Function *F = CGM.getIntrinsic(ID);
12165     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
12166     return Builder.CreateCall(F, {X, Y, M4Value});
12167   }
12168   case SystemZ::BI__builtin_s390_vfminsb:
12169   case SystemZ::BI__builtin_s390_vfmindb: {
12170     llvm::Type *ResultType = ConvertType(E->getType());
12171     Value *X = EmitScalarExpr(E->getArg(0));
12172     Value *Y = EmitScalarExpr(E->getArg(1));
12173     // Constant-fold the M4 mask argument.
12174     llvm::APSInt M4;
12175     bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
12176     assert(IsConstM4 && "Constant arg isn't actually constant?");
12177     (void)IsConstM4;
12178     // Check whether this instance can be represented via a LLVM standard
12179     // intrinsic.  We only support some values of M4.
12180     Intrinsic::ID ID = Intrinsic::not_intrinsic;
12181     switch (M4.getZExtValue()) {
12182     default: break;
12183     case 4: ID = Intrinsic::minnum; break;
12184     }
12185     if (ID != Intrinsic::not_intrinsic) {
12186       Function *F = CGM.getIntrinsic(ID, ResultType);
12187       return Builder.CreateCall(F, {X, Y});
12188     }
12189     switch (BuiltinID) {
12190       case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
12191       case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
12192       default: llvm_unreachable("Unknown BuiltinID");
12193     }
12194     Function *F = CGM.getIntrinsic(ID);
12195     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
12196     return Builder.CreateCall(F, {X, Y, M4Value});
12197   }
12198 
12199   // Vector intrisincs that output the post-instruction CC value.
12200 
12201 #define INTRINSIC_WITH_CC(NAME) \
12202     case SystemZ::BI__builtin_##NAME: \
12203       return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
12204 
12205   INTRINSIC_WITH_CC(s390_vpkshs);
12206   INTRINSIC_WITH_CC(s390_vpksfs);
12207   INTRINSIC_WITH_CC(s390_vpksgs);
12208 
12209   INTRINSIC_WITH_CC(s390_vpklshs);
12210   INTRINSIC_WITH_CC(s390_vpklsfs);
12211   INTRINSIC_WITH_CC(s390_vpklsgs);
12212 
12213   INTRINSIC_WITH_CC(s390_vceqbs);
12214   INTRINSIC_WITH_CC(s390_vceqhs);
12215   INTRINSIC_WITH_CC(s390_vceqfs);
12216   INTRINSIC_WITH_CC(s390_vceqgs);
12217 
12218   INTRINSIC_WITH_CC(s390_vchbs);
12219   INTRINSIC_WITH_CC(s390_vchhs);
12220   INTRINSIC_WITH_CC(s390_vchfs);
12221   INTRINSIC_WITH_CC(s390_vchgs);
12222 
12223   INTRINSIC_WITH_CC(s390_vchlbs);
12224   INTRINSIC_WITH_CC(s390_vchlhs);
12225   INTRINSIC_WITH_CC(s390_vchlfs);
12226   INTRINSIC_WITH_CC(s390_vchlgs);
12227 
12228   INTRINSIC_WITH_CC(s390_vfaebs);
12229   INTRINSIC_WITH_CC(s390_vfaehs);
12230   INTRINSIC_WITH_CC(s390_vfaefs);
12231 
12232   INTRINSIC_WITH_CC(s390_vfaezbs);
12233   INTRINSIC_WITH_CC(s390_vfaezhs);
12234   INTRINSIC_WITH_CC(s390_vfaezfs);
12235 
12236   INTRINSIC_WITH_CC(s390_vfeebs);
12237   INTRINSIC_WITH_CC(s390_vfeehs);
12238   INTRINSIC_WITH_CC(s390_vfeefs);
12239 
12240   INTRINSIC_WITH_CC(s390_vfeezbs);
12241   INTRINSIC_WITH_CC(s390_vfeezhs);
12242   INTRINSIC_WITH_CC(s390_vfeezfs);
12243 
12244   INTRINSIC_WITH_CC(s390_vfenebs);
12245   INTRINSIC_WITH_CC(s390_vfenehs);
12246   INTRINSIC_WITH_CC(s390_vfenefs);
12247 
12248   INTRINSIC_WITH_CC(s390_vfenezbs);
12249   INTRINSIC_WITH_CC(s390_vfenezhs);
12250   INTRINSIC_WITH_CC(s390_vfenezfs);
12251 
12252   INTRINSIC_WITH_CC(s390_vistrbs);
12253   INTRINSIC_WITH_CC(s390_vistrhs);
12254   INTRINSIC_WITH_CC(s390_vistrfs);
12255 
12256   INTRINSIC_WITH_CC(s390_vstrcbs);
12257   INTRINSIC_WITH_CC(s390_vstrchs);
12258   INTRINSIC_WITH_CC(s390_vstrcfs);
12259 
12260   INTRINSIC_WITH_CC(s390_vstrczbs);
12261   INTRINSIC_WITH_CC(s390_vstrczhs);
12262   INTRINSIC_WITH_CC(s390_vstrczfs);
12263 
12264   INTRINSIC_WITH_CC(s390_vfcesbs);
12265   INTRINSIC_WITH_CC(s390_vfcedbs);
12266   INTRINSIC_WITH_CC(s390_vfchsbs);
12267   INTRINSIC_WITH_CC(s390_vfchdbs);
12268   INTRINSIC_WITH_CC(s390_vfchesbs);
12269   INTRINSIC_WITH_CC(s390_vfchedbs);
12270 
12271   INTRINSIC_WITH_CC(s390_vftcisb);
12272   INTRINSIC_WITH_CC(s390_vftcidb);
12273 
12274 #undef INTRINSIC_WITH_CC
12275 
12276   default:
12277     return nullptr;
12278   }
12279 }
12280 
12281 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
12282                                              const CallExpr *E) {
12283   auto MakeLdg = [&](unsigned IntrinsicID) {
12284     Value *Ptr = EmitScalarExpr(E->getArg(0));
12285     clang::CharUnits Align =
12286         getNaturalPointeeTypeAlignment(E->getArg(0)->getType());
12287     return Builder.CreateCall(
12288         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
12289                                        Ptr->getType()}),
12290         {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
12291   };
12292   auto MakeScopedAtomic = [&](unsigned IntrinsicID) {
12293     Value *Ptr = EmitScalarExpr(E->getArg(0));
12294     return Builder.CreateCall(
12295         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
12296                                        Ptr->getType()}),
12297         {Ptr, EmitScalarExpr(E->getArg(1))});
12298   };
12299   switch (BuiltinID) {
12300   case NVPTX::BI__nvvm_atom_add_gen_i:
12301   case NVPTX::BI__nvvm_atom_add_gen_l:
12302   case NVPTX::BI__nvvm_atom_add_gen_ll:
12303     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
12304 
12305   case NVPTX::BI__nvvm_atom_sub_gen_i:
12306   case NVPTX::BI__nvvm_atom_sub_gen_l:
12307   case NVPTX::BI__nvvm_atom_sub_gen_ll:
12308     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
12309 
12310   case NVPTX::BI__nvvm_atom_and_gen_i:
12311   case NVPTX::BI__nvvm_atom_and_gen_l:
12312   case NVPTX::BI__nvvm_atom_and_gen_ll:
12313     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
12314 
12315   case NVPTX::BI__nvvm_atom_or_gen_i:
12316   case NVPTX::BI__nvvm_atom_or_gen_l:
12317   case NVPTX::BI__nvvm_atom_or_gen_ll:
12318     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
12319 
12320   case NVPTX::BI__nvvm_atom_xor_gen_i:
12321   case NVPTX::BI__nvvm_atom_xor_gen_l:
12322   case NVPTX::BI__nvvm_atom_xor_gen_ll:
12323     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
12324 
12325   case NVPTX::BI__nvvm_atom_xchg_gen_i:
12326   case NVPTX::BI__nvvm_atom_xchg_gen_l:
12327   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
12328     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
12329 
12330   case NVPTX::BI__nvvm_atom_max_gen_i:
12331   case NVPTX::BI__nvvm_atom_max_gen_l:
12332   case NVPTX::BI__nvvm_atom_max_gen_ll:
12333     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
12334 
12335   case NVPTX::BI__nvvm_atom_max_gen_ui:
12336   case NVPTX::BI__nvvm_atom_max_gen_ul:
12337   case NVPTX::BI__nvvm_atom_max_gen_ull:
12338     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
12339 
12340   case NVPTX::BI__nvvm_atom_min_gen_i:
12341   case NVPTX::BI__nvvm_atom_min_gen_l:
12342   case NVPTX::BI__nvvm_atom_min_gen_ll:
12343     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
12344 
12345   case NVPTX::BI__nvvm_atom_min_gen_ui:
12346   case NVPTX::BI__nvvm_atom_min_gen_ul:
12347   case NVPTX::BI__nvvm_atom_min_gen_ull:
12348     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
12349 
12350   case NVPTX::BI__nvvm_atom_cas_gen_i:
12351   case NVPTX::BI__nvvm_atom_cas_gen_l:
12352   case NVPTX::BI__nvvm_atom_cas_gen_ll:
12353     // __nvvm_atom_cas_gen_* should return the old value rather than the
12354     // success flag.
12355     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
12356 
12357   case NVPTX::BI__nvvm_atom_add_gen_f: {
12358     Value *Ptr = EmitScalarExpr(E->getArg(0));
12359     Value *Val = EmitScalarExpr(E->getArg(1));
12360     // atomicrmw only deals with integer arguments so we need to use
12361     // LLVM's nvvm_atomic_load_add_f32 intrinsic for that.
12362     Value *FnALAF32 =
12363         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f32, Ptr->getType());
12364     return Builder.CreateCall(FnALAF32, {Ptr, Val});
12365   }
12366 
12367   case NVPTX::BI__nvvm_atom_add_gen_d: {
12368     Value *Ptr = EmitScalarExpr(E->getArg(0));
12369     Value *Val = EmitScalarExpr(E->getArg(1));
12370     // atomicrmw only deals with integer arguments, so we need to use
12371     // LLVM's nvvm_atomic_load_add_f64 intrinsic.
12372     Value *FnALAF64 =
12373         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType());
12374     return Builder.CreateCall(FnALAF64, {Ptr, Val});
12375   }
12376 
12377   case NVPTX::BI__nvvm_atom_inc_gen_ui: {
12378     Value *Ptr = EmitScalarExpr(E->getArg(0));
12379     Value *Val = EmitScalarExpr(E->getArg(1));
12380     Value *FnALI32 =
12381         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
12382     return Builder.CreateCall(FnALI32, {Ptr, Val});
12383   }
12384 
12385   case NVPTX::BI__nvvm_atom_dec_gen_ui: {
12386     Value *Ptr = EmitScalarExpr(E->getArg(0));
12387     Value *Val = EmitScalarExpr(E->getArg(1));
12388     Value *FnALD32 =
12389         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
12390     return Builder.CreateCall(FnALD32, {Ptr, Val});
12391   }
12392 
12393   case NVPTX::BI__nvvm_ldg_c:
12394   case NVPTX::BI__nvvm_ldg_c2:
12395   case NVPTX::BI__nvvm_ldg_c4:
12396   case NVPTX::BI__nvvm_ldg_s:
12397   case NVPTX::BI__nvvm_ldg_s2:
12398   case NVPTX::BI__nvvm_ldg_s4:
12399   case NVPTX::BI__nvvm_ldg_i:
12400   case NVPTX::BI__nvvm_ldg_i2:
12401   case NVPTX::BI__nvvm_ldg_i4:
12402   case NVPTX::BI__nvvm_ldg_l:
12403   case NVPTX::BI__nvvm_ldg_ll:
12404   case NVPTX::BI__nvvm_ldg_ll2:
12405   case NVPTX::BI__nvvm_ldg_uc:
12406   case NVPTX::BI__nvvm_ldg_uc2:
12407   case NVPTX::BI__nvvm_ldg_uc4:
12408   case NVPTX::BI__nvvm_ldg_us:
12409   case NVPTX::BI__nvvm_ldg_us2:
12410   case NVPTX::BI__nvvm_ldg_us4:
12411   case NVPTX::BI__nvvm_ldg_ui:
12412   case NVPTX::BI__nvvm_ldg_ui2:
12413   case NVPTX::BI__nvvm_ldg_ui4:
12414   case NVPTX::BI__nvvm_ldg_ul:
12415   case NVPTX::BI__nvvm_ldg_ull:
12416   case NVPTX::BI__nvvm_ldg_ull2:
12417     // PTX Interoperability section 2.2: "For a vector with an even number of
12418     // elements, its alignment is set to number of elements times the alignment
12419     // of its member: n*alignof(t)."
12420     return MakeLdg(Intrinsic::nvvm_ldg_global_i);
12421   case NVPTX::BI__nvvm_ldg_f:
12422   case NVPTX::BI__nvvm_ldg_f2:
12423   case NVPTX::BI__nvvm_ldg_f4:
12424   case NVPTX::BI__nvvm_ldg_d:
12425   case NVPTX::BI__nvvm_ldg_d2:
12426     return MakeLdg(Intrinsic::nvvm_ldg_global_f);
12427 
12428   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
12429   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
12430   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
12431     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta);
12432   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
12433   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
12434   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
12435     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys);
12436   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
12437   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
12438     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta);
12439   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
12440   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
12441     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys);
12442   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
12443   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
12444   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
12445     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta);
12446   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
12447   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
12448   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
12449     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys);
12450   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
12451   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
12452   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
12453   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
12454   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
12455   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
12456     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta);
12457   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
12458   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
12459   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
12460   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
12461   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
12462   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
12463     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys);
12464   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
12465   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
12466   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
12467   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
12468   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
12469   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
12470     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta);
12471   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
12472   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
12473   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
12474   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
12475   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
12476   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
12477     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys);
12478   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
12479     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta);
12480   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
12481     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta);
12482   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
12483     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys);
12484   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
12485     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys);
12486   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
12487   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
12488   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
12489     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta);
12490   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
12491   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
12492   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
12493     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys);
12494   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
12495   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
12496   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
12497     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta);
12498   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
12499   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
12500   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
12501     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys);
12502   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
12503   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
12504   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
12505     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta);
12506   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
12507   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
12508   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
12509     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys);
12510   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
12511   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
12512   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
12513     Value *Ptr = EmitScalarExpr(E->getArg(0));
12514     return Builder.CreateCall(
12515         CGM.getIntrinsic(
12516             Intrinsic::nvvm_atomic_cas_gen_i_cta,
12517             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
12518         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
12519   }
12520   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
12521   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
12522   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
12523     Value *Ptr = EmitScalarExpr(E->getArg(0));
12524     return Builder.CreateCall(
12525         CGM.getIntrinsic(
12526             Intrinsic::nvvm_atomic_cas_gen_i_sys,
12527             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
12528         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
12529   }
12530   case NVPTX::BI__nvvm_match_all_sync_i32p:
12531   case NVPTX::BI__nvvm_match_all_sync_i64p: {
12532     Value *Mask = EmitScalarExpr(E->getArg(0));
12533     Value *Val = EmitScalarExpr(E->getArg(1));
12534     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
12535     Value *ResultPair = Builder.CreateCall(
12536         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
12537                              ? Intrinsic::nvvm_match_all_sync_i32p
12538                              : Intrinsic::nvvm_match_all_sync_i64p),
12539         {Mask, Val});
12540     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
12541                                      PredOutPtr.getElementType());
12542     Builder.CreateStore(Pred, PredOutPtr);
12543     return Builder.CreateExtractValue(ResultPair, 0);
12544   }
12545   case NVPTX::BI__hmma_m16n16k16_ld_a:
12546   case NVPTX::BI__hmma_m16n16k16_ld_b:
12547   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
12548   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
12549   case NVPTX::BI__hmma_m32n8k16_ld_a:
12550   case NVPTX::BI__hmma_m32n8k16_ld_b:
12551   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
12552   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
12553   case NVPTX::BI__hmma_m8n32k16_ld_a:
12554   case NVPTX::BI__hmma_m8n32k16_ld_b:
12555   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
12556   case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
12557     Address Dst = EmitPointerWithAlignment(E->getArg(0));
12558     Value *Src = EmitScalarExpr(E->getArg(1));
12559     Value *Ldm = EmitScalarExpr(E->getArg(2));
12560     llvm::APSInt isColMajorArg;
12561     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
12562       return nullptr;
12563     bool isColMajor = isColMajorArg.getSExtValue();
12564     unsigned IID;
12565     unsigned NumResults;
12566     switch (BuiltinID) {
12567     case NVPTX::BI__hmma_m16n16k16_ld_a:
12568       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
12569                        : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride;
12570       NumResults = 8;
12571       break;
12572     case NVPTX::BI__hmma_m16n16k16_ld_b:
12573       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
12574                        : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride;
12575       NumResults = 8;
12576       break;
12577     case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
12578       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
12579                        : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride;
12580       NumResults = 4;
12581       break;
12582     case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
12583       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
12584                        : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
12585       NumResults = 8;
12586       break;
12587     case NVPTX::BI__hmma_m32n8k16_ld_a:
12588       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
12589                        : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
12590       NumResults = 8;
12591       break;
12592     case NVPTX::BI__hmma_m32n8k16_ld_b:
12593       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
12594                        : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
12595       NumResults = 8;
12596       break;
12597     case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
12598       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
12599                        : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
12600       NumResults = 4;
12601       break;
12602     case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
12603       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
12604                        : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
12605       NumResults = 8;
12606       break;
12607     case NVPTX::BI__hmma_m8n32k16_ld_a:
12608       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
12609                        : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
12610       NumResults = 8;
12611       break;
12612     case NVPTX::BI__hmma_m8n32k16_ld_b:
12613       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
12614                        : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
12615       NumResults = 8;
12616       break;
12617     case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
12618       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
12619                        : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
12620       NumResults = 4;
12621       break;
12622     case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
12623       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
12624                        : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
12625       NumResults = 8;
12626       break;
12627     default:
12628       llvm_unreachable("Unexpected builtin ID.");
12629     }
12630     Value *Result =
12631         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
12632 
12633     // Save returned values.
12634     for (unsigned i = 0; i < NumResults; ++i) {
12635       Builder.CreateAlignedStore(
12636           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
12637                                 Dst.getElementType()),
12638           Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
12639           CharUnits::fromQuantity(4));
12640     }
12641     return Result;
12642   }
12643 
12644   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
12645   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
12646   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
12647   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
12648   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
12649   case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
12650     Value *Dst = EmitScalarExpr(E->getArg(0));
12651     Address Src = EmitPointerWithAlignment(E->getArg(1));
12652     Value *Ldm = EmitScalarExpr(E->getArg(2));
12653     llvm::APSInt isColMajorArg;
12654     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
12655       return nullptr;
12656     bool isColMajor = isColMajorArg.getSExtValue();
12657     unsigned IID;
12658     unsigned NumResults = 8;
12659     // PTX Instructions (and LLVM instrinsics) are defined for slice _d_, yet
12660     // for some reason nvcc builtins use _c_.
12661     switch (BuiltinID) {
12662     case NVPTX::BI__hmma_m16n16k16_st_c_f16:
12663       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
12664                        : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride;
12665       NumResults = 4;
12666       break;
12667     case NVPTX::BI__hmma_m16n16k16_st_c_f32:
12668       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
12669                        : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
12670       break;
12671     case NVPTX::BI__hmma_m32n8k16_st_c_f16:
12672       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
12673                        : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
12674       NumResults = 4;
12675       break;
12676     case NVPTX::BI__hmma_m32n8k16_st_c_f32:
12677       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
12678                        : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
12679       break;
12680     case NVPTX::BI__hmma_m8n32k16_st_c_f16:
12681       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
12682                        : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
12683       NumResults = 4;
12684       break;
12685     case NVPTX::BI__hmma_m8n32k16_st_c_f32:
12686       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
12687                        : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
12688       break;
12689     default:
12690       llvm_unreachable("Unexpected builtin ID.");
12691     }
12692     Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType());
12693     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
12694     SmallVector<Value *, 10> Values = {Dst};
12695     for (unsigned i = 0; i < NumResults; ++i) {
12696       Value *V = Builder.CreateAlignedLoad(
12697           Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
12698           CharUnits::fromQuantity(4));
12699       Values.push_back(Builder.CreateBitCast(V, ParamType));
12700     }
12701     Values.push_back(Ldm);
12702     Value *Result = Builder.CreateCall(Intrinsic, Values);
12703     return Result;
12704   }
12705 
12706   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
12707   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
12708   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
12709   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
12710   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
12711   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
12712   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
12713   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
12714   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
12715   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
12716   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
12717   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
12718   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
12719   case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
12720     Address Dst = EmitPointerWithAlignment(E->getArg(0));
12721     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
12722     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
12723     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
12724     llvm::APSInt LayoutArg;
12725     if (!E->getArg(4)->isIntegerConstantExpr(LayoutArg, getContext()))
12726       return nullptr;
12727     int Layout = LayoutArg.getSExtValue();
12728     if (Layout < 0 || Layout > 3)
12729       return nullptr;
12730     llvm::APSInt SatfArg;
12731     if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
12732       return nullptr;
12733     bool Satf = SatfArg.getSExtValue();
12734 
12735     // clang-format off
12736 #define MMA_VARIANTS(geom, type) {{                                 \
12737       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
12738       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
12739       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
12740       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
12741       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
12742       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
12743       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type,             \
12744       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite  \
12745     }}
12746     // clang-format on
12747 
12748     auto getMMAIntrinsic = [Layout, Satf](std::array<unsigned, 8> Variants) {
12749       unsigned Index = Layout * 2 + Satf;
12750       assert(Index < 8);
12751       return Variants[Index];
12752     };
12753     unsigned IID;
12754     unsigned NumEltsC;
12755     unsigned NumEltsD;
12756     switch (BuiltinID) {
12757     case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
12758       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
12759       NumEltsC = 4;
12760       NumEltsD = 4;
12761       break;
12762     case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
12763       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
12764       NumEltsC = 4;
12765       NumEltsD = 8;
12766       break;
12767     case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
12768       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
12769       NumEltsC = 8;
12770       NumEltsD = 4;
12771       break;
12772     case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
12773       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
12774       NumEltsC = 8;
12775       NumEltsD = 8;
12776       break;
12777     case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
12778       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
12779       NumEltsC = 4;
12780       NumEltsD = 4;
12781       break;
12782     case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
12783       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
12784       NumEltsC = 4;
12785       NumEltsD = 8;
12786       break;
12787     case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
12788       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
12789       NumEltsC = 8;
12790       NumEltsD = 4;
12791       break;
12792     case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
12793       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
12794       NumEltsC = 8;
12795       NumEltsD = 8;
12796       break;
12797     case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
12798       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
12799       NumEltsC = 4;
12800       NumEltsD = 4;
12801       break;
12802     case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
12803       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
12804       NumEltsC = 4;
12805       NumEltsD = 8;
12806       break;
12807     case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
12808       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
12809       NumEltsC = 8;
12810       NumEltsD = 4;
12811       break;
12812     case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
12813       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
12814       NumEltsC = 8;
12815       NumEltsD = 8;
12816       break;
12817     default:
12818       llvm_unreachable("Unexpected builtin ID.");
12819     }
12820 #undef MMA_VARIANTS
12821 
12822     SmallVector<Value *, 24> Values;
12823     Function *Intrinsic = CGM.getIntrinsic(IID);
12824     llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0);
12825     // Load A
12826     for (unsigned i = 0; i < 8; ++i) {
12827       Value *V = Builder.CreateAlignedLoad(
12828           Builder.CreateGEP(SrcA.getPointer(),
12829                             llvm::ConstantInt::get(IntTy, i)),
12830           CharUnits::fromQuantity(4));
12831       Values.push_back(Builder.CreateBitCast(V, ABType));
12832     }
12833     // Load B
12834     for (unsigned i = 0; i < 8; ++i) {
12835       Value *V = Builder.CreateAlignedLoad(
12836           Builder.CreateGEP(SrcB.getPointer(),
12837                             llvm::ConstantInt::get(IntTy, i)),
12838           CharUnits::fromQuantity(4));
12839       Values.push_back(Builder.CreateBitCast(V, ABType));
12840     }
12841     // Load C
12842     llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16);
12843     for (unsigned i = 0; i < NumEltsC; ++i) {
12844       Value *V = Builder.CreateAlignedLoad(
12845           Builder.CreateGEP(SrcC.getPointer(),
12846                             llvm::ConstantInt::get(IntTy, i)),
12847           CharUnits::fromQuantity(4));
12848       Values.push_back(Builder.CreateBitCast(V, CType));
12849     }
12850     Value *Result = Builder.CreateCall(Intrinsic, Values);
12851     llvm::Type *DType = Dst.getElementType();
12852     for (unsigned i = 0; i < NumEltsD; ++i)
12853       Builder.CreateAlignedStore(
12854           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
12855           Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
12856           CharUnits::fromQuantity(4));
12857     return Result;
12858   }
12859   default:
12860     return nullptr;
12861   }
12862 }
12863 
12864 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
12865                                                    const CallExpr *E) {
12866   switch (BuiltinID) {
12867   case WebAssembly::BI__builtin_wasm_memory_size: {
12868     llvm::Type *ResultType = ConvertType(E->getType());
12869     Value *I = EmitScalarExpr(E->getArg(0));
12870     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
12871     return Builder.CreateCall(Callee, I);
12872   }
12873   case WebAssembly::BI__builtin_wasm_memory_grow: {
12874     llvm::Type *ResultType = ConvertType(E->getType());
12875     Value *Args[] = {
12876       EmitScalarExpr(E->getArg(0)),
12877       EmitScalarExpr(E->getArg(1))
12878     };
12879     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
12880     return Builder.CreateCall(Callee, Args);
12881   }
12882   case WebAssembly::BI__builtin_wasm_mem_size: {
12883     llvm::Type *ResultType = ConvertType(E->getType());
12884     Value *I = EmitScalarExpr(E->getArg(0));
12885     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_size, ResultType);
12886     return Builder.CreateCall(Callee, I);
12887   }
12888   case WebAssembly::BI__builtin_wasm_mem_grow: {
12889     llvm::Type *ResultType = ConvertType(E->getType());
12890     Value *Args[] = {
12891       EmitScalarExpr(E->getArg(0)),
12892       EmitScalarExpr(E->getArg(1))
12893     };
12894     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_grow, ResultType);
12895     return Builder.CreateCall(Callee, Args);
12896   }
12897   case WebAssembly::BI__builtin_wasm_current_memory: {
12898     llvm::Type *ResultType = ConvertType(E->getType());
12899     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType);
12900     return Builder.CreateCall(Callee);
12901   }
12902   case WebAssembly::BI__builtin_wasm_grow_memory: {
12903     Value *X = EmitScalarExpr(E->getArg(0));
12904     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_grow_memory, X->getType());
12905     return Builder.CreateCall(Callee, X);
12906   }
12907   case WebAssembly::BI__builtin_wasm_throw: {
12908     Value *Tag = EmitScalarExpr(E->getArg(0));
12909     Value *Obj = EmitScalarExpr(E->getArg(1));
12910     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
12911     return Builder.CreateCall(Callee, {Tag, Obj});
12912   }
12913   case WebAssembly::BI__builtin_wasm_rethrow: {
12914     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
12915     return Builder.CreateCall(Callee);
12916   }
12917   case WebAssembly::BI__builtin_wasm_atomic_wait_i32: {
12918     Value *Addr = EmitScalarExpr(E->getArg(0));
12919     Value *Expected = EmitScalarExpr(E->getArg(1));
12920     Value *Timeout = EmitScalarExpr(E->getArg(2));
12921     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_atomic_wait_i32);
12922     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
12923   }
12924   case WebAssembly::BI__builtin_wasm_atomic_wait_i64: {
12925     Value *Addr = EmitScalarExpr(E->getArg(0));
12926     Value *Expected = EmitScalarExpr(E->getArg(1));
12927     Value *Timeout = EmitScalarExpr(E->getArg(2));
12928     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_atomic_wait_i64);
12929     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
12930   }
12931   case WebAssembly::BI__builtin_wasm_atomic_notify: {
12932     Value *Addr = EmitScalarExpr(E->getArg(0));
12933     Value *Count = EmitScalarExpr(E->getArg(1));
12934     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_atomic_notify);
12935     return Builder.CreateCall(Callee, {Addr, Count});
12936   }
12937   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f32:
12938   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
12939   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
12940   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
12941   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4:
12942   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64x2_f64x2: {
12943     Value *Src = EmitScalarExpr(E->getArg(0));
12944     llvm::Type *ResT = ConvertType(E->getType());
12945     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_saturate_signed,
12946                                      {ResT, Src->getType()});
12947     return Builder.CreateCall(Callee, {Src});
12948   }
12949   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
12950   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
12951   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
12952   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
12953   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4:
12954   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64x2_f64x2: {
12955     Value *Src = EmitScalarExpr(E->getArg(0));
12956     llvm::Type *ResT = ConvertType(E->getType());
12957     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_saturate_unsigned,
12958                                      {ResT, Src->getType()});
12959     return Builder.CreateCall(Callee, {Src});
12960   }
12961   case WebAssembly::BI__builtin_wasm_min_f32:
12962   case WebAssembly::BI__builtin_wasm_min_f64:
12963   case WebAssembly::BI__builtin_wasm_min_f32x4:
12964   case WebAssembly::BI__builtin_wasm_min_f64x2: {
12965     Value *LHS = EmitScalarExpr(E->getArg(0));
12966     Value *RHS = EmitScalarExpr(E->getArg(1));
12967     Value *Callee = CGM.getIntrinsic(Intrinsic::minimum,
12968                                      ConvertType(E->getType()));
12969     return Builder.CreateCall(Callee, {LHS, RHS});
12970   }
12971   case WebAssembly::BI__builtin_wasm_max_f32:
12972   case WebAssembly::BI__builtin_wasm_max_f64:
12973   case WebAssembly::BI__builtin_wasm_max_f32x4:
12974   case WebAssembly::BI__builtin_wasm_max_f64x2: {
12975     Value *LHS = EmitScalarExpr(E->getArg(0));
12976     Value *RHS = EmitScalarExpr(E->getArg(1));
12977     Value *Callee = CGM.getIntrinsic(Intrinsic::maximum,
12978                                      ConvertType(E->getType()));
12979     return Builder.CreateCall(Callee, {LHS, RHS});
12980   }
12981   case WebAssembly::BI__builtin_wasm_extract_lane_s_i8x16:
12982   case WebAssembly::BI__builtin_wasm_extract_lane_u_i8x16:
12983   case WebAssembly::BI__builtin_wasm_extract_lane_s_i16x8:
12984   case WebAssembly::BI__builtin_wasm_extract_lane_u_i16x8:
12985   case WebAssembly::BI__builtin_wasm_extract_lane_i32x4:
12986   case WebAssembly::BI__builtin_wasm_extract_lane_i64x2:
12987   case WebAssembly::BI__builtin_wasm_extract_lane_f32x4:
12988   case WebAssembly::BI__builtin_wasm_extract_lane_f64x2: {
12989     llvm::APSInt LaneConst;
12990     if (!E->getArg(1)->isIntegerConstantExpr(LaneConst, getContext()))
12991       llvm_unreachable("Constant arg isn't actually constant?");
12992     Value *Vec = EmitScalarExpr(E->getArg(0));
12993     Value *Lane = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
12994     Value *Extract = Builder.CreateExtractElement(Vec, Lane);
12995     switch (BuiltinID) {
12996     case WebAssembly::BI__builtin_wasm_extract_lane_s_i8x16:
12997     case WebAssembly::BI__builtin_wasm_extract_lane_s_i16x8:
12998       return Builder.CreateSExt(Extract, ConvertType(E->getType()));
12999     case WebAssembly::BI__builtin_wasm_extract_lane_u_i8x16:
13000     case WebAssembly::BI__builtin_wasm_extract_lane_u_i16x8:
13001       return Builder.CreateZExt(Extract, ConvertType(E->getType()));
13002     case WebAssembly::BI__builtin_wasm_extract_lane_i32x4:
13003     case WebAssembly::BI__builtin_wasm_extract_lane_i64x2:
13004     case WebAssembly::BI__builtin_wasm_extract_lane_f32x4:
13005     case WebAssembly::BI__builtin_wasm_extract_lane_f64x2:
13006       return Extract;
13007     default:
13008       llvm_unreachable("unexpected builtin ID");
13009     }
13010   }
13011   case WebAssembly::BI__builtin_wasm_replace_lane_i8x16:
13012   case WebAssembly::BI__builtin_wasm_replace_lane_i16x8:
13013   case WebAssembly::BI__builtin_wasm_replace_lane_i32x4:
13014   case WebAssembly::BI__builtin_wasm_replace_lane_i64x2:
13015   case WebAssembly::BI__builtin_wasm_replace_lane_f32x4:
13016   case WebAssembly::BI__builtin_wasm_replace_lane_f64x2: {
13017     llvm::APSInt LaneConst;
13018     if (!E->getArg(1)->isIntegerConstantExpr(LaneConst, getContext()))
13019       llvm_unreachable("Constant arg isn't actually constant?");
13020     Value *Vec = EmitScalarExpr(E->getArg(0));
13021     Value *Lane = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
13022     Value *Val = EmitScalarExpr(E->getArg(2));
13023     switch (BuiltinID) {
13024     case WebAssembly::BI__builtin_wasm_replace_lane_i8x16:
13025     case WebAssembly::BI__builtin_wasm_replace_lane_i16x8: {
13026       llvm::Type *ElemType = ConvertType(E->getType())->getVectorElementType();
13027       Value *Trunc = Builder.CreateTrunc(Val, ElemType);
13028       return Builder.CreateInsertElement(Vec, Trunc, Lane);
13029     }
13030     case WebAssembly::BI__builtin_wasm_replace_lane_i32x4:
13031     case WebAssembly::BI__builtin_wasm_replace_lane_i64x2:
13032     case WebAssembly::BI__builtin_wasm_replace_lane_f32x4:
13033     case WebAssembly::BI__builtin_wasm_replace_lane_f64x2:
13034       return Builder.CreateInsertElement(Vec, Val, Lane);
13035     default:
13036       llvm_unreachable("unexpected builtin ID");
13037     }
13038   }
13039   case WebAssembly::BI__builtin_wasm_add_saturate_s_i8x16:
13040   case WebAssembly::BI__builtin_wasm_add_saturate_u_i8x16:
13041   case WebAssembly::BI__builtin_wasm_add_saturate_s_i16x8:
13042   case WebAssembly::BI__builtin_wasm_add_saturate_u_i16x8:
13043   case WebAssembly::BI__builtin_wasm_sub_saturate_s_i8x16:
13044   case WebAssembly::BI__builtin_wasm_sub_saturate_u_i8x16:
13045   case WebAssembly::BI__builtin_wasm_sub_saturate_s_i16x8:
13046   case WebAssembly::BI__builtin_wasm_sub_saturate_u_i16x8: {
13047     unsigned IntNo;
13048     switch (BuiltinID) {
13049     case WebAssembly::BI__builtin_wasm_add_saturate_s_i8x16:
13050     case WebAssembly::BI__builtin_wasm_add_saturate_s_i16x8:
13051       IntNo = Intrinsic::sadd_sat;
13052       break;
13053     case WebAssembly::BI__builtin_wasm_add_saturate_u_i8x16:
13054     case WebAssembly::BI__builtin_wasm_add_saturate_u_i16x8:
13055       IntNo = Intrinsic::uadd_sat;
13056       break;
13057     case WebAssembly::BI__builtin_wasm_sub_saturate_s_i8x16:
13058     case WebAssembly::BI__builtin_wasm_sub_saturate_s_i16x8:
13059       IntNo = Intrinsic::wasm_sub_saturate_signed;
13060       break;
13061     case WebAssembly::BI__builtin_wasm_sub_saturate_u_i8x16:
13062     case WebAssembly::BI__builtin_wasm_sub_saturate_u_i16x8:
13063       IntNo = Intrinsic::wasm_sub_saturate_unsigned;
13064       break;
13065     default:
13066       llvm_unreachable("unexpected builtin ID");
13067     }
13068     Value *LHS = EmitScalarExpr(E->getArg(0));
13069     Value *RHS = EmitScalarExpr(E->getArg(1));
13070     Value *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
13071     return Builder.CreateCall(Callee, {LHS, RHS});
13072   }
13073   case WebAssembly::BI__builtin_wasm_bitselect: {
13074     Value *V1 = EmitScalarExpr(E->getArg(0));
13075     Value *V2 = EmitScalarExpr(E->getArg(1));
13076     Value *C = EmitScalarExpr(E->getArg(2));
13077     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_bitselect,
13078                                      ConvertType(E->getType()));
13079     return Builder.CreateCall(Callee, {V1, V2, C});
13080   }
13081   case WebAssembly::BI__builtin_wasm_any_true_i8x16:
13082   case WebAssembly::BI__builtin_wasm_any_true_i16x8:
13083   case WebAssembly::BI__builtin_wasm_any_true_i32x4:
13084   case WebAssembly::BI__builtin_wasm_any_true_i64x2:
13085   case WebAssembly::BI__builtin_wasm_all_true_i8x16:
13086   case WebAssembly::BI__builtin_wasm_all_true_i16x8:
13087   case WebAssembly::BI__builtin_wasm_all_true_i32x4:
13088   case WebAssembly::BI__builtin_wasm_all_true_i64x2: {
13089     unsigned IntNo;
13090     switch (BuiltinID) {
13091     case WebAssembly::BI__builtin_wasm_any_true_i8x16:
13092     case WebAssembly::BI__builtin_wasm_any_true_i16x8:
13093     case WebAssembly::BI__builtin_wasm_any_true_i32x4:
13094     case WebAssembly::BI__builtin_wasm_any_true_i64x2:
13095       IntNo = Intrinsic::wasm_anytrue;
13096       break;
13097     case WebAssembly::BI__builtin_wasm_all_true_i8x16:
13098     case WebAssembly::BI__builtin_wasm_all_true_i16x8:
13099     case WebAssembly::BI__builtin_wasm_all_true_i32x4:
13100     case WebAssembly::BI__builtin_wasm_all_true_i64x2:
13101       IntNo = Intrinsic::wasm_alltrue;
13102       break;
13103     default:
13104       llvm_unreachable("unexpected builtin ID");
13105     }
13106     Value *Vec = EmitScalarExpr(E->getArg(0));
13107     Value *Callee = CGM.getIntrinsic(IntNo, Vec->getType());
13108     return Builder.CreateCall(Callee, {Vec});
13109   }
13110   case WebAssembly::BI__builtin_wasm_abs_f32x4:
13111   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
13112     Value *Vec = EmitScalarExpr(E->getArg(0));
13113     Value *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
13114     return Builder.CreateCall(Callee, {Vec});
13115   }
13116   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
13117   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
13118     Value *Vec = EmitScalarExpr(E->getArg(0));
13119     Value *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType());
13120     return Builder.CreateCall(Callee, {Vec});
13121   }
13122 
13123   default:
13124     return nullptr;
13125   }
13126 }
13127 
13128 Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
13129                                                const CallExpr *E) {
13130   SmallVector<llvm::Value *, 4> Ops;
13131   Intrinsic::ID ID = Intrinsic::not_intrinsic;
13132 
13133   auto MakeCircLd = [&](unsigned IntID, bool HasImm) {
13134     // The base pointer is passed by address, so it needs to be loaded.
13135     Address BP = EmitPointerWithAlignment(E->getArg(0));
13136     BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
13137                  BP.getAlignment());
13138     llvm::Value *Base = Builder.CreateLoad(BP);
13139     // Operands are Base, Increment, Modifier, Start.
13140     if (HasImm)
13141       Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
13142               EmitScalarExpr(E->getArg(3)) };
13143     else
13144       Ops = { Base, EmitScalarExpr(E->getArg(1)),
13145               EmitScalarExpr(E->getArg(2)) };
13146 
13147     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
13148     llvm::Value *NewBase = Builder.CreateExtractValue(Result, 1);
13149     llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
13150                                             NewBase->getType()->getPointerTo());
13151     Address Dest = EmitPointerWithAlignment(E->getArg(0));
13152     // The intrinsic generates two results. The new value for the base pointer
13153     // needs to be stored.
13154     Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
13155     return Builder.CreateExtractValue(Result, 0);
13156   };
13157 
13158   auto MakeCircSt = [&](unsigned IntID, bool HasImm) {
13159     // The base pointer is passed by address, so it needs to be loaded.
13160     Address BP = EmitPointerWithAlignment(E->getArg(0));
13161     BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
13162                  BP.getAlignment());
13163     llvm::Value *Base = Builder.CreateLoad(BP);
13164     // Operands are Base, Increment, Modifier, Value, Start.
13165     if (HasImm)
13166       Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
13167               EmitScalarExpr(E->getArg(3)), EmitScalarExpr(E->getArg(4)) };
13168     else
13169       Ops = { Base, EmitScalarExpr(E->getArg(1)),
13170               EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)) };
13171 
13172     llvm::Value *NewBase = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
13173     llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
13174                                             NewBase->getType()->getPointerTo());
13175     Address Dest = EmitPointerWithAlignment(E->getArg(0));
13176     // The intrinsic generates one result, which is the new value for the base
13177     // pointer. It needs to be stored.
13178     return Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
13179   };
13180 
13181   // Handle the conversion of bit-reverse load intrinsics to bit code.
13182   // The intrinsic call after this function only reads from memory and the
13183   // write to memory is dealt by the store instruction.
13184   auto MakeBrevLd = [&](unsigned IntID, llvm::Type *DestTy) {
13185     // The intrinsic generates one result, which is the new value for the base
13186     // pointer. It needs to be returned. The result of the load instruction is
13187     // passed to intrinsic by address, so the value needs to be stored.
13188     llvm::Value *BaseAddress =
13189         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
13190 
13191     // Expressions like &(*pt++) will be incremented per evaluation.
13192     // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
13193     // per call.
13194     Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
13195     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy),
13196                        DestAddr.getAlignment());
13197     llvm::Value *DestAddress = DestAddr.getPointer();
13198 
13199     // Operands are Base, Dest, Modifier.
13200     // The intrinsic format in LLVM IR is defined as
13201     // { ValueType, i8* } (i8*, i32).
13202     Ops = {BaseAddress, EmitScalarExpr(E->getArg(2))};
13203 
13204     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
13205     // The value needs to be stored as the variable is passed by reference.
13206     llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
13207 
13208     // The store needs to be truncated to fit the destination type.
13209     // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
13210     // to be handled with stores of respective destination type.
13211     DestVal = Builder.CreateTrunc(DestVal, DestTy);
13212 
13213     llvm::Value *DestForStore =
13214         Builder.CreateBitCast(DestAddress, DestVal->getType()->getPointerTo());
13215     Builder.CreateAlignedStore(DestVal, DestForStore, DestAddr.getAlignment());
13216     // The updated value of the base pointer is returned.
13217     return Builder.CreateExtractValue(Result, 1);
13218   };
13219 
13220   switch (BuiltinID) {
13221   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
13222   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B: {
13223     Address Dest = EmitPointerWithAlignment(E->getArg(2));
13224     unsigned Size;
13225     if (BuiltinID == Hexagon::BI__builtin_HEXAGON_V6_vaddcarry) {
13226       Size = 512;
13227       ID = Intrinsic::hexagon_V6_vaddcarry;
13228     } else {
13229       Size = 1024;
13230       ID = Intrinsic::hexagon_V6_vaddcarry_128B;
13231     }
13232     Dest = Builder.CreateBitCast(Dest,
13233         llvm::VectorType::get(Builder.getInt1Ty(), Size)->getPointerTo(0));
13234     LoadInst *QLd = Builder.CreateLoad(Dest);
13235     Ops = { EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), QLd };
13236     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
13237     llvm::Value *Vprd = Builder.CreateExtractValue(Result, 1);
13238     llvm::Value *Base = Builder.CreateBitCast(EmitScalarExpr(E->getArg(2)),
13239                                               Vprd->getType()->getPointerTo(0));
13240     Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment());
13241     return Builder.CreateExtractValue(Result, 0);
13242   }
13243   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
13244   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
13245     Address Dest = EmitPointerWithAlignment(E->getArg(2));
13246     unsigned Size;
13247     if (BuiltinID == Hexagon::BI__builtin_HEXAGON_V6_vsubcarry) {
13248       Size = 512;
13249       ID = Intrinsic::hexagon_V6_vsubcarry;
13250     } else {
13251       Size = 1024;
13252       ID = Intrinsic::hexagon_V6_vsubcarry_128B;
13253     }
13254     Dest = Builder.CreateBitCast(Dest,
13255         llvm::VectorType::get(Builder.getInt1Ty(), Size)->getPointerTo(0));
13256     LoadInst *QLd = Builder.CreateLoad(Dest);
13257     Ops = { EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), QLd };
13258     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
13259     llvm::Value *Vprd = Builder.CreateExtractValue(Result, 1);
13260     llvm::Value *Base = Builder.CreateBitCast(EmitScalarExpr(E->getArg(2)),
13261                                               Vprd->getType()->getPointerTo(0));
13262     Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment());
13263     return Builder.CreateExtractValue(Result, 0);
13264   }
13265   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
13266     return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pci, /*HasImm*/true);
13267   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
13268     return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pci,  /*HasImm*/true);
13269   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
13270     return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pci, /*HasImm*/true);
13271   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
13272     return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pci,  /*HasImm*/true);
13273   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
13274     return MakeCircLd(Intrinsic::hexagon_L2_loadri_pci,  /*HasImm*/true);
13275   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
13276     return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pci,  /*HasImm*/true);
13277   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
13278     return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pcr, /*HasImm*/false);
13279   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
13280     return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pcr,  /*HasImm*/false);
13281   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
13282     return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pcr, /*HasImm*/false);
13283   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
13284     return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pcr,  /*HasImm*/false);
13285   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
13286     return MakeCircLd(Intrinsic::hexagon_L2_loadri_pcr,  /*HasImm*/false);
13287   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
13288     return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pcr,  /*HasImm*/false);
13289   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
13290     return MakeCircSt(Intrinsic::hexagon_S2_storerb_pci, /*HasImm*/true);
13291   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
13292     return MakeCircSt(Intrinsic::hexagon_S2_storerh_pci, /*HasImm*/true);
13293   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
13294     return MakeCircSt(Intrinsic::hexagon_S2_storerf_pci, /*HasImm*/true);
13295   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
13296     return MakeCircSt(Intrinsic::hexagon_S2_storeri_pci, /*HasImm*/true);
13297   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
13298     return MakeCircSt(Intrinsic::hexagon_S2_storerd_pci, /*HasImm*/true);
13299   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
13300     return MakeCircSt(Intrinsic::hexagon_S2_storerb_pcr, /*HasImm*/false);
13301   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
13302     return MakeCircSt(Intrinsic::hexagon_S2_storerh_pcr, /*HasImm*/false);
13303   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
13304     return MakeCircSt(Intrinsic::hexagon_S2_storerf_pcr, /*HasImm*/false);
13305   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
13306     return MakeCircSt(Intrinsic::hexagon_S2_storeri_pcr, /*HasImm*/false);
13307   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
13308     return MakeCircSt(Intrinsic::hexagon_S2_storerd_pcr, /*HasImm*/false);
13309   case Hexagon::BI__builtin_brev_ldub:
13310     return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
13311   case Hexagon::BI__builtin_brev_ldb:
13312     return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
13313   case Hexagon::BI__builtin_brev_lduh:
13314     return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
13315   case Hexagon::BI__builtin_brev_ldh:
13316     return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
13317   case Hexagon::BI__builtin_brev_ldw:
13318     return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
13319   case Hexagon::BI__builtin_brev_ldd:
13320     return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
13321   default:
13322     break;
13323   } // switch
13324 
13325   return nullptr;
13326 }
13327