1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This contains code to emit Builtin calls as LLVM code.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCXXABI.h"
15 #include "CGObjCRuntime.h"
16 #include "CGOpenCLRuntime.h"
17 #include "CGRecordLayout.h"
18 #include "CodeGenFunction.h"
19 #include "CodeGenModule.h"
20 #include "ConstantEmitter.h"
21 #include "TargetInfo.h"
22 #include "clang/AST/ASTContext.h"
23 #include "clang/AST/Decl.h"
24 #include "clang/Analysis/Analyses/OSLog.h"
25 #include "clang/Basic/TargetBuiltins.h"
26 #include "clang/Basic/TargetInfo.h"
27 #include "clang/CodeGen/CGFunctionInfo.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/IR/CallSite.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/InlineAsm.h"
32 #include "llvm/IR/Intrinsics.h"
33 #include "llvm/IR/MDBuilder.h"
34 #include "llvm/Support/ConvertUTF.h"
35 #include "llvm/Support/ScopedPrinter.h"
36 #include "llvm/Support/TargetParser.h"
37 #include <sstream>
38 
39 using namespace clang;
40 using namespace CodeGen;
41 using namespace llvm;
42 
43 static
44 int64_t clamp(int64_t Value, int64_t Low, int64_t High) {
45   return std::min(High, std::max(Low, Value));
46 }
47 
48 /// getBuiltinLibFunction - Given a builtin id for a function like
49 /// "__builtin_fabsf", return a Function* for "fabsf".
50 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
51                                                      unsigned BuiltinID) {
52   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
53 
54   // Get the name, skip over the __builtin_ prefix (if necessary).
55   StringRef Name;
56   GlobalDecl D(FD);
57 
58   // If the builtin has been declared explicitly with an assembler label,
59   // use the mangled name. This differs from the plain label on platforms
60   // that prefix labels.
61   if (FD->hasAttr<AsmLabelAttr>())
62     Name = getMangledName(D);
63   else
64     Name = Context.BuiltinInfo.getName(BuiltinID) + 10;
65 
66   llvm::FunctionType *Ty =
67     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
68 
69   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
70 }
71 
72 /// Emit the conversions required to turn the given value into an
73 /// integer of the given size.
74 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
75                         QualType T, llvm::IntegerType *IntType) {
76   V = CGF.EmitToMemory(V, T);
77 
78   if (V->getType()->isPointerTy())
79     return CGF.Builder.CreatePtrToInt(V, IntType);
80 
81   assert(V->getType() == IntType);
82   return V;
83 }
84 
85 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
86                           QualType T, llvm::Type *ResultType) {
87   V = CGF.EmitFromMemory(V, T);
88 
89   if (ResultType->isPointerTy())
90     return CGF.Builder.CreateIntToPtr(V, ResultType);
91 
92   assert(V->getType() == ResultType);
93   return V;
94 }
95 
96 /// Utility to insert an atomic instruction based on Instrinsic::ID
97 /// and the expression node.
98 static Value *MakeBinaryAtomicValue(CodeGenFunction &CGF,
99                                     llvm::AtomicRMWInst::BinOp Kind,
100                                     const CallExpr *E) {
101   QualType T = E->getType();
102   assert(E->getArg(0)->getType()->isPointerType());
103   assert(CGF.getContext().hasSameUnqualifiedType(T,
104                                   E->getArg(0)->getType()->getPointeeType()));
105   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
106 
107   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
108   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
109 
110   llvm::IntegerType *IntType =
111     llvm::IntegerType::get(CGF.getLLVMContext(),
112                            CGF.getContext().getTypeSize(T));
113   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
114 
115   llvm::Value *Args[2];
116   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
117   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
118   llvm::Type *ValueType = Args[1]->getType();
119   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
120 
121   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
122       Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
123   return EmitFromInt(CGF, Result, T, ValueType);
124 }
125 
126 static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
127   Value *Val = CGF.EmitScalarExpr(E->getArg(0));
128   Value *Address = CGF.EmitScalarExpr(E->getArg(1));
129 
130   // Convert the type of the pointer to a pointer to the stored type.
131   Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
132   Value *BC = CGF.Builder.CreateBitCast(
133       Address, llvm::PointerType::getUnqual(Val->getType()), "cast");
134   LValue LV = CGF.MakeNaturalAlignAddrLValue(BC, E->getArg(0)->getType());
135   LV.setNontemporal(true);
136   CGF.EmitStoreOfScalar(Val, LV, false);
137   return nullptr;
138 }
139 
140 static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
141   Value *Address = CGF.EmitScalarExpr(E->getArg(0));
142 
143   LValue LV = CGF.MakeNaturalAlignAddrLValue(Address, E->getType());
144   LV.setNontemporal(true);
145   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
146 }
147 
148 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
149                                llvm::AtomicRMWInst::BinOp Kind,
150                                const CallExpr *E) {
151   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
152 }
153 
154 /// Utility to insert an atomic instruction based Instrinsic::ID and
155 /// the expression node, where the return value is the result of the
156 /// operation.
157 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
158                                    llvm::AtomicRMWInst::BinOp Kind,
159                                    const CallExpr *E,
160                                    Instruction::BinaryOps Op,
161                                    bool Invert = false) {
162   QualType T = E->getType();
163   assert(E->getArg(0)->getType()->isPointerType());
164   assert(CGF.getContext().hasSameUnqualifiedType(T,
165                                   E->getArg(0)->getType()->getPointeeType()));
166   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
167 
168   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
169   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
170 
171   llvm::IntegerType *IntType =
172     llvm::IntegerType::get(CGF.getLLVMContext(),
173                            CGF.getContext().getTypeSize(T));
174   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
175 
176   llvm::Value *Args[2];
177   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
178   llvm::Type *ValueType = Args[1]->getType();
179   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
180   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
181 
182   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
183       Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
184   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
185   if (Invert)
186     Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
187                                      llvm::ConstantInt::get(IntType, -1));
188   Result = EmitFromInt(CGF, Result, T, ValueType);
189   return RValue::get(Result);
190 }
191 
192 /// @brief Utility to insert an atomic cmpxchg instruction.
193 ///
194 /// @param CGF The current codegen function.
195 /// @param E   Builtin call expression to convert to cmpxchg.
196 ///            arg0 - address to operate on
197 ///            arg1 - value to compare with
198 ///            arg2 - new value
199 /// @param ReturnBool Specifies whether to return success flag of
200 ///                   cmpxchg result or the old value.
201 ///
202 /// @returns result of cmpxchg, according to ReturnBool
203 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
204                                      bool ReturnBool) {
205   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
206   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
207   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
208 
209   llvm::IntegerType *IntType = llvm::IntegerType::get(
210       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
211   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
212 
213   Value *Args[3];
214   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
215   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
216   llvm::Type *ValueType = Args[1]->getType();
217   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
218   Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
219 
220   Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
221       Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent,
222       llvm::AtomicOrdering::SequentiallyConsistent);
223   if (ReturnBool)
224     // Extract boolean success flag and zext it to int.
225     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
226                                   CGF.ConvertType(E->getType()));
227   else
228     // Extract old value and emit it using the same type as compare value.
229     return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
230                        ValueType);
231 }
232 
233 // Emit a simple mangled intrinsic that has 1 argument and a return type
234 // matching the argument type.
235 static Value *emitUnaryBuiltin(CodeGenFunction &CGF,
236                                const CallExpr *E,
237                                unsigned IntrinsicID) {
238   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
239 
240   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
241   return CGF.Builder.CreateCall(F, Src0);
242 }
243 
244 // Emit an intrinsic that has 2 operands of the same type as its result.
245 static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
246                                 const CallExpr *E,
247                                 unsigned IntrinsicID) {
248   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
249   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
250 
251   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
252   return CGF.Builder.CreateCall(F, { Src0, Src1 });
253 }
254 
255 // Emit an intrinsic that has 3 operands of the same type as its result.
256 static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
257                                  const CallExpr *E,
258                                  unsigned IntrinsicID) {
259   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
260   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
261   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
262 
263   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
264   return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
265 }
266 
267 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
268 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
269                                const CallExpr *E,
270                                unsigned IntrinsicID) {
271   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
272   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
273 
274   Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
275   return CGF.Builder.CreateCall(F, {Src0, Src1});
276 }
277 
278 /// EmitFAbs - Emit a call to @llvm.fabs().
279 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
280   Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
281   llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
282   Call->setDoesNotAccessMemory();
283   return Call;
284 }
285 
286 /// Emit the computation of the sign bit for a floating point value. Returns
287 /// the i1 sign bit value.
288 static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
289   LLVMContext &C = CGF.CGM.getLLVMContext();
290 
291   llvm::Type *Ty = V->getType();
292   int Width = Ty->getPrimitiveSizeInBits();
293   llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
294   V = CGF.Builder.CreateBitCast(V, IntTy);
295   if (Ty->isPPC_FP128Ty()) {
296     // We want the sign bit of the higher-order double. The bitcast we just
297     // did works as if the double-double was stored to memory and then
298     // read as an i128. The "store" will put the higher-order double in the
299     // lower address in both little- and big-Endian modes, but the "load"
300     // will treat those bits as a different part of the i128: the low bits in
301     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
302     // we need to shift the high bits down to the low before truncating.
303     Width >>= 1;
304     if (CGF.getTarget().isBigEndian()) {
305       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
306       V = CGF.Builder.CreateLShr(V, ShiftCst);
307     }
308     // We are truncating value in order to extract the higher-order
309     // double, which we will be using to extract the sign from.
310     IntTy = llvm::IntegerType::get(C, Width);
311     V = CGF.Builder.CreateTrunc(V, IntTy);
312   }
313   Value *Zero = llvm::Constant::getNullValue(IntTy);
314   return CGF.Builder.CreateICmpSLT(V, Zero);
315 }
316 
317 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
318                               const CallExpr *E, llvm::Constant *calleeValue) {
319   CGCallee callee = CGCallee::forDirect(calleeValue, FD);
320   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
321 }
322 
323 /// \brief Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
324 /// depending on IntrinsicID.
325 ///
326 /// \arg CGF The current codegen function.
327 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
328 /// \arg X The first argument to the llvm.*.with.overflow.*.
329 /// \arg Y The second argument to the llvm.*.with.overflow.*.
330 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
331 /// \returns The result (i.e. sum/product) returned by the intrinsic.
332 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
333                                           const llvm::Intrinsic::ID IntrinsicID,
334                                           llvm::Value *X, llvm::Value *Y,
335                                           llvm::Value *&Carry) {
336   // Make sure we have integers of the same width.
337   assert(X->getType() == Y->getType() &&
338          "Arguments must be the same type. (Did you forget to make sure both "
339          "arguments have the same integer width?)");
340 
341   llvm::Value *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
342   llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
343   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
344   return CGF.Builder.CreateExtractValue(Tmp, 0);
345 }
346 
347 static Value *emitRangedBuiltin(CodeGenFunction &CGF,
348                                 unsigned IntrinsicID,
349                                 int low, int high) {
350     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
351     llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
352     Value *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
353     llvm::Instruction *Call = CGF.Builder.CreateCall(F);
354     Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
355     return Call;
356 }
357 
358 namespace {
359   struct WidthAndSignedness {
360     unsigned Width;
361     bool Signed;
362   };
363 }
364 
365 static WidthAndSignedness
366 getIntegerWidthAndSignedness(const clang::ASTContext &context,
367                              const clang::QualType Type) {
368   assert(Type->isIntegerType() && "Given type is not an integer.");
369   unsigned Width = Type->isBooleanType() ? 1 : context.getTypeInfo(Type).Width;
370   bool Signed = Type->isSignedIntegerType();
371   return {Width, Signed};
372 }
373 
374 // Given one or more integer types, this function produces an integer type that
375 // encompasses them: any value in one of the given types could be expressed in
376 // the encompassing type.
377 static struct WidthAndSignedness
378 EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
379   assert(Types.size() > 0 && "Empty list of types.");
380 
381   // If any of the given types is signed, we must return a signed type.
382   bool Signed = false;
383   for (const auto &Type : Types) {
384     Signed |= Type.Signed;
385   }
386 
387   // The encompassing type must have a width greater than or equal to the width
388   // of the specified types.  Additionally, if the encompassing type is signed,
389   // its width must be strictly greater than the width of any unsigned types
390   // given.
391   unsigned Width = 0;
392   for (const auto &Type : Types) {
393     unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
394     if (Width < MinWidth) {
395       Width = MinWidth;
396     }
397   }
398 
399   return {Width, Signed};
400 }
401 
402 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
403   llvm::Type *DestType = Int8PtrTy;
404   if (ArgValue->getType() != DestType)
405     ArgValue =
406         Builder.CreateBitCast(ArgValue, DestType, ArgValue->getName().data());
407 
408   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
409   return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
410 }
411 
412 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
413 /// __builtin_object_size(p, @p To) is correct
414 static bool areBOSTypesCompatible(int From, int To) {
415   // Note: Our __builtin_object_size implementation currently treats Type=0 and
416   // Type=2 identically. Encoding this implementation detail here may make
417   // improving __builtin_object_size difficult in the future, so it's omitted.
418   return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
419 }
420 
421 static llvm::Value *
422 getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
423   return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
424 }
425 
426 llvm::Value *
427 CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
428                                                  llvm::IntegerType *ResType,
429                                                  llvm::Value *EmittedE) {
430   uint64_t ObjectSize;
431   if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
432     return emitBuiltinObjectSize(E, Type, ResType, EmittedE);
433   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
434 }
435 
436 /// Returns a Value corresponding to the size of the given expression.
437 /// This Value may be either of the following:
438 ///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
439 ///     it)
440 ///   - A call to the @llvm.objectsize intrinsic
441 ///
442 /// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
443 /// and we wouldn't otherwise try to reference a pass_object_size parameter,
444 /// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
445 llvm::Value *
446 CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
447                                        llvm::IntegerType *ResType,
448                                        llvm::Value *EmittedE) {
449   // We need to reference an argument if the pointer is a parameter with the
450   // pass_object_size attribute.
451   if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
452     auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
453     auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
454     if (Param != nullptr && PS != nullptr &&
455         areBOSTypesCompatible(PS->getType(), Type)) {
456       auto Iter = SizeArguments.find(Param);
457       assert(Iter != SizeArguments.end());
458 
459       const ImplicitParamDecl *D = Iter->second;
460       auto DIter = LocalDeclMap.find(D);
461       assert(DIter != LocalDeclMap.end());
462 
463       return EmitLoadOfScalar(DIter->second, /*volatile=*/false,
464                               getContext().getSizeType(), E->getLocStart());
465     }
466   }
467 
468   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
469   // evaluate E for side-effects. In either case, we shouldn't lower to
470   // @llvm.objectsize.
471   if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
472     return getDefaultBuiltinObjectSizeResult(Type, ResType);
473 
474   Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
475   assert(Ptr->getType()->isPointerTy() &&
476          "Non-pointer passed to __builtin_object_size?");
477 
478   Value *F = CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
479 
480   // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
481   Value *Min = Builder.getInt1((Type & 2) != 0);
482   // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
483   Value *NullIsUnknown = Builder.getTrue();
484   return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown});
485 }
486 
487 // Many of MSVC builtins are on both x64 and ARM; to avoid repeating code, we
488 // handle them here.
489 enum class CodeGenFunction::MSVCIntrin {
490   _BitScanForward,
491   _BitScanReverse,
492   _InterlockedAnd,
493   _InterlockedDecrement,
494   _InterlockedExchange,
495   _InterlockedExchangeAdd,
496   _InterlockedExchangeSub,
497   _InterlockedIncrement,
498   _InterlockedOr,
499   _InterlockedXor,
500   _interlockedbittestandset,
501   __fastfail,
502 };
503 
504 Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
505                                             const CallExpr *E) {
506   switch (BuiltinID) {
507   case MSVCIntrin::_BitScanForward:
508   case MSVCIntrin::_BitScanReverse: {
509     Value *ArgValue = EmitScalarExpr(E->getArg(1));
510 
511     llvm::Type *ArgType = ArgValue->getType();
512     llvm::Type *IndexType =
513       EmitScalarExpr(E->getArg(0))->getType()->getPointerElementType();
514     llvm::Type *ResultType = ConvertType(E->getType());
515 
516     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
517     Value *ResZero = llvm::Constant::getNullValue(ResultType);
518     Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
519 
520     BasicBlock *Begin = Builder.GetInsertBlock();
521     BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
522     Builder.SetInsertPoint(End);
523     PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
524 
525     Builder.SetInsertPoint(Begin);
526     Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
527     BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
528     Builder.CreateCondBr(IsZero, End, NotZero);
529     Result->addIncoming(ResZero, Begin);
530 
531     Builder.SetInsertPoint(NotZero);
532     Address IndexAddress = EmitPointerWithAlignment(E->getArg(0));
533 
534     if (BuiltinID == MSVCIntrin::_BitScanForward) {
535       Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
536       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
537       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
538       Builder.CreateStore(ZeroCount, IndexAddress, false);
539     } else {
540       unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
541       Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
542 
543       Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
544       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
545       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
546       Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
547       Builder.CreateStore(Index, IndexAddress, false);
548     }
549     Builder.CreateBr(End);
550     Result->addIncoming(ResOne, NotZero);
551 
552     Builder.SetInsertPoint(End);
553     return Result;
554   }
555   case MSVCIntrin::_InterlockedAnd:
556     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
557   case MSVCIntrin::_InterlockedExchange:
558     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
559   case MSVCIntrin::_InterlockedExchangeAdd:
560     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
561   case MSVCIntrin::_InterlockedExchangeSub:
562     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
563   case MSVCIntrin::_InterlockedOr:
564     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
565   case MSVCIntrin::_InterlockedXor:
566     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
567 
568   case MSVCIntrin::_interlockedbittestandset: {
569     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
570     llvm::Value *Bit = EmitScalarExpr(E->getArg(1));
571     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
572         AtomicRMWInst::Or, Addr,
573         Builder.CreateShl(ConstantInt::get(Bit->getType(), 1), Bit),
574         llvm::AtomicOrdering::SequentiallyConsistent);
575     // Shift the relevant bit to the least significant position, truncate to
576     // the result type, and test the low bit.
577     llvm::Value *Shifted = Builder.CreateLShr(RMWI, Bit);
578     llvm::Value *Truncated =
579         Builder.CreateTrunc(Shifted, ConvertType(E->getType()));
580     return Builder.CreateAnd(Truncated,
581                              ConstantInt::get(Truncated->getType(), 1));
582   }
583 
584   case MSVCIntrin::_InterlockedDecrement: {
585     llvm::Type *IntTy = ConvertType(E->getType());
586     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
587       AtomicRMWInst::Sub,
588       EmitScalarExpr(E->getArg(0)),
589       ConstantInt::get(IntTy, 1),
590       llvm::AtomicOrdering::SequentiallyConsistent);
591     return Builder.CreateSub(RMWI, ConstantInt::get(IntTy, 1));
592   }
593   case MSVCIntrin::_InterlockedIncrement: {
594     llvm::Type *IntTy = ConvertType(E->getType());
595     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
596       AtomicRMWInst::Add,
597       EmitScalarExpr(E->getArg(0)),
598       ConstantInt::get(IntTy, 1),
599       llvm::AtomicOrdering::SequentiallyConsistent);
600     return Builder.CreateAdd(RMWI, ConstantInt::get(IntTy, 1));
601   }
602 
603   case MSVCIntrin::__fastfail: {
604     // Request immediate process termination from the kernel. The instruction
605     // sequences to do this are documented on MSDN:
606     // https://msdn.microsoft.com/en-us/library/dn774154.aspx
607     llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
608     StringRef Asm, Constraints;
609     switch (ISA) {
610     default:
611       ErrorUnsupported(E, "__fastfail call for this architecture");
612       break;
613     case llvm::Triple::x86:
614     case llvm::Triple::x86_64:
615       Asm = "int $$0x29";
616       Constraints = "{cx}";
617       break;
618     case llvm::Triple::thumb:
619       Asm = "udf #251";
620       Constraints = "{r0}";
621       break;
622     }
623     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
624     llvm::InlineAsm *IA =
625         llvm::InlineAsm::get(FTy, Asm, Constraints, /*SideEffects=*/true);
626     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
627         getLLVMContext(), llvm::AttributeList::FunctionIndex,
628         llvm::Attribute::NoReturn);
629     CallSite CS = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
630     CS.setAttributes(NoReturnAttr);
631     return CS.getInstruction();
632   }
633   }
634   llvm_unreachable("Incorrect MSVC intrinsic!");
635 }
636 
637 namespace {
638 // ARC cleanup for __builtin_os_log_format
639 struct CallObjCArcUse final : EHScopeStack::Cleanup {
640   CallObjCArcUse(llvm::Value *object) : object(object) {}
641   llvm::Value *object;
642 
643   void Emit(CodeGenFunction &CGF, Flags flags) override {
644     CGF.EmitARCIntrinsicUse(object);
645   }
646 };
647 }
648 
649 Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
650                                                  BuiltinCheckKind Kind) {
651   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
652           && "Unsupported builtin check kind");
653 
654   Value *ArgValue = EmitScalarExpr(E);
655   if (!SanOpts.has(SanitizerKind::Builtin) || !getTarget().isCLZForZeroUndef())
656     return ArgValue;
657 
658   SanitizerScope SanScope(this);
659   Value *Cond = Builder.CreateICmpNE(
660       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
661   EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
662             SanitizerHandler::InvalidBuiltin,
663             {EmitCheckSourceLocation(E->getExprLoc()),
664              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
665             None);
666   return ArgValue;
667 }
668 
669 /// Get the argument type for arguments to os_log_helper.
670 static CanQualType getOSLogArgType(ASTContext &C, int Size) {
671   QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
672   return C.getCanonicalType(UnsignedTy);
673 }
674 
675 llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
676     const analyze_os_log::OSLogBufferLayout &Layout,
677     CharUnits BufferAlignment) {
678   ASTContext &Ctx = getContext();
679 
680   llvm::SmallString<64> Name;
681   {
682     raw_svector_ostream OS(Name);
683     OS << "__os_log_helper";
684     OS << "_" << BufferAlignment.getQuantity();
685     OS << "_" << int(Layout.getSummaryByte());
686     OS << "_" << int(Layout.getNumArgsByte());
687     for (const auto &Item : Layout.Items)
688       OS << "_" << int(Item.getSizeByte()) << "_"
689          << int(Item.getDescriptorByte());
690   }
691 
692   if (llvm::Function *F = CGM.getModule().getFunction(Name))
693     return F;
694 
695   llvm::SmallVector<ImplicitParamDecl, 4> Params;
696   Params.emplace_back(Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"),
697                       Ctx.VoidPtrTy, ImplicitParamDecl::Other);
698 
699   for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
700     char Size = Layout.Items[I].getSizeByte();
701     if (!Size)
702       continue;
703 
704     Params.emplace_back(
705         Ctx, nullptr, SourceLocation(),
706         &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)),
707         getOSLogArgType(Ctx, Size), ImplicitParamDecl::Other);
708   }
709 
710   FunctionArgList Args;
711   for (auto &P : Params)
712     Args.push_back(&P);
713 
714   // The helper function has linkonce_odr linkage to enable the linker to merge
715   // identical functions. To ensure the merging always happens, 'noinline' is
716   // attached to the function when compiling with -Oz.
717   const CGFunctionInfo &FI =
718       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Args);
719   llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
720   llvm::Function *Fn = llvm::Function::Create(
721       FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
722   Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
723   CGM.SetLLVMFunctionAttributes(nullptr, FI, Fn);
724   CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
725 
726   // Attach 'noinline' at -Oz.
727   if (CGM.getCodeGenOpts().OptimizeSize == 2)
728     Fn->addFnAttr(llvm::Attribute::NoInline);
729 
730   auto NL = ApplyDebugLocation::CreateEmpty(*this);
731   IdentifierInfo *II = &Ctx.Idents.get(Name);
732   FunctionDecl *FD = FunctionDecl::Create(
733       Ctx, Ctx.getTranslationUnitDecl(), SourceLocation(), SourceLocation(), II,
734       Ctx.VoidTy, nullptr, SC_PrivateExtern, false, false);
735 
736   StartFunction(FD, Ctx.VoidTy, Fn, FI, Args);
737 
738   // Create a scope with an artificial location for the body of this function.
739   auto AL = ApplyDebugLocation::CreateArtificial(*this);
740 
741   CharUnits Offset;
742   Address BufAddr(Builder.CreateLoad(GetAddrOfLocalVar(&Params[0]), "buf"),
743                   BufferAlignment);
744   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
745                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
746   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
747                       Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
748 
749   unsigned I = 1;
750   for (const auto &Item : Layout.Items) {
751     Builder.CreateStore(
752         Builder.getInt8(Item.getDescriptorByte()),
753         Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
754     Builder.CreateStore(
755         Builder.getInt8(Item.getSizeByte()),
756         Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
757 
758     CharUnits Size = Item.size();
759     if (!Size.getQuantity())
760       continue;
761 
762     Address Arg = GetAddrOfLocalVar(&Params[I]);
763     Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
764     Addr = Builder.CreateBitCast(Addr, Arg.getPointer()->getType(),
765                                  "argDataCast");
766     Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
767     Offset += Size;
768     ++I;
769   }
770 
771   FinishFunction();
772 
773   return Fn;
774 }
775 
776 RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
777   assert(E.getNumArgs() >= 2 &&
778          "__builtin_os_log_format takes at least 2 arguments");
779   ASTContext &Ctx = getContext();
780   analyze_os_log::OSLogBufferLayout Layout;
781   analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
782   Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
783   llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
784 
785   // Ignore argument 1, the format string. It is not currently used.
786   CallArgList Args;
787   Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
788 
789   for (const auto &Item : Layout.Items) {
790     int Size = Item.getSizeByte();
791     if (!Size)
792       continue;
793 
794     llvm::Value *ArgVal;
795 
796     if (const Expr *TheExpr = Item.getExpr()) {
797       ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
798 
799       // Check if this is a retainable type.
800       if (TheExpr->getType()->isObjCRetainableType()) {
801         assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
802                "Only scalar can be a ObjC retainable type");
803         // Check if the object is constant, if not, save it in
804         // RetainableOperands.
805         if (!isa<Constant>(ArgVal))
806           RetainableOperands.push_back(ArgVal);
807       }
808     } else {
809       ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
810     }
811 
812     unsigned ArgValSize =
813         CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
814     llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
815                                                      ArgValSize);
816     ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
817     CanQualType ArgTy = getOSLogArgType(Ctx, Size);
818     // If ArgVal has type x86_fp80, zero-extend ArgVal.
819     ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
820     Args.add(RValue::get(ArgVal), ArgTy);
821   }
822 
823   const CGFunctionInfo &FI =
824       CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
825   llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
826       Layout, BufAddr.getAlignment());
827   EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
828 
829   // Push a clang.arc.use cleanup for each object in RetainableOperands. The
830   // cleanup will cause the use to appear after the final log call, keeping
831   // the object valid while it’s held in the log buffer.  Note that if there’s
832   // a release cleanup on the object, it will already be active; since
833   // cleanups are emitted in reverse order, the use will occur before the
834   // object is released.
835   if (!RetainableOperands.empty() && getLangOpts().ObjCAutoRefCount &&
836       CGM.getCodeGenOpts().OptimizationLevel != 0)
837     for (llvm::Value *Object : RetainableOperands)
838       pushFullExprCleanup<CallObjCArcUse>(getARCCleanupKind(), Object);
839 
840   return RValue::get(BufAddr.getPointer());
841 }
842 
843 /// Determine if a binop is a checked mixed-sign multiply we can specialize.
844 static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
845                                        WidthAndSignedness Op1Info,
846                                        WidthAndSignedness Op2Info,
847                                        WidthAndSignedness ResultInfo) {
848   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
849          Op1Info.Width == Op2Info.Width && Op1Info.Width >= ResultInfo.Width &&
850          Op1Info.Signed != Op2Info.Signed;
851 }
852 
853 /// Emit a checked mixed-sign multiply. This is a cheaper specialization of
854 /// the generic checked-binop irgen.
855 static RValue
856 EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
857                              WidthAndSignedness Op1Info, const clang::Expr *Op2,
858                              WidthAndSignedness Op2Info,
859                              const clang::Expr *ResultArg, QualType ResultQTy,
860                              WidthAndSignedness ResultInfo) {
861   assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
862                                     Op2Info, ResultInfo) &&
863          "Not a mixed-sign multipliction we can specialize");
864 
865   // Emit the signed and unsigned operands.
866   const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
867   const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
868   llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
869   llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
870 
871   llvm::Type *OpTy = Signed->getType();
872   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
873   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
874   llvm::Type *ResTy = ResultPtr.getElementType();
875 
876   // Take the absolute value of the signed operand.
877   llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
878   llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
879   llvm::Value *AbsSigned =
880       CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
881 
882   // Perform a checked unsigned multiplication.
883   llvm::Value *UnsignedOverflow;
884   llvm::Value *UnsignedResult =
885       EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
886                             Unsigned, UnsignedOverflow);
887 
888   llvm::Value *Overflow, *Result;
889   if (ResultInfo.Signed) {
890     // Signed overflow occurs if the result is greater than INT_MAX or lesser
891     // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
892     auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width)
893                       .zextOrSelf(Op1Info.Width);
894     llvm::Value *MaxResult =
895         CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
896                               CGF.Builder.CreateZExt(IsNegative, OpTy));
897     llvm::Value *SignedOverflow =
898         CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
899     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
900 
901     // Prepare the signed result (possibly by negating it).
902     llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
903     llvm::Value *SignedResult =
904         CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
905     Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
906   } else {
907     // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
908     llvm::Value *Underflow = CGF.Builder.CreateAnd(
909         IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
910     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
911     if (ResultInfo.Width < Op1Info.Width) {
912       auto IntMax =
913           llvm::APInt::getMaxValue(ResultInfo.Width).zext(Op1Info.Width);
914       llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
915           UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
916       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
917     }
918 
919     // Negate the product if it would be negative in infinite precision.
920     Result = CGF.Builder.CreateSelect(
921         IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
922 
923     Result = CGF.Builder.CreateTrunc(Result, ResTy);
924   }
925   assert(Overflow && Result && "Missing overflow or result");
926 
927   bool isVolatile =
928       ResultArg->getType()->getPointeeType().isVolatileQualified();
929   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
930                           isVolatile);
931   return RValue::get(Overflow);
932 }
933 
934 static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType,
935                                Value *&RecordPtr, CharUnits Align, Value *Func,
936                                int Lvl) {
937   const auto *RT = RType->getAs<RecordType>();
938   ASTContext &Context = CGF.getContext();
939   RecordDecl *RD = RT->getDecl()->getDefinition();
940   ASTContext &Ctx = RD->getASTContext();
941   const ASTRecordLayout &RL = Ctx.getASTRecordLayout(RD);
942   std::string Pad = std::string(Lvl * 4, ' ');
943 
944   Value *GString =
945       CGF.Builder.CreateGlobalStringPtr(RType.getAsString() + " {\n");
946   Value *Res = CGF.Builder.CreateCall(Func, {GString});
947 
948   static llvm::DenseMap<QualType, const char *> Types;
949   if (Types.empty()) {
950     Types[Context.CharTy] = "%c";
951     Types[Context.BoolTy] = "%d";
952     Types[Context.SignedCharTy] = "%hhd";
953     Types[Context.UnsignedCharTy] = "%hhu";
954     Types[Context.IntTy] = "%d";
955     Types[Context.UnsignedIntTy] = "%u";
956     Types[Context.LongTy] = "%ld";
957     Types[Context.UnsignedLongTy] = "%lu";
958     Types[Context.LongLongTy] = "%lld";
959     Types[Context.UnsignedLongLongTy] = "%llu";
960     Types[Context.ShortTy] = "%hd";
961     Types[Context.UnsignedShortTy] = "%hu";
962     Types[Context.VoidPtrTy] = "%p";
963     Types[Context.FloatTy] = "%f";
964     Types[Context.DoubleTy] = "%f";
965     Types[Context.LongDoubleTy] = "%Lf";
966     Types[Context.getPointerType(Context.CharTy)] = "%s";
967     Types[Context.getPointerType(Context.getConstType(Context.CharTy))] = "%s";
968   }
969 
970   for (const auto *FD : RD->fields()) {
971     uint64_t Off = RL.getFieldOffset(FD->getFieldIndex());
972     Off = Ctx.toCharUnitsFromBits(Off).getQuantity();
973 
974     Value *FieldPtr = RecordPtr;
975     if (RD->isUnion())
976       FieldPtr = CGF.Builder.CreatePointerCast(
977           FieldPtr, CGF.ConvertType(Context.getPointerType(FD->getType())));
978     else
979       FieldPtr = CGF.Builder.CreateStructGEP(CGF.ConvertType(RType), FieldPtr,
980                                              FD->getFieldIndex());
981 
982     GString = CGF.Builder.CreateGlobalStringPtr(
983         llvm::Twine(Pad)
984             .concat(FD->getType().getAsString())
985             .concat(llvm::Twine(' '))
986             .concat(FD->getNameAsString())
987             .concat(" : ")
988             .str());
989     Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
990     Res = CGF.Builder.CreateAdd(Res, TmpRes);
991 
992     QualType CanonicalType =
993         FD->getType().getUnqualifiedType().getCanonicalType();
994 
995     // We check whether we are in a recursive type
996     if (CanonicalType->isRecordType()) {
997       Value *TmpRes =
998           dumpRecord(CGF, CanonicalType, FieldPtr, Align, Func, Lvl + 1);
999       Res = CGF.Builder.CreateAdd(TmpRes, Res);
1000       continue;
1001     }
1002 
1003     // We try to determine the best format to print the current field
1004     llvm::Twine Format = Types.find(CanonicalType) == Types.end()
1005                              ? Types[Context.VoidPtrTy]
1006                              : Types[CanonicalType];
1007 
1008     Address FieldAddress = Address(FieldPtr, Align);
1009     FieldPtr = CGF.Builder.CreateLoad(FieldAddress);
1010 
1011     // FIXME Need to handle bitfield here
1012     GString = CGF.Builder.CreateGlobalStringPtr(
1013         Format.concat(llvm::Twine('\n')).str());
1014     TmpRes = CGF.Builder.CreateCall(Func, {GString, FieldPtr});
1015     Res = CGF.Builder.CreateAdd(Res, TmpRes);
1016   }
1017 
1018   GString = CGF.Builder.CreateGlobalStringPtr(Pad + "}\n");
1019   Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
1020   Res = CGF.Builder.CreateAdd(Res, TmpRes);
1021   return Res;
1022 }
1023 
1024 RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
1025                                         unsigned BuiltinID, const CallExpr *E,
1026                                         ReturnValueSlot ReturnValue) {
1027   // See if we can constant fold this builtin.  If so, don't emit it at all.
1028   Expr::EvalResult Result;
1029   if (E->EvaluateAsRValue(Result, CGM.getContext()) &&
1030       !Result.hasSideEffects()) {
1031     if (Result.Val.isInt())
1032       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
1033                                                 Result.Val.getInt()));
1034     if (Result.Val.isFloat())
1035       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
1036                                                Result.Val.getFloat()));
1037   }
1038 
1039   // There are LLVM math intrinsics/instructions corresponding to math library
1040   // functions except the LLVM op will never set errno while the math library
1041   // might. Also, math builtins have the same semantics as their math library
1042   // twins. Thus, we can transform math library and builtin calls to their
1043   // LLVM counterparts if the call is marked 'const' (known to never set errno).
1044   if (FD->hasAttr<ConstAttr>()) {
1045     switch (BuiltinID) {
1046     case Builtin::BIceil:
1047     case Builtin::BIceilf:
1048     case Builtin::BIceill:
1049     case Builtin::BI__builtin_ceil:
1050     case Builtin::BI__builtin_ceilf:
1051     case Builtin::BI__builtin_ceill:
1052       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::ceil));
1053 
1054     case Builtin::BIcopysign:
1055     case Builtin::BIcopysignf:
1056     case Builtin::BIcopysignl:
1057     case Builtin::BI__builtin_copysign:
1058     case Builtin::BI__builtin_copysignf:
1059     case Builtin::BI__builtin_copysignl:
1060     case Builtin::BI__builtin_copysignf128:
1061       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
1062 
1063     case Builtin::BIcos:
1064     case Builtin::BIcosf:
1065     case Builtin::BIcosl:
1066     case Builtin::BI__builtin_cos:
1067     case Builtin::BI__builtin_cosf:
1068     case Builtin::BI__builtin_cosl:
1069       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::cos));
1070 
1071     case Builtin::BIexp:
1072     case Builtin::BIexpf:
1073     case Builtin::BIexpl:
1074     case Builtin::BI__builtin_exp:
1075     case Builtin::BI__builtin_expf:
1076     case Builtin::BI__builtin_expl:
1077       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp));
1078 
1079     case Builtin::BIexp2:
1080     case Builtin::BIexp2f:
1081     case Builtin::BIexp2l:
1082     case Builtin::BI__builtin_exp2:
1083     case Builtin::BI__builtin_exp2f:
1084     case Builtin::BI__builtin_exp2l:
1085       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp2));
1086 
1087     case Builtin::BIfabs:
1088     case Builtin::BIfabsf:
1089     case Builtin::BIfabsl:
1090     case Builtin::BI__builtin_fabs:
1091     case Builtin::BI__builtin_fabsf:
1092     case Builtin::BI__builtin_fabsl:
1093     case Builtin::BI__builtin_fabsf128:
1094       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
1095 
1096     case Builtin::BIfloor:
1097     case Builtin::BIfloorf:
1098     case Builtin::BIfloorl:
1099     case Builtin::BI__builtin_floor:
1100     case Builtin::BI__builtin_floorf:
1101     case Builtin::BI__builtin_floorl:
1102       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::floor));
1103 
1104     case Builtin::BIfma:
1105     case Builtin::BIfmaf:
1106     case Builtin::BIfmal:
1107     case Builtin::BI__builtin_fma:
1108     case Builtin::BI__builtin_fmaf:
1109     case Builtin::BI__builtin_fmal:
1110       return RValue::get(emitTernaryBuiltin(*this, E, Intrinsic::fma));
1111 
1112     case Builtin::BIfmax:
1113     case Builtin::BIfmaxf:
1114     case Builtin::BIfmaxl:
1115     case Builtin::BI__builtin_fmax:
1116     case Builtin::BI__builtin_fmaxf:
1117     case Builtin::BI__builtin_fmaxl:
1118       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::maxnum));
1119 
1120     case Builtin::BIfmin:
1121     case Builtin::BIfminf:
1122     case Builtin::BIfminl:
1123     case Builtin::BI__builtin_fmin:
1124     case Builtin::BI__builtin_fminf:
1125     case Builtin::BI__builtin_fminl:
1126       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::minnum));
1127 
1128     // fmod() is a special-case. It maps to the frem instruction rather than an
1129     // LLVM intrinsic.
1130     case Builtin::BIfmod:
1131     case Builtin::BIfmodf:
1132     case Builtin::BIfmodl:
1133     case Builtin::BI__builtin_fmod:
1134     case Builtin::BI__builtin_fmodf:
1135     case Builtin::BI__builtin_fmodl: {
1136       Value *Arg1 = EmitScalarExpr(E->getArg(0));
1137       Value *Arg2 = EmitScalarExpr(E->getArg(1));
1138       return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
1139     }
1140 
1141     case Builtin::BIlog:
1142     case Builtin::BIlogf:
1143     case Builtin::BIlogl:
1144     case Builtin::BI__builtin_log:
1145     case Builtin::BI__builtin_logf:
1146     case Builtin::BI__builtin_logl:
1147       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log));
1148 
1149     case Builtin::BIlog10:
1150     case Builtin::BIlog10f:
1151     case Builtin::BIlog10l:
1152     case Builtin::BI__builtin_log10:
1153     case Builtin::BI__builtin_log10f:
1154     case Builtin::BI__builtin_log10l:
1155       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log10));
1156 
1157     case Builtin::BIlog2:
1158     case Builtin::BIlog2f:
1159     case Builtin::BIlog2l:
1160     case Builtin::BI__builtin_log2:
1161     case Builtin::BI__builtin_log2f:
1162     case Builtin::BI__builtin_log2l:
1163       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::log2));
1164 
1165     case Builtin::BInearbyint:
1166     case Builtin::BInearbyintf:
1167     case Builtin::BInearbyintl:
1168     case Builtin::BI__builtin_nearbyint:
1169     case Builtin::BI__builtin_nearbyintf:
1170     case Builtin::BI__builtin_nearbyintl:
1171       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::nearbyint));
1172 
1173     case Builtin::BIpow:
1174     case Builtin::BIpowf:
1175     case Builtin::BIpowl:
1176     case Builtin::BI__builtin_pow:
1177     case Builtin::BI__builtin_powf:
1178     case Builtin::BI__builtin_powl:
1179       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::pow));
1180 
1181     case Builtin::BIrint:
1182     case Builtin::BIrintf:
1183     case Builtin::BIrintl:
1184     case Builtin::BI__builtin_rint:
1185     case Builtin::BI__builtin_rintf:
1186     case Builtin::BI__builtin_rintl:
1187       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::rint));
1188 
1189     case Builtin::BIround:
1190     case Builtin::BIroundf:
1191     case Builtin::BIroundl:
1192     case Builtin::BI__builtin_round:
1193     case Builtin::BI__builtin_roundf:
1194     case Builtin::BI__builtin_roundl:
1195       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::round));
1196 
1197     case Builtin::BIsin:
1198     case Builtin::BIsinf:
1199     case Builtin::BIsinl:
1200     case Builtin::BI__builtin_sin:
1201     case Builtin::BI__builtin_sinf:
1202     case Builtin::BI__builtin_sinl:
1203       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::sin));
1204 
1205     case Builtin::BIsqrt:
1206     case Builtin::BIsqrtf:
1207     case Builtin::BIsqrtl:
1208     case Builtin::BI__builtin_sqrt:
1209     case Builtin::BI__builtin_sqrtf:
1210     case Builtin::BI__builtin_sqrtl:
1211       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::sqrt));
1212 
1213     case Builtin::BItrunc:
1214     case Builtin::BItruncf:
1215     case Builtin::BItruncl:
1216     case Builtin::BI__builtin_trunc:
1217     case Builtin::BI__builtin_truncf:
1218     case Builtin::BI__builtin_truncl:
1219       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::trunc));
1220 
1221     default:
1222       break;
1223     }
1224   }
1225 
1226   switch (BuiltinID) {
1227   default: break;
1228   case Builtin::BI__builtin___CFStringMakeConstantString:
1229   case Builtin::BI__builtin___NSStringMakeConstantString:
1230     return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
1231   case Builtin::BI__builtin_stdarg_start:
1232   case Builtin::BI__builtin_va_start:
1233   case Builtin::BI__va_start:
1234   case Builtin::BI__builtin_va_end:
1235     return RValue::get(
1236         EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
1237                            ? EmitScalarExpr(E->getArg(0))
1238                            : EmitVAListRef(E->getArg(0)).getPointer(),
1239                        BuiltinID != Builtin::BI__builtin_va_end));
1240   case Builtin::BI__builtin_va_copy: {
1241     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
1242     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
1243 
1244     llvm::Type *Type = Int8PtrTy;
1245 
1246     DstPtr = Builder.CreateBitCast(DstPtr, Type);
1247     SrcPtr = Builder.CreateBitCast(SrcPtr, Type);
1248     return RValue::get(Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy),
1249                                           {DstPtr, SrcPtr}));
1250   }
1251   case Builtin::BI__builtin_abs:
1252   case Builtin::BI__builtin_labs:
1253   case Builtin::BI__builtin_llabs: {
1254     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1255 
1256     Value *NegOp = Builder.CreateNeg(ArgValue, "neg");
1257     Value *CmpResult =
1258     Builder.CreateICmpSGE(ArgValue,
1259                           llvm::Constant::getNullValue(ArgValue->getType()),
1260                                                             "abscond");
1261     Value *Result =
1262       Builder.CreateSelect(CmpResult, ArgValue, NegOp, "abs");
1263 
1264     return RValue::get(Result);
1265   }
1266   case Builtin::BI__builtin_conj:
1267   case Builtin::BI__builtin_conjf:
1268   case Builtin::BI__builtin_conjl: {
1269     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1270     Value *Real = ComplexVal.first;
1271     Value *Imag = ComplexVal.second;
1272     Value *Zero =
1273       Imag->getType()->isFPOrFPVectorTy()
1274         ? llvm::ConstantFP::getZeroValueForNegation(Imag->getType())
1275         : llvm::Constant::getNullValue(Imag->getType());
1276 
1277     Imag = Builder.CreateFSub(Zero, Imag, "sub");
1278     return RValue::getComplex(std::make_pair(Real, Imag));
1279   }
1280   case Builtin::BI__builtin_creal:
1281   case Builtin::BI__builtin_crealf:
1282   case Builtin::BI__builtin_creall:
1283   case Builtin::BIcreal:
1284   case Builtin::BIcrealf:
1285   case Builtin::BIcreall: {
1286     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1287     return RValue::get(ComplexVal.first);
1288   }
1289 
1290   case Builtin::BI__builtin_dump_struct: {
1291     Value *Func = EmitScalarExpr(E->getArg(1)->IgnoreImpCasts());
1292     CharUnits Arg0Align = EmitPointerWithAlignment(E->getArg(0)).getAlignment();
1293 
1294     const Expr *Arg0 = E->getArg(0)->IgnoreImpCasts();
1295     QualType Arg0Type = Arg0->getType()->getPointeeType();
1296 
1297     Value *RecordPtr = EmitScalarExpr(Arg0);
1298     Value *Res = dumpRecord(*this, Arg0Type, RecordPtr, Arg0Align, Func, 0);
1299     return RValue::get(Res);
1300   }
1301 
1302   case Builtin::BI__builtin_cimag:
1303   case Builtin::BI__builtin_cimagf:
1304   case Builtin::BI__builtin_cimagl:
1305   case Builtin::BIcimag:
1306   case Builtin::BIcimagf:
1307   case Builtin::BIcimagl: {
1308     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
1309     return RValue::get(ComplexVal.second);
1310   }
1311 
1312   case Builtin::BI__builtin_ctzs:
1313   case Builtin::BI__builtin_ctz:
1314   case Builtin::BI__builtin_ctzl:
1315   case Builtin::BI__builtin_ctzll: {
1316     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
1317 
1318     llvm::Type *ArgType = ArgValue->getType();
1319     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1320 
1321     llvm::Type *ResultType = ConvertType(E->getType());
1322     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
1323     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
1324     if (Result->getType() != ResultType)
1325       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1326                                      "cast");
1327     return RValue::get(Result);
1328   }
1329   case Builtin::BI__builtin_clzs:
1330   case Builtin::BI__builtin_clz:
1331   case Builtin::BI__builtin_clzl:
1332   case Builtin::BI__builtin_clzll: {
1333     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
1334 
1335     llvm::Type *ArgType = ArgValue->getType();
1336     Value *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1337 
1338     llvm::Type *ResultType = ConvertType(E->getType());
1339     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
1340     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
1341     if (Result->getType() != ResultType)
1342       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1343                                      "cast");
1344     return RValue::get(Result);
1345   }
1346   case Builtin::BI__builtin_ffs:
1347   case Builtin::BI__builtin_ffsl:
1348   case Builtin::BI__builtin_ffsll: {
1349     // ffs(x) -> x ? cttz(x) + 1 : 0
1350     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1351 
1352     llvm::Type *ArgType = ArgValue->getType();
1353     Value *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1354 
1355     llvm::Type *ResultType = ConvertType(E->getType());
1356     Value *Tmp =
1357         Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
1358                           llvm::ConstantInt::get(ArgType, 1));
1359     Value *Zero = llvm::Constant::getNullValue(ArgType);
1360     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
1361     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
1362     if (Result->getType() != ResultType)
1363       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1364                                      "cast");
1365     return RValue::get(Result);
1366   }
1367   case Builtin::BI__builtin_parity:
1368   case Builtin::BI__builtin_parityl:
1369   case Builtin::BI__builtin_parityll: {
1370     // parity(x) -> ctpop(x) & 1
1371     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1372 
1373     llvm::Type *ArgType = ArgValue->getType();
1374     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
1375 
1376     llvm::Type *ResultType = ConvertType(E->getType());
1377     Value *Tmp = Builder.CreateCall(F, ArgValue);
1378     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
1379     if (Result->getType() != ResultType)
1380       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1381                                      "cast");
1382     return RValue::get(Result);
1383   }
1384   case Builtin::BI__popcnt16:
1385   case Builtin::BI__popcnt:
1386   case Builtin::BI__popcnt64:
1387   case Builtin::BI__builtin_popcount:
1388   case Builtin::BI__builtin_popcountl:
1389   case Builtin::BI__builtin_popcountll: {
1390     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1391 
1392     llvm::Type *ArgType = ArgValue->getType();
1393     Value *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
1394 
1395     llvm::Type *ResultType = ConvertType(E->getType());
1396     Value *Result = Builder.CreateCall(F, ArgValue);
1397     if (Result->getType() != ResultType)
1398       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
1399                                      "cast");
1400     return RValue::get(Result);
1401   }
1402   case Builtin::BI_rotr8:
1403   case Builtin::BI_rotr16:
1404   case Builtin::BI_rotr:
1405   case Builtin::BI_lrotr:
1406   case Builtin::BI_rotr64: {
1407     Value *Val = EmitScalarExpr(E->getArg(0));
1408     Value *Shift = EmitScalarExpr(E->getArg(1));
1409 
1410     llvm::Type *ArgType = Val->getType();
1411     Shift = Builder.CreateIntCast(Shift, ArgType, false);
1412     unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1413     Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth);
1414     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1415 
1416     Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
1417     Shift = Builder.CreateAnd(Shift, Mask);
1418     Value *LeftShift = Builder.CreateSub(ArgTypeSize, Shift);
1419 
1420     Value *RightShifted = Builder.CreateLShr(Val, Shift);
1421     Value *LeftShifted = Builder.CreateShl(Val, LeftShift);
1422     Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted);
1423 
1424     Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero);
1425     Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated);
1426     return RValue::get(Result);
1427   }
1428   case Builtin::BI_rotl8:
1429   case Builtin::BI_rotl16:
1430   case Builtin::BI_rotl:
1431   case Builtin::BI_lrotl:
1432   case Builtin::BI_rotl64: {
1433     Value *Val = EmitScalarExpr(E->getArg(0));
1434     Value *Shift = EmitScalarExpr(E->getArg(1));
1435 
1436     llvm::Type *ArgType = Val->getType();
1437     Shift = Builder.CreateIntCast(Shift, ArgType, false);
1438     unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1439     Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth);
1440     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1441 
1442     Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
1443     Shift = Builder.CreateAnd(Shift, Mask);
1444     Value *RightShift = Builder.CreateSub(ArgTypeSize, Shift);
1445 
1446     Value *LeftShifted = Builder.CreateShl(Val, Shift);
1447     Value *RightShifted = Builder.CreateLShr(Val, RightShift);
1448     Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted);
1449 
1450     Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero);
1451     Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated);
1452     return RValue::get(Result);
1453   }
1454   case Builtin::BI__builtin_unpredictable: {
1455     // Always return the argument of __builtin_unpredictable. LLVM does not
1456     // handle this builtin. Metadata for this builtin should be added directly
1457     // to instructions such as branches or switches that use it.
1458     return RValue::get(EmitScalarExpr(E->getArg(0)));
1459   }
1460   case Builtin::BI__builtin_expect: {
1461     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1462     llvm::Type *ArgType = ArgValue->getType();
1463 
1464     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
1465     // Don't generate llvm.expect on -O0 as the backend won't use it for
1466     // anything.
1467     // Note, we still IRGen ExpectedValue because it could have side-effects.
1468     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
1469       return RValue::get(ArgValue);
1470 
1471     Value *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
1472     Value *Result =
1473         Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
1474     return RValue::get(Result);
1475   }
1476   case Builtin::BI__builtin_assume_aligned: {
1477     Value *PtrValue = EmitScalarExpr(E->getArg(0));
1478     Value *OffsetValue =
1479       (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
1480 
1481     Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
1482     ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
1483     unsigned Alignment = (unsigned) AlignmentCI->getZExtValue();
1484 
1485     EmitAlignmentAssumption(PtrValue, Alignment, OffsetValue);
1486     return RValue::get(PtrValue);
1487   }
1488   case Builtin::BI__assume:
1489   case Builtin::BI__builtin_assume: {
1490     if (E->getArg(0)->HasSideEffects(getContext()))
1491       return RValue::get(nullptr);
1492 
1493     Value *ArgValue = EmitScalarExpr(E->getArg(0));
1494     Value *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
1495     return RValue::get(Builder.CreateCall(FnAssume, ArgValue));
1496   }
1497   case Builtin::BI__builtin_bswap16:
1498   case Builtin::BI__builtin_bswap32:
1499   case Builtin::BI__builtin_bswap64: {
1500     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
1501   }
1502   case Builtin::BI__builtin_bitreverse8:
1503   case Builtin::BI__builtin_bitreverse16:
1504   case Builtin::BI__builtin_bitreverse32:
1505   case Builtin::BI__builtin_bitreverse64: {
1506     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
1507   }
1508   case Builtin::BI__builtin_object_size: {
1509     unsigned Type =
1510         E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
1511     auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
1512 
1513     // We pass this builtin onto the optimizer so that it can figure out the
1514     // object size in more complex cases.
1515     return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
1516                                              /*EmittedE=*/nullptr));
1517   }
1518   case Builtin::BI__builtin_prefetch: {
1519     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
1520     // FIXME: Technically these constants should of type 'int', yes?
1521     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
1522       llvm::ConstantInt::get(Int32Ty, 0);
1523     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
1524       llvm::ConstantInt::get(Int32Ty, 3);
1525     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
1526     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
1527     return RValue::get(Builder.CreateCall(F, {Address, RW, Locality, Data}));
1528   }
1529   case Builtin::BI__builtin_readcyclecounter: {
1530     Value *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
1531     return RValue::get(Builder.CreateCall(F));
1532   }
1533   case Builtin::BI__builtin___clear_cache: {
1534     Value *Begin = EmitScalarExpr(E->getArg(0));
1535     Value *End = EmitScalarExpr(E->getArg(1));
1536     Value *F = CGM.getIntrinsic(Intrinsic::clear_cache);
1537     return RValue::get(Builder.CreateCall(F, {Begin, End}));
1538   }
1539   case Builtin::BI__builtin_trap:
1540     return RValue::get(EmitTrapCall(Intrinsic::trap));
1541   case Builtin::BI__debugbreak:
1542     return RValue::get(EmitTrapCall(Intrinsic::debugtrap));
1543   case Builtin::BI__builtin_unreachable: {
1544     EmitUnreachable(E->getExprLoc());
1545 
1546     // We do need to preserve an insertion point.
1547     EmitBlock(createBasicBlock("unreachable.cont"));
1548 
1549     return RValue::get(nullptr);
1550   }
1551 
1552   case Builtin::BI__builtin_powi:
1553   case Builtin::BI__builtin_powif:
1554   case Builtin::BI__builtin_powil: {
1555     Value *Base = EmitScalarExpr(E->getArg(0));
1556     Value *Exponent = EmitScalarExpr(E->getArg(1));
1557     llvm::Type *ArgType = Base->getType();
1558     Value *F = CGM.getIntrinsic(Intrinsic::powi, ArgType);
1559     return RValue::get(Builder.CreateCall(F, {Base, Exponent}));
1560   }
1561 
1562   case Builtin::BI__builtin_isgreater:
1563   case Builtin::BI__builtin_isgreaterequal:
1564   case Builtin::BI__builtin_isless:
1565   case Builtin::BI__builtin_islessequal:
1566   case Builtin::BI__builtin_islessgreater:
1567   case Builtin::BI__builtin_isunordered: {
1568     // Ordered comparisons: we know the arguments to these are matching scalar
1569     // floating point values.
1570     Value *LHS = EmitScalarExpr(E->getArg(0));
1571     Value *RHS = EmitScalarExpr(E->getArg(1));
1572 
1573     switch (BuiltinID) {
1574     default: llvm_unreachable("Unknown ordered comparison");
1575     case Builtin::BI__builtin_isgreater:
1576       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
1577       break;
1578     case Builtin::BI__builtin_isgreaterequal:
1579       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
1580       break;
1581     case Builtin::BI__builtin_isless:
1582       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
1583       break;
1584     case Builtin::BI__builtin_islessequal:
1585       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
1586       break;
1587     case Builtin::BI__builtin_islessgreater:
1588       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
1589       break;
1590     case Builtin::BI__builtin_isunordered:
1591       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
1592       break;
1593     }
1594     // ZExt bool to int type.
1595     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
1596   }
1597   case Builtin::BI__builtin_isnan: {
1598     Value *V = EmitScalarExpr(E->getArg(0));
1599     V = Builder.CreateFCmpUNO(V, V, "cmp");
1600     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
1601   }
1602 
1603   case Builtin::BIfinite:
1604   case Builtin::BI__finite:
1605   case Builtin::BIfinitef:
1606   case Builtin::BI__finitef:
1607   case Builtin::BIfinitel:
1608   case Builtin::BI__finitel:
1609   case Builtin::BI__builtin_isinf:
1610   case Builtin::BI__builtin_isfinite: {
1611     // isinf(x)    --> fabs(x) == infinity
1612     // isfinite(x) --> fabs(x) != infinity
1613     // x != NaN via the ordered compare in either case.
1614     Value *V = EmitScalarExpr(E->getArg(0));
1615     Value *Fabs = EmitFAbs(*this, V);
1616     Constant *Infinity = ConstantFP::getInfinity(V->getType());
1617     CmpInst::Predicate Pred = (BuiltinID == Builtin::BI__builtin_isinf)
1618                                   ? CmpInst::FCMP_OEQ
1619                                   : CmpInst::FCMP_ONE;
1620     Value *FCmp = Builder.CreateFCmp(Pred, Fabs, Infinity, "cmpinf");
1621     return RValue::get(Builder.CreateZExt(FCmp, ConvertType(E->getType())));
1622   }
1623 
1624   case Builtin::BI__builtin_isinf_sign: {
1625     // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
1626     Value *Arg = EmitScalarExpr(E->getArg(0));
1627     Value *AbsArg = EmitFAbs(*this, Arg);
1628     Value *IsInf = Builder.CreateFCmpOEQ(
1629         AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
1630     Value *IsNeg = EmitSignBit(*this, Arg);
1631 
1632     llvm::Type *IntTy = ConvertType(E->getType());
1633     Value *Zero = Constant::getNullValue(IntTy);
1634     Value *One = ConstantInt::get(IntTy, 1);
1635     Value *NegativeOne = ConstantInt::get(IntTy, -1);
1636     Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
1637     Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
1638     return RValue::get(Result);
1639   }
1640 
1641   case Builtin::BI__builtin_isnormal: {
1642     // isnormal(x) --> x == x && fabsf(x) < infinity && fabsf(x) >= float_min
1643     Value *V = EmitScalarExpr(E->getArg(0));
1644     Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
1645 
1646     Value *Abs = EmitFAbs(*this, V);
1647     Value *IsLessThanInf =
1648       Builder.CreateFCmpULT(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
1649     APFloat Smallest = APFloat::getSmallestNormalized(
1650                    getContext().getFloatTypeSemantics(E->getArg(0)->getType()));
1651     Value *IsNormal =
1652       Builder.CreateFCmpUGE(Abs, ConstantFP::get(V->getContext(), Smallest),
1653                             "isnormal");
1654     V = Builder.CreateAnd(Eq, IsLessThanInf, "and");
1655     V = Builder.CreateAnd(V, IsNormal, "and");
1656     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
1657   }
1658 
1659   case Builtin::BI__builtin_fpclassify: {
1660     Value *V = EmitScalarExpr(E->getArg(5));
1661     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
1662 
1663     // Create Result
1664     BasicBlock *Begin = Builder.GetInsertBlock();
1665     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
1666     Builder.SetInsertPoint(End);
1667     PHINode *Result =
1668       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
1669                         "fpclassify_result");
1670 
1671     // if (V==0) return FP_ZERO
1672     Builder.SetInsertPoint(Begin);
1673     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
1674                                           "iszero");
1675     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
1676     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
1677     Builder.CreateCondBr(IsZero, End, NotZero);
1678     Result->addIncoming(ZeroLiteral, Begin);
1679 
1680     // if (V != V) return FP_NAN
1681     Builder.SetInsertPoint(NotZero);
1682     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
1683     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
1684     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
1685     Builder.CreateCondBr(IsNan, End, NotNan);
1686     Result->addIncoming(NanLiteral, NotZero);
1687 
1688     // if (fabs(V) == infinity) return FP_INFINITY
1689     Builder.SetInsertPoint(NotNan);
1690     Value *VAbs = EmitFAbs(*this, V);
1691     Value *IsInf =
1692       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
1693                             "isinf");
1694     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
1695     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
1696     Builder.CreateCondBr(IsInf, End, NotInf);
1697     Result->addIncoming(InfLiteral, NotNan);
1698 
1699     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
1700     Builder.SetInsertPoint(NotInf);
1701     APFloat Smallest = APFloat::getSmallestNormalized(
1702         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
1703     Value *IsNormal =
1704       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
1705                             "isnormal");
1706     Value *NormalResult =
1707       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
1708                            EmitScalarExpr(E->getArg(3)));
1709     Builder.CreateBr(End);
1710     Result->addIncoming(NormalResult, NotInf);
1711 
1712     // return Result
1713     Builder.SetInsertPoint(End);
1714     return RValue::get(Result);
1715   }
1716 
1717   case Builtin::BIalloca:
1718   case Builtin::BI_alloca:
1719   case Builtin::BI__builtin_alloca: {
1720     Value *Size = EmitScalarExpr(E->getArg(0));
1721     const TargetInfo &TI = getContext().getTargetInfo();
1722     // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
1723     unsigned SuitableAlignmentInBytes =
1724         CGM.getContext()
1725             .toCharUnitsFromBits(TI.getSuitableAlign())
1726             .getQuantity();
1727     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
1728     AI->setAlignment(SuitableAlignmentInBytes);
1729     return RValue::get(AI);
1730   }
1731 
1732   case Builtin::BI__builtin_alloca_with_align: {
1733     Value *Size = EmitScalarExpr(E->getArg(0));
1734     Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
1735     auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
1736     unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
1737     unsigned AlignmentInBytes =
1738         CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getQuantity();
1739     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
1740     AI->setAlignment(AlignmentInBytes);
1741     return RValue::get(AI);
1742   }
1743 
1744   case Builtin::BIbzero:
1745   case Builtin::BI__builtin_bzero: {
1746     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1747     Value *SizeVal = EmitScalarExpr(E->getArg(1));
1748     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
1749                         E->getArg(0)->getExprLoc(), FD, 0);
1750     Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
1751     return RValue::get(nullptr);
1752   }
1753   case Builtin::BImemcpy:
1754   case Builtin::BI__builtin_memcpy: {
1755     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1756     Address Src = EmitPointerWithAlignment(E->getArg(1));
1757     Value *SizeVal = EmitScalarExpr(E->getArg(2));
1758     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
1759                         E->getArg(0)->getExprLoc(), FD, 0);
1760     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
1761                         E->getArg(1)->getExprLoc(), FD, 1);
1762     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
1763     return RValue::get(Dest.getPointer());
1764   }
1765 
1766   case Builtin::BI__builtin_char_memchr:
1767     BuiltinID = Builtin::BI__builtin_memchr;
1768     break;
1769 
1770   case Builtin::BI__builtin___memcpy_chk: {
1771     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
1772     llvm::APSInt Size, DstSize;
1773     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
1774         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
1775       break;
1776     if (Size.ugt(DstSize))
1777       break;
1778     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1779     Address Src = EmitPointerWithAlignment(E->getArg(1));
1780     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
1781     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
1782     return RValue::get(Dest.getPointer());
1783   }
1784 
1785   case Builtin::BI__builtin_objc_memmove_collectable: {
1786     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
1787     Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
1788     Value *SizeVal = EmitScalarExpr(E->getArg(2));
1789     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
1790                                                   DestAddr, SrcAddr, SizeVal);
1791     return RValue::get(DestAddr.getPointer());
1792   }
1793 
1794   case Builtin::BI__builtin___memmove_chk: {
1795     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
1796     llvm::APSInt Size, DstSize;
1797     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
1798         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
1799       break;
1800     if (Size.ugt(DstSize))
1801       break;
1802     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1803     Address Src = EmitPointerWithAlignment(E->getArg(1));
1804     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
1805     Builder.CreateMemMove(Dest, Src, SizeVal, false);
1806     return RValue::get(Dest.getPointer());
1807   }
1808 
1809   case Builtin::BImemmove:
1810   case Builtin::BI__builtin_memmove: {
1811     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1812     Address Src = EmitPointerWithAlignment(E->getArg(1));
1813     Value *SizeVal = EmitScalarExpr(E->getArg(2));
1814     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
1815                         E->getArg(0)->getExprLoc(), FD, 0);
1816     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
1817                         E->getArg(1)->getExprLoc(), FD, 1);
1818     Builder.CreateMemMove(Dest, Src, SizeVal, false);
1819     return RValue::get(Dest.getPointer());
1820   }
1821   case Builtin::BImemset:
1822   case Builtin::BI__builtin_memset: {
1823     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1824     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
1825                                          Builder.getInt8Ty());
1826     Value *SizeVal = EmitScalarExpr(E->getArg(2));
1827     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
1828                         E->getArg(0)->getExprLoc(), FD, 0);
1829     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
1830     return RValue::get(Dest.getPointer());
1831   }
1832   case Builtin::BI__builtin___memset_chk: {
1833     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
1834     llvm::APSInt Size, DstSize;
1835     if (!E->getArg(2)->EvaluateAsInt(Size, CGM.getContext()) ||
1836         !E->getArg(3)->EvaluateAsInt(DstSize, CGM.getContext()))
1837       break;
1838     if (Size.ugt(DstSize))
1839       break;
1840     Address Dest = EmitPointerWithAlignment(E->getArg(0));
1841     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
1842                                          Builder.getInt8Ty());
1843     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
1844     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
1845     return RValue::get(Dest.getPointer());
1846   }
1847   case Builtin::BI__builtin_wmemcmp: {
1848     // The MSVC runtime library does not provide a definition of wmemcmp, so we
1849     // need an inline implementation.
1850     if (!getTarget().getTriple().isOSMSVCRT())
1851       break;
1852 
1853     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
1854 
1855     Value *Dst = EmitScalarExpr(E->getArg(0));
1856     Value *Src = EmitScalarExpr(E->getArg(1));
1857     Value *Size = EmitScalarExpr(E->getArg(2));
1858 
1859     BasicBlock *Entry = Builder.GetInsertBlock();
1860     BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
1861     BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
1862     BasicBlock *Next = createBasicBlock("wmemcmp.next");
1863     BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
1864     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
1865     Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
1866 
1867     EmitBlock(CmpGT);
1868     PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
1869     DstPhi->addIncoming(Dst, Entry);
1870     PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
1871     SrcPhi->addIncoming(Src, Entry);
1872     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
1873     SizePhi->addIncoming(Size, Entry);
1874     CharUnits WCharAlign =
1875         getContext().getTypeAlignInChars(getContext().WCharTy);
1876     Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
1877     Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
1878     Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
1879     Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
1880 
1881     EmitBlock(CmpLT);
1882     Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
1883     Builder.CreateCondBr(DstLtSrc, Exit, Next);
1884 
1885     EmitBlock(Next);
1886     Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
1887     Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
1888     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
1889     Value *NextSizeEq0 =
1890         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
1891     Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
1892     DstPhi->addIncoming(NextDst, Next);
1893     SrcPhi->addIncoming(NextSrc, Next);
1894     SizePhi->addIncoming(NextSize, Next);
1895 
1896     EmitBlock(Exit);
1897     PHINode *Ret = Builder.CreatePHI(IntTy, 4);
1898     Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
1899     Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
1900     Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
1901     Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
1902     return RValue::get(Ret);
1903   }
1904   case Builtin::BI__builtin_dwarf_cfa: {
1905     // The offset in bytes from the first argument to the CFA.
1906     //
1907     // Why on earth is this in the frontend?  Is there any reason at
1908     // all that the backend can't reasonably determine this while
1909     // lowering llvm.eh.dwarf.cfa()?
1910     //
1911     // TODO: If there's a satisfactory reason, add a target hook for
1912     // this instead of hard-coding 0, which is correct for most targets.
1913     int32_t Offset = 0;
1914 
1915     Value *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
1916     return RValue::get(Builder.CreateCall(F,
1917                                       llvm::ConstantInt::get(Int32Ty, Offset)));
1918   }
1919   case Builtin::BI__builtin_return_address: {
1920     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
1921                                                    getContext().UnsignedIntTy);
1922     Value *F = CGM.getIntrinsic(Intrinsic::returnaddress);
1923     return RValue::get(Builder.CreateCall(F, Depth));
1924   }
1925   case Builtin::BI_ReturnAddress: {
1926     Value *F = CGM.getIntrinsic(Intrinsic::returnaddress);
1927     return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
1928   }
1929   case Builtin::BI__builtin_frame_address: {
1930     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
1931                                                    getContext().UnsignedIntTy);
1932     Value *F = CGM.getIntrinsic(Intrinsic::frameaddress);
1933     return RValue::get(Builder.CreateCall(F, Depth));
1934   }
1935   case Builtin::BI__builtin_extract_return_addr: {
1936     Value *Address = EmitScalarExpr(E->getArg(0));
1937     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
1938     return RValue::get(Result);
1939   }
1940   case Builtin::BI__builtin_frob_return_addr: {
1941     Value *Address = EmitScalarExpr(E->getArg(0));
1942     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
1943     return RValue::get(Result);
1944   }
1945   case Builtin::BI__builtin_dwarf_sp_column: {
1946     llvm::IntegerType *Ty
1947       = cast<llvm::IntegerType>(ConvertType(E->getType()));
1948     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
1949     if (Column == -1) {
1950       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
1951       return RValue::get(llvm::UndefValue::get(Ty));
1952     }
1953     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
1954   }
1955   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
1956     Value *Address = EmitScalarExpr(E->getArg(0));
1957     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
1958       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
1959     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
1960   }
1961   case Builtin::BI__builtin_eh_return: {
1962     Value *Int = EmitScalarExpr(E->getArg(0));
1963     Value *Ptr = EmitScalarExpr(E->getArg(1));
1964 
1965     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
1966     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
1967            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
1968     Value *F = CGM.getIntrinsic(IntTy->getBitWidth() == 32
1969                                   ? Intrinsic::eh_return_i32
1970                                   : Intrinsic::eh_return_i64);
1971     Builder.CreateCall(F, {Int, Ptr});
1972     Builder.CreateUnreachable();
1973 
1974     // We do need to preserve an insertion point.
1975     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
1976 
1977     return RValue::get(nullptr);
1978   }
1979   case Builtin::BI__builtin_unwind_init: {
1980     Value *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
1981     return RValue::get(Builder.CreateCall(F));
1982   }
1983   case Builtin::BI__builtin_extend_pointer: {
1984     // Extends a pointer to the size of an _Unwind_Word, which is
1985     // uint64_t on all platforms.  Generally this gets poked into a
1986     // register and eventually used as an address, so if the
1987     // addressing registers are wider than pointers and the platform
1988     // doesn't implicitly ignore high-order bits when doing
1989     // addressing, we need to make sure we zext / sext based on
1990     // the platform's expectations.
1991     //
1992     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
1993 
1994     // Cast the pointer to intptr_t.
1995     Value *Ptr = EmitScalarExpr(E->getArg(0));
1996     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
1997 
1998     // If that's 64 bits, we're done.
1999     if (IntPtrTy->getBitWidth() == 64)
2000       return RValue::get(Result);
2001 
2002     // Otherwise, ask the codegen data what to do.
2003     if (getTargetHooks().extendPointerWithSExt())
2004       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
2005     else
2006       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
2007   }
2008   case Builtin::BI__builtin_setjmp: {
2009     // Buffer is a void**.
2010     Address Buf = EmitPointerWithAlignment(E->getArg(0));
2011 
2012     // Store the frame pointer to the setjmp buffer.
2013     Value *FrameAddr =
2014       Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
2015                          ConstantInt::get(Int32Ty, 0));
2016     Builder.CreateStore(FrameAddr, Buf);
2017 
2018     // Store the stack pointer to the setjmp buffer.
2019     Value *StackAddr =
2020         Builder.CreateCall(CGM.getIntrinsic(Intrinsic::stacksave));
2021     Address StackSaveSlot =
2022       Builder.CreateConstInBoundsGEP(Buf, 2, getPointerSize());
2023     Builder.CreateStore(StackAddr, StackSaveSlot);
2024 
2025     // Call LLVM's EH setjmp, which is lightweight.
2026     Value *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
2027     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
2028     return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
2029   }
2030   case Builtin::BI__builtin_longjmp: {
2031     Value *Buf = EmitScalarExpr(E->getArg(0));
2032     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
2033 
2034     // Call LLVM's EH longjmp, which is lightweight.
2035     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
2036 
2037     // longjmp doesn't return; mark this as unreachable.
2038     Builder.CreateUnreachable();
2039 
2040     // We do need to preserve an insertion point.
2041     EmitBlock(createBasicBlock("longjmp.cont"));
2042 
2043     return RValue::get(nullptr);
2044   }
2045   case Builtin::BI__sync_fetch_and_add:
2046   case Builtin::BI__sync_fetch_and_sub:
2047   case Builtin::BI__sync_fetch_and_or:
2048   case Builtin::BI__sync_fetch_and_and:
2049   case Builtin::BI__sync_fetch_and_xor:
2050   case Builtin::BI__sync_fetch_and_nand:
2051   case Builtin::BI__sync_add_and_fetch:
2052   case Builtin::BI__sync_sub_and_fetch:
2053   case Builtin::BI__sync_and_and_fetch:
2054   case Builtin::BI__sync_or_and_fetch:
2055   case Builtin::BI__sync_xor_and_fetch:
2056   case Builtin::BI__sync_nand_and_fetch:
2057   case Builtin::BI__sync_val_compare_and_swap:
2058   case Builtin::BI__sync_bool_compare_and_swap:
2059   case Builtin::BI__sync_lock_test_and_set:
2060   case Builtin::BI__sync_lock_release:
2061   case Builtin::BI__sync_swap:
2062     llvm_unreachable("Shouldn't make it through sema");
2063   case Builtin::BI__sync_fetch_and_add_1:
2064   case Builtin::BI__sync_fetch_and_add_2:
2065   case Builtin::BI__sync_fetch_and_add_4:
2066   case Builtin::BI__sync_fetch_and_add_8:
2067   case Builtin::BI__sync_fetch_and_add_16:
2068     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
2069   case Builtin::BI__sync_fetch_and_sub_1:
2070   case Builtin::BI__sync_fetch_and_sub_2:
2071   case Builtin::BI__sync_fetch_and_sub_4:
2072   case Builtin::BI__sync_fetch_and_sub_8:
2073   case Builtin::BI__sync_fetch_and_sub_16:
2074     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
2075   case Builtin::BI__sync_fetch_and_or_1:
2076   case Builtin::BI__sync_fetch_and_or_2:
2077   case Builtin::BI__sync_fetch_and_or_4:
2078   case Builtin::BI__sync_fetch_and_or_8:
2079   case Builtin::BI__sync_fetch_and_or_16:
2080     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
2081   case Builtin::BI__sync_fetch_and_and_1:
2082   case Builtin::BI__sync_fetch_and_and_2:
2083   case Builtin::BI__sync_fetch_and_and_4:
2084   case Builtin::BI__sync_fetch_and_and_8:
2085   case Builtin::BI__sync_fetch_and_and_16:
2086     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
2087   case Builtin::BI__sync_fetch_and_xor_1:
2088   case Builtin::BI__sync_fetch_and_xor_2:
2089   case Builtin::BI__sync_fetch_and_xor_4:
2090   case Builtin::BI__sync_fetch_and_xor_8:
2091   case Builtin::BI__sync_fetch_and_xor_16:
2092     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
2093   case Builtin::BI__sync_fetch_and_nand_1:
2094   case Builtin::BI__sync_fetch_and_nand_2:
2095   case Builtin::BI__sync_fetch_and_nand_4:
2096   case Builtin::BI__sync_fetch_and_nand_8:
2097   case Builtin::BI__sync_fetch_and_nand_16:
2098     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
2099 
2100   // Clang extensions: not overloaded yet.
2101   case Builtin::BI__sync_fetch_and_min:
2102     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
2103   case Builtin::BI__sync_fetch_and_max:
2104     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
2105   case Builtin::BI__sync_fetch_and_umin:
2106     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
2107   case Builtin::BI__sync_fetch_and_umax:
2108     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
2109 
2110   case Builtin::BI__sync_add_and_fetch_1:
2111   case Builtin::BI__sync_add_and_fetch_2:
2112   case Builtin::BI__sync_add_and_fetch_4:
2113   case Builtin::BI__sync_add_and_fetch_8:
2114   case Builtin::BI__sync_add_and_fetch_16:
2115     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
2116                                 llvm::Instruction::Add);
2117   case Builtin::BI__sync_sub_and_fetch_1:
2118   case Builtin::BI__sync_sub_and_fetch_2:
2119   case Builtin::BI__sync_sub_and_fetch_4:
2120   case Builtin::BI__sync_sub_and_fetch_8:
2121   case Builtin::BI__sync_sub_and_fetch_16:
2122     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
2123                                 llvm::Instruction::Sub);
2124   case Builtin::BI__sync_and_and_fetch_1:
2125   case Builtin::BI__sync_and_and_fetch_2:
2126   case Builtin::BI__sync_and_and_fetch_4:
2127   case Builtin::BI__sync_and_and_fetch_8:
2128   case Builtin::BI__sync_and_and_fetch_16:
2129     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
2130                                 llvm::Instruction::And);
2131   case Builtin::BI__sync_or_and_fetch_1:
2132   case Builtin::BI__sync_or_and_fetch_2:
2133   case Builtin::BI__sync_or_and_fetch_4:
2134   case Builtin::BI__sync_or_and_fetch_8:
2135   case Builtin::BI__sync_or_and_fetch_16:
2136     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
2137                                 llvm::Instruction::Or);
2138   case Builtin::BI__sync_xor_and_fetch_1:
2139   case Builtin::BI__sync_xor_and_fetch_2:
2140   case Builtin::BI__sync_xor_and_fetch_4:
2141   case Builtin::BI__sync_xor_and_fetch_8:
2142   case Builtin::BI__sync_xor_and_fetch_16:
2143     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
2144                                 llvm::Instruction::Xor);
2145   case Builtin::BI__sync_nand_and_fetch_1:
2146   case Builtin::BI__sync_nand_and_fetch_2:
2147   case Builtin::BI__sync_nand_and_fetch_4:
2148   case Builtin::BI__sync_nand_and_fetch_8:
2149   case Builtin::BI__sync_nand_and_fetch_16:
2150     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
2151                                 llvm::Instruction::And, true);
2152 
2153   case Builtin::BI__sync_val_compare_and_swap_1:
2154   case Builtin::BI__sync_val_compare_and_swap_2:
2155   case Builtin::BI__sync_val_compare_and_swap_4:
2156   case Builtin::BI__sync_val_compare_and_swap_8:
2157   case Builtin::BI__sync_val_compare_and_swap_16:
2158     return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
2159 
2160   case Builtin::BI__sync_bool_compare_and_swap_1:
2161   case Builtin::BI__sync_bool_compare_and_swap_2:
2162   case Builtin::BI__sync_bool_compare_and_swap_4:
2163   case Builtin::BI__sync_bool_compare_and_swap_8:
2164   case Builtin::BI__sync_bool_compare_and_swap_16:
2165     return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
2166 
2167   case Builtin::BI__sync_swap_1:
2168   case Builtin::BI__sync_swap_2:
2169   case Builtin::BI__sync_swap_4:
2170   case Builtin::BI__sync_swap_8:
2171   case Builtin::BI__sync_swap_16:
2172     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
2173 
2174   case Builtin::BI__sync_lock_test_and_set_1:
2175   case Builtin::BI__sync_lock_test_and_set_2:
2176   case Builtin::BI__sync_lock_test_and_set_4:
2177   case Builtin::BI__sync_lock_test_and_set_8:
2178   case Builtin::BI__sync_lock_test_and_set_16:
2179     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
2180 
2181   case Builtin::BI__sync_lock_release_1:
2182   case Builtin::BI__sync_lock_release_2:
2183   case Builtin::BI__sync_lock_release_4:
2184   case Builtin::BI__sync_lock_release_8:
2185   case Builtin::BI__sync_lock_release_16: {
2186     Value *Ptr = EmitScalarExpr(E->getArg(0));
2187     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
2188     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
2189     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
2190                                              StoreSize.getQuantity() * 8);
2191     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
2192     llvm::StoreInst *Store =
2193       Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr,
2194                                  StoreSize);
2195     Store->setAtomic(llvm::AtomicOrdering::Release);
2196     return RValue::get(nullptr);
2197   }
2198 
2199   case Builtin::BI__sync_synchronize: {
2200     // We assume this is supposed to correspond to a C++0x-style
2201     // sequentially-consistent fence (i.e. this is only usable for
2202     // synchronization, not device I/O or anything like that). This intrinsic
2203     // is really badly designed in the sense that in theory, there isn't
2204     // any way to safely use it... but in practice, it mostly works
2205     // to use it with non-atomic loads and stores to get acquire/release
2206     // semantics.
2207     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
2208     return RValue::get(nullptr);
2209   }
2210 
2211   case Builtin::BI__builtin_nontemporal_load:
2212     return RValue::get(EmitNontemporalLoad(*this, E));
2213   case Builtin::BI__builtin_nontemporal_store:
2214     return RValue::get(EmitNontemporalStore(*this, E));
2215   case Builtin::BI__c11_atomic_is_lock_free:
2216   case Builtin::BI__atomic_is_lock_free: {
2217     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
2218     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
2219     // _Atomic(T) is always properly-aligned.
2220     const char *LibCallName = "__atomic_is_lock_free";
2221     CallArgList Args;
2222     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
2223              getContext().getSizeType());
2224     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
2225       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
2226                getContext().VoidPtrTy);
2227     else
2228       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
2229                getContext().VoidPtrTy);
2230     const CGFunctionInfo &FuncInfo =
2231         CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
2232     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
2233     llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
2234     return EmitCall(FuncInfo, CGCallee::forDirect(Func),
2235                     ReturnValueSlot(), Args);
2236   }
2237 
2238   case Builtin::BI__atomic_test_and_set: {
2239     // Look at the argument type to determine whether this is a volatile
2240     // operation. The parameter type is always volatile.
2241     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
2242     bool Volatile =
2243         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
2244 
2245     Value *Ptr = EmitScalarExpr(E->getArg(0));
2246     unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
2247     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
2248     Value *NewVal = Builder.getInt8(1);
2249     Value *Order = EmitScalarExpr(E->getArg(1));
2250     if (isa<llvm::ConstantInt>(Order)) {
2251       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2252       AtomicRMWInst *Result = nullptr;
2253       switch (ord) {
2254       case 0:  // memory_order_relaxed
2255       default: // invalid order
2256         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2257                                          llvm::AtomicOrdering::Monotonic);
2258         break;
2259       case 1: // memory_order_consume
2260       case 2: // memory_order_acquire
2261         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2262                                          llvm::AtomicOrdering::Acquire);
2263         break;
2264       case 3: // memory_order_release
2265         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2266                                          llvm::AtomicOrdering::Release);
2267         break;
2268       case 4: // memory_order_acq_rel
2269 
2270         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2271                                          llvm::AtomicOrdering::AcquireRelease);
2272         break;
2273       case 5: // memory_order_seq_cst
2274         Result = Builder.CreateAtomicRMW(
2275             llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
2276             llvm::AtomicOrdering::SequentiallyConsistent);
2277         break;
2278       }
2279       Result->setVolatile(Volatile);
2280       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
2281     }
2282 
2283     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2284 
2285     llvm::BasicBlock *BBs[5] = {
2286       createBasicBlock("monotonic", CurFn),
2287       createBasicBlock("acquire", CurFn),
2288       createBasicBlock("release", CurFn),
2289       createBasicBlock("acqrel", CurFn),
2290       createBasicBlock("seqcst", CurFn)
2291     };
2292     llvm::AtomicOrdering Orders[5] = {
2293         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
2294         llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
2295         llvm::AtomicOrdering::SequentiallyConsistent};
2296 
2297     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2298     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
2299 
2300     Builder.SetInsertPoint(ContBB);
2301     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
2302 
2303     for (unsigned i = 0; i < 5; ++i) {
2304       Builder.SetInsertPoint(BBs[i]);
2305       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
2306                                                    Ptr, NewVal, Orders[i]);
2307       RMW->setVolatile(Volatile);
2308       Result->addIncoming(RMW, BBs[i]);
2309       Builder.CreateBr(ContBB);
2310     }
2311 
2312     SI->addCase(Builder.getInt32(0), BBs[0]);
2313     SI->addCase(Builder.getInt32(1), BBs[1]);
2314     SI->addCase(Builder.getInt32(2), BBs[1]);
2315     SI->addCase(Builder.getInt32(3), BBs[2]);
2316     SI->addCase(Builder.getInt32(4), BBs[3]);
2317     SI->addCase(Builder.getInt32(5), BBs[4]);
2318 
2319     Builder.SetInsertPoint(ContBB);
2320     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
2321   }
2322 
2323   case Builtin::BI__atomic_clear: {
2324     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
2325     bool Volatile =
2326         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
2327 
2328     Address Ptr = EmitPointerWithAlignment(E->getArg(0));
2329     unsigned AddrSpace = Ptr.getPointer()->getType()->getPointerAddressSpace();
2330     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
2331     Value *NewVal = Builder.getInt8(0);
2332     Value *Order = EmitScalarExpr(E->getArg(1));
2333     if (isa<llvm::ConstantInt>(Order)) {
2334       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2335       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
2336       switch (ord) {
2337       case 0:  // memory_order_relaxed
2338       default: // invalid order
2339         Store->setOrdering(llvm::AtomicOrdering::Monotonic);
2340         break;
2341       case 3:  // memory_order_release
2342         Store->setOrdering(llvm::AtomicOrdering::Release);
2343         break;
2344       case 5:  // memory_order_seq_cst
2345         Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
2346         break;
2347       }
2348       return RValue::get(nullptr);
2349     }
2350 
2351     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2352 
2353     llvm::BasicBlock *BBs[3] = {
2354       createBasicBlock("monotonic", CurFn),
2355       createBasicBlock("release", CurFn),
2356       createBasicBlock("seqcst", CurFn)
2357     };
2358     llvm::AtomicOrdering Orders[3] = {
2359         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
2360         llvm::AtomicOrdering::SequentiallyConsistent};
2361 
2362     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2363     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
2364 
2365     for (unsigned i = 0; i < 3; ++i) {
2366       Builder.SetInsertPoint(BBs[i]);
2367       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
2368       Store->setOrdering(Orders[i]);
2369       Builder.CreateBr(ContBB);
2370     }
2371 
2372     SI->addCase(Builder.getInt32(0), BBs[0]);
2373     SI->addCase(Builder.getInt32(3), BBs[1]);
2374     SI->addCase(Builder.getInt32(5), BBs[2]);
2375 
2376     Builder.SetInsertPoint(ContBB);
2377     return RValue::get(nullptr);
2378   }
2379 
2380   case Builtin::BI__atomic_thread_fence:
2381   case Builtin::BI__atomic_signal_fence:
2382   case Builtin::BI__c11_atomic_thread_fence:
2383   case Builtin::BI__c11_atomic_signal_fence: {
2384     llvm::SyncScope::ID SSID;
2385     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
2386         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
2387       SSID = llvm::SyncScope::SingleThread;
2388     else
2389       SSID = llvm::SyncScope::System;
2390     Value *Order = EmitScalarExpr(E->getArg(0));
2391     if (isa<llvm::ConstantInt>(Order)) {
2392       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
2393       switch (ord) {
2394       case 0:  // memory_order_relaxed
2395       default: // invalid order
2396         break;
2397       case 1:  // memory_order_consume
2398       case 2:  // memory_order_acquire
2399         Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
2400         break;
2401       case 3:  // memory_order_release
2402         Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
2403         break;
2404       case 4:  // memory_order_acq_rel
2405         Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
2406         break;
2407       case 5:  // memory_order_seq_cst
2408         Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
2409         break;
2410       }
2411       return RValue::get(nullptr);
2412     }
2413 
2414     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
2415     AcquireBB = createBasicBlock("acquire", CurFn);
2416     ReleaseBB = createBasicBlock("release", CurFn);
2417     AcqRelBB = createBasicBlock("acqrel", CurFn);
2418     SeqCstBB = createBasicBlock("seqcst", CurFn);
2419     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
2420 
2421     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
2422     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
2423 
2424     Builder.SetInsertPoint(AcquireBB);
2425     Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
2426     Builder.CreateBr(ContBB);
2427     SI->addCase(Builder.getInt32(1), AcquireBB);
2428     SI->addCase(Builder.getInt32(2), AcquireBB);
2429 
2430     Builder.SetInsertPoint(ReleaseBB);
2431     Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
2432     Builder.CreateBr(ContBB);
2433     SI->addCase(Builder.getInt32(3), ReleaseBB);
2434 
2435     Builder.SetInsertPoint(AcqRelBB);
2436     Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
2437     Builder.CreateBr(ContBB);
2438     SI->addCase(Builder.getInt32(4), AcqRelBB);
2439 
2440     Builder.SetInsertPoint(SeqCstBB);
2441     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
2442     Builder.CreateBr(ContBB);
2443     SI->addCase(Builder.getInt32(5), SeqCstBB);
2444 
2445     Builder.SetInsertPoint(ContBB);
2446     return RValue::get(nullptr);
2447   }
2448 
2449   case Builtin::BI__builtin_signbit:
2450   case Builtin::BI__builtin_signbitf:
2451   case Builtin::BI__builtin_signbitl: {
2452     return RValue::get(
2453         Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
2454                            ConvertType(E->getType())));
2455   }
2456   case Builtin::BI__annotation: {
2457     // Re-encode each wide string to UTF8 and make an MDString.
2458     SmallVector<Metadata *, 1> Strings;
2459     for (const Expr *Arg : E->arguments()) {
2460       const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
2461       assert(Str->getCharByteWidth() == 2);
2462       StringRef WideBytes = Str->getBytes();
2463       std::string StrUtf8;
2464       if (!convertUTF16ToUTF8String(
2465               makeArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
2466         CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
2467         continue;
2468       }
2469       Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
2470     }
2471 
2472     // Build and MDTuple of MDStrings and emit the intrinsic call.
2473     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
2474     MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
2475     Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
2476     return RValue::getIgnored();
2477   }
2478   case Builtin::BI__builtin_annotation: {
2479     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
2480     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::annotation,
2481                                       AnnVal->getType());
2482 
2483     // Get the annotation string, go through casts. Sema requires this to be a
2484     // non-wide string literal, potentially casted, so the cast<> is safe.
2485     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
2486     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
2487     return RValue::get(EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc()));
2488   }
2489   case Builtin::BI__builtin_addcb:
2490   case Builtin::BI__builtin_addcs:
2491   case Builtin::BI__builtin_addc:
2492   case Builtin::BI__builtin_addcl:
2493   case Builtin::BI__builtin_addcll:
2494   case Builtin::BI__builtin_subcb:
2495   case Builtin::BI__builtin_subcs:
2496   case Builtin::BI__builtin_subc:
2497   case Builtin::BI__builtin_subcl:
2498   case Builtin::BI__builtin_subcll: {
2499 
2500     // We translate all of these builtins from expressions of the form:
2501     //   int x = ..., y = ..., carryin = ..., carryout, result;
2502     //   result = __builtin_addc(x, y, carryin, &carryout);
2503     //
2504     // to LLVM IR of the form:
2505     //
2506     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
2507     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
2508     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
2509     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
2510     //                                                       i32 %carryin)
2511     //   %result = extractvalue {i32, i1} %tmp2, 0
2512     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
2513     //   %tmp3 = or i1 %carry1, %carry2
2514     //   %tmp4 = zext i1 %tmp3 to i32
2515     //   store i32 %tmp4, i32* %carryout
2516 
2517     // Scalarize our inputs.
2518     llvm::Value *X = EmitScalarExpr(E->getArg(0));
2519     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
2520     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
2521     Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
2522 
2523     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
2524     llvm::Intrinsic::ID IntrinsicId;
2525     switch (BuiltinID) {
2526     default: llvm_unreachable("Unknown multiprecision builtin id.");
2527     case Builtin::BI__builtin_addcb:
2528     case Builtin::BI__builtin_addcs:
2529     case Builtin::BI__builtin_addc:
2530     case Builtin::BI__builtin_addcl:
2531     case Builtin::BI__builtin_addcll:
2532       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
2533       break;
2534     case Builtin::BI__builtin_subcb:
2535     case Builtin::BI__builtin_subcs:
2536     case Builtin::BI__builtin_subc:
2537     case Builtin::BI__builtin_subcl:
2538     case Builtin::BI__builtin_subcll:
2539       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
2540       break;
2541     }
2542 
2543     // Construct our resulting LLVM IR expression.
2544     llvm::Value *Carry1;
2545     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
2546                                               X, Y, Carry1);
2547     llvm::Value *Carry2;
2548     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
2549                                               Sum1, Carryin, Carry2);
2550     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
2551                                                X->getType());
2552     Builder.CreateStore(CarryOut, CarryOutPtr);
2553     return RValue::get(Sum2);
2554   }
2555 
2556   case Builtin::BI__builtin_add_overflow:
2557   case Builtin::BI__builtin_sub_overflow:
2558   case Builtin::BI__builtin_mul_overflow: {
2559     const clang::Expr *LeftArg = E->getArg(0);
2560     const clang::Expr *RightArg = E->getArg(1);
2561     const clang::Expr *ResultArg = E->getArg(2);
2562 
2563     clang::QualType ResultQTy =
2564         ResultArg->getType()->castAs<PointerType>()->getPointeeType();
2565 
2566     WidthAndSignedness LeftInfo =
2567         getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
2568     WidthAndSignedness RightInfo =
2569         getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
2570     WidthAndSignedness ResultInfo =
2571         getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
2572 
2573     // Handle mixed-sign multiplication as a special case, because adding
2574     // runtime or backend support for our generic irgen would be too expensive.
2575     if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
2576       return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
2577                                           RightInfo, ResultArg, ResultQTy,
2578                                           ResultInfo);
2579 
2580     WidthAndSignedness EncompassingInfo =
2581         EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
2582 
2583     llvm::Type *EncompassingLLVMTy =
2584         llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
2585 
2586     llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
2587 
2588     llvm::Intrinsic::ID IntrinsicId;
2589     switch (BuiltinID) {
2590     default:
2591       llvm_unreachable("Unknown overflow builtin id.");
2592     case Builtin::BI__builtin_add_overflow:
2593       IntrinsicId = EncompassingInfo.Signed
2594                         ? llvm::Intrinsic::sadd_with_overflow
2595                         : llvm::Intrinsic::uadd_with_overflow;
2596       break;
2597     case Builtin::BI__builtin_sub_overflow:
2598       IntrinsicId = EncompassingInfo.Signed
2599                         ? llvm::Intrinsic::ssub_with_overflow
2600                         : llvm::Intrinsic::usub_with_overflow;
2601       break;
2602     case Builtin::BI__builtin_mul_overflow:
2603       IntrinsicId = EncompassingInfo.Signed
2604                         ? llvm::Intrinsic::smul_with_overflow
2605                         : llvm::Intrinsic::umul_with_overflow;
2606       break;
2607     }
2608 
2609     llvm::Value *Left = EmitScalarExpr(LeftArg);
2610     llvm::Value *Right = EmitScalarExpr(RightArg);
2611     Address ResultPtr = EmitPointerWithAlignment(ResultArg);
2612 
2613     // Extend each operand to the encompassing type.
2614     Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
2615     Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
2616 
2617     // Perform the operation on the extended values.
2618     llvm::Value *Overflow, *Result;
2619     Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
2620 
2621     if (EncompassingInfo.Width > ResultInfo.Width) {
2622       // The encompassing type is wider than the result type, so we need to
2623       // truncate it.
2624       llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
2625 
2626       // To see if the truncation caused an overflow, we will extend
2627       // the result and then compare it to the original result.
2628       llvm::Value *ResultTruncExt = Builder.CreateIntCast(
2629           ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
2630       llvm::Value *TruncationOverflow =
2631           Builder.CreateICmpNE(Result, ResultTruncExt);
2632 
2633       Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
2634       Result = ResultTrunc;
2635     }
2636 
2637     // Finally, store the result using the pointer.
2638     bool isVolatile =
2639       ResultArg->getType()->getPointeeType().isVolatileQualified();
2640     Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
2641 
2642     return RValue::get(Overflow);
2643   }
2644 
2645   case Builtin::BI__builtin_uadd_overflow:
2646   case Builtin::BI__builtin_uaddl_overflow:
2647   case Builtin::BI__builtin_uaddll_overflow:
2648   case Builtin::BI__builtin_usub_overflow:
2649   case Builtin::BI__builtin_usubl_overflow:
2650   case Builtin::BI__builtin_usubll_overflow:
2651   case Builtin::BI__builtin_umul_overflow:
2652   case Builtin::BI__builtin_umull_overflow:
2653   case Builtin::BI__builtin_umulll_overflow:
2654   case Builtin::BI__builtin_sadd_overflow:
2655   case Builtin::BI__builtin_saddl_overflow:
2656   case Builtin::BI__builtin_saddll_overflow:
2657   case Builtin::BI__builtin_ssub_overflow:
2658   case Builtin::BI__builtin_ssubl_overflow:
2659   case Builtin::BI__builtin_ssubll_overflow:
2660   case Builtin::BI__builtin_smul_overflow:
2661   case Builtin::BI__builtin_smull_overflow:
2662   case Builtin::BI__builtin_smulll_overflow: {
2663 
2664     // We translate all of these builtins directly to the relevant llvm IR node.
2665 
2666     // Scalarize our inputs.
2667     llvm::Value *X = EmitScalarExpr(E->getArg(0));
2668     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
2669     Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
2670 
2671     // Decide which of the overflow intrinsics we are lowering to:
2672     llvm::Intrinsic::ID IntrinsicId;
2673     switch (BuiltinID) {
2674     default: llvm_unreachable("Unknown overflow builtin id.");
2675     case Builtin::BI__builtin_uadd_overflow:
2676     case Builtin::BI__builtin_uaddl_overflow:
2677     case Builtin::BI__builtin_uaddll_overflow:
2678       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
2679       break;
2680     case Builtin::BI__builtin_usub_overflow:
2681     case Builtin::BI__builtin_usubl_overflow:
2682     case Builtin::BI__builtin_usubll_overflow:
2683       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
2684       break;
2685     case Builtin::BI__builtin_umul_overflow:
2686     case Builtin::BI__builtin_umull_overflow:
2687     case Builtin::BI__builtin_umulll_overflow:
2688       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
2689       break;
2690     case Builtin::BI__builtin_sadd_overflow:
2691     case Builtin::BI__builtin_saddl_overflow:
2692     case Builtin::BI__builtin_saddll_overflow:
2693       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
2694       break;
2695     case Builtin::BI__builtin_ssub_overflow:
2696     case Builtin::BI__builtin_ssubl_overflow:
2697     case Builtin::BI__builtin_ssubll_overflow:
2698       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
2699       break;
2700     case Builtin::BI__builtin_smul_overflow:
2701     case Builtin::BI__builtin_smull_overflow:
2702     case Builtin::BI__builtin_smulll_overflow:
2703       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
2704       break;
2705     }
2706 
2707 
2708     llvm::Value *Carry;
2709     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
2710     Builder.CreateStore(Sum, SumOutPtr);
2711 
2712     return RValue::get(Carry);
2713   }
2714   case Builtin::BI__builtin_addressof:
2715     return RValue::get(EmitLValue(E->getArg(0)).getPointer());
2716   case Builtin::BI__builtin_operator_new:
2717     return EmitBuiltinNewDeleteCall(
2718         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
2719   case Builtin::BI__builtin_operator_delete:
2720     return EmitBuiltinNewDeleteCall(
2721         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
2722 
2723   case Builtin::BI__noop:
2724     // __noop always evaluates to an integer literal zero.
2725     return RValue::get(ConstantInt::get(IntTy, 0));
2726   case Builtin::BI__builtin_call_with_static_chain: {
2727     const CallExpr *Call = cast<CallExpr>(E->getArg(0));
2728     const Expr *Chain = E->getArg(1);
2729     return EmitCall(Call->getCallee()->getType(),
2730                     EmitCallee(Call->getCallee()), Call, ReturnValue,
2731                     EmitScalarExpr(Chain));
2732   }
2733   case Builtin::BI_InterlockedExchange8:
2734   case Builtin::BI_InterlockedExchange16:
2735   case Builtin::BI_InterlockedExchange:
2736   case Builtin::BI_InterlockedExchangePointer:
2737     return RValue::get(
2738         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
2739   case Builtin::BI_InterlockedCompareExchangePointer: {
2740     llvm::Type *RTy;
2741     llvm::IntegerType *IntType =
2742       IntegerType::get(getLLVMContext(),
2743                        getContext().getTypeSize(E->getType()));
2744     llvm::Type *IntPtrType = IntType->getPointerTo();
2745 
2746     llvm::Value *Destination =
2747       Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), IntPtrType);
2748 
2749     llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
2750     RTy = Exchange->getType();
2751     Exchange = Builder.CreatePtrToInt(Exchange, IntType);
2752 
2753     llvm::Value *Comparand =
2754       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
2755 
2756     auto Result =
2757         Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
2758                                     AtomicOrdering::SequentiallyConsistent,
2759                                     AtomicOrdering::SequentiallyConsistent);
2760     Result->setVolatile(true);
2761 
2762     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
2763                                                                          0),
2764                                               RTy));
2765   }
2766   case Builtin::BI_InterlockedCompareExchange8:
2767   case Builtin::BI_InterlockedCompareExchange16:
2768   case Builtin::BI_InterlockedCompareExchange:
2769   case Builtin::BI_InterlockedCompareExchange64: {
2770     AtomicCmpXchgInst *CXI = Builder.CreateAtomicCmpXchg(
2771         EmitScalarExpr(E->getArg(0)),
2772         EmitScalarExpr(E->getArg(2)),
2773         EmitScalarExpr(E->getArg(1)),
2774         AtomicOrdering::SequentiallyConsistent,
2775         AtomicOrdering::SequentiallyConsistent);
2776       CXI->setVolatile(true);
2777       return RValue::get(Builder.CreateExtractValue(CXI, 0));
2778   }
2779   case Builtin::BI_InterlockedIncrement16:
2780   case Builtin::BI_InterlockedIncrement:
2781     return RValue::get(
2782         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
2783   case Builtin::BI_InterlockedDecrement16:
2784   case Builtin::BI_InterlockedDecrement:
2785     return RValue::get(
2786         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
2787   case Builtin::BI_InterlockedAnd8:
2788   case Builtin::BI_InterlockedAnd16:
2789   case Builtin::BI_InterlockedAnd:
2790     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
2791   case Builtin::BI_InterlockedExchangeAdd8:
2792   case Builtin::BI_InterlockedExchangeAdd16:
2793   case Builtin::BI_InterlockedExchangeAdd:
2794     return RValue::get(
2795         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
2796   case Builtin::BI_InterlockedExchangeSub8:
2797   case Builtin::BI_InterlockedExchangeSub16:
2798   case Builtin::BI_InterlockedExchangeSub:
2799     return RValue::get(
2800         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
2801   case Builtin::BI_InterlockedOr8:
2802   case Builtin::BI_InterlockedOr16:
2803   case Builtin::BI_InterlockedOr:
2804     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
2805   case Builtin::BI_InterlockedXor8:
2806   case Builtin::BI_InterlockedXor16:
2807   case Builtin::BI_InterlockedXor:
2808     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
2809   case Builtin::BI_interlockedbittestandset:
2810     return RValue::get(
2811         EmitMSVCBuiltinExpr(MSVCIntrin::_interlockedbittestandset, E));
2812 
2813   case Builtin::BI__exception_code:
2814   case Builtin::BI_exception_code:
2815     return RValue::get(EmitSEHExceptionCode());
2816   case Builtin::BI__exception_info:
2817   case Builtin::BI_exception_info:
2818     return RValue::get(EmitSEHExceptionInfo());
2819   case Builtin::BI__abnormal_termination:
2820   case Builtin::BI_abnormal_termination:
2821     return RValue::get(EmitSEHAbnormalTermination());
2822   case Builtin::BI_setjmpex: {
2823     if (getTarget().getTriple().isOSMSVCRT()) {
2824       llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy};
2825       llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
2826           getLLVMContext(), llvm::AttributeList::FunctionIndex,
2827           llvm::Attribute::ReturnsTwice);
2828       llvm::Constant *SetJmpEx = CGM.CreateRuntimeFunction(
2829           llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false),
2830           "_setjmpex", ReturnsTwiceAttr, /*Local=*/true);
2831       llvm::Value *Buf = Builder.CreateBitOrPointerCast(
2832           EmitScalarExpr(E->getArg(0)), Int8PtrTy);
2833       llvm::Value *FrameAddr =
2834           Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
2835                              ConstantInt::get(Int32Ty, 0));
2836       llvm::Value *Args[] = {Buf, FrameAddr};
2837       llvm::CallSite CS = EmitRuntimeCallOrInvoke(SetJmpEx, Args);
2838       CS.setAttributes(ReturnsTwiceAttr);
2839       return RValue::get(CS.getInstruction());
2840     }
2841     break;
2842   }
2843   case Builtin::BI_setjmp: {
2844     if (getTarget().getTriple().isOSMSVCRT()) {
2845       llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
2846           getLLVMContext(), llvm::AttributeList::FunctionIndex,
2847           llvm::Attribute::ReturnsTwice);
2848       llvm::Value *Buf = Builder.CreateBitOrPointerCast(
2849           EmitScalarExpr(E->getArg(0)), Int8PtrTy);
2850       llvm::CallSite CS;
2851       if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
2852         llvm::Type *ArgTypes[] = {Int8PtrTy, IntTy};
2853         llvm::Constant *SetJmp3 = CGM.CreateRuntimeFunction(
2854             llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/true),
2855             "_setjmp3", ReturnsTwiceAttr, /*Local=*/true);
2856         llvm::Value *Count = ConstantInt::get(IntTy, 0);
2857         llvm::Value *Args[] = {Buf, Count};
2858         CS = EmitRuntimeCallOrInvoke(SetJmp3, Args);
2859       } else {
2860         llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy};
2861         llvm::Constant *SetJmp = CGM.CreateRuntimeFunction(
2862             llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false),
2863             "_setjmp", ReturnsTwiceAttr, /*Local=*/true);
2864         llvm::Value *FrameAddr =
2865             Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
2866                                ConstantInt::get(Int32Ty, 0));
2867         llvm::Value *Args[] = {Buf, FrameAddr};
2868         CS = EmitRuntimeCallOrInvoke(SetJmp, Args);
2869       }
2870       CS.setAttributes(ReturnsTwiceAttr);
2871       return RValue::get(CS.getInstruction());
2872     }
2873     break;
2874   }
2875 
2876   case Builtin::BI__GetExceptionInfo: {
2877     if (llvm::GlobalVariable *GV =
2878             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
2879       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
2880     break;
2881   }
2882 
2883   case Builtin::BI__fastfail:
2884     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
2885 
2886   case Builtin::BI__builtin_coro_size: {
2887     auto & Context = getContext();
2888     auto SizeTy = Context.getSizeType();
2889     auto T = Builder.getIntNTy(Context.getTypeSize(SizeTy));
2890     Value *F = CGM.getIntrinsic(Intrinsic::coro_size, T);
2891     return RValue::get(Builder.CreateCall(F));
2892   }
2893 
2894   case Builtin::BI__builtin_coro_id:
2895     return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
2896   case Builtin::BI__builtin_coro_promise:
2897     return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
2898   case Builtin::BI__builtin_coro_resume:
2899     return EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
2900   case Builtin::BI__builtin_coro_frame:
2901     return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
2902   case Builtin::BI__builtin_coro_noop:
2903     return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
2904   case Builtin::BI__builtin_coro_free:
2905     return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
2906   case Builtin::BI__builtin_coro_destroy:
2907     return EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
2908   case Builtin::BI__builtin_coro_done:
2909     return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
2910   case Builtin::BI__builtin_coro_alloc:
2911     return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
2912   case Builtin::BI__builtin_coro_begin:
2913     return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
2914   case Builtin::BI__builtin_coro_end:
2915     return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
2916   case Builtin::BI__builtin_coro_suspend:
2917     return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
2918   case Builtin::BI__builtin_coro_param:
2919     return EmitCoroutineIntrinsic(E, Intrinsic::coro_param);
2920 
2921   // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
2922   case Builtin::BIread_pipe:
2923   case Builtin::BIwrite_pipe: {
2924     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
2925           *Arg1 = EmitScalarExpr(E->getArg(1));
2926     CGOpenCLRuntime OpenCLRT(CGM);
2927     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
2928     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
2929 
2930     // Type of the generic packet parameter.
2931     unsigned GenericAS =
2932         getContext().getTargetAddressSpace(LangAS::opencl_generic);
2933     llvm::Type *I8PTy = llvm::PointerType::get(
2934         llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
2935 
2936     // Testing which overloaded version we should generate the call for.
2937     if (2U == E->getNumArgs()) {
2938       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
2939                                                              : "__write_pipe_2";
2940       // Creating a generic function type to be able to call with any builtin or
2941       // user defined type.
2942       llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
2943       llvm::FunctionType *FTy = llvm::FunctionType::get(
2944           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
2945       Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
2946       return RValue::get(
2947           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
2948                              {Arg0, BCast, PacketSize, PacketAlign}));
2949     } else {
2950       assert(4 == E->getNumArgs() &&
2951              "Illegal number of parameters to pipe function");
2952       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
2953                                                              : "__write_pipe_4";
2954 
2955       llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
2956                               Int32Ty, Int32Ty};
2957       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
2958             *Arg3 = EmitScalarExpr(E->getArg(3));
2959       llvm::FunctionType *FTy = llvm::FunctionType::get(
2960           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
2961       Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
2962       // We know the third argument is an integer type, but we may need to cast
2963       // it to i32.
2964       if (Arg2->getType() != Int32Ty)
2965         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
2966       return RValue::get(Builder.CreateCall(
2967           CGM.CreateRuntimeFunction(FTy, Name),
2968           {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
2969     }
2970   }
2971   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
2972   // functions
2973   case Builtin::BIreserve_read_pipe:
2974   case Builtin::BIreserve_write_pipe:
2975   case Builtin::BIwork_group_reserve_read_pipe:
2976   case Builtin::BIwork_group_reserve_write_pipe:
2977   case Builtin::BIsub_group_reserve_read_pipe:
2978   case Builtin::BIsub_group_reserve_write_pipe: {
2979     // Composing the mangled name for the function.
2980     const char *Name;
2981     if (BuiltinID == Builtin::BIreserve_read_pipe)
2982       Name = "__reserve_read_pipe";
2983     else if (BuiltinID == Builtin::BIreserve_write_pipe)
2984       Name = "__reserve_write_pipe";
2985     else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
2986       Name = "__work_group_reserve_read_pipe";
2987     else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
2988       Name = "__work_group_reserve_write_pipe";
2989     else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
2990       Name = "__sub_group_reserve_read_pipe";
2991     else
2992       Name = "__sub_group_reserve_write_pipe";
2993 
2994     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
2995           *Arg1 = EmitScalarExpr(E->getArg(1));
2996     llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
2997     CGOpenCLRuntime OpenCLRT(CGM);
2998     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
2999     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3000 
3001     // Building the generic function prototype.
3002     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
3003     llvm::FunctionType *FTy = llvm::FunctionType::get(
3004         ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3005     // We know the second argument is an integer type, but we may need to cast
3006     // it to i32.
3007     if (Arg1->getType() != Int32Ty)
3008       Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
3009     return RValue::get(
3010         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3011                            {Arg0, Arg1, PacketSize, PacketAlign}));
3012   }
3013   // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
3014   // functions
3015   case Builtin::BIcommit_read_pipe:
3016   case Builtin::BIcommit_write_pipe:
3017   case Builtin::BIwork_group_commit_read_pipe:
3018   case Builtin::BIwork_group_commit_write_pipe:
3019   case Builtin::BIsub_group_commit_read_pipe:
3020   case Builtin::BIsub_group_commit_write_pipe: {
3021     const char *Name;
3022     if (BuiltinID == Builtin::BIcommit_read_pipe)
3023       Name = "__commit_read_pipe";
3024     else if (BuiltinID == Builtin::BIcommit_write_pipe)
3025       Name = "__commit_write_pipe";
3026     else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
3027       Name = "__work_group_commit_read_pipe";
3028     else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
3029       Name = "__work_group_commit_write_pipe";
3030     else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
3031       Name = "__sub_group_commit_read_pipe";
3032     else
3033       Name = "__sub_group_commit_write_pipe";
3034 
3035     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
3036           *Arg1 = EmitScalarExpr(E->getArg(1));
3037     CGOpenCLRuntime OpenCLRT(CGM);
3038     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3039     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3040 
3041     // Building the generic function prototype.
3042     llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
3043     llvm::FunctionType *FTy =
3044         llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
3045                                 llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3046 
3047     return RValue::get(
3048         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3049                            {Arg0, Arg1, PacketSize, PacketAlign}));
3050   }
3051   // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
3052   case Builtin::BIget_pipe_num_packets:
3053   case Builtin::BIget_pipe_max_packets: {
3054     const char *BaseName;
3055     const PipeType *PipeTy = E->getArg(0)->getType()->getAs<PipeType>();
3056     if (BuiltinID == Builtin::BIget_pipe_num_packets)
3057       BaseName = "__get_pipe_num_packets";
3058     else
3059       BaseName = "__get_pipe_max_packets";
3060     auto Name = std::string(BaseName) +
3061                 std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
3062 
3063     // Building the generic function prototype.
3064     Value *Arg0 = EmitScalarExpr(E->getArg(0));
3065     CGOpenCLRuntime OpenCLRT(CGM);
3066     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
3067     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
3068     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
3069     llvm::FunctionType *FTy = llvm::FunctionType::get(
3070         Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3071 
3072     return RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3073                                           {Arg0, PacketSize, PacketAlign}));
3074   }
3075 
3076   // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
3077   case Builtin::BIto_global:
3078   case Builtin::BIto_local:
3079   case Builtin::BIto_private: {
3080     auto Arg0 = EmitScalarExpr(E->getArg(0));
3081     auto NewArgT = llvm::PointerType::get(Int8Ty,
3082       CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
3083     auto NewRetT = llvm::PointerType::get(Int8Ty,
3084       CGM.getContext().getTargetAddressSpace(
3085         E->getType()->getPointeeType().getAddressSpace()));
3086     auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
3087     llvm::Value *NewArg;
3088     if (Arg0->getType()->getPointerAddressSpace() !=
3089         NewArgT->getPointerAddressSpace())
3090       NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
3091     else
3092       NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
3093     auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
3094     auto NewCall =
3095         Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
3096     return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
3097       ConvertType(E->getType())));
3098   }
3099 
3100   // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
3101   // It contains four different overload formats specified in Table 6.13.17.1.
3102   case Builtin::BIenqueue_kernel: {
3103     StringRef Name; // Generated function call name
3104     unsigned NumArgs = E->getNumArgs();
3105 
3106     llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
3107     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3108         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3109 
3110     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
3111     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
3112     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
3113     llvm::Value *Range = NDRangeL.getAddress().getPointer();
3114     llvm::Type *RangeTy = NDRangeL.getAddress().getType();
3115 
3116     if (NumArgs == 4) {
3117       // The most basic form of the call with parameters:
3118       // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
3119       Name = "__enqueue_kernel_basic";
3120       llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
3121                               GenericVoidPtrTy};
3122       llvm::FunctionType *FTy = llvm::FunctionType::get(
3123           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3124 
3125       auto Info =
3126           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
3127       llvm::Value *Kernel =
3128           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3129       llvm::Value *Block =
3130           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3131 
3132       AttrBuilder B;
3133       B.addAttribute(Attribute::ByVal);
3134       llvm::AttributeList ByValAttrSet =
3135           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
3136 
3137       auto RTCall =
3138           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
3139                              {Queue, Flags, Range, Kernel, Block});
3140       RTCall->setAttributes(ByValAttrSet);
3141       return RValue::get(RTCall);
3142     }
3143     assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
3144 
3145     // Create a temporary array to hold the sizes of local pointer arguments
3146     // for the block. \p First is the position of the first size argument.
3147     auto CreateArrayForSizeVar = [=](unsigned First) {
3148       auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First);
3149       auto *Arr = Builder.CreateAlloca(AT);
3150       llvm::Value *Ptr;
3151       // Each of the following arguments specifies the size of the corresponding
3152       // argument passed to the enqueued block.
3153       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
3154       for (unsigned I = First; I < NumArgs; ++I) {
3155         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
3156         auto *GEP = Builder.CreateGEP(Arr, {Zero, Index});
3157         if (I == First)
3158           Ptr = GEP;
3159         auto *V =
3160             Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
3161         Builder.CreateAlignedStore(
3162             V, GEP, CGM.getDataLayout().getPrefTypeAlignment(SizeTy));
3163       }
3164       return Ptr;
3165     };
3166 
3167     // Could have events and/or vaargs.
3168     if (E->getArg(3)->getType()->isBlockPointerType()) {
3169       // No events passed, but has variadic arguments.
3170       Name = "__enqueue_kernel_vaargs";
3171       auto Info =
3172           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
3173       llvm::Value *Kernel =
3174           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3175       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3176       auto *PtrToSizeArray = CreateArrayForSizeVar(4);
3177 
3178       // Create a vector of the arguments, as well as a constant value to
3179       // express to the runtime the number of variadic arguments.
3180       std::vector<llvm::Value *> Args = {
3181           Queue,  Flags, Range,
3182           Kernel, Block, ConstantInt::get(IntTy, NumArgs - 4),
3183           PtrToSizeArray};
3184       std::vector<llvm::Type *> ArgTys = {
3185           QueueTy,          IntTy,            RangeTy,
3186           GenericVoidPtrTy, GenericVoidPtrTy, IntTy,
3187           PtrToSizeArray->getType()};
3188 
3189       llvm::FunctionType *FTy = llvm::FunctionType::get(
3190           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3191       return RValue::get(
3192           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3193                              llvm::ArrayRef<llvm::Value *>(Args)));
3194     }
3195     // Any calls now have event arguments passed.
3196     if (NumArgs >= 7) {
3197       llvm::Type *EventTy = ConvertType(getContext().OCLClkEventTy);
3198       llvm::Type *EventPtrTy = EventTy->getPointerTo(
3199           CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
3200 
3201       llvm::Value *NumEvents =
3202           Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
3203       llvm::Value *EventList =
3204           E->getArg(4)->getType()->isArrayType()
3205               ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
3206               : EmitScalarExpr(E->getArg(4));
3207       llvm::Value *ClkEvent = EmitScalarExpr(E->getArg(5));
3208       // Convert to generic address space.
3209       EventList = Builder.CreatePointerCast(EventList, EventPtrTy);
3210       ClkEvent = Builder.CreatePointerCast(ClkEvent, EventPtrTy);
3211       auto Info =
3212           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
3213       llvm::Value *Kernel =
3214           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3215       llvm::Value *Block =
3216           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3217 
3218       std::vector<llvm::Type *> ArgTys = {
3219           QueueTy,    Int32Ty,    RangeTy,          Int32Ty,
3220           EventPtrTy, EventPtrTy, GenericVoidPtrTy, GenericVoidPtrTy};
3221 
3222       std::vector<llvm::Value *> Args = {Queue,     Flags,    Range,  NumEvents,
3223                                          EventList, ClkEvent, Kernel, Block};
3224 
3225       if (NumArgs == 7) {
3226         // Has events but no variadics.
3227         Name = "__enqueue_kernel_basic_events";
3228         llvm::FunctionType *FTy = llvm::FunctionType::get(
3229             Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3230         return RValue::get(
3231             Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3232                                llvm::ArrayRef<llvm::Value *>(Args)));
3233       }
3234       // Has event info and variadics
3235       // Pass the number of variadics to the runtime function too.
3236       Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
3237       ArgTys.push_back(Int32Ty);
3238       Name = "__enqueue_kernel_events_vaargs";
3239 
3240       auto *PtrToSizeArray = CreateArrayForSizeVar(7);
3241       Args.push_back(PtrToSizeArray);
3242       ArgTys.push_back(PtrToSizeArray->getType());
3243 
3244       llvm::FunctionType *FTy = llvm::FunctionType::get(
3245           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
3246       return RValue::get(
3247           Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
3248                              llvm::ArrayRef<llvm::Value *>(Args)));
3249     }
3250     LLVM_FALLTHROUGH;
3251   }
3252   // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
3253   // parameter.
3254   case Builtin::BIget_kernel_work_group_size: {
3255     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3256         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3257     auto Info =
3258         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
3259     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3260     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3261     return RValue::get(Builder.CreateCall(
3262         CGM.CreateRuntimeFunction(
3263             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
3264                                     false),
3265             "__get_kernel_work_group_size_impl"),
3266         {Kernel, Arg}));
3267   }
3268   case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
3269     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3270         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3271     auto Info =
3272         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
3273     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3274     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3275     return RValue::get(Builder.CreateCall(
3276         CGM.CreateRuntimeFunction(
3277             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
3278                                     false),
3279             "__get_kernel_preferred_work_group_multiple_impl"),
3280         {Kernel, Arg}));
3281   }
3282   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
3283   case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
3284     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
3285         getContext().getTargetAddressSpace(LangAS::opencl_generic));
3286     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
3287     llvm::Value *NDRange = NDRangeL.getAddress().getPointer();
3288     auto Info =
3289         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
3290     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
3291     Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
3292     const char *Name =
3293         BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
3294             ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
3295             : "__get_kernel_sub_group_count_for_ndrange_impl";
3296     return RValue::get(Builder.CreateCall(
3297         CGM.CreateRuntimeFunction(
3298             llvm::FunctionType::get(
3299                 IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
3300                 false),
3301             Name),
3302         {NDRange, Kernel, Block}));
3303   }
3304 
3305   case Builtin::BI__builtin_store_half:
3306   case Builtin::BI__builtin_store_halff: {
3307     Value *Val = EmitScalarExpr(E->getArg(0));
3308     Address Address = EmitPointerWithAlignment(E->getArg(1));
3309     Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
3310     return RValue::get(Builder.CreateStore(HalfVal, Address));
3311   }
3312   case Builtin::BI__builtin_load_half: {
3313     Address Address = EmitPointerWithAlignment(E->getArg(0));
3314     Value *HalfVal = Builder.CreateLoad(Address);
3315     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
3316   }
3317   case Builtin::BI__builtin_load_halff: {
3318     Address Address = EmitPointerWithAlignment(E->getArg(0));
3319     Value *HalfVal = Builder.CreateLoad(Address);
3320     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
3321   }
3322   case Builtin::BIprintf:
3323     if (getTarget().getTriple().isNVPTX())
3324       return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
3325     break;
3326   case Builtin::BI__builtin_canonicalize:
3327   case Builtin::BI__builtin_canonicalizef:
3328   case Builtin::BI__builtin_canonicalizel:
3329     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
3330 
3331   case Builtin::BI__builtin_thread_pointer: {
3332     if (!getContext().getTargetInfo().isTLSSupported())
3333       CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
3334     // Fall through - it's already mapped to the intrinsic by GCCBuiltin.
3335     break;
3336   }
3337   case Builtin::BI__builtin_os_log_format:
3338     return emitBuiltinOSLogFormat(*E);
3339 
3340   case Builtin::BI__builtin_os_log_format_buffer_size: {
3341     analyze_os_log::OSLogBufferLayout Layout;
3342     analyze_os_log::computeOSLogBufferLayout(CGM.getContext(), E, Layout);
3343     return RValue::get(ConstantInt::get(ConvertType(E->getType()),
3344                                         Layout.size().getQuantity()));
3345   }
3346 
3347   case Builtin::BI__xray_customevent: {
3348     if (!ShouldXRayInstrumentFunction())
3349       return RValue::getIgnored();
3350 
3351     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
3352             XRayInstrKind::Custom))
3353       return RValue::getIgnored();
3354 
3355     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
3356       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
3357         return RValue::getIgnored();
3358 
3359     Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
3360     auto FTy = F->getFunctionType();
3361     auto Arg0 = E->getArg(0);
3362     auto Arg0Val = EmitScalarExpr(Arg0);
3363     auto Arg0Ty = Arg0->getType();
3364     auto PTy0 = FTy->getParamType(0);
3365     if (PTy0 != Arg0Val->getType()) {
3366       if (Arg0Ty->isArrayType())
3367         Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
3368       else
3369         Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
3370     }
3371     auto Arg1 = EmitScalarExpr(E->getArg(1));
3372     auto PTy1 = FTy->getParamType(1);
3373     if (PTy1 != Arg1->getType())
3374       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
3375     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
3376   }
3377 
3378   case Builtin::BI__xray_typedevent: {
3379     // TODO: There should be a way to always emit events even if the current
3380     // function is not instrumented. Losing events in a stream can cripple
3381     // a trace.
3382     if (!ShouldXRayInstrumentFunction())
3383       return RValue::getIgnored();
3384 
3385     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
3386             XRayInstrKind::Typed))
3387       return RValue::getIgnored();
3388 
3389     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
3390       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
3391         return RValue::getIgnored();
3392 
3393     Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
3394     auto FTy = F->getFunctionType();
3395     auto Arg0 = EmitScalarExpr(E->getArg(0));
3396     auto PTy0 = FTy->getParamType(0);
3397     if (PTy0 != Arg0->getType())
3398       Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
3399     auto Arg1 = E->getArg(1);
3400     auto Arg1Val = EmitScalarExpr(Arg1);
3401     auto Arg1Ty = Arg1->getType();
3402     auto PTy1 = FTy->getParamType(1);
3403     if (PTy1 != Arg1Val->getType()) {
3404       if (Arg1Ty->isArrayType())
3405         Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
3406       else
3407         Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
3408     }
3409     auto Arg2 = EmitScalarExpr(E->getArg(2));
3410     auto PTy2 = FTy->getParamType(2);
3411     if (PTy2 != Arg2->getType())
3412       Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
3413     return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
3414   }
3415 
3416   case Builtin::BI__builtin_ms_va_start:
3417   case Builtin::BI__builtin_ms_va_end:
3418     return RValue::get(
3419         EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
3420                        BuiltinID == Builtin::BI__builtin_ms_va_start));
3421 
3422   case Builtin::BI__builtin_ms_va_copy: {
3423     // Lower this manually. We can't reliably determine whether or not any
3424     // given va_copy() is for a Win64 va_list from the calling convention
3425     // alone, because it's legal to do this from a System V ABI function.
3426     // With opaque pointer types, we won't have enough information in LLVM
3427     // IR to determine this from the argument types, either. Best to do it
3428     // now, while we have enough information.
3429     Address DestAddr = EmitMSVAListRef(E->getArg(0));
3430     Address SrcAddr = EmitMSVAListRef(E->getArg(1));
3431 
3432     llvm::Type *BPP = Int8PtrPtrTy;
3433 
3434     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
3435                        DestAddr.getAlignment());
3436     SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
3437                       SrcAddr.getAlignment());
3438 
3439     Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
3440     return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
3441   }
3442   }
3443 
3444   // If this is an alias for a lib function (e.g. __builtin_sin), emit
3445   // the call using the normal call path, but using the unmangled
3446   // version of the function name.
3447   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
3448     return emitLibraryCall(*this, FD, E,
3449                            CGM.getBuiltinLibFunction(FD, BuiltinID));
3450 
3451   // If this is a predefined lib function (e.g. malloc), emit the call
3452   // using exactly the normal call path.
3453   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
3454     return emitLibraryCall(*this, FD, E,
3455                       cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
3456 
3457   // Check that a call to a target specific builtin has the correct target
3458   // features.
3459   // This is down here to avoid non-target specific builtins, however, if
3460   // generic builtins start to require generic target features then we
3461   // can move this up to the beginning of the function.
3462   checkTargetFeatures(E, FD);
3463 
3464   // See if we have a target specific intrinsic.
3465   const char *Name = getContext().BuiltinInfo.getName(BuiltinID);
3466   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
3467   StringRef Prefix =
3468       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
3469   if (!Prefix.empty()) {
3470     IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix.data(), Name);
3471     // NOTE we don't need to perform a compatibility flag check here since the
3472     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
3473     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
3474     if (IntrinsicID == Intrinsic::not_intrinsic)
3475       IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
3476   }
3477 
3478   if (IntrinsicID != Intrinsic::not_intrinsic) {
3479     SmallVector<Value*, 16> Args;
3480 
3481     // Find out if any arguments are required to be integer constant
3482     // expressions.
3483     unsigned ICEArguments = 0;
3484     ASTContext::GetBuiltinTypeError Error;
3485     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
3486     assert(Error == ASTContext::GE_None && "Should not codegen an error");
3487 
3488     Function *F = CGM.getIntrinsic(IntrinsicID);
3489     llvm::FunctionType *FTy = F->getFunctionType();
3490 
3491     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
3492       Value *ArgValue;
3493       // If this is a normal argument, just emit it as a scalar.
3494       if ((ICEArguments & (1 << i)) == 0) {
3495         ArgValue = EmitScalarExpr(E->getArg(i));
3496       } else {
3497         // If this is required to be a constant, constant fold it so that we
3498         // know that the generated intrinsic gets a ConstantInt.
3499         llvm::APSInt Result;
3500         bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result,getContext());
3501         assert(IsConst && "Constant arg isn't actually constant?");
3502         (void)IsConst;
3503         ArgValue = llvm::ConstantInt::get(getLLVMContext(), Result);
3504       }
3505 
3506       // If the intrinsic arg type is different from the builtin arg type
3507       // we need to do a bit cast.
3508       llvm::Type *PTy = FTy->getParamType(i);
3509       if (PTy != ArgValue->getType()) {
3510         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
3511                "Must be able to losslessly bit cast to param");
3512         ArgValue = Builder.CreateBitCast(ArgValue, PTy);
3513       }
3514 
3515       Args.push_back(ArgValue);
3516     }
3517 
3518     Value *V = Builder.CreateCall(F, Args);
3519     QualType BuiltinRetType = E->getType();
3520 
3521     llvm::Type *RetTy = VoidTy;
3522     if (!BuiltinRetType->isVoidType())
3523       RetTy = ConvertType(BuiltinRetType);
3524 
3525     if (RetTy != V->getType()) {
3526       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
3527              "Must be able to losslessly bit cast result type");
3528       V = Builder.CreateBitCast(V, RetTy);
3529     }
3530 
3531     return RValue::get(V);
3532   }
3533 
3534   // See if we have a target specific builtin that needs to be lowered.
3535   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E))
3536     return RValue::get(V);
3537 
3538   ErrorUnsupported(E, "builtin function");
3539 
3540   // Unknown builtin, for now just dump it out and return undef.
3541   return GetUndefRValue(E->getType());
3542 }
3543 
3544 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
3545                                         unsigned BuiltinID, const CallExpr *E,
3546                                         llvm::Triple::ArchType Arch) {
3547   switch (Arch) {
3548   case llvm::Triple::arm:
3549   case llvm::Triple::armeb:
3550   case llvm::Triple::thumb:
3551   case llvm::Triple::thumbeb:
3552     return CGF->EmitARMBuiltinExpr(BuiltinID, E, Arch);
3553   case llvm::Triple::aarch64:
3554   case llvm::Triple::aarch64_be:
3555     return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
3556   case llvm::Triple::x86:
3557   case llvm::Triple::x86_64:
3558     return CGF->EmitX86BuiltinExpr(BuiltinID, E);
3559   case llvm::Triple::ppc:
3560   case llvm::Triple::ppc64:
3561   case llvm::Triple::ppc64le:
3562     return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
3563   case llvm::Triple::r600:
3564   case llvm::Triple::amdgcn:
3565     return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
3566   case llvm::Triple::systemz:
3567     return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
3568   case llvm::Triple::nvptx:
3569   case llvm::Triple::nvptx64:
3570     return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
3571   case llvm::Triple::wasm32:
3572   case llvm::Triple::wasm64:
3573     return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
3574   case llvm::Triple::hexagon:
3575     return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
3576   default:
3577     return nullptr;
3578   }
3579 }
3580 
3581 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
3582                                               const CallExpr *E) {
3583   if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
3584     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
3585     return EmitTargetArchBuiltinExpr(
3586         this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
3587         getContext().getAuxTargetInfo()->getTriple().getArch());
3588   }
3589 
3590   return EmitTargetArchBuiltinExpr(this, BuiltinID, E,
3591                                    getTarget().getTriple().getArch());
3592 }
3593 
3594 static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
3595                                      NeonTypeFlags TypeFlags,
3596                                      bool HasLegalHalfType=true,
3597                                      bool V1Ty=false) {
3598   int IsQuad = TypeFlags.isQuad();
3599   switch (TypeFlags.getEltType()) {
3600   case NeonTypeFlags::Int8:
3601   case NeonTypeFlags::Poly8:
3602     return llvm::VectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
3603   case NeonTypeFlags::Int16:
3604   case NeonTypeFlags::Poly16:
3605     return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
3606   case NeonTypeFlags::Float16:
3607     if (HasLegalHalfType)
3608       return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
3609     else
3610       return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
3611   case NeonTypeFlags::Int32:
3612     return llvm::VectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
3613   case NeonTypeFlags::Int64:
3614   case NeonTypeFlags::Poly64:
3615     return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
3616   case NeonTypeFlags::Poly128:
3617     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
3618     // There is a lot of i128 and f128 API missing.
3619     // so we use v16i8 to represent poly128 and get pattern matched.
3620     return llvm::VectorType::get(CGF->Int8Ty, 16);
3621   case NeonTypeFlags::Float32:
3622     return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
3623   case NeonTypeFlags::Float64:
3624     return llvm::VectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
3625   }
3626   llvm_unreachable("Unknown vector element type!");
3627 }
3628 
3629 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
3630                                           NeonTypeFlags IntTypeFlags) {
3631   int IsQuad = IntTypeFlags.isQuad();
3632   switch (IntTypeFlags.getEltType()) {
3633   case NeonTypeFlags::Int16:
3634     return llvm::VectorType::get(CGF->HalfTy, (4 << IsQuad));
3635   case NeonTypeFlags::Int32:
3636     return llvm::VectorType::get(CGF->FloatTy, (2 << IsQuad));
3637   case NeonTypeFlags::Int64:
3638     return llvm::VectorType::get(CGF->DoubleTy, (1 << IsQuad));
3639   default:
3640     llvm_unreachable("Type can't be converted to floating-point!");
3641   }
3642 }
3643 
3644 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
3645   unsigned nElts = V->getType()->getVectorNumElements();
3646   Value* SV = llvm::ConstantVector::getSplat(nElts, C);
3647   return Builder.CreateShuffleVector(V, V, SV, "lane");
3648 }
3649 
3650 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
3651                                      const char *name,
3652                                      unsigned shift, bool rightshift) {
3653   unsigned j = 0;
3654   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
3655        ai != ae; ++ai, ++j)
3656     if (shift > 0 && shift == j)
3657       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
3658     else
3659       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
3660 
3661   return Builder.CreateCall(F, Ops, name);
3662 }
3663 
3664 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
3665                                             bool neg) {
3666   int SV = cast<ConstantInt>(V)->getSExtValue();
3667   return ConstantInt::get(Ty, neg ? -SV : SV);
3668 }
3669 
3670 // \brief Right-shift a vector by a constant.
3671 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
3672                                           llvm::Type *Ty, bool usgn,
3673                                           const char *name) {
3674   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
3675 
3676   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
3677   int EltSize = VTy->getScalarSizeInBits();
3678 
3679   Vec = Builder.CreateBitCast(Vec, Ty);
3680 
3681   // lshr/ashr are undefined when the shift amount is equal to the vector
3682   // element size.
3683   if (ShiftAmt == EltSize) {
3684     if (usgn) {
3685       // Right-shifting an unsigned value by its size yields 0.
3686       return llvm::ConstantAggregateZero::get(VTy);
3687     } else {
3688       // Right-shifting a signed value by its size is equivalent
3689       // to a shift of size-1.
3690       --ShiftAmt;
3691       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
3692     }
3693   }
3694 
3695   Shift = EmitNeonShiftVector(Shift, Ty, false);
3696   if (usgn)
3697     return Builder.CreateLShr(Vec, Shift, name);
3698   else
3699     return Builder.CreateAShr(Vec, Shift, name);
3700 }
3701 
3702 enum {
3703   AddRetType = (1 << 0),
3704   Add1ArgType = (1 << 1),
3705   Add2ArgTypes = (1 << 2),
3706 
3707   VectorizeRetType = (1 << 3),
3708   VectorizeArgTypes = (1 << 4),
3709 
3710   InventFloatType = (1 << 5),
3711   UnsignedAlts = (1 << 6),
3712 
3713   Use64BitVectors = (1 << 7),
3714   Use128BitVectors = (1 << 8),
3715 
3716   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
3717   VectorRet = AddRetType | VectorizeRetType,
3718   VectorRetGetArgs01 =
3719       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
3720   FpCmpzModifiers =
3721       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
3722 };
3723 
3724 namespace {
3725 struct NeonIntrinsicInfo {
3726   const char *NameHint;
3727   unsigned BuiltinID;
3728   unsigned LLVMIntrinsic;
3729   unsigned AltLLVMIntrinsic;
3730   unsigned TypeModifier;
3731 
3732   bool operator<(unsigned RHSBuiltinID) const {
3733     return BuiltinID < RHSBuiltinID;
3734   }
3735   bool operator<(const NeonIntrinsicInfo &TE) const {
3736     return BuiltinID < TE.BuiltinID;
3737   }
3738 };
3739 } // end anonymous namespace
3740 
3741 #define NEONMAP0(NameBase) \
3742   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
3743 
3744 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
3745   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
3746       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
3747 
3748 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
3749   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
3750       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
3751       TypeModifier }
3752 
3753 static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
3754   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
3755   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
3756   NEONMAP1(vabs_v, arm_neon_vabs, 0),
3757   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
3758   NEONMAP0(vaddhn_v),
3759   NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
3760   NEONMAP1(vaeseq_v, arm_neon_aese, 0),
3761   NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
3762   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
3763   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
3764   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
3765   NEONMAP1(vcage_v, arm_neon_vacge, 0),
3766   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
3767   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
3768   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
3769   NEONMAP1(vcale_v, arm_neon_vacge, 0),
3770   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
3771   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
3772   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
3773   NEONMAP0(vceqz_v),
3774   NEONMAP0(vceqzq_v),
3775   NEONMAP0(vcgez_v),
3776   NEONMAP0(vcgezq_v),
3777   NEONMAP0(vcgtz_v),
3778   NEONMAP0(vcgtzq_v),
3779   NEONMAP0(vclez_v),
3780   NEONMAP0(vclezq_v),
3781   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
3782   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
3783   NEONMAP0(vcltz_v),
3784   NEONMAP0(vcltzq_v),
3785   NEONMAP1(vclz_v, ctlz, Add1ArgType),
3786   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
3787   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
3788   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
3789   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
3790   NEONMAP0(vcvt_f16_v),
3791   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
3792   NEONMAP0(vcvt_f32_v),
3793   NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
3794   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
3795   NEONMAP1(vcvt_n_s16_v, arm_neon_vcvtfp2fxs, 0),
3796   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
3797   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
3798   NEONMAP1(vcvt_n_u16_v, arm_neon_vcvtfp2fxu, 0),
3799   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
3800   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
3801   NEONMAP0(vcvt_s16_v),
3802   NEONMAP0(vcvt_s32_v),
3803   NEONMAP0(vcvt_s64_v),
3804   NEONMAP0(vcvt_u16_v),
3805   NEONMAP0(vcvt_u32_v),
3806   NEONMAP0(vcvt_u64_v),
3807   NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0),
3808   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
3809   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
3810   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
3811   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
3812   NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0),
3813   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
3814   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
3815   NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0),
3816   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
3817   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
3818   NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0),
3819   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
3820   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
3821   NEONMAP1(vcvtm_u16_v, arm_neon_vcvtmu, 0),
3822   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
3823   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
3824   NEONMAP1(vcvtmq_s16_v, arm_neon_vcvtms, 0),
3825   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
3826   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
3827   NEONMAP1(vcvtmq_u16_v, arm_neon_vcvtmu, 0),
3828   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
3829   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
3830   NEONMAP1(vcvtn_s16_v, arm_neon_vcvtns, 0),
3831   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
3832   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
3833   NEONMAP1(vcvtn_u16_v, arm_neon_vcvtnu, 0),
3834   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
3835   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
3836   NEONMAP1(vcvtnq_s16_v, arm_neon_vcvtns, 0),
3837   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
3838   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
3839   NEONMAP1(vcvtnq_u16_v, arm_neon_vcvtnu, 0),
3840   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
3841   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
3842   NEONMAP1(vcvtp_s16_v, arm_neon_vcvtps, 0),
3843   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
3844   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
3845   NEONMAP1(vcvtp_u16_v, arm_neon_vcvtpu, 0),
3846   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
3847   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
3848   NEONMAP1(vcvtpq_s16_v, arm_neon_vcvtps, 0),
3849   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
3850   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
3851   NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0),
3852   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
3853   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
3854   NEONMAP0(vcvtq_f16_v),
3855   NEONMAP0(vcvtq_f32_v),
3856   NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
3857   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
3858   NEONMAP1(vcvtq_n_s16_v, arm_neon_vcvtfp2fxs, 0),
3859   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
3860   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
3861   NEONMAP1(vcvtq_n_u16_v, arm_neon_vcvtfp2fxu, 0),
3862   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
3863   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
3864   NEONMAP0(vcvtq_s16_v),
3865   NEONMAP0(vcvtq_s32_v),
3866   NEONMAP0(vcvtq_s64_v),
3867   NEONMAP0(vcvtq_u16_v),
3868   NEONMAP0(vcvtq_u32_v),
3869   NEONMAP0(vcvtq_u64_v),
3870   NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
3871   NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
3872   NEONMAP0(vext_v),
3873   NEONMAP0(vextq_v),
3874   NEONMAP0(vfma_v),
3875   NEONMAP0(vfmaq_v),
3876   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
3877   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
3878   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
3879   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
3880   NEONMAP0(vld1_dup_v),
3881   NEONMAP1(vld1_v, arm_neon_vld1, 0),
3882   NEONMAP0(vld1q_dup_v),
3883   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
3884   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
3885   NEONMAP1(vld2_v, arm_neon_vld2, 0),
3886   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
3887   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
3888   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
3889   NEONMAP1(vld3_v, arm_neon_vld3, 0),
3890   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
3891   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
3892   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
3893   NEONMAP1(vld4_v, arm_neon_vld4, 0),
3894   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
3895   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
3896   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
3897   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
3898   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
3899   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
3900   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
3901   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
3902   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
3903   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
3904   NEONMAP0(vmovl_v),
3905   NEONMAP0(vmovn_v),
3906   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
3907   NEONMAP0(vmull_v),
3908   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
3909   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
3910   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
3911   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
3912   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
3913   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
3914   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
3915   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
3916   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
3917   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
3918   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
3919   NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
3920   NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
3921   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
3922   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
3923   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
3924   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
3925   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
3926   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
3927   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
3928   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
3929   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
3930   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
3931   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
3932   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
3933   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
3934   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
3935   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
3936   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
3937   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
3938   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
3939   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
3940   NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
3941   NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
3942   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
3943   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
3944   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
3945   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
3946   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
3947   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
3948   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
3949   NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
3950   NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
3951   NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
3952   NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
3953   NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
3954   NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
3955   NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
3956   NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
3957   NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
3958   NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
3959   NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
3960   NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
3961   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
3962   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
3963   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
3964   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
3965   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
3966   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
3967   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
3968   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
3969   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
3970   NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
3971   NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
3972   NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
3973   NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
3974   NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
3975   NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
3976   NEONMAP0(vshl_n_v),
3977   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
3978   NEONMAP0(vshll_n_v),
3979   NEONMAP0(vshlq_n_v),
3980   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
3981   NEONMAP0(vshr_n_v),
3982   NEONMAP0(vshrn_n_v),
3983   NEONMAP0(vshrq_n_v),
3984   NEONMAP1(vst1_v, arm_neon_vst1, 0),
3985   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
3986   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
3987   NEONMAP1(vst2_v, arm_neon_vst2, 0),
3988   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
3989   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
3990   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
3991   NEONMAP1(vst3_v, arm_neon_vst3, 0),
3992   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
3993   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
3994   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
3995   NEONMAP1(vst4_v, arm_neon_vst4, 0),
3996   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
3997   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
3998   NEONMAP0(vsubhn_v),
3999   NEONMAP0(vtrn_v),
4000   NEONMAP0(vtrnq_v),
4001   NEONMAP0(vtst_v),
4002   NEONMAP0(vtstq_v),
4003   NEONMAP0(vuzp_v),
4004   NEONMAP0(vuzpq_v),
4005   NEONMAP0(vzip_v),
4006   NEONMAP0(vzipq_v)
4007 };
4008 
4009 static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
4010   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
4011   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
4012   NEONMAP0(vaddhn_v),
4013   NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
4014   NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
4015   NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
4016   NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
4017   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
4018   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
4019   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
4020   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
4021   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
4022   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
4023   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
4024   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
4025   NEONMAP0(vceqz_v),
4026   NEONMAP0(vceqzq_v),
4027   NEONMAP0(vcgez_v),
4028   NEONMAP0(vcgezq_v),
4029   NEONMAP0(vcgtz_v),
4030   NEONMAP0(vcgtzq_v),
4031   NEONMAP0(vclez_v),
4032   NEONMAP0(vclezq_v),
4033   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
4034   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
4035   NEONMAP0(vcltz_v),
4036   NEONMAP0(vcltzq_v),
4037   NEONMAP1(vclz_v, ctlz, Add1ArgType),
4038   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
4039   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
4040   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
4041   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
4042   NEONMAP0(vcvt_f16_v),
4043   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
4044   NEONMAP0(vcvt_f32_v),
4045   NEONMAP2(vcvt_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4046   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4047   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4048   NEONMAP1(vcvt_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
4049   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
4050   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
4051   NEONMAP1(vcvt_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
4052   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
4053   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
4054   NEONMAP0(vcvtq_f16_v),
4055   NEONMAP0(vcvtq_f32_v),
4056   NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4057   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4058   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
4059   NEONMAP1(vcvtq_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
4060   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
4061   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
4062   NEONMAP1(vcvtq_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
4063   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
4064   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
4065   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
4066   NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
4067   NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
4068   NEONMAP0(vext_v),
4069   NEONMAP0(vextq_v),
4070   NEONMAP0(vfma_v),
4071   NEONMAP0(vfmaq_v),
4072   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
4073   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
4074   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
4075   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
4076   NEONMAP0(vmovl_v),
4077   NEONMAP0(vmovn_v),
4078   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
4079   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
4080   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
4081   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
4082   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
4083   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
4084   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
4085   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
4086   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
4087   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
4088   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
4089   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
4090   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
4091   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
4092   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
4093   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
4094   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
4095   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
4096   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
4097   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
4098   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
4099   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
4100   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
4101   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
4102   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
4103   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
4104   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
4105   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
4106   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
4107   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
4108   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
4109   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
4110   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
4111   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
4112   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
4113   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
4114   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
4115   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
4116   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
4117   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
4118   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
4119   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
4120   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
4121   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
4122   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
4123   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
4124   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
4125   NEONMAP1(vsha1su0q_v, aarch64_crypto_sha1su0, 0),
4126   NEONMAP1(vsha1su1q_v, aarch64_crypto_sha1su1, 0),
4127   NEONMAP1(vsha256h2q_v, aarch64_crypto_sha256h2, 0),
4128   NEONMAP1(vsha256hq_v, aarch64_crypto_sha256h, 0),
4129   NEONMAP1(vsha256su0q_v, aarch64_crypto_sha256su0, 0),
4130   NEONMAP1(vsha256su1q_v, aarch64_crypto_sha256su1, 0),
4131   NEONMAP0(vshl_n_v),
4132   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
4133   NEONMAP0(vshll_n_v),
4134   NEONMAP0(vshlq_n_v),
4135   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
4136   NEONMAP0(vshr_n_v),
4137   NEONMAP0(vshrn_n_v),
4138   NEONMAP0(vshrq_n_v),
4139   NEONMAP0(vsubhn_v),
4140   NEONMAP0(vtst_v),
4141   NEONMAP0(vtstq_v),
4142 };
4143 
4144 static const NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = {
4145   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
4146   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
4147   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
4148   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
4149   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
4150   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
4151   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
4152   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
4153   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
4154   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4155   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
4156   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
4157   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
4158   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
4159   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4160   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4161   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
4162   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
4163   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
4164   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
4165   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
4166   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
4167   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
4168   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
4169   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4170   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4171   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4172   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4173   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4174   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4175   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4176   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4177   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4178   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4179   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4180   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4181   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4182   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4183   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4184   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4185   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4186   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4187   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4188   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4189   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4190   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4191   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4192   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4193   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
4194   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4195   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4196   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4197   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4198   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
4199   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
4200   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4201   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4202   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
4203   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
4204   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4205   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4206   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4207   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4208   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
4209   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
4210   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4211   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
4212   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
4213   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
4214   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
4215   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
4216   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
4217   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4218   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
4219   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4220   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
4221   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4222   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
4223   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4224   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
4225   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
4226   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
4227   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
4228   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
4229   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
4230   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
4231   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
4232   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
4233   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
4234   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
4235   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
4236   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
4237   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
4238   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
4239   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
4240   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
4241   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
4242   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
4243   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
4244   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
4245   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
4246   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
4247   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
4248   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
4249   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
4250   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
4251   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
4252   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
4253   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
4254   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
4255   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
4256   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
4257   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
4258   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
4259   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
4260   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
4261   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
4262   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
4263   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
4264   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
4265   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
4266   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
4267   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
4268   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
4269   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
4270   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
4271   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
4272   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
4273   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
4274   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
4275   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4276   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4277   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4278   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4279   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
4280   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
4281   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4282   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4283   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
4284   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
4285   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
4286   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
4287   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
4288   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
4289   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
4290   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
4291   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
4292   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
4293   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
4294   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
4295   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
4296   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
4297   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
4298   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
4299   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
4300   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
4301   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
4302   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
4303   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
4304   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
4305   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
4306   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
4307   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
4308   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
4309   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
4310   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
4311   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
4312   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
4313   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
4314   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
4315   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
4316   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
4317   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
4318   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
4319   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
4320   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
4321   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
4322   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
4323   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
4324   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
4325   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
4326   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
4327   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
4328   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
4329   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
4330   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
4331   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
4332   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
4333   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
4334   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
4335   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
4336   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
4337   // FP16 scalar intrinisics go here.
4338   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
4339   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4340   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
4341   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4342   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
4343   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4344   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
4345   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4346   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
4347   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4348   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
4349   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4350   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
4351   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4352   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
4353   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4354   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
4355   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4356   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
4357   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4358   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
4359   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4360   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
4361   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4362   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
4363   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
4364   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
4365   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
4366   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
4367   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
4368 };
4369 
4370 #undef NEONMAP0
4371 #undef NEONMAP1
4372 #undef NEONMAP2
4373 
4374 static bool NEONSIMDIntrinsicsProvenSorted = false;
4375 
4376 static bool AArch64SIMDIntrinsicsProvenSorted = false;
4377 static bool AArch64SISDIntrinsicsProvenSorted = false;
4378 
4379 
4380 static const NeonIntrinsicInfo *
4381 findNeonIntrinsicInMap(ArrayRef<NeonIntrinsicInfo> IntrinsicMap,
4382                        unsigned BuiltinID, bool &MapProvenSorted) {
4383 
4384 #ifndef NDEBUG
4385   if (!MapProvenSorted) {
4386     assert(std::is_sorted(std::begin(IntrinsicMap), std::end(IntrinsicMap)));
4387     MapProvenSorted = true;
4388   }
4389 #endif
4390 
4391   const NeonIntrinsicInfo *Builtin =
4392       std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID);
4393 
4394   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
4395     return Builtin;
4396 
4397   return nullptr;
4398 }
4399 
4400 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
4401                                                    unsigned Modifier,
4402                                                    llvm::Type *ArgType,
4403                                                    const CallExpr *E) {
4404   int VectorSize = 0;
4405   if (Modifier & Use64BitVectors)
4406     VectorSize = 64;
4407   else if (Modifier & Use128BitVectors)
4408     VectorSize = 128;
4409 
4410   // Return type.
4411   SmallVector<llvm::Type *, 3> Tys;
4412   if (Modifier & AddRetType) {
4413     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
4414     if (Modifier & VectorizeRetType)
4415       Ty = llvm::VectorType::get(
4416           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
4417 
4418     Tys.push_back(Ty);
4419   }
4420 
4421   // Arguments.
4422   if (Modifier & VectorizeArgTypes) {
4423     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
4424     ArgType = llvm::VectorType::get(ArgType, Elts);
4425   }
4426 
4427   if (Modifier & (Add1ArgType | Add2ArgTypes))
4428     Tys.push_back(ArgType);
4429 
4430   if (Modifier & Add2ArgTypes)
4431     Tys.push_back(ArgType);
4432 
4433   if (Modifier & InventFloatType)
4434     Tys.push_back(FloatTy);
4435 
4436   return CGM.getIntrinsic(IntrinsicID, Tys);
4437 }
4438 
4439 static Value *EmitCommonNeonSISDBuiltinExpr(CodeGenFunction &CGF,
4440                                             const NeonIntrinsicInfo &SISDInfo,
4441                                             SmallVectorImpl<Value *> &Ops,
4442                                             const CallExpr *E) {
4443   unsigned BuiltinID = SISDInfo.BuiltinID;
4444   unsigned int Int = SISDInfo.LLVMIntrinsic;
4445   unsigned Modifier = SISDInfo.TypeModifier;
4446   const char *s = SISDInfo.NameHint;
4447 
4448   switch (BuiltinID) {
4449   case NEON::BI__builtin_neon_vcled_s64:
4450   case NEON::BI__builtin_neon_vcled_u64:
4451   case NEON::BI__builtin_neon_vcles_f32:
4452   case NEON::BI__builtin_neon_vcled_f64:
4453   case NEON::BI__builtin_neon_vcltd_s64:
4454   case NEON::BI__builtin_neon_vcltd_u64:
4455   case NEON::BI__builtin_neon_vclts_f32:
4456   case NEON::BI__builtin_neon_vcltd_f64:
4457   case NEON::BI__builtin_neon_vcales_f32:
4458   case NEON::BI__builtin_neon_vcaled_f64:
4459   case NEON::BI__builtin_neon_vcalts_f32:
4460   case NEON::BI__builtin_neon_vcaltd_f64:
4461     // Only one direction of comparisons actually exist, cmle is actually a cmge
4462     // with swapped operands. The table gives us the right intrinsic but we
4463     // still need to do the swap.
4464     std::swap(Ops[0], Ops[1]);
4465     break;
4466   }
4467 
4468   assert(Int && "Generic code assumes a valid intrinsic");
4469 
4470   // Determine the type(s) of this overloaded AArch64 intrinsic.
4471   const Expr *Arg = E->getArg(0);
4472   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
4473   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
4474 
4475   int j = 0;
4476   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
4477   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
4478        ai != ae; ++ai, ++j) {
4479     llvm::Type *ArgTy = ai->getType();
4480     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
4481              ArgTy->getPrimitiveSizeInBits())
4482       continue;
4483 
4484     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
4485     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
4486     // it before inserting.
4487     Ops[j] =
4488         CGF.Builder.CreateTruncOrBitCast(Ops[j], ArgTy->getVectorElementType());
4489     Ops[j] =
4490         CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0);
4491   }
4492 
4493   Value *Result = CGF.EmitNeonCall(F, Ops, s);
4494   llvm::Type *ResultType = CGF.ConvertType(E->getType());
4495   if (ResultType->getPrimitiveSizeInBits() <
4496       Result->getType()->getPrimitiveSizeInBits())
4497     return CGF.Builder.CreateExtractElement(Result, C0);
4498 
4499   return CGF.Builder.CreateBitCast(Result, ResultType, s);
4500 }
4501 
4502 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
4503     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
4504     const char *NameHint, unsigned Modifier, const CallExpr *E,
4505     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
4506     llvm::Triple::ArchType Arch) {
4507   // Get the last argument, which specifies the vector type.
4508   llvm::APSInt NeonTypeConst;
4509   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
4510   if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext()))
4511     return nullptr;
4512 
4513   // Determine the type of this overloaded NEON intrinsic.
4514   NeonTypeFlags Type(NeonTypeConst.getZExtValue());
4515   bool Usgn = Type.isUnsigned();
4516   bool Quad = Type.isQuad();
4517   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
4518 
4519   llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType);
4520   llvm::Type *Ty = VTy;
4521   if (!Ty)
4522     return nullptr;
4523 
4524   auto getAlignmentValue32 = [&](Address addr) -> Value* {
4525     return Builder.getInt32(addr.getAlignment().getQuantity());
4526   };
4527 
4528   unsigned Int = LLVMIntrinsic;
4529   if ((Modifier & UnsignedAlts) && !Usgn)
4530     Int = AltLLVMIntrinsic;
4531 
4532   switch (BuiltinID) {
4533   default: break;
4534   case NEON::BI__builtin_neon_vabs_v:
4535   case NEON::BI__builtin_neon_vabsq_v:
4536     if (VTy->getElementType()->isFloatingPointTy())
4537       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
4538     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
4539   case NEON::BI__builtin_neon_vaddhn_v: {
4540     llvm::VectorType *SrcTy =
4541         llvm::VectorType::getExtendedElementVectorType(VTy);
4542 
4543     // %sum = add <4 x i32> %lhs, %rhs
4544     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
4545     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
4546     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
4547 
4548     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
4549     Constant *ShiftAmt =
4550         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
4551     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
4552 
4553     // %res = trunc <4 x i32> %high to <4 x i16>
4554     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
4555   }
4556   case NEON::BI__builtin_neon_vcale_v:
4557   case NEON::BI__builtin_neon_vcaleq_v:
4558   case NEON::BI__builtin_neon_vcalt_v:
4559   case NEON::BI__builtin_neon_vcaltq_v:
4560     std::swap(Ops[0], Ops[1]);
4561     LLVM_FALLTHROUGH;
4562   case NEON::BI__builtin_neon_vcage_v:
4563   case NEON::BI__builtin_neon_vcageq_v:
4564   case NEON::BI__builtin_neon_vcagt_v:
4565   case NEON::BI__builtin_neon_vcagtq_v: {
4566     llvm::Type *Ty;
4567     switch (VTy->getScalarSizeInBits()) {
4568     default: llvm_unreachable("unexpected type");
4569     case 32:
4570       Ty = FloatTy;
4571       break;
4572     case 64:
4573       Ty = DoubleTy;
4574       break;
4575     case 16:
4576       Ty = HalfTy;
4577       break;
4578     }
4579     llvm::Type *VecFlt = llvm::VectorType::get(Ty, VTy->getNumElements());
4580     llvm::Type *Tys[] = { VTy, VecFlt };
4581     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
4582     return EmitNeonCall(F, Ops, NameHint);
4583   }
4584   case NEON::BI__builtin_neon_vceqz_v:
4585   case NEON::BI__builtin_neon_vceqzq_v:
4586     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
4587                                          ICmpInst::ICMP_EQ, "vceqz");
4588   case NEON::BI__builtin_neon_vcgez_v:
4589   case NEON::BI__builtin_neon_vcgezq_v:
4590     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
4591                                          ICmpInst::ICMP_SGE, "vcgez");
4592   case NEON::BI__builtin_neon_vclez_v:
4593   case NEON::BI__builtin_neon_vclezq_v:
4594     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
4595                                          ICmpInst::ICMP_SLE, "vclez");
4596   case NEON::BI__builtin_neon_vcgtz_v:
4597   case NEON::BI__builtin_neon_vcgtzq_v:
4598     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
4599                                          ICmpInst::ICMP_SGT, "vcgtz");
4600   case NEON::BI__builtin_neon_vcltz_v:
4601   case NEON::BI__builtin_neon_vcltzq_v:
4602     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
4603                                          ICmpInst::ICMP_SLT, "vcltz");
4604   case NEON::BI__builtin_neon_vclz_v:
4605   case NEON::BI__builtin_neon_vclzq_v:
4606     // We generate target-independent intrinsic, which needs a second argument
4607     // for whether or not clz of zero is undefined; on ARM it isn't.
4608     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
4609     break;
4610   case NEON::BI__builtin_neon_vcvt_f32_v:
4611   case NEON::BI__builtin_neon_vcvtq_f32_v:
4612     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4613     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
4614                      HasLegalHalfType);
4615     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
4616                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
4617   case NEON::BI__builtin_neon_vcvt_f16_v:
4618   case NEON::BI__builtin_neon_vcvtq_f16_v:
4619     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4620     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
4621                      HasLegalHalfType);
4622     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
4623                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
4624   case NEON::BI__builtin_neon_vcvt_n_f16_v:
4625   case NEON::BI__builtin_neon_vcvt_n_f32_v:
4626   case NEON::BI__builtin_neon_vcvt_n_f64_v:
4627   case NEON::BI__builtin_neon_vcvtq_n_f16_v:
4628   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
4629   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
4630     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
4631     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
4632     Function *F = CGM.getIntrinsic(Int, Tys);
4633     return EmitNeonCall(F, Ops, "vcvt_n");
4634   }
4635   case NEON::BI__builtin_neon_vcvt_n_s16_v:
4636   case NEON::BI__builtin_neon_vcvt_n_s32_v:
4637   case NEON::BI__builtin_neon_vcvt_n_u16_v:
4638   case NEON::BI__builtin_neon_vcvt_n_u32_v:
4639   case NEON::BI__builtin_neon_vcvt_n_s64_v:
4640   case NEON::BI__builtin_neon_vcvt_n_u64_v:
4641   case NEON::BI__builtin_neon_vcvtq_n_s16_v:
4642   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
4643   case NEON::BI__builtin_neon_vcvtq_n_u16_v:
4644   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
4645   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
4646   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
4647     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
4648     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
4649     return EmitNeonCall(F, Ops, "vcvt_n");
4650   }
4651   case NEON::BI__builtin_neon_vcvt_s32_v:
4652   case NEON::BI__builtin_neon_vcvt_u32_v:
4653   case NEON::BI__builtin_neon_vcvt_s64_v:
4654   case NEON::BI__builtin_neon_vcvt_u64_v:
4655   case NEON::BI__builtin_neon_vcvt_s16_v:
4656   case NEON::BI__builtin_neon_vcvt_u16_v:
4657   case NEON::BI__builtin_neon_vcvtq_s32_v:
4658   case NEON::BI__builtin_neon_vcvtq_u32_v:
4659   case NEON::BI__builtin_neon_vcvtq_s64_v:
4660   case NEON::BI__builtin_neon_vcvtq_u64_v:
4661   case NEON::BI__builtin_neon_vcvtq_s16_v:
4662   case NEON::BI__builtin_neon_vcvtq_u16_v: {
4663     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
4664     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
4665                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
4666   }
4667   case NEON::BI__builtin_neon_vcvta_s16_v:
4668   case NEON::BI__builtin_neon_vcvta_s32_v:
4669   case NEON::BI__builtin_neon_vcvta_s64_v:
4670   case NEON::BI__builtin_neon_vcvta_u32_v:
4671   case NEON::BI__builtin_neon_vcvta_u64_v:
4672   case NEON::BI__builtin_neon_vcvtaq_s16_v:
4673   case NEON::BI__builtin_neon_vcvtaq_s32_v:
4674   case NEON::BI__builtin_neon_vcvtaq_s64_v:
4675   case NEON::BI__builtin_neon_vcvtaq_u16_v:
4676   case NEON::BI__builtin_neon_vcvtaq_u32_v:
4677   case NEON::BI__builtin_neon_vcvtaq_u64_v:
4678   case NEON::BI__builtin_neon_vcvtn_s16_v:
4679   case NEON::BI__builtin_neon_vcvtn_s32_v:
4680   case NEON::BI__builtin_neon_vcvtn_s64_v:
4681   case NEON::BI__builtin_neon_vcvtn_u16_v:
4682   case NEON::BI__builtin_neon_vcvtn_u32_v:
4683   case NEON::BI__builtin_neon_vcvtn_u64_v:
4684   case NEON::BI__builtin_neon_vcvtnq_s16_v:
4685   case NEON::BI__builtin_neon_vcvtnq_s32_v:
4686   case NEON::BI__builtin_neon_vcvtnq_s64_v:
4687   case NEON::BI__builtin_neon_vcvtnq_u16_v:
4688   case NEON::BI__builtin_neon_vcvtnq_u32_v:
4689   case NEON::BI__builtin_neon_vcvtnq_u64_v:
4690   case NEON::BI__builtin_neon_vcvtp_s16_v:
4691   case NEON::BI__builtin_neon_vcvtp_s32_v:
4692   case NEON::BI__builtin_neon_vcvtp_s64_v:
4693   case NEON::BI__builtin_neon_vcvtp_u16_v:
4694   case NEON::BI__builtin_neon_vcvtp_u32_v:
4695   case NEON::BI__builtin_neon_vcvtp_u64_v:
4696   case NEON::BI__builtin_neon_vcvtpq_s16_v:
4697   case NEON::BI__builtin_neon_vcvtpq_s32_v:
4698   case NEON::BI__builtin_neon_vcvtpq_s64_v:
4699   case NEON::BI__builtin_neon_vcvtpq_u16_v:
4700   case NEON::BI__builtin_neon_vcvtpq_u32_v:
4701   case NEON::BI__builtin_neon_vcvtpq_u64_v:
4702   case NEON::BI__builtin_neon_vcvtm_s16_v:
4703   case NEON::BI__builtin_neon_vcvtm_s32_v:
4704   case NEON::BI__builtin_neon_vcvtm_s64_v:
4705   case NEON::BI__builtin_neon_vcvtm_u16_v:
4706   case NEON::BI__builtin_neon_vcvtm_u32_v:
4707   case NEON::BI__builtin_neon_vcvtm_u64_v:
4708   case NEON::BI__builtin_neon_vcvtmq_s16_v:
4709   case NEON::BI__builtin_neon_vcvtmq_s32_v:
4710   case NEON::BI__builtin_neon_vcvtmq_s64_v:
4711   case NEON::BI__builtin_neon_vcvtmq_u16_v:
4712   case NEON::BI__builtin_neon_vcvtmq_u32_v:
4713   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
4714     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
4715     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
4716   }
4717   case NEON::BI__builtin_neon_vext_v:
4718   case NEON::BI__builtin_neon_vextq_v: {
4719     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
4720     SmallVector<uint32_t, 16> Indices;
4721     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
4722       Indices.push_back(i+CV);
4723 
4724     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4725     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4726     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
4727   }
4728   case NEON::BI__builtin_neon_vfma_v:
4729   case NEON::BI__builtin_neon_vfmaq_v: {
4730     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
4731     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4732     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4733     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4734 
4735     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
4736     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
4737   }
4738   case NEON::BI__builtin_neon_vld1_v:
4739   case NEON::BI__builtin_neon_vld1q_v: {
4740     llvm::Type *Tys[] = {Ty, Int8PtrTy};
4741     Ops.push_back(getAlignmentValue32(PtrOp0));
4742     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
4743   }
4744   case NEON::BI__builtin_neon_vld2_v:
4745   case NEON::BI__builtin_neon_vld2q_v:
4746   case NEON::BI__builtin_neon_vld3_v:
4747   case NEON::BI__builtin_neon_vld3q_v:
4748   case NEON::BI__builtin_neon_vld4_v:
4749   case NEON::BI__builtin_neon_vld4q_v: {
4750     llvm::Type *Tys[] = {Ty, Int8PtrTy};
4751     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
4752     Value *Align = getAlignmentValue32(PtrOp1);
4753     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
4754     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4755     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4756     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
4757   }
4758   case NEON::BI__builtin_neon_vld1_dup_v:
4759   case NEON::BI__builtin_neon_vld1q_dup_v: {
4760     Value *V = UndefValue::get(Ty);
4761     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
4762     PtrOp0 = Builder.CreateBitCast(PtrOp0, Ty);
4763     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
4764     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
4765     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
4766     return EmitNeonSplat(Ops[0], CI);
4767   }
4768   case NEON::BI__builtin_neon_vld2_lane_v:
4769   case NEON::BI__builtin_neon_vld2q_lane_v:
4770   case NEON::BI__builtin_neon_vld3_lane_v:
4771   case NEON::BI__builtin_neon_vld3q_lane_v:
4772   case NEON::BI__builtin_neon_vld4_lane_v:
4773   case NEON::BI__builtin_neon_vld4q_lane_v: {
4774     llvm::Type *Tys[] = {Ty, Int8PtrTy};
4775     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
4776     for (unsigned I = 2; I < Ops.size() - 1; ++I)
4777       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
4778     Ops.push_back(getAlignmentValue32(PtrOp1));
4779     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
4780     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
4781     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4782     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
4783   }
4784   case NEON::BI__builtin_neon_vmovl_v: {
4785     llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
4786     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
4787     if (Usgn)
4788       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
4789     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
4790   }
4791   case NEON::BI__builtin_neon_vmovn_v: {
4792     llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
4793     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
4794     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
4795   }
4796   case NEON::BI__builtin_neon_vmull_v:
4797     // FIXME: the integer vmull operations could be emitted in terms of pure
4798     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
4799     // hoisting the exts outside loops. Until global ISel comes along that can
4800     // see through such movement this leads to bad CodeGen. So we need an
4801     // intrinsic for now.
4802     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
4803     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
4804     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
4805   case NEON::BI__builtin_neon_vpadal_v:
4806   case NEON::BI__builtin_neon_vpadalq_v: {
4807     // The source operand type has twice as many elements of half the size.
4808     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
4809     llvm::Type *EltTy =
4810       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
4811     llvm::Type *NarrowTy =
4812       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
4813     llvm::Type *Tys[2] = { Ty, NarrowTy };
4814     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
4815   }
4816   case NEON::BI__builtin_neon_vpaddl_v:
4817   case NEON::BI__builtin_neon_vpaddlq_v: {
4818     // The source operand type has twice as many elements of half the size.
4819     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
4820     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
4821     llvm::Type *NarrowTy =
4822       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
4823     llvm::Type *Tys[2] = { Ty, NarrowTy };
4824     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
4825   }
4826   case NEON::BI__builtin_neon_vqdmlal_v:
4827   case NEON::BI__builtin_neon_vqdmlsl_v: {
4828     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
4829     Ops[1] =
4830         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
4831     Ops.resize(2);
4832     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
4833   }
4834   case NEON::BI__builtin_neon_vqshl_n_v:
4835   case NEON::BI__builtin_neon_vqshlq_n_v:
4836     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
4837                         1, false);
4838   case NEON::BI__builtin_neon_vqshlu_n_v:
4839   case NEON::BI__builtin_neon_vqshluq_n_v:
4840     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
4841                         1, false);
4842   case NEON::BI__builtin_neon_vrecpe_v:
4843   case NEON::BI__builtin_neon_vrecpeq_v:
4844   case NEON::BI__builtin_neon_vrsqrte_v:
4845   case NEON::BI__builtin_neon_vrsqrteq_v:
4846     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
4847     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
4848 
4849   case NEON::BI__builtin_neon_vrshr_n_v:
4850   case NEON::BI__builtin_neon_vrshrq_n_v:
4851     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
4852                         1, true);
4853   case NEON::BI__builtin_neon_vshl_n_v:
4854   case NEON::BI__builtin_neon_vshlq_n_v:
4855     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
4856     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
4857                              "vshl_n");
4858   case NEON::BI__builtin_neon_vshll_n_v: {
4859     llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
4860     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
4861     if (Usgn)
4862       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
4863     else
4864       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
4865     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
4866     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
4867   }
4868   case NEON::BI__builtin_neon_vshrn_n_v: {
4869     llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
4870     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
4871     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
4872     if (Usgn)
4873       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
4874     else
4875       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
4876     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
4877   }
4878   case NEON::BI__builtin_neon_vshr_n_v:
4879   case NEON::BI__builtin_neon_vshrq_n_v:
4880     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
4881   case NEON::BI__builtin_neon_vst1_v:
4882   case NEON::BI__builtin_neon_vst1q_v:
4883   case NEON::BI__builtin_neon_vst2_v:
4884   case NEON::BI__builtin_neon_vst2q_v:
4885   case NEON::BI__builtin_neon_vst3_v:
4886   case NEON::BI__builtin_neon_vst3q_v:
4887   case NEON::BI__builtin_neon_vst4_v:
4888   case NEON::BI__builtin_neon_vst4q_v:
4889   case NEON::BI__builtin_neon_vst2_lane_v:
4890   case NEON::BI__builtin_neon_vst2q_lane_v:
4891   case NEON::BI__builtin_neon_vst3_lane_v:
4892   case NEON::BI__builtin_neon_vst3q_lane_v:
4893   case NEON::BI__builtin_neon_vst4_lane_v:
4894   case NEON::BI__builtin_neon_vst4q_lane_v: {
4895     llvm::Type *Tys[] = {Int8PtrTy, Ty};
4896     Ops.push_back(getAlignmentValue32(PtrOp0));
4897     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
4898   }
4899   case NEON::BI__builtin_neon_vsubhn_v: {
4900     llvm::VectorType *SrcTy =
4901         llvm::VectorType::getExtendedElementVectorType(VTy);
4902 
4903     // %sum = add <4 x i32> %lhs, %rhs
4904     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
4905     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
4906     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
4907 
4908     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
4909     Constant *ShiftAmt =
4910         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
4911     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
4912 
4913     // %res = trunc <4 x i32> %high to <4 x i16>
4914     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
4915   }
4916   case NEON::BI__builtin_neon_vtrn_v:
4917   case NEON::BI__builtin_neon_vtrnq_v: {
4918     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
4919     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4920     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4921     Value *SV = nullptr;
4922 
4923     for (unsigned vi = 0; vi != 2; ++vi) {
4924       SmallVector<uint32_t, 16> Indices;
4925       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
4926         Indices.push_back(i+vi);
4927         Indices.push_back(i+e+vi);
4928       }
4929       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
4930       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
4931       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
4932     }
4933     return SV;
4934   }
4935   case NEON::BI__builtin_neon_vtst_v:
4936   case NEON::BI__builtin_neon_vtstq_v: {
4937     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
4938     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4939     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
4940     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
4941                                 ConstantAggregateZero::get(Ty));
4942     return Builder.CreateSExt(Ops[0], Ty, "vtst");
4943   }
4944   case NEON::BI__builtin_neon_vuzp_v:
4945   case NEON::BI__builtin_neon_vuzpq_v: {
4946     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
4947     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4948     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4949     Value *SV = nullptr;
4950 
4951     for (unsigned vi = 0; vi != 2; ++vi) {
4952       SmallVector<uint32_t, 16> Indices;
4953       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
4954         Indices.push_back(2*i+vi);
4955 
4956       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
4957       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
4958       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
4959     }
4960     return SV;
4961   }
4962   case NEON::BI__builtin_neon_vzip_v:
4963   case NEON::BI__builtin_neon_vzipq_v: {
4964     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
4965     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
4966     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
4967     Value *SV = nullptr;
4968 
4969     for (unsigned vi = 0; vi != 2; ++vi) {
4970       SmallVector<uint32_t, 16> Indices;
4971       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
4972         Indices.push_back((i + vi*e) >> 1);
4973         Indices.push_back(((i + vi*e) >> 1)+e);
4974       }
4975       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
4976       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
4977       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
4978     }
4979     return SV;
4980   }
4981   case NEON::BI__builtin_neon_vdot_v:
4982   case NEON::BI__builtin_neon_vdotq_v: {
4983     llvm::Type *InputTy =
4984         llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
4985     llvm::Type *Tys[2] = { Ty, InputTy };
4986     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
4987     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
4988   }
4989   }
4990 
4991   assert(Int && "Expected valid intrinsic number");
4992 
4993   // Determine the type(s) of this overloaded AArch64 intrinsic.
4994   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
4995 
4996   Value *Result = EmitNeonCall(F, Ops, NameHint);
4997   llvm::Type *ResultType = ConvertType(E->getType());
4998   // AArch64 intrinsic one-element vector type cast to
4999   // scalar type expected by the builtin
5000   return Builder.CreateBitCast(Result, ResultType, NameHint);
5001 }
5002 
5003 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
5004     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
5005     const CmpInst::Predicate Ip, const Twine &Name) {
5006   llvm::Type *OTy = Op->getType();
5007 
5008   // FIXME: this is utterly horrific. We should not be looking at previous
5009   // codegen context to find out what needs doing. Unfortunately TableGen
5010   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
5011   // (etc).
5012   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
5013     OTy = BI->getOperand(0)->getType();
5014 
5015   Op = Builder.CreateBitCast(Op, OTy);
5016   if (OTy->getScalarType()->isFloatingPointTy()) {
5017     Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
5018   } else {
5019     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
5020   }
5021   return Builder.CreateSExt(Op, Ty, Name);
5022 }
5023 
5024 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
5025                                  Value *ExtOp, Value *IndexOp,
5026                                  llvm::Type *ResTy, unsigned IntID,
5027                                  const char *Name) {
5028   SmallVector<Value *, 2> TblOps;
5029   if (ExtOp)
5030     TblOps.push_back(ExtOp);
5031 
5032   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
5033   SmallVector<uint32_t, 16> Indices;
5034   llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
5035   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
5036     Indices.push_back(2*i);
5037     Indices.push_back(2*i+1);
5038   }
5039 
5040   int PairPos = 0, End = Ops.size() - 1;
5041   while (PairPos < End) {
5042     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
5043                                                      Ops[PairPos+1], Indices,
5044                                                      Name));
5045     PairPos += 2;
5046   }
5047 
5048   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
5049   // of the 128-bit lookup table with zero.
5050   if (PairPos == End) {
5051     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
5052     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
5053                                                      ZeroTbl, Indices, Name));
5054   }
5055 
5056   Function *TblF;
5057   TblOps.push_back(IndexOp);
5058   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
5059 
5060   return CGF.EmitNeonCall(TblF, TblOps, Name);
5061 }
5062 
5063 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
5064   unsigned Value;
5065   switch (BuiltinID) {
5066   default:
5067     return nullptr;
5068   case ARM::BI__builtin_arm_nop:
5069     Value = 0;
5070     break;
5071   case ARM::BI__builtin_arm_yield:
5072   case ARM::BI__yield:
5073     Value = 1;
5074     break;
5075   case ARM::BI__builtin_arm_wfe:
5076   case ARM::BI__wfe:
5077     Value = 2;
5078     break;
5079   case ARM::BI__builtin_arm_wfi:
5080   case ARM::BI__wfi:
5081     Value = 3;
5082     break;
5083   case ARM::BI__builtin_arm_sev:
5084   case ARM::BI__sev:
5085     Value = 4;
5086     break;
5087   case ARM::BI__builtin_arm_sevl:
5088   case ARM::BI__sevl:
5089     Value = 5;
5090     break;
5091   }
5092 
5093   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
5094                             llvm::ConstantInt::get(Int32Ty, Value));
5095 }
5096 
5097 // Generates the IR for the read/write special register builtin,
5098 // ValueType is the type of the value that is to be written or read,
5099 // RegisterType is the type of the register being written to or read from.
5100 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
5101                                          const CallExpr *E,
5102                                          llvm::Type *RegisterType,
5103                                          llvm::Type *ValueType,
5104                                          bool IsRead,
5105                                          StringRef SysReg = "") {
5106   // write and register intrinsics only support 32 and 64 bit operations.
5107   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64))
5108           && "Unsupported size for register.");
5109 
5110   CodeGen::CGBuilderTy &Builder = CGF.Builder;
5111   CodeGen::CodeGenModule &CGM = CGF.CGM;
5112   LLVMContext &Context = CGM.getLLVMContext();
5113 
5114   if (SysReg.empty()) {
5115     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
5116     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
5117   }
5118 
5119   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
5120   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
5121   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
5122 
5123   llvm::Type *Types[] = { RegisterType };
5124 
5125   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
5126   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
5127             && "Can't fit 64-bit value in 32-bit register");
5128 
5129   if (IsRead) {
5130     llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
5131     llvm::Value *Call = Builder.CreateCall(F, Metadata);
5132 
5133     if (MixedTypes)
5134       // Read into 64 bit register and then truncate result to 32 bit.
5135       return Builder.CreateTrunc(Call, ValueType);
5136 
5137     if (ValueType->isPointerTy())
5138       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
5139       return Builder.CreateIntToPtr(Call, ValueType);
5140 
5141     return Call;
5142   }
5143 
5144   llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
5145   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
5146   if (MixedTypes) {
5147     // Extend 32 bit write value to 64 bit to pass to write.
5148     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
5149     return Builder.CreateCall(F, { Metadata, ArgValue });
5150   }
5151 
5152   if (ValueType->isPointerTy()) {
5153     // Have VoidPtrTy ArgValue but want to return an i32/i64.
5154     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
5155     return Builder.CreateCall(F, { Metadata, ArgValue });
5156   }
5157 
5158   return Builder.CreateCall(F, { Metadata, ArgValue });
5159 }
5160 
5161 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
5162 /// argument that specifies the vector type.
5163 static bool HasExtraNeonArgument(unsigned BuiltinID) {
5164   switch (BuiltinID) {
5165   default: break;
5166   case NEON::BI__builtin_neon_vget_lane_i8:
5167   case NEON::BI__builtin_neon_vget_lane_i16:
5168   case NEON::BI__builtin_neon_vget_lane_i32:
5169   case NEON::BI__builtin_neon_vget_lane_i64:
5170   case NEON::BI__builtin_neon_vget_lane_f32:
5171   case NEON::BI__builtin_neon_vgetq_lane_i8:
5172   case NEON::BI__builtin_neon_vgetq_lane_i16:
5173   case NEON::BI__builtin_neon_vgetq_lane_i32:
5174   case NEON::BI__builtin_neon_vgetq_lane_i64:
5175   case NEON::BI__builtin_neon_vgetq_lane_f32:
5176   case NEON::BI__builtin_neon_vset_lane_i8:
5177   case NEON::BI__builtin_neon_vset_lane_i16:
5178   case NEON::BI__builtin_neon_vset_lane_i32:
5179   case NEON::BI__builtin_neon_vset_lane_i64:
5180   case NEON::BI__builtin_neon_vset_lane_f32:
5181   case NEON::BI__builtin_neon_vsetq_lane_i8:
5182   case NEON::BI__builtin_neon_vsetq_lane_i16:
5183   case NEON::BI__builtin_neon_vsetq_lane_i32:
5184   case NEON::BI__builtin_neon_vsetq_lane_i64:
5185   case NEON::BI__builtin_neon_vsetq_lane_f32:
5186   case NEON::BI__builtin_neon_vsha1h_u32:
5187   case NEON::BI__builtin_neon_vsha1cq_u32:
5188   case NEON::BI__builtin_neon_vsha1pq_u32:
5189   case NEON::BI__builtin_neon_vsha1mq_u32:
5190   case clang::ARM::BI_MoveToCoprocessor:
5191   case clang::ARM::BI_MoveToCoprocessor2:
5192     return false;
5193   }
5194   return true;
5195 }
5196 
5197 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
5198                                            const CallExpr *E,
5199                                            llvm::Triple::ArchType Arch) {
5200   if (auto Hint = GetValueForARMHint(BuiltinID))
5201     return Hint;
5202 
5203   if (BuiltinID == ARM::BI__emit) {
5204     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
5205     llvm::FunctionType *FTy =
5206         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
5207 
5208     APSInt Value;
5209     if (!E->getArg(0)->EvaluateAsInt(Value, CGM.getContext()))
5210       llvm_unreachable("Sema will ensure that the parameter is constant");
5211 
5212     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
5213 
5214     llvm::InlineAsm *Emit =
5215         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
5216                                  /*SideEffects=*/true)
5217                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
5218                                  /*SideEffects=*/true);
5219 
5220     return Builder.CreateCall(Emit);
5221   }
5222 
5223   if (BuiltinID == ARM::BI__builtin_arm_dbg) {
5224     Value *Option = EmitScalarExpr(E->getArg(0));
5225     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
5226   }
5227 
5228   if (BuiltinID == ARM::BI__builtin_arm_prefetch) {
5229     Value *Address = EmitScalarExpr(E->getArg(0));
5230     Value *RW      = EmitScalarExpr(E->getArg(1));
5231     Value *IsData  = EmitScalarExpr(E->getArg(2));
5232 
5233     // Locality is not supported on ARM target
5234     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
5235 
5236     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
5237     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
5238   }
5239 
5240   if (BuiltinID == ARM::BI__builtin_arm_rbit) {
5241     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
5242     return Builder.CreateCall(
5243         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
5244   }
5245 
5246   if (BuiltinID == ARM::BI__clear_cache) {
5247     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
5248     const FunctionDecl *FD = E->getDirectCallee();
5249     Value *Ops[2];
5250     for (unsigned i = 0; i < 2; i++)
5251       Ops[i] = EmitScalarExpr(E->getArg(i));
5252     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
5253     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
5254     StringRef Name = FD->getName();
5255     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
5256   }
5257 
5258   if (BuiltinID == ARM::BI__builtin_arm_mcrr ||
5259       BuiltinID == ARM::BI__builtin_arm_mcrr2) {
5260     Function *F;
5261 
5262     switch (BuiltinID) {
5263     default: llvm_unreachable("unexpected builtin");
5264     case ARM::BI__builtin_arm_mcrr:
5265       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
5266       break;
5267     case ARM::BI__builtin_arm_mcrr2:
5268       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
5269       break;
5270     }
5271 
5272     // MCRR{2} instruction has 5 operands but
5273     // the intrinsic has 4 because Rt and Rt2
5274     // are represented as a single unsigned 64
5275     // bit integer in the intrinsic definition
5276     // but internally it's represented as 2 32
5277     // bit integers.
5278 
5279     Value *Coproc = EmitScalarExpr(E->getArg(0));
5280     Value *Opc1 = EmitScalarExpr(E->getArg(1));
5281     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
5282     Value *CRm = EmitScalarExpr(E->getArg(3));
5283 
5284     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
5285     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
5286     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
5287     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
5288 
5289     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
5290   }
5291 
5292   if (BuiltinID == ARM::BI__builtin_arm_mrrc ||
5293       BuiltinID == ARM::BI__builtin_arm_mrrc2) {
5294     Function *F;
5295 
5296     switch (BuiltinID) {
5297     default: llvm_unreachable("unexpected builtin");
5298     case ARM::BI__builtin_arm_mrrc:
5299       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
5300       break;
5301     case ARM::BI__builtin_arm_mrrc2:
5302       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
5303       break;
5304     }
5305 
5306     Value *Coproc = EmitScalarExpr(E->getArg(0));
5307     Value *Opc1 = EmitScalarExpr(E->getArg(1));
5308     Value *CRm  = EmitScalarExpr(E->getArg(2));
5309     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
5310 
5311     // Returns an unsigned 64 bit integer, represented
5312     // as two 32 bit integers.
5313 
5314     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
5315     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
5316     Rt = Builder.CreateZExt(Rt, Int64Ty);
5317     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
5318 
5319     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
5320     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
5321     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
5322 
5323     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
5324   }
5325 
5326   if (BuiltinID == ARM::BI__builtin_arm_ldrexd ||
5327       ((BuiltinID == ARM::BI__builtin_arm_ldrex ||
5328         BuiltinID == ARM::BI__builtin_arm_ldaex) &&
5329        getContext().getTypeSize(E->getType()) == 64) ||
5330       BuiltinID == ARM::BI__ldrexd) {
5331     Function *F;
5332 
5333     switch (BuiltinID) {
5334     default: llvm_unreachable("unexpected builtin");
5335     case ARM::BI__builtin_arm_ldaex:
5336       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
5337       break;
5338     case ARM::BI__builtin_arm_ldrexd:
5339     case ARM::BI__builtin_arm_ldrex:
5340     case ARM::BI__ldrexd:
5341       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
5342       break;
5343     }
5344 
5345     Value *LdPtr = EmitScalarExpr(E->getArg(0));
5346     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
5347                                     "ldrexd");
5348 
5349     Value *Val0 = Builder.CreateExtractValue(Val, 1);
5350     Value *Val1 = Builder.CreateExtractValue(Val, 0);
5351     Val0 = Builder.CreateZExt(Val0, Int64Ty);
5352     Val1 = Builder.CreateZExt(Val1, Int64Ty);
5353 
5354     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
5355     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
5356     Val = Builder.CreateOr(Val, Val1);
5357     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
5358   }
5359 
5360   if (BuiltinID == ARM::BI__builtin_arm_ldrex ||
5361       BuiltinID == ARM::BI__builtin_arm_ldaex) {
5362     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
5363 
5364     QualType Ty = E->getType();
5365     llvm::Type *RealResTy = ConvertType(Ty);
5366     llvm::Type *PtrTy = llvm::IntegerType::get(
5367         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
5368     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
5369 
5370     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_ldaex
5371                                        ? Intrinsic::arm_ldaex
5372                                        : Intrinsic::arm_ldrex,
5373                                    PtrTy);
5374     Value *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
5375 
5376     if (RealResTy->isPointerTy())
5377       return Builder.CreateIntToPtr(Val, RealResTy);
5378     else {
5379       llvm::Type *IntResTy = llvm::IntegerType::get(
5380           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
5381       Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
5382       return Builder.CreateBitCast(Val, RealResTy);
5383     }
5384   }
5385 
5386   if (BuiltinID == ARM::BI__builtin_arm_strexd ||
5387       ((BuiltinID == ARM::BI__builtin_arm_stlex ||
5388         BuiltinID == ARM::BI__builtin_arm_strex) &&
5389        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
5390     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
5391                                        ? Intrinsic::arm_stlexd
5392                                        : Intrinsic::arm_strexd);
5393     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
5394 
5395     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
5396     Value *Val = EmitScalarExpr(E->getArg(0));
5397     Builder.CreateStore(Val, Tmp);
5398 
5399     Address LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
5400     Val = Builder.CreateLoad(LdPtr);
5401 
5402     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
5403     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
5404     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), Int8PtrTy);
5405     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
5406   }
5407 
5408   if (BuiltinID == ARM::BI__builtin_arm_strex ||
5409       BuiltinID == ARM::BI__builtin_arm_stlex) {
5410     Value *StoreVal = EmitScalarExpr(E->getArg(0));
5411     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
5412 
5413     QualType Ty = E->getArg(0)->getType();
5414     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
5415                                                  getContext().getTypeSize(Ty));
5416     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
5417 
5418     if (StoreVal->getType()->isPointerTy())
5419       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
5420     else {
5421       llvm::Type *IntTy = llvm::IntegerType::get(
5422           getLLVMContext(),
5423           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
5424       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
5425       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
5426     }
5427 
5428     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
5429                                        ? Intrinsic::arm_stlex
5430                                        : Intrinsic::arm_strex,
5431                                    StoreAddr->getType());
5432     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
5433   }
5434 
5435   switch (BuiltinID) {
5436   case ARM::BI__iso_volatile_load8:
5437   case ARM::BI__iso_volatile_load16:
5438   case ARM::BI__iso_volatile_load32:
5439   case ARM::BI__iso_volatile_load64: {
5440     Value *Ptr = EmitScalarExpr(E->getArg(0));
5441     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
5442     CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy);
5443     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
5444                                              LoadSize.getQuantity() * 8);
5445     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
5446     llvm::LoadInst *Load =
5447       Builder.CreateAlignedLoad(Ptr, LoadSize);
5448     Load->setVolatile(true);
5449     return Load;
5450   }
5451   case ARM::BI__iso_volatile_store8:
5452   case ARM::BI__iso_volatile_store16:
5453   case ARM::BI__iso_volatile_store32:
5454   case ARM::BI__iso_volatile_store64: {
5455     Value *Ptr = EmitScalarExpr(E->getArg(0));
5456     Value *Value = EmitScalarExpr(E->getArg(1));
5457     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
5458     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
5459     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
5460                                              StoreSize.getQuantity() * 8);
5461     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
5462     llvm::StoreInst *Store =
5463       Builder.CreateAlignedStore(Value, Ptr,
5464                                  StoreSize);
5465     Store->setVolatile(true);
5466     return Store;
5467   }
5468   }
5469 
5470   if (BuiltinID == ARM::BI__builtin_arm_clrex) {
5471     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
5472     return Builder.CreateCall(F);
5473   }
5474 
5475   // CRC32
5476   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
5477   switch (BuiltinID) {
5478   case ARM::BI__builtin_arm_crc32b:
5479     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
5480   case ARM::BI__builtin_arm_crc32cb:
5481     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
5482   case ARM::BI__builtin_arm_crc32h:
5483     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
5484   case ARM::BI__builtin_arm_crc32ch:
5485     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
5486   case ARM::BI__builtin_arm_crc32w:
5487   case ARM::BI__builtin_arm_crc32d:
5488     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
5489   case ARM::BI__builtin_arm_crc32cw:
5490   case ARM::BI__builtin_arm_crc32cd:
5491     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
5492   }
5493 
5494   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
5495     Value *Arg0 = EmitScalarExpr(E->getArg(0));
5496     Value *Arg1 = EmitScalarExpr(E->getArg(1));
5497 
5498     // crc32{c,}d intrinsics are implemnted as two calls to crc32{c,}w
5499     // intrinsics, hence we need different codegen for these cases.
5500     if (BuiltinID == ARM::BI__builtin_arm_crc32d ||
5501         BuiltinID == ARM::BI__builtin_arm_crc32cd) {
5502       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
5503       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
5504       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
5505       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
5506 
5507       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
5508       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
5509       return Builder.CreateCall(F, {Res, Arg1b});
5510     } else {
5511       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
5512 
5513       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
5514       return Builder.CreateCall(F, {Arg0, Arg1});
5515     }
5516   }
5517 
5518   if (BuiltinID == ARM::BI__builtin_arm_rsr ||
5519       BuiltinID == ARM::BI__builtin_arm_rsr64 ||
5520       BuiltinID == ARM::BI__builtin_arm_rsrp ||
5521       BuiltinID == ARM::BI__builtin_arm_wsr ||
5522       BuiltinID == ARM::BI__builtin_arm_wsr64 ||
5523       BuiltinID == ARM::BI__builtin_arm_wsrp) {
5524 
5525     bool IsRead = BuiltinID == ARM::BI__builtin_arm_rsr ||
5526                   BuiltinID == ARM::BI__builtin_arm_rsr64 ||
5527                   BuiltinID == ARM::BI__builtin_arm_rsrp;
5528 
5529     bool IsPointerBuiltin = BuiltinID == ARM::BI__builtin_arm_rsrp ||
5530                             BuiltinID == ARM::BI__builtin_arm_wsrp;
5531 
5532     bool Is64Bit = BuiltinID == ARM::BI__builtin_arm_rsr64 ||
5533                    BuiltinID == ARM::BI__builtin_arm_wsr64;
5534 
5535     llvm::Type *ValueType;
5536     llvm::Type *RegisterType;
5537     if (IsPointerBuiltin) {
5538       ValueType = VoidPtrTy;
5539       RegisterType = Int32Ty;
5540     } else if (Is64Bit) {
5541       ValueType = RegisterType = Int64Ty;
5542     } else {
5543       ValueType = RegisterType = Int32Ty;
5544     }
5545 
5546     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, IsRead);
5547   }
5548 
5549   // Find out if any arguments are required to be integer constant
5550   // expressions.
5551   unsigned ICEArguments = 0;
5552   ASTContext::GetBuiltinTypeError Error;
5553   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5554   assert(Error == ASTContext::GE_None && "Should not codegen an error");
5555 
5556   auto getAlignmentValue32 = [&](Address addr) -> Value* {
5557     return Builder.getInt32(addr.getAlignment().getQuantity());
5558   };
5559 
5560   Address PtrOp0 = Address::invalid();
5561   Address PtrOp1 = Address::invalid();
5562   SmallVector<Value*, 4> Ops;
5563   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
5564   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
5565   for (unsigned i = 0, e = NumArgs; i != e; i++) {
5566     if (i == 0) {
5567       switch (BuiltinID) {
5568       case NEON::BI__builtin_neon_vld1_v:
5569       case NEON::BI__builtin_neon_vld1q_v:
5570       case NEON::BI__builtin_neon_vld1q_lane_v:
5571       case NEON::BI__builtin_neon_vld1_lane_v:
5572       case NEON::BI__builtin_neon_vld1_dup_v:
5573       case NEON::BI__builtin_neon_vld1q_dup_v:
5574       case NEON::BI__builtin_neon_vst1_v:
5575       case NEON::BI__builtin_neon_vst1q_v:
5576       case NEON::BI__builtin_neon_vst1q_lane_v:
5577       case NEON::BI__builtin_neon_vst1_lane_v:
5578       case NEON::BI__builtin_neon_vst2_v:
5579       case NEON::BI__builtin_neon_vst2q_v:
5580       case NEON::BI__builtin_neon_vst2_lane_v:
5581       case NEON::BI__builtin_neon_vst2q_lane_v:
5582       case NEON::BI__builtin_neon_vst3_v:
5583       case NEON::BI__builtin_neon_vst3q_v:
5584       case NEON::BI__builtin_neon_vst3_lane_v:
5585       case NEON::BI__builtin_neon_vst3q_lane_v:
5586       case NEON::BI__builtin_neon_vst4_v:
5587       case NEON::BI__builtin_neon_vst4q_v:
5588       case NEON::BI__builtin_neon_vst4_lane_v:
5589       case NEON::BI__builtin_neon_vst4q_lane_v:
5590         // Get the alignment for the argument in addition to the value;
5591         // we'll use it later.
5592         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
5593         Ops.push_back(PtrOp0.getPointer());
5594         continue;
5595       }
5596     }
5597     if (i == 1) {
5598       switch (BuiltinID) {
5599       case NEON::BI__builtin_neon_vld2_v:
5600       case NEON::BI__builtin_neon_vld2q_v:
5601       case NEON::BI__builtin_neon_vld3_v:
5602       case NEON::BI__builtin_neon_vld3q_v:
5603       case NEON::BI__builtin_neon_vld4_v:
5604       case NEON::BI__builtin_neon_vld4q_v:
5605       case NEON::BI__builtin_neon_vld2_lane_v:
5606       case NEON::BI__builtin_neon_vld2q_lane_v:
5607       case NEON::BI__builtin_neon_vld3_lane_v:
5608       case NEON::BI__builtin_neon_vld3q_lane_v:
5609       case NEON::BI__builtin_neon_vld4_lane_v:
5610       case NEON::BI__builtin_neon_vld4q_lane_v:
5611       case NEON::BI__builtin_neon_vld2_dup_v:
5612       case NEON::BI__builtin_neon_vld3_dup_v:
5613       case NEON::BI__builtin_neon_vld4_dup_v:
5614         // Get the alignment for the argument in addition to the value;
5615         // we'll use it later.
5616         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
5617         Ops.push_back(PtrOp1.getPointer());
5618         continue;
5619       }
5620     }
5621 
5622     if ((ICEArguments & (1 << i)) == 0) {
5623       Ops.push_back(EmitScalarExpr(E->getArg(i)));
5624     } else {
5625       // If this is required to be a constant, constant fold it so that we know
5626       // that the generated intrinsic gets a ConstantInt.
5627       llvm::APSInt Result;
5628       bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
5629       assert(IsConst && "Constant arg isn't actually constant?"); (void)IsConst;
5630       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
5631     }
5632   }
5633 
5634   switch (BuiltinID) {
5635   default: break;
5636 
5637   case NEON::BI__builtin_neon_vget_lane_i8:
5638   case NEON::BI__builtin_neon_vget_lane_i16:
5639   case NEON::BI__builtin_neon_vget_lane_i32:
5640   case NEON::BI__builtin_neon_vget_lane_i64:
5641   case NEON::BI__builtin_neon_vget_lane_f32:
5642   case NEON::BI__builtin_neon_vgetq_lane_i8:
5643   case NEON::BI__builtin_neon_vgetq_lane_i16:
5644   case NEON::BI__builtin_neon_vgetq_lane_i32:
5645   case NEON::BI__builtin_neon_vgetq_lane_i64:
5646   case NEON::BI__builtin_neon_vgetq_lane_f32:
5647     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
5648 
5649   case NEON::BI__builtin_neon_vrndns_f32: {
5650     Value *Arg = EmitScalarExpr(E->getArg(0));
5651     llvm::Type *Tys[] = {Arg->getType()};
5652     Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
5653     return Builder.CreateCall(F, {Arg}, "vrndn"); }
5654 
5655   case NEON::BI__builtin_neon_vset_lane_i8:
5656   case NEON::BI__builtin_neon_vset_lane_i16:
5657   case NEON::BI__builtin_neon_vset_lane_i32:
5658   case NEON::BI__builtin_neon_vset_lane_i64:
5659   case NEON::BI__builtin_neon_vset_lane_f32:
5660   case NEON::BI__builtin_neon_vsetq_lane_i8:
5661   case NEON::BI__builtin_neon_vsetq_lane_i16:
5662   case NEON::BI__builtin_neon_vsetq_lane_i32:
5663   case NEON::BI__builtin_neon_vsetq_lane_i64:
5664   case NEON::BI__builtin_neon_vsetq_lane_f32:
5665     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
5666 
5667   case NEON::BI__builtin_neon_vsha1h_u32:
5668     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
5669                         "vsha1h");
5670   case NEON::BI__builtin_neon_vsha1cq_u32:
5671     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
5672                         "vsha1h");
5673   case NEON::BI__builtin_neon_vsha1pq_u32:
5674     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
5675                         "vsha1h");
5676   case NEON::BI__builtin_neon_vsha1mq_u32:
5677     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
5678                         "vsha1h");
5679 
5680   // The ARM _MoveToCoprocessor builtins put the input register value as
5681   // the first argument, but the LLVM intrinsic expects it as the third one.
5682   case ARM::BI_MoveToCoprocessor:
5683   case ARM::BI_MoveToCoprocessor2: {
5684     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
5685                                    Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
5686     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
5687                                   Ops[3], Ops[4], Ops[5]});
5688   }
5689   case ARM::BI_BitScanForward:
5690   case ARM::BI_BitScanForward64:
5691     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
5692   case ARM::BI_BitScanReverse:
5693   case ARM::BI_BitScanReverse64:
5694     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
5695 
5696   case ARM::BI_InterlockedAnd64:
5697     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
5698   case ARM::BI_InterlockedExchange64:
5699     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
5700   case ARM::BI_InterlockedExchangeAdd64:
5701     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
5702   case ARM::BI_InterlockedExchangeSub64:
5703     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
5704   case ARM::BI_InterlockedOr64:
5705     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
5706   case ARM::BI_InterlockedXor64:
5707     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
5708   case ARM::BI_InterlockedDecrement64:
5709     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
5710   case ARM::BI_InterlockedIncrement64:
5711     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
5712   }
5713 
5714   // Get the last argument, which specifies the vector type.
5715   assert(HasExtraArg);
5716   llvm::APSInt Result;
5717   const Expr *Arg = E->getArg(E->getNumArgs()-1);
5718   if (!Arg->isIntegerConstantExpr(Result, getContext()))
5719     return nullptr;
5720 
5721   if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f ||
5722       BuiltinID == ARM::BI__builtin_arm_vcvtr_d) {
5723     // Determine the overloaded type of this builtin.
5724     llvm::Type *Ty;
5725     if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f)
5726       Ty = FloatTy;
5727     else
5728       Ty = DoubleTy;
5729 
5730     // Determine whether this is an unsigned conversion or not.
5731     bool usgn = Result.getZExtValue() == 1;
5732     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
5733 
5734     // Call the appropriate intrinsic.
5735     Function *F = CGM.getIntrinsic(Int, Ty);
5736     return Builder.CreateCall(F, Ops, "vcvtr");
5737   }
5738 
5739   // Determine the type of this overloaded NEON intrinsic.
5740   NeonTypeFlags Type(Result.getZExtValue());
5741   bool usgn = Type.isUnsigned();
5742   bool rightShift = false;
5743 
5744   llvm::VectorType *VTy = GetNeonType(this, Type,
5745                                       getTarget().hasLegalHalfType());
5746   llvm::Type *Ty = VTy;
5747   if (!Ty)
5748     return nullptr;
5749 
5750   // Many NEON builtins have identical semantics and uses in ARM and
5751   // AArch64. Emit these in a single function.
5752   auto IntrinsicMap = makeArrayRef(ARMSIMDIntrinsicMap);
5753   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
5754       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
5755   if (Builtin)
5756     return EmitCommonNeonBuiltinExpr(
5757         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
5758         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
5759 
5760   unsigned Int;
5761   switch (BuiltinID) {
5762   default: return nullptr;
5763   case NEON::BI__builtin_neon_vld1q_lane_v:
5764     // Handle 64-bit integer elements as a special case.  Use shuffles of
5765     // one-element vectors to avoid poor code for i64 in the backend.
5766     if (VTy->getElementType()->isIntegerTy(64)) {
5767       // Extract the other lane.
5768       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5769       uint32_t Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
5770       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
5771       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
5772       // Load the value as a one-element vector.
5773       Ty = llvm::VectorType::get(VTy->getElementType(), 1);
5774       llvm::Type *Tys[] = {Ty, Int8PtrTy};
5775       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
5776       Value *Align = getAlignmentValue32(PtrOp0);
5777       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
5778       // Combine them.
5779       uint32_t Indices[] = {1 - Lane, Lane};
5780       SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices);
5781       return Builder.CreateShuffleVector(Ops[1], Ld, SV, "vld1q_lane");
5782     }
5783     LLVM_FALLTHROUGH;
5784   case NEON::BI__builtin_neon_vld1_lane_v: {
5785     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5786     PtrOp0 = Builder.CreateElementBitCast(PtrOp0, VTy->getElementType());
5787     Value *Ld = Builder.CreateLoad(PtrOp0);
5788     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
5789   }
5790   case NEON::BI__builtin_neon_vld2_dup_v:
5791   case NEON::BI__builtin_neon_vld3_dup_v:
5792   case NEON::BI__builtin_neon_vld4_dup_v: {
5793     // Handle 64-bit elements as a special-case.  There is no "dup" needed.
5794     if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) {
5795       switch (BuiltinID) {
5796       case NEON::BI__builtin_neon_vld2_dup_v:
5797         Int = Intrinsic::arm_neon_vld2;
5798         break;
5799       case NEON::BI__builtin_neon_vld3_dup_v:
5800         Int = Intrinsic::arm_neon_vld3;
5801         break;
5802       case NEON::BI__builtin_neon_vld4_dup_v:
5803         Int = Intrinsic::arm_neon_vld4;
5804         break;
5805       default: llvm_unreachable("unknown vld_dup intrinsic?");
5806       }
5807       llvm::Type *Tys[] = {Ty, Int8PtrTy};
5808       Function *F = CGM.getIntrinsic(Int, Tys);
5809       llvm::Value *Align = getAlignmentValue32(PtrOp1);
5810       Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, "vld_dup");
5811       Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5812       Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5813       return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
5814     }
5815     switch (BuiltinID) {
5816     case NEON::BI__builtin_neon_vld2_dup_v:
5817       Int = Intrinsic::arm_neon_vld2lane;
5818       break;
5819     case NEON::BI__builtin_neon_vld3_dup_v:
5820       Int = Intrinsic::arm_neon_vld3lane;
5821       break;
5822     case NEON::BI__builtin_neon_vld4_dup_v:
5823       Int = Intrinsic::arm_neon_vld4lane;
5824       break;
5825     default: llvm_unreachable("unknown vld_dup intrinsic?");
5826     }
5827     llvm::Type *Tys[] = {Ty, Int8PtrTy};
5828     Function *F = CGM.getIntrinsic(Int, Tys);
5829     llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType());
5830 
5831     SmallVector<Value*, 6> Args;
5832     Args.push_back(Ops[1]);
5833     Args.append(STy->getNumElements(), UndefValue::get(Ty));
5834 
5835     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
5836     Args.push_back(CI);
5837     Args.push_back(getAlignmentValue32(PtrOp1));
5838 
5839     Ops[1] = Builder.CreateCall(F, Args, "vld_dup");
5840     // splat lane 0 to all elts in each vector of the result.
5841     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
5842       Value *Val = Builder.CreateExtractValue(Ops[1], i);
5843       Value *Elt = Builder.CreateBitCast(Val, Ty);
5844       Elt = EmitNeonSplat(Elt, CI);
5845       Elt = Builder.CreateBitCast(Elt, Val->getType());
5846       Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i);
5847     }
5848     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5849     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5850     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
5851   }
5852   case NEON::BI__builtin_neon_vqrshrn_n_v:
5853     Int =
5854       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
5855     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
5856                         1, true);
5857   case NEON::BI__builtin_neon_vqrshrun_n_v:
5858     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
5859                         Ops, "vqrshrun_n", 1, true);
5860   case NEON::BI__builtin_neon_vqshrn_n_v:
5861     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
5862     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
5863                         1, true);
5864   case NEON::BI__builtin_neon_vqshrun_n_v:
5865     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
5866                         Ops, "vqshrun_n", 1, true);
5867   case NEON::BI__builtin_neon_vrecpe_v:
5868   case NEON::BI__builtin_neon_vrecpeq_v:
5869     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
5870                         Ops, "vrecpe");
5871   case NEON::BI__builtin_neon_vrshrn_n_v:
5872     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
5873                         Ops, "vrshrn_n", 1, true);
5874   case NEON::BI__builtin_neon_vrsra_n_v:
5875   case NEON::BI__builtin_neon_vrsraq_n_v:
5876     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5877     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5878     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
5879     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
5880     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
5881     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
5882   case NEON::BI__builtin_neon_vsri_n_v:
5883   case NEON::BI__builtin_neon_vsriq_n_v:
5884     rightShift = true;
5885     LLVM_FALLTHROUGH;
5886   case NEON::BI__builtin_neon_vsli_n_v:
5887   case NEON::BI__builtin_neon_vsliq_n_v:
5888     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
5889     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
5890                         Ops, "vsli_n");
5891   case NEON::BI__builtin_neon_vsra_n_v:
5892   case NEON::BI__builtin_neon_vsraq_n_v:
5893     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
5894     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
5895     return Builder.CreateAdd(Ops[0], Ops[1]);
5896   case NEON::BI__builtin_neon_vst1q_lane_v:
5897     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
5898     // a one-element vector and avoid poor code for i64 in the backend.
5899     if (VTy->getElementType()->isIntegerTy(64)) {
5900       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5901       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
5902       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
5903       Ops[2] = getAlignmentValue32(PtrOp0);
5904       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
5905       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
5906                                                  Tys), Ops);
5907     }
5908     LLVM_FALLTHROUGH;
5909   case NEON::BI__builtin_neon_vst1_lane_v: {
5910     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
5911     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
5912     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
5913     auto St = Builder.CreateStore(Ops[1], Builder.CreateBitCast(PtrOp0, Ty));
5914     return St;
5915   }
5916   case NEON::BI__builtin_neon_vtbl1_v:
5917     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
5918                         Ops, "vtbl1");
5919   case NEON::BI__builtin_neon_vtbl2_v:
5920     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
5921                         Ops, "vtbl2");
5922   case NEON::BI__builtin_neon_vtbl3_v:
5923     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
5924                         Ops, "vtbl3");
5925   case NEON::BI__builtin_neon_vtbl4_v:
5926     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
5927                         Ops, "vtbl4");
5928   case NEON::BI__builtin_neon_vtbx1_v:
5929     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
5930                         Ops, "vtbx1");
5931   case NEON::BI__builtin_neon_vtbx2_v:
5932     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
5933                         Ops, "vtbx2");
5934   case NEON::BI__builtin_neon_vtbx3_v:
5935     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
5936                         Ops, "vtbx3");
5937   case NEON::BI__builtin_neon_vtbx4_v:
5938     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
5939                         Ops, "vtbx4");
5940   }
5941 }
5942 
5943 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
5944                                       const CallExpr *E,
5945                                       SmallVectorImpl<Value *> &Ops,
5946                                       llvm::Triple::ArchType Arch) {
5947   unsigned int Int = 0;
5948   const char *s = nullptr;
5949 
5950   switch (BuiltinID) {
5951   default:
5952     return nullptr;
5953   case NEON::BI__builtin_neon_vtbl1_v:
5954   case NEON::BI__builtin_neon_vqtbl1_v:
5955   case NEON::BI__builtin_neon_vqtbl1q_v:
5956   case NEON::BI__builtin_neon_vtbl2_v:
5957   case NEON::BI__builtin_neon_vqtbl2_v:
5958   case NEON::BI__builtin_neon_vqtbl2q_v:
5959   case NEON::BI__builtin_neon_vtbl3_v:
5960   case NEON::BI__builtin_neon_vqtbl3_v:
5961   case NEON::BI__builtin_neon_vqtbl3q_v:
5962   case NEON::BI__builtin_neon_vtbl4_v:
5963   case NEON::BI__builtin_neon_vqtbl4_v:
5964   case NEON::BI__builtin_neon_vqtbl4q_v:
5965     break;
5966   case NEON::BI__builtin_neon_vtbx1_v:
5967   case NEON::BI__builtin_neon_vqtbx1_v:
5968   case NEON::BI__builtin_neon_vqtbx1q_v:
5969   case NEON::BI__builtin_neon_vtbx2_v:
5970   case NEON::BI__builtin_neon_vqtbx2_v:
5971   case NEON::BI__builtin_neon_vqtbx2q_v:
5972   case NEON::BI__builtin_neon_vtbx3_v:
5973   case NEON::BI__builtin_neon_vqtbx3_v:
5974   case NEON::BI__builtin_neon_vqtbx3q_v:
5975   case NEON::BI__builtin_neon_vtbx4_v:
5976   case NEON::BI__builtin_neon_vqtbx4_v:
5977   case NEON::BI__builtin_neon_vqtbx4q_v:
5978     break;
5979   }
5980 
5981   assert(E->getNumArgs() >= 3);
5982 
5983   // Get the last argument, which specifies the vector type.
5984   llvm::APSInt Result;
5985   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
5986   if (!Arg->isIntegerConstantExpr(Result, CGF.getContext()))
5987     return nullptr;
5988 
5989   // Determine the type of this overloaded NEON intrinsic.
5990   NeonTypeFlags Type(Result.getZExtValue());
5991   llvm::VectorType *Ty = GetNeonType(&CGF, Type);
5992   if (!Ty)
5993     return nullptr;
5994 
5995   CodeGen::CGBuilderTy &Builder = CGF.Builder;
5996 
5997   // AArch64 scalar builtins are not overloaded, they do not have an extra
5998   // argument that specifies the vector type, need to handle each case.
5999   switch (BuiltinID) {
6000   case NEON::BI__builtin_neon_vtbl1_v: {
6001     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 1), nullptr,
6002                               Ops[1], Ty, Intrinsic::aarch64_neon_tbl1,
6003                               "vtbl1");
6004   }
6005   case NEON::BI__builtin_neon_vtbl2_v: {
6006     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 2), nullptr,
6007                               Ops[2], Ty, Intrinsic::aarch64_neon_tbl1,
6008                               "vtbl1");
6009   }
6010   case NEON::BI__builtin_neon_vtbl3_v: {
6011     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 3), nullptr,
6012                               Ops[3], Ty, Intrinsic::aarch64_neon_tbl2,
6013                               "vtbl2");
6014   }
6015   case NEON::BI__builtin_neon_vtbl4_v: {
6016     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 4), nullptr,
6017                               Ops[4], Ty, Intrinsic::aarch64_neon_tbl2,
6018                               "vtbl2");
6019   }
6020   case NEON::BI__builtin_neon_vtbx1_v: {
6021     Value *TblRes =
6022         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 1), nullptr, Ops[2],
6023                            Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
6024 
6025     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
6026     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
6027     CmpRes = Builder.CreateSExt(CmpRes, Ty);
6028 
6029     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
6030     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
6031     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
6032   }
6033   case NEON::BI__builtin_neon_vtbx2_v: {
6034     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 2), Ops[0],
6035                               Ops[3], Ty, Intrinsic::aarch64_neon_tbx1,
6036                               "vtbx1");
6037   }
6038   case NEON::BI__builtin_neon_vtbx3_v: {
6039     Value *TblRes =
6040         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 3), nullptr, Ops[4],
6041                            Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
6042 
6043     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
6044     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
6045                                            TwentyFourV);
6046     CmpRes = Builder.CreateSExt(CmpRes, Ty);
6047 
6048     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
6049     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
6050     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
6051   }
6052   case NEON::BI__builtin_neon_vtbx4_v: {
6053     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 4), Ops[0],
6054                               Ops[5], Ty, Intrinsic::aarch64_neon_tbx2,
6055                               "vtbx2");
6056   }
6057   case NEON::BI__builtin_neon_vqtbl1_v:
6058   case NEON::BI__builtin_neon_vqtbl1q_v:
6059     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
6060   case NEON::BI__builtin_neon_vqtbl2_v:
6061   case NEON::BI__builtin_neon_vqtbl2q_v: {
6062     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
6063   case NEON::BI__builtin_neon_vqtbl3_v:
6064   case NEON::BI__builtin_neon_vqtbl3q_v:
6065     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
6066   case NEON::BI__builtin_neon_vqtbl4_v:
6067   case NEON::BI__builtin_neon_vqtbl4q_v:
6068     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
6069   case NEON::BI__builtin_neon_vqtbx1_v:
6070   case NEON::BI__builtin_neon_vqtbx1q_v:
6071     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
6072   case NEON::BI__builtin_neon_vqtbx2_v:
6073   case NEON::BI__builtin_neon_vqtbx2q_v:
6074     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
6075   case NEON::BI__builtin_neon_vqtbx3_v:
6076   case NEON::BI__builtin_neon_vqtbx3q_v:
6077     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
6078   case NEON::BI__builtin_neon_vqtbx4_v:
6079   case NEON::BI__builtin_neon_vqtbx4q_v:
6080     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
6081   }
6082   }
6083 
6084   if (!Int)
6085     return nullptr;
6086 
6087   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
6088   return CGF.EmitNeonCall(F, Ops, s);
6089 }
6090 
6091 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
6092   llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
6093   Op = Builder.CreateBitCast(Op, Int16Ty);
6094   Value *V = UndefValue::get(VTy);
6095   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
6096   Op = Builder.CreateInsertElement(V, Op, CI);
6097   return Op;
6098 }
6099 
6100 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
6101                                                const CallExpr *E,
6102                                                llvm::Triple::ArchType Arch) {
6103   unsigned HintID = static_cast<unsigned>(-1);
6104   switch (BuiltinID) {
6105   default: break;
6106   case AArch64::BI__builtin_arm_nop:
6107     HintID = 0;
6108     break;
6109   case AArch64::BI__builtin_arm_yield:
6110     HintID = 1;
6111     break;
6112   case AArch64::BI__builtin_arm_wfe:
6113     HintID = 2;
6114     break;
6115   case AArch64::BI__builtin_arm_wfi:
6116     HintID = 3;
6117     break;
6118   case AArch64::BI__builtin_arm_sev:
6119     HintID = 4;
6120     break;
6121   case AArch64::BI__builtin_arm_sevl:
6122     HintID = 5;
6123     break;
6124   }
6125 
6126   if (HintID != static_cast<unsigned>(-1)) {
6127     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
6128     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
6129   }
6130 
6131   if (BuiltinID == AArch64::BI__builtin_arm_prefetch) {
6132     Value *Address         = EmitScalarExpr(E->getArg(0));
6133     Value *RW              = EmitScalarExpr(E->getArg(1));
6134     Value *CacheLevel      = EmitScalarExpr(E->getArg(2));
6135     Value *RetentionPolicy = EmitScalarExpr(E->getArg(3));
6136     Value *IsData          = EmitScalarExpr(E->getArg(4));
6137 
6138     Value *Locality = nullptr;
6139     if (cast<llvm::ConstantInt>(RetentionPolicy)->isZero()) {
6140       // Temporal fetch, needs to convert cache level to locality.
6141       Locality = llvm::ConstantInt::get(Int32Ty,
6142         -cast<llvm::ConstantInt>(CacheLevel)->getValue() + 3);
6143     } else {
6144       // Streaming fetch.
6145       Locality = llvm::ConstantInt::get(Int32Ty, 0);
6146     }
6147 
6148     // FIXME: We need AArch64 specific LLVM intrinsic if we want to specify
6149     // PLDL3STRM or PLDL2STRM.
6150     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
6151     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
6152   }
6153 
6154   if (BuiltinID == AArch64::BI__builtin_arm_rbit) {
6155     assert((getContext().getTypeSize(E->getType()) == 32) &&
6156            "rbit of unusual size!");
6157     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
6158     return Builder.CreateCall(
6159         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
6160   }
6161   if (BuiltinID == AArch64::BI__builtin_arm_rbit64) {
6162     assert((getContext().getTypeSize(E->getType()) == 64) &&
6163            "rbit of unusual size!");
6164     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
6165     return Builder.CreateCall(
6166         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
6167   }
6168 
6169   if (BuiltinID == AArch64::BI__clear_cache) {
6170     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
6171     const FunctionDecl *FD = E->getDirectCallee();
6172     Value *Ops[2];
6173     for (unsigned i = 0; i < 2; i++)
6174       Ops[i] = EmitScalarExpr(E->getArg(i));
6175     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
6176     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
6177     StringRef Name = FD->getName();
6178     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
6179   }
6180 
6181   if ((BuiltinID == AArch64::BI__builtin_arm_ldrex ||
6182       BuiltinID == AArch64::BI__builtin_arm_ldaex) &&
6183       getContext().getTypeSize(E->getType()) == 128) {
6184     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
6185                                        ? Intrinsic::aarch64_ldaxp
6186                                        : Intrinsic::aarch64_ldxp);
6187 
6188     Value *LdPtr = EmitScalarExpr(E->getArg(0));
6189     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
6190                                     "ldxp");
6191 
6192     Value *Val0 = Builder.CreateExtractValue(Val, 1);
6193     Value *Val1 = Builder.CreateExtractValue(Val, 0);
6194     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
6195     Val0 = Builder.CreateZExt(Val0, Int128Ty);
6196     Val1 = Builder.CreateZExt(Val1, Int128Ty);
6197 
6198     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
6199     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
6200     Val = Builder.CreateOr(Val, Val1);
6201     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
6202   } else if (BuiltinID == AArch64::BI__builtin_arm_ldrex ||
6203              BuiltinID == AArch64::BI__builtin_arm_ldaex) {
6204     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
6205 
6206     QualType Ty = E->getType();
6207     llvm::Type *RealResTy = ConvertType(Ty);
6208     llvm::Type *PtrTy = llvm::IntegerType::get(
6209         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
6210     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
6211 
6212     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
6213                                        ? Intrinsic::aarch64_ldaxr
6214                                        : Intrinsic::aarch64_ldxr,
6215                                    PtrTy);
6216     Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
6217 
6218     if (RealResTy->isPointerTy())
6219       return Builder.CreateIntToPtr(Val, RealResTy);
6220 
6221     llvm::Type *IntResTy = llvm::IntegerType::get(
6222         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
6223     Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
6224     return Builder.CreateBitCast(Val, RealResTy);
6225   }
6226 
6227   if ((BuiltinID == AArch64::BI__builtin_arm_strex ||
6228        BuiltinID == AArch64::BI__builtin_arm_stlex) &&
6229       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
6230     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
6231                                        ? Intrinsic::aarch64_stlxp
6232                                        : Intrinsic::aarch64_stxp);
6233     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
6234 
6235     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
6236     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
6237 
6238     Tmp = Builder.CreateBitCast(Tmp, llvm::PointerType::getUnqual(STy));
6239     llvm::Value *Val = Builder.CreateLoad(Tmp);
6240 
6241     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
6242     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
6243     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
6244                                          Int8PtrTy);
6245     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
6246   }
6247 
6248   if (BuiltinID == AArch64::BI__builtin_arm_strex ||
6249       BuiltinID == AArch64::BI__builtin_arm_stlex) {
6250     Value *StoreVal = EmitScalarExpr(E->getArg(0));
6251     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
6252 
6253     QualType Ty = E->getArg(0)->getType();
6254     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
6255                                                  getContext().getTypeSize(Ty));
6256     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
6257 
6258     if (StoreVal->getType()->isPointerTy())
6259       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
6260     else {
6261       llvm::Type *IntTy = llvm::IntegerType::get(
6262           getLLVMContext(),
6263           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
6264       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
6265       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
6266     }
6267 
6268     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
6269                                        ? Intrinsic::aarch64_stlxr
6270                                        : Intrinsic::aarch64_stxr,
6271                                    StoreAddr->getType());
6272     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
6273   }
6274 
6275   if (BuiltinID == AArch64::BI__builtin_arm_clrex) {
6276     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
6277     return Builder.CreateCall(F);
6278   }
6279 
6280   // CRC32
6281   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
6282   switch (BuiltinID) {
6283   case AArch64::BI__builtin_arm_crc32b:
6284     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
6285   case AArch64::BI__builtin_arm_crc32cb:
6286     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
6287   case AArch64::BI__builtin_arm_crc32h:
6288     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
6289   case AArch64::BI__builtin_arm_crc32ch:
6290     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
6291   case AArch64::BI__builtin_arm_crc32w:
6292     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
6293   case AArch64::BI__builtin_arm_crc32cw:
6294     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
6295   case AArch64::BI__builtin_arm_crc32d:
6296     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
6297   case AArch64::BI__builtin_arm_crc32cd:
6298     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
6299   }
6300 
6301   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
6302     Value *Arg0 = EmitScalarExpr(E->getArg(0));
6303     Value *Arg1 = EmitScalarExpr(E->getArg(1));
6304     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
6305 
6306     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
6307     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
6308 
6309     return Builder.CreateCall(F, {Arg0, Arg1});
6310   }
6311 
6312   if (BuiltinID == AArch64::BI__builtin_arm_rsr ||
6313       BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
6314       BuiltinID == AArch64::BI__builtin_arm_rsrp ||
6315       BuiltinID == AArch64::BI__builtin_arm_wsr ||
6316       BuiltinID == AArch64::BI__builtin_arm_wsr64 ||
6317       BuiltinID == AArch64::BI__builtin_arm_wsrp) {
6318 
6319     bool IsRead = BuiltinID == AArch64::BI__builtin_arm_rsr ||
6320                   BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
6321                   BuiltinID == AArch64::BI__builtin_arm_rsrp;
6322 
6323     bool IsPointerBuiltin = BuiltinID == AArch64::BI__builtin_arm_rsrp ||
6324                             BuiltinID == AArch64::BI__builtin_arm_wsrp;
6325 
6326     bool Is64Bit = BuiltinID != AArch64::BI__builtin_arm_rsr &&
6327                    BuiltinID != AArch64::BI__builtin_arm_wsr;
6328 
6329     llvm::Type *ValueType;
6330     llvm::Type *RegisterType = Int64Ty;
6331     if (IsPointerBuiltin) {
6332       ValueType = VoidPtrTy;
6333     } else if (Is64Bit) {
6334       ValueType = Int64Ty;
6335     } else {
6336       ValueType = Int32Ty;
6337     }
6338 
6339     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, IsRead);
6340   }
6341 
6342   // Find out if any arguments are required to be integer constant
6343   // expressions.
6344   unsigned ICEArguments = 0;
6345   ASTContext::GetBuiltinTypeError Error;
6346   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
6347   assert(Error == ASTContext::GE_None && "Should not codegen an error");
6348 
6349   llvm::SmallVector<Value*, 4> Ops;
6350   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
6351     if ((ICEArguments & (1 << i)) == 0) {
6352       Ops.push_back(EmitScalarExpr(E->getArg(i)));
6353     } else {
6354       // If this is required to be a constant, constant fold it so that we know
6355       // that the generated intrinsic gets a ConstantInt.
6356       llvm::APSInt Result;
6357       bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
6358       assert(IsConst && "Constant arg isn't actually constant?");
6359       (void)IsConst;
6360       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
6361     }
6362   }
6363 
6364   auto SISDMap = makeArrayRef(AArch64SISDIntrinsicMap);
6365   const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
6366       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
6367 
6368   if (Builtin) {
6369     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
6370     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
6371     assert(Result && "SISD intrinsic should have been handled");
6372     return Result;
6373   }
6374 
6375   llvm::APSInt Result;
6376   const Expr *Arg = E->getArg(E->getNumArgs()-1);
6377   NeonTypeFlags Type(0);
6378   if (Arg->isIntegerConstantExpr(Result, getContext()))
6379     // Determine the type of this overloaded NEON intrinsic.
6380     Type = NeonTypeFlags(Result.getZExtValue());
6381 
6382   bool usgn = Type.isUnsigned();
6383   bool quad = Type.isQuad();
6384 
6385   // Handle non-overloaded intrinsics first.
6386   switch (BuiltinID) {
6387   default: break;
6388   case NEON::BI__builtin_neon_vabsh_f16:
6389     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6390     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
6391   case NEON::BI__builtin_neon_vldrq_p128: {
6392     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
6393     llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
6394     Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
6395     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
6396                                      CharUnits::fromQuantity(16));
6397   }
6398   case NEON::BI__builtin_neon_vstrq_p128: {
6399     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
6400     Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
6401     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
6402   }
6403   case NEON::BI__builtin_neon_vcvts_u32_f32:
6404   case NEON::BI__builtin_neon_vcvtd_u64_f64:
6405     usgn = true;
6406     LLVM_FALLTHROUGH;
6407   case NEON::BI__builtin_neon_vcvts_s32_f32:
6408   case NEON::BI__builtin_neon_vcvtd_s64_f64: {
6409     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6410     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
6411     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
6412     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
6413     Ops[0] = Builder.CreateBitCast(Ops[0], FTy);
6414     if (usgn)
6415       return Builder.CreateFPToUI(Ops[0], InTy);
6416     return Builder.CreateFPToSI(Ops[0], InTy);
6417   }
6418   case NEON::BI__builtin_neon_vcvts_f32_u32:
6419   case NEON::BI__builtin_neon_vcvtd_f64_u64:
6420     usgn = true;
6421     LLVM_FALLTHROUGH;
6422   case NEON::BI__builtin_neon_vcvts_f32_s32:
6423   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
6424     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6425     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
6426     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
6427     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
6428     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
6429     if (usgn)
6430       return Builder.CreateUIToFP(Ops[0], FTy);
6431     return Builder.CreateSIToFP(Ops[0], FTy);
6432   }
6433   case NEON::BI__builtin_neon_vcvth_f16_u16:
6434   case NEON::BI__builtin_neon_vcvth_f16_u32:
6435   case NEON::BI__builtin_neon_vcvth_f16_u64:
6436     usgn = true;
6437     // FALL THROUGH
6438   case NEON::BI__builtin_neon_vcvth_f16_s16:
6439   case NEON::BI__builtin_neon_vcvth_f16_s32:
6440   case NEON::BI__builtin_neon_vcvth_f16_s64: {
6441     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6442     llvm::Type *FTy = HalfTy;
6443     llvm::Type *InTy;
6444     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
6445       InTy = Int64Ty;
6446     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
6447       InTy = Int32Ty;
6448     else
6449       InTy = Int16Ty;
6450     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
6451     if (usgn)
6452       return Builder.CreateUIToFP(Ops[0], FTy);
6453     return Builder.CreateSIToFP(Ops[0], FTy);
6454   }
6455   case NEON::BI__builtin_neon_vcvth_u16_f16:
6456     usgn = true;
6457     // FALL THROUGH
6458   case NEON::BI__builtin_neon_vcvth_s16_f16: {
6459     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6460     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
6461     if (usgn)
6462       return Builder.CreateFPToUI(Ops[0], Int16Ty);
6463     return Builder.CreateFPToSI(Ops[0], Int16Ty);
6464   }
6465   case NEON::BI__builtin_neon_vcvth_u32_f16:
6466     usgn = true;
6467     // FALL THROUGH
6468   case NEON::BI__builtin_neon_vcvth_s32_f16: {
6469     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6470     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
6471     if (usgn)
6472       return Builder.CreateFPToUI(Ops[0], Int32Ty);
6473     return Builder.CreateFPToSI(Ops[0], Int32Ty);
6474   }
6475   case NEON::BI__builtin_neon_vcvth_u64_f16:
6476     usgn = true;
6477     // FALL THROUGH
6478   case NEON::BI__builtin_neon_vcvth_s64_f16: {
6479     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6480     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
6481     if (usgn)
6482       return Builder.CreateFPToUI(Ops[0], Int64Ty);
6483     return Builder.CreateFPToSI(Ops[0], Int64Ty);
6484   }
6485   case NEON::BI__builtin_neon_vcvtah_u16_f16:
6486   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
6487   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
6488   case NEON::BI__builtin_neon_vcvtph_u16_f16:
6489   case NEON::BI__builtin_neon_vcvtah_s16_f16:
6490   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
6491   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
6492   case NEON::BI__builtin_neon_vcvtph_s16_f16: {
6493     unsigned Int;
6494     llvm::Type* InTy = Int32Ty;
6495     llvm::Type* FTy  = HalfTy;
6496     llvm::Type *Tys[2] = {InTy, FTy};
6497     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6498     switch (BuiltinID) {
6499     default: llvm_unreachable("missing builtin ID in switch!");
6500     case NEON::BI__builtin_neon_vcvtah_u16_f16:
6501       Int = Intrinsic::aarch64_neon_fcvtau; break;
6502     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
6503       Int = Intrinsic::aarch64_neon_fcvtmu; break;
6504     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
6505       Int = Intrinsic::aarch64_neon_fcvtnu; break;
6506     case NEON::BI__builtin_neon_vcvtph_u16_f16:
6507       Int = Intrinsic::aarch64_neon_fcvtpu; break;
6508     case NEON::BI__builtin_neon_vcvtah_s16_f16:
6509       Int = Intrinsic::aarch64_neon_fcvtas; break;
6510     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
6511       Int = Intrinsic::aarch64_neon_fcvtms; break;
6512     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
6513       Int = Intrinsic::aarch64_neon_fcvtns; break;
6514     case NEON::BI__builtin_neon_vcvtph_s16_f16:
6515       Int = Intrinsic::aarch64_neon_fcvtps; break;
6516     }
6517     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
6518     return Builder.CreateTrunc(Ops[0], Int16Ty);
6519   }
6520   case NEON::BI__builtin_neon_vcaleh_f16:
6521   case NEON::BI__builtin_neon_vcalth_f16:
6522   case NEON::BI__builtin_neon_vcageh_f16:
6523   case NEON::BI__builtin_neon_vcagth_f16: {
6524     unsigned Int;
6525     llvm::Type* InTy = Int32Ty;
6526     llvm::Type* FTy  = HalfTy;
6527     llvm::Type *Tys[2] = {InTy, FTy};
6528     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6529     switch (BuiltinID) {
6530     default: llvm_unreachable("missing builtin ID in switch!");
6531     case NEON::BI__builtin_neon_vcageh_f16:
6532       Int = Intrinsic::aarch64_neon_facge; break;
6533     case NEON::BI__builtin_neon_vcagth_f16:
6534       Int = Intrinsic::aarch64_neon_facgt; break;
6535     case NEON::BI__builtin_neon_vcaleh_f16:
6536       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
6537     case NEON::BI__builtin_neon_vcalth_f16:
6538       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
6539     }
6540     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
6541     return Builder.CreateTrunc(Ops[0], Int16Ty);
6542   }
6543   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
6544   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
6545     unsigned Int;
6546     llvm::Type* InTy = Int32Ty;
6547     llvm::Type* FTy  = HalfTy;
6548     llvm::Type *Tys[2] = {InTy, FTy};
6549     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6550     switch (BuiltinID) {
6551     default: llvm_unreachable("missing builtin ID in switch!");
6552     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
6553       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
6554     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
6555       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
6556     }
6557     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
6558     return Builder.CreateTrunc(Ops[0], Int16Ty);
6559   }
6560   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
6561   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
6562     unsigned Int;
6563     llvm::Type* FTy  = HalfTy;
6564     llvm::Type* InTy = Int32Ty;
6565     llvm::Type *Tys[2] = {FTy, InTy};
6566     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6567     switch (BuiltinID) {
6568     default: llvm_unreachable("missing builtin ID in switch!");
6569     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
6570       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
6571       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
6572       break;
6573     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
6574       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
6575       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
6576       break;
6577     }
6578     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
6579   }
6580   case NEON::BI__builtin_neon_vpaddd_s64: {
6581     llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
6582     Value *Vec = EmitScalarExpr(E->getArg(0));
6583     // The vector is v2f64, so make sure it's bitcast to that.
6584     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
6585     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
6586     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
6587     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
6588     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
6589     // Pairwise addition of a v2f64 into a scalar f64.
6590     return Builder.CreateAdd(Op0, Op1, "vpaddd");
6591   }
6592   case NEON::BI__builtin_neon_vpaddd_f64: {
6593     llvm::Type *Ty =
6594       llvm::VectorType::get(DoubleTy, 2);
6595     Value *Vec = EmitScalarExpr(E->getArg(0));
6596     // The vector is v2f64, so make sure it's bitcast to that.
6597     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
6598     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
6599     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
6600     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
6601     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
6602     // Pairwise addition of a v2f64 into a scalar f64.
6603     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
6604   }
6605   case NEON::BI__builtin_neon_vpadds_f32: {
6606     llvm::Type *Ty =
6607       llvm::VectorType::get(FloatTy, 2);
6608     Value *Vec = EmitScalarExpr(E->getArg(0));
6609     // The vector is v2f32, so make sure it's bitcast to that.
6610     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
6611     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
6612     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
6613     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
6614     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
6615     // Pairwise addition of a v2f32 into a scalar f32.
6616     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
6617   }
6618   case NEON::BI__builtin_neon_vceqzd_s64:
6619   case NEON::BI__builtin_neon_vceqzd_f64:
6620   case NEON::BI__builtin_neon_vceqzs_f32:
6621   case NEON::BI__builtin_neon_vceqzh_f16:
6622     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6623     return EmitAArch64CompareBuiltinExpr(
6624         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6625         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
6626   case NEON::BI__builtin_neon_vcgezd_s64:
6627   case NEON::BI__builtin_neon_vcgezd_f64:
6628   case NEON::BI__builtin_neon_vcgezs_f32:
6629   case NEON::BI__builtin_neon_vcgezh_f16:
6630     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6631     return EmitAArch64CompareBuiltinExpr(
6632         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6633         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
6634   case NEON::BI__builtin_neon_vclezd_s64:
6635   case NEON::BI__builtin_neon_vclezd_f64:
6636   case NEON::BI__builtin_neon_vclezs_f32:
6637   case NEON::BI__builtin_neon_vclezh_f16:
6638     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6639     return EmitAArch64CompareBuiltinExpr(
6640         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6641         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
6642   case NEON::BI__builtin_neon_vcgtzd_s64:
6643   case NEON::BI__builtin_neon_vcgtzd_f64:
6644   case NEON::BI__builtin_neon_vcgtzs_f32:
6645   case NEON::BI__builtin_neon_vcgtzh_f16:
6646     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6647     return EmitAArch64CompareBuiltinExpr(
6648         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6649         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
6650   case NEON::BI__builtin_neon_vcltzd_s64:
6651   case NEON::BI__builtin_neon_vcltzd_f64:
6652   case NEON::BI__builtin_neon_vcltzs_f32:
6653   case NEON::BI__builtin_neon_vcltzh_f16:
6654     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6655     return EmitAArch64CompareBuiltinExpr(
6656         Ops[0], ConvertType(E->getCallReturnType(getContext())),
6657         ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
6658 
6659   case NEON::BI__builtin_neon_vceqzd_u64: {
6660     Ops.push_back(EmitScalarExpr(E->getArg(0)));
6661     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6662     Ops[0] =
6663         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
6664     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
6665   }
6666   case NEON::BI__builtin_neon_vceqd_f64:
6667   case NEON::BI__builtin_neon_vcled_f64:
6668   case NEON::BI__builtin_neon_vcltd_f64:
6669   case NEON::BI__builtin_neon_vcged_f64:
6670   case NEON::BI__builtin_neon_vcgtd_f64: {
6671     llvm::CmpInst::Predicate P;
6672     switch (BuiltinID) {
6673     default: llvm_unreachable("missing builtin ID in switch!");
6674     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
6675     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
6676     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
6677     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
6678     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
6679     }
6680     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6681     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
6682     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
6683     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6684     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
6685   }
6686   case NEON::BI__builtin_neon_vceqs_f32:
6687   case NEON::BI__builtin_neon_vcles_f32:
6688   case NEON::BI__builtin_neon_vclts_f32:
6689   case NEON::BI__builtin_neon_vcges_f32:
6690   case NEON::BI__builtin_neon_vcgts_f32: {
6691     llvm::CmpInst::Predicate P;
6692     switch (BuiltinID) {
6693     default: llvm_unreachable("missing builtin ID in switch!");
6694     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
6695     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
6696     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
6697     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
6698     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
6699     }
6700     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6701     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
6702     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
6703     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6704     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
6705   }
6706   case NEON::BI__builtin_neon_vceqh_f16:
6707   case NEON::BI__builtin_neon_vcleh_f16:
6708   case NEON::BI__builtin_neon_vclth_f16:
6709   case NEON::BI__builtin_neon_vcgeh_f16:
6710   case NEON::BI__builtin_neon_vcgth_f16: {
6711     llvm::CmpInst::Predicate P;
6712     switch (BuiltinID) {
6713     default: llvm_unreachable("missing builtin ID in switch!");
6714     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
6715     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
6716     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
6717     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
6718     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
6719     }
6720     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6721     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
6722     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
6723     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
6724     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
6725   }
6726   case NEON::BI__builtin_neon_vceqd_s64:
6727   case NEON::BI__builtin_neon_vceqd_u64:
6728   case NEON::BI__builtin_neon_vcgtd_s64:
6729   case NEON::BI__builtin_neon_vcgtd_u64:
6730   case NEON::BI__builtin_neon_vcltd_s64:
6731   case NEON::BI__builtin_neon_vcltd_u64:
6732   case NEON::BI__builtin_neon_vcged_u64:
6733   case NEON::BI__builtin_neon_vcged_s64:
6734   case NEON::BI__builtin_neon_vcled_u64:
6735   case NEON::BI__builtin_neon_vcled_s64: {
6736     llvm::CmpInst::Predicate P;
6737     switch (BuiltinID) {
6738     default: llvm_unreachable("missing builtin ID in switch!");
6739     case NEON::BI__builtin_neon_vceqd_s64:
6740     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
6741     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
6742     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
6743     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
6744     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
6745     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
6746     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
6747     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
6748     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
6749     }
6750     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6751     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6752     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6753     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
6754     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
6755   }
6756   case NEON::BI__builtin_neon_vtstd_s64:
6757   case NEON::BI__builtin_neon_vtstd_u64: {
6758     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6759     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
6760     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6761     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
6762     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
6763                                 llvm::Constant::getNullValue(Int64Ty));
6764     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
6765   }
6766   case NEON::BI__builtin_neon_vset_lane_i8:
6767   case NEON::BI__builtin_neon_vset_lane_i16:
6768   case NEON::BI__builtin_neon_vset_lane_i32:
6769   case NEON::BI__builtin_neon_vset_lane_i64:
6770   case NEON::BI__builtin_neon_vset_lane_f32:
6771   case NEON::BI__builtin_neon_vsetq_lane_i8:
6772   case NEON::BI__builtin_neon_vsetq_lane_i16:
6773   case NEON::BI__builtin_neon_vsetq_lane_i32:
6774   case NEON::BI__builtin_neon_vsetq_lane_i64:
6775   case NEON::BI__builtin_neon_vsetq_lane_f32:
6776     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6777     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6778   case NEON::BI__builtin_neon_vset_lane_f64:
6779     // The vector type needs a cast for the v1f64 variant.
6780     Ops[1] = Builder.CreateBitCast(Ops[1],
6781                                    llvm::VectorType::get(DoubleTy, 1));
6782     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6783     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6784   case NEON::BI__builtin_neon_vsetq_lane_f64:
6785     // The vector type needs a cast for the v2f64 variant.
6786     Ops[1] = Builder.CreateBitCast(Ops[1],
6787         llvm::VectorType::get(DoubleTy, 2));
6788     Ops.push_back(EmitScalarExpr(E->getArg(2)));
6789     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
6790 
6791   case NEON::BI__builtin_neon_vget_lane_i8:
6792   case NEON::BI__builtin_neon_vdupb_lane_i8:
6793     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 8));
6794     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6795                                         "vget_lane");
6796   case NEON::BI__builtin_neon_vgetq_lane_i8:
6797   case NEON::BI__builtin_neon_vdupb_laneq_i8:
6798     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 16));
6799     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6800                                         "vgetq_lane");
6801   case NEON::BI__builtin_neon_vget_lane_i16:
6802   case NEON::BI__builtin_neon_vduph_lane_i16:
6803     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 4));
6804     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6805                                         "vget_lane");
6806   case NEON::BI__builtin_neon_vgetq_lane_i16:
6807   case NEON::BI__builtin_neon_vduph_laneq_i16:
6808     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 8));
6809     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6810                                         "vgetq_lane");
6811   case NEON::BI__builtin_neon_vget_lane_i32:
6812   case NEON::BI__builtin_neon_vdups_lane_i32:
6813     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 2));
6814     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6815                                         "vget_lane");
6816   case NEON::BI__builtin_neon_vdups_lane_f32:
6817     Ops[0] = Builder.CreateBitCast(Ops[0],
6818         llvm::VectorType::get(FloatTy, 2));
6819     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6820                                         "vdups_lane");
6821   case NEON::BI__builtin_neon_vgetq_lane_i32:
6822   case NEON::BI__builtin_neon_vdups_laneq_i32:
6823     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
6824     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6825                                         "vgetq_lane");
6826   case NEON::BI__builtin_neon_vget_lane_i64:
6827   case NEON::BI__builtin_neon_vdupd_lane_i64:
6828     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 1));
6829     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6830                                         "vget_lane");
6831   case NEON::BI__builtin_neon_vdupd_lane_f64:
6832     Ops[0] = Builder.CreateBitCast(Ops[0],
6833         llvm::VectorType::get(DoubleTy, 1));
6834     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6835                                         "vdupd_lane");
6836   case NEON::BI__builtin_neon_vgetq_lane_i64:
6837   case NEON::BI__builtin_neon_vdupd_laneq_i64:
6838     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
6839     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6840                                         "vgetq_lane");
6841   case NEON::BI__builtin_neon_vget_lane_f32:
6842     Ops[0] = Builder.CreateBitCast(Ops[0],
6843         llvm::VectorType::get(FloatTy, 2));
6844     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6845                                         "vget_lane");
6846   case NEON::BI__builtin_neon_vget_lane_f64:
6847     Ops[0] = Builder.CreateBitCast(Ops[0],
6848         llvm::VectorType::get(DoubleTy, 1));
6849     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6850                                         "vget_lane");
6851   case NEON::BI__builtin_neon_vgetq_lane_f32:
6852   case NEON::BI__builtin_neon_vdups_laneq_f32:
6853     Ops[0] = Builder.CreateBitCast(Ops[0],
6854         llvm::VectorType::get(FloatTy, 4));
6855     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6856                                         "vgetq_lane");
6857   case NEON::BI__builtin_neon_vgetq_lane_f64:
6858   case NEON::BI__builtin_neon_vdupd_laneq_f64:
6859     Ops[0] = Builder.CreateBitCast(Ops[0],
6860         llvm::VectorType::get(DoubleTy, 2));
6861     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
6862                                         "vgetq_lane");
6863   case NEON::BI__builtin_neon_vaddh_f16:
6864     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6865     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
6866   case NEON::BI__builtin_neon_vsubh_f16:
6867     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6868     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
6869   case NEON::BI__builtin_neon_vmulh_f16:
6870     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6871     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
6872   case NEON::BI__builtin_neon_vdivh_f16:
6873     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6874     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
6875   case NEON::BI__builtin_neon_vfmah_f16: {
6876     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
6877     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6878     return Builder.CreateCall(F,
6879       {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
6880   }
6881   case NEON::BI__builtin_neon_vfmsh_f16: {
6882     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
6883     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
6884     Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
6885     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6886     return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
6887   }
6888   case NEON::BI__builtin_neon_vaddd_s64:
6889   case NEON::BI__builtin_neon_vaddd_u64:
6890     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
6891   case NEON::BI__builtin_neon_vsubd_s64:
6892   case NEON::BI__builtin_neon_vsubd_u64:
6893     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
6894   case NEON::BI__builtin_neon_vqdmlalh_s16:
6895   case NEON::BI__builtin_neon_vqdmlslh_s16: {
6896     SmallVector<Value *, 2> ProductOps;
6897     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
6898     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
6899     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
6900     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
6901                           ProductOps, "vqdmlXl");
6902     Constant *CI = ConstantInt::get(SizeTy, 0);
6903     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
6904 
6905     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
6906                                         ? Intrinsic::aarch64_neon_sqadd
6907                                         : Intrinsic::aarch64_neon_sqsub;
6908     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
6909   }
6910   case NEON::BI__builtin_neon_vqshlud_n_s64: {
6911     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6912     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
6913     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
6914                         Ops, "vqshlu_n");
6915   }
6916   case NEON::BI__builtin_neon_vqshld_n_u64:
6917   case NEON::BI__builtin_neon_vqshld_n_s64: {
6918     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
6919                                    ? Intrinsic::aarch64_neon_uqshl
6920                                    : Intrinsic::aarch64_neon_sqshl;
6921     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6922     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
6923     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
6924   }
6925   case NEON::BI__builtin_neon_vrshrd_n_u64:
6926   case NEON::BI__builtin_neon_vrshrd_n_s64: {
6927     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
6928                                    ? Intrinsic::aarch64_neon_urshl
6929                                    : Intrinsic::aarch64_neon_srshl;
6930     Ops.push_back(EmitScalarExpr(E->getArg(1)));
6931     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
6932     Ops[1] = ConstantInt::get(Int64Ty, -SV);
6933     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
6934   }
6935   case NEON::BI__builtin_neon_vrsrad_n_u64:
6936   case NEON::BI__builtin_neon_vrsrad_n_s64: {
6937     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
6938                                    ? Intrinsic::aarch64_neon_urshl
6939                                    : Intrinsic::aarch64_neon_srshl;
6940     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
6941     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
6942     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
6943                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
6944     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
6945   }
6946   case NEON::BI__builtin_neon_vshld_n_s64:
6947   case NEON::BI__builtin_neon_vshld_n_u64: {
6948     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6949     return Builder.CreateShl(
6950         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
6951   }
6952   case NEON::BI__builtin_neon_vshrd_n_s64: {
6953     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6954     return Builder.CreateAShr(
6955         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
6956                                                    Amt->getZExtValue())),
6957         "shrd_n");
6958   }
6959   case NEON::BI__builtin_neon_vshrd_n_u64: {
6960     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
6961     uint64_t ShiftAmt = Amt->getZExtValue();
6962     // Right-shifting an unsigned value by its size yields 0.
6963     if (ShiftAmt == 64)
6964       return ConstantInt::get(Int64Ty, 0);
6965     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
6966                               "shrd_n");
6967   }
6968   case NEON::BI__builtin_neon_vsrad_n_s64: {
6969     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
6970     Ops[1] = Builder.CreateAShr(
6971         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
6972                                                    Amt->getZExtValue())),
6973         "shrd_n");
6974     return Builder.CreateAdd(Ops[0], Ops[1]);
6975   }
6976   case NEON::BI__builtin_neon_vsrad_n_u64: {
6977     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
6978     uint64_t ShiftAmt = Amt->getZExtValue();
6979     // Right-shifting an unsigned value by its size yields 0.
6980     // As Op + 0 = Op, return Ops[0] directly.
6981     if (ShiftAmt == 64)
6982       return Ops[0];
6983     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
6984                                 "shrd_n");
6985     return Builder.CreateAdd(Ops[0], Ops[1]);
6986   }
6987   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
6988   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
6989   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
6990   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
6991     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
6992                                           "lane");
6993     SmallVector<Value *, 2> ProductOps;
6994     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
6995     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
6996     llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
6997     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
6998                           ProductOps, "vqdmlXl");
6999     Constant *CI = ConstantInt::get(SizeTy, 0);
7000     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
7001     Ops.pop_back();
7002 
7003     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
7004                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
7005                           ? Intrinsic::aarch64_neon_sqadd
7006                           : Intrinsic::aarch64_neon_sqsub;
7007     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
7008   }
7009   case NEON::BI__builtin_neon_vqdmlals_s32:
7010   case NEON::BI__builtin_neon_vqdmlsls_s32: {
7011     SmallVector<Value *, 2> ProductOps;
7012     ProductOps.push_back(Ops[1]);
7013     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
7014     Ops[1] =
7015         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
7016                      ProductOps, "vqdmlXl");
7017 
7018     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
7019                                         ? Intrinsic::aarch64_neon_sqadd
7020                                         : Intrinsic::aarch64_neon_sqsub;
7021     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
7022   }
7023   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
7024   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
7025   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
7026   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
7027     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
7028                                           "lane");
7029     SmallVector<Value *, 2> ProductOps;
7030     ProductOps.push_back(Ops[1]);
7031     ProductOps.push_back(Ops[2]);
7032     Ops[1] =
7033         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
7034                      ProductOps, "vqdmlXl");
7035     Ops.pop_back();
7036 
7037     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
7038                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
7039                           ? Intrinsic::aarch64_neon_sqadd
7040                           : Intrinsic::aarch64_neon_sqsub;
7041     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
7042   }
7043   }
7044 
7045   llvm::VectorType *VTy = GetNeonType(this, Type);
7046   llvm::Type *Ty = VTy;
7047   if (!Ty)
7048     return nullptr;
7049 
7050   // Not all intrinsics handled by the common case work for AArch64 yet, so only
7051   // defer to common code if it's been added to our special map.
7052   Builtin = findNeonIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
7053                                    AArch64SIMDIntrinsicsProvenSorted);
7054 
7055   if (Builtin)
7056     return EmitCommonNeonBuiltinExpr(
7057         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
7058         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
7059         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
7060 
7061   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
7062     return V;
7063 
7064   unsigned Int;
7065   switch (BuiltinID) {
7066   default: return nullptr;
7067   case NEON::BI__builtin_neon_vbsl_v:
7068   case NEON::BI__builtin_neon_vbslq_v: {
7069     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
7070     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
7071     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
7072     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
7073 
7074     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
7075     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
7076     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
7077     return Builder.CreateBitCast(Ops[0], Ty);
7078   }
7079   case NEON::BI__builtin_neon_vfma_lane_v:
7080   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
7081     // The ARM builtins (and instructions) have the addend as the first
7082     // operand, but the 'fma' intrinsics have it last. Swap it around here.
7083     Value *Addend = Ops[0];
7084     Value *Multiplicand = Ops[1];
7085     Value *LaneSource = Ops[2];
7086     Ops[0] = Multiplicand;
7087     Ops[1] = LaneSource;
7088     Ops[2] = Addend;
7089 
7090     // Now adjust things to handle the lane access.
7091     llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ?
7092       llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) :
7093       VTy;
7094     llvm::Constant *cst = cast<Constant>(Ops[3]);
7095     Value *SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), cst);
7096     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
7097     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
7098 
7099     Ops.pop_back();
7100     Int = Intrinsic::fma;
7101     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
7102   }
7103   case NEON::BI__builtin_neon_vfma_laneq_v: {
7104     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
7105     // v1f64 fma should be mapped to Neon scalar f64 fma
7106     if (VTy && VTy->getElementType() == DoubleTy) {
7107       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7108       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
7109       llvm::Type *VTy = GetNeonType(this,
7110         NeonTypeFlags(NeonTypeFlags::Float64, false, true));
7111       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
7112       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
7113       Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
7114       Value *Result = Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
7115       return Builder.CreateBitCast(Result, Ty);
7116     }
7117     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7118     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7119     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7120 
7121     llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
7122                                             VTy->getNumElements() * 2);
7123     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
7124     Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
7125                                                cast<ConstantInt>(Ops[3]));
7126     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
7127 
7128     return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
7129   }
7130   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
7131     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7132     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7133     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7134 
7135     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7136     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
7137     return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
7138   }
7139   case NEON::BI__builtin_neon_vfmah_lane_f16:
7140   case NEON::BI__builtin_neon_vfmas_lane_f32:
7141   case NEON::BI__builtin_neon_vfmah_laneq_f16:
7142   case NEON::BI__builtin_neon_vfmas_laneq_f32:
7143   case NEON::BI__builtin_neon_vfmad_lane_f64:
7144   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
7145     Ops.push_back(EmitScalarExpr(E->getArg(3)));
7146     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
7147     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
7148     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
7149     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
7150   }
7151   case NEON::BI__builtin_neon_vmull_v:
7152     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7153     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
7154     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
7155     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
7156   case NEON::BI__builtin_neon_vmax_v:
7157   case NEON::BI__builtin_neon_vmaxq_v:
7158     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7159     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
7160     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
7161     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
7162   case NEON::BI__builtin_neon_vmaxh_f16: {
7163     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7164     Int = Intrinsic::aarch64_neon_fmax;
7165     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
7166   }
7167   case NEON::BI__builtin_neon_vmin_v:
7168   case NEON::BI__builtin_neon_vminq_v:
7169     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7170     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
7171     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
7172     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
7173   case NEON::BI__builtin_neon_vminh_f16: {
7174     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7175     Int = Intrinsic::aarch64_neon_fmin;
7176     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
7177   }
7178   case NEON::BI__builtin_neon_vabd_v:
7179   case NEON::BI__builtin_neon_vabdq_v:
7180     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7181     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
7182     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
7183     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
7184   case NEON::BI__builtin_neon_vpadal_v:
7185   case NEON::BI__builtin_neon_vpadalq_v: {
7186     unsigned ArgElts = VTy->getNumElements();
7187     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
7188     unsigned BitWidth = EltTy->getBitWidth();
7189     llvm::Type *ArgTy = llvm::VectorType::get(
7190         llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts);
7191     llvm::Type* Tys[2] = { VTy, ArgTy };
7192     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
7193     SmallVector<llvm::Value*, 1> TmpOps;
7194     TmpOps.push_back(Ops[1]);
7195     Function *F = CGM.getIntrinsic(Int, Tys);
7196     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
7197     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
7198     return Builder.CreateAdd(tmp, addend);
7199   }
7200   case NEON::BI__builtin_neon_vpmin_v:
7201   case NEON::BI__builtin_neon_vpminq_v:
7202     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7203     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
7204     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
7205     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
7206   case NEON::BI__builtin_neon_vpmax_v:
7207   case NEON::BI__builtin_neon_vpmaxq_v:
7208     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
7209     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
7210     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
7211     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
7212   case NEON::BI__builtin_neon_vminnm_v:
7213   case NEON::BI__builtin_neon_vminnmq_v:
7214     Int = Intrinsic::aarch64_neon_fminnm;
7215     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
7216   case NEON::BI__builtin_neon_vminnmh_f16:
7217     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7218     Int = Intrinsic::aarch64_neon_fminnm;
7219     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
7220   case NEON::BI__builtin_neon_vmaxnm_v:
7221   case NEON::BI__builtin_neon_vmaxnmq_v:
7222     Int = Intrinsic::aarch64_neon_fmaxnm;
7223     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
7224   case NEON::BI__builtin_neon_vmaxnmh_f16:
7225     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7226     Int = Intrinsic::aarch64_neon_fmaxnm;
7227     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
7228   case NEON::BI__builtin_neon_vrecpss_f32: {
7229     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7230     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
7231                         Ops, "vrecps");
7232   }
7233   case NEON::BI__builtin_neon_vrecpsd_f64:
7234     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7235     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
7236                         Ops, "vrecps");
7237   case NEON::BI__builtin_neon_vrecpsh_f16:
7238     Ops.push_back(EmitScalarExpr(E->getArg(1)));
7239     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
7240                         Ops, "vrecps");
7241   case NEON::BI__builtin_neon_vqshrun_n_v:
7242     Int = Intrinsic::aarch64_neon_sqshrun;
7243     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
7244   case NEON::BI__builtin_neon_vqrshrun_n_v:
7245     Int = Intrinsic::aarch64_neon_sqrshrun;
7246     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
7247   case NEON::BI__builtin_neon_vqshrn_n_v:
7248     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
7249     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
7250   case NEON::BI__builtin_neon_vrshrn_n_v:
7251     Int = Intrinsic::aarch64_neon_rshrn;
7252     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
7253   case NEON::BI__builtin_neon_vqrshrn_n_v:
7254     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
7255     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
7256   case NEON::BI__builtin_neon_vrndah_f16: {
7257     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7258     Int = Intrinsic::round;
7259     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
7260   }
7261   case NEON::BI__builtin_neon_vrnda_v:
7262   case NEON::BI__builtin_neon_vrndaq_v: {
7263     Int = Intrinsic::round;
7264     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
7265   }
7266   case NEON::BI__builtin_neon_vrndih_f16: {
7267     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7268     Int = Intrinsic::nearbyint;
7269     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
7270   }
7271   case NEON::BI__builtin_neon_vrndi_v:
7272   case NEON::BI__builtin_neon_vrndiq_v: {
7273     Int = Intrinsic::nearbyint;
7274     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
7275   }
7276   case NEON::BI__builtin_neon_vrndmh_f16: {
7277     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7278     Int = Intrinsic::floor;
7279     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
7280   }
7281   case NEON::BI__builtin_neon_vrndm_v:
7282   case NEON::BI__builtin_neon_vrndmq_v: {
7283     Int = Intrinsic::floor;
7284     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
7285   }
7286   case NEON::BI__builtin_neon_vrndnh_f16: {
7287     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7288     Int = Intrinsic::aarch64_neon_frintn;
7289     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
7290   }
7291   case NEON::BI__builtin_neon_vrndn_v:
7292   case NEON::BI__builtin_neon_vrndnq_v: {
7293     Int = Intrinsic::aarch64_neon_frintn;
7294     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
7295   }
7296   case NEON::BI__builtin_neon_vrndph_f16: {
7297     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7298     Int = Intrinsic::ceil;
7299     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
7300   }
7301   case NEON::BI__builtin_neon_vrndp_v:
7302   case NEON::BI__builtin_neon_vrndpq_v: {
7303     Int = Intrinsic::ceil;
7304     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
7305   }
7306   case NEON::BI__builtin_neon_vrndxh_f16: {
7307     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7308     Int = Intrinsic::rint;
7309     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
7310   }
7311   case NEON::BI__builtin_neon_vrndx_v:
7312   case NEON::BI__builtin_neon_vrndxq_v: {
7313     Int = Intrinsic::rint;
7314     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
7315   }
7316   case NEON::BI__builtin_neon_vrndh_f16: {
7317     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7318     Int = Intrinsic::trunc;
7319     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
7320   }
7321   case NEON::BI__builtin_neon_vrnd_v:
7322   case NEON::BI__builtin_neon_vrndq_v: {
7323     Int = Intrinsic::trunc;
7324     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
7325   }
7326   case NEON::BI__builtin_neon_vcvt_f64_v:
7327   case NEON::BI__builtin_neon_vcvtq_f64_v:
7328     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7329     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
7330     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7331                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7332   case NEON::BI__builtin_neon_vcvt_f64_f32: {
7333     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
7334            "unexpected vcvt_f64_f32 builtin");
7335     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
7336     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
7337 
7338     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
7339   }
7340   case NEON::BI__builtin_neon_vcvt_f32_f64: {
7341     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
7342            "unexpected vcvt_f32_f64 builtin");
7343     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
7344     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
7345 
7346     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
7347   }
7348   case NEON::BI__builtin_neon_vcvt_s32_v:
7349   case NEON::BI__builtin_neon_vcvt_u32_v:
7350   case NEON::BI__builtin_neon_vcvt_s64_v:
7351   case NEON::BI__builtin_neon_vcvt_u64_v:
7352 	case NEON::BI__builtin_neon_vcvt_s16_v:
7353 	case NEON::BI__builtin_neon_vcvt_u16_v:
7354   case NEON::BI__builtin_neon_vcvtq_s32_v:
7355   case NEON::BI__builtin_neon_vcvtq_u32_v:
7356   case NEON::BI__builtin_neon_vcvtq_s64_v:
7357   case NEON::BI__builtin_neon_vcvtq_u64_v:
7358 	case NEON::BI__builtin_neon_vcvtq_s16_v:
7359 	case NEON::BI__builtin_neon_vcvtq_u16_v: {
7360     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
7361     if (usgn)
7362       return Builder.CreateFPToUI(Ops[0], Ty);
7363     return Builder.CreateFPToSI(Ops[0], Ty);
7364   }
7365   case NEON::BI__builtin_neon_vcvta_s16_v:
7366   case NEON::BI__builtin_neon_vcvta_s32_v:
7367   case NEON::BI__builtin_neon_vcvtaq_s16_v:
7368   case NEON::BI__builtin_neon_vcvtaq_s32_v:
7369   case NEON::BI__builtin_neon_vcvta_u32_v:
7370   case NEON::BI__builtin_neon_vcvtaq_u16_v:
7371   case NEON::BI__builtin_neon_vcvtaq_u32_v:
7372   case NEON::BI__builtin_neon_vcvta_s64_v:
7373   case NEON::BI__builtin_neon_vcvtaq_s64_v:
7374   case NEON::BI__builtin_neon_vcvta_u64_v:
7375   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
7376     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
7377     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7378     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
7379   }
7380   case NEON::BI__builtin_neon_vcvtm_s16_v:
7381   case NEON::BI__builtin_neon_vcvtm_s32_v:
7382   case NEON::BI__builtin_neon_vcvtmq_s16_v:
7383   case NEON::BI__builtin_neon_vcvtmq_s32_v:
7384   case NEON::BI__builtin_neon_vcvtm_u16_v:
7385   case NEON::BI__builtin_neon_vcvtm_u32_v:
7386   case NEON::BI__builtin_neon_vcvtmq_u16_v:
7387   case NEON::BI__builtin_neon_vcvtmq_u32_v:
7388   case NEON::BI__builtin_neon_vcvtm_s64_v:
7389   case NEON::BI__builtin_neon_vcvtmq_s64_v:
7390   case NEON::BI__builtin_neon_vcvtm_u64_v:
7391   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
7392     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
7393     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7394     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
7395   }
7396   case NEON::BI__builtin_neon_vcvtn_s16_v:
7397   case NEON::BI__builtin_neon_vcvtn_s32_v:
7398   case NEON::BI__builtin_neon_vcvtnq_s16_v:
7399   case NEON::BI__builtin_neon_vcvtnq_s32_v:
7400   case NEON::BI__builtin_neon_vcvtn_u16_v:
7401   case NEON::BI__builtin_neon_vcvtn_u32_v:
7402   case NEON::BI__builtin_neon_vcvtnq_u16_v:
7403   case NEON::BI__builtin_neon_vcvtnq_u32_v:
7404   case NEON::BI__builtin_neon_vcvtn_s64_v:
7405   case NEON::BI__builtin_neon_vcvtnq_s64_v:
7406   case NEON::BI__builtin_neon_vcvtn_u64_v:
7407   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
7408     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
7409     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7410     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
7411   }
7412   case NEON::BI__builtin_neon_vcvtp_s16_v:
7413   case NEON::BI__builtin_neon_vcvtp_s32_v:
7414   case NEON::BI__builtin_neon_vcvtpq_s16_v:
7415   case NEON::BI__builtin_neon_vcvtpq_s32_v:
7416   case NEON::BI__builtin_neon_vcvtp_u16_v:
7417   case NEON::BI__builtin_neon_vcvtp_u32_v:
7418   case NEON::BI__builtin_neon_vcvtpq_u16_v:
7419   case NEON::BI__builtin_neon_vcvtpq_u32_v:
7420   case NEON::BI__builtin_neon_vcvtp_s64_v:
7421   case NEON::BI__builtin_neon_vcvtpq_s64_v:
7422   case NEON::BI__builtin_neon_vcvtp_u64_v:
7423   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
7424     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
7425     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7426     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
7427   }
7428   case NEON::BI__builtin_neon_vmulx_v:
7429   case NEON::BI__builtin_neon_vmulxq_v: {
7430     Int = Intrinsic::aarch64_neon_fmulx;
7431     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
7432   }
7433   case NEON::BI__builtin_neon_vmulxh_lane_f16:
7434   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
7435     // vmulx_lane should be mapped to Neon scalar mulx after
7436     // extracting the scalar element
7437     Ops.push_back(EmitScalarExpr(E->getArg(2)));
7438     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
7439     Ops.pop_back();
7440     Int = Intrinsic::aarch64_neon_fmulx;
7441     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
7442   }
7443   case NEON::BI__builtin_neon_vmul_lane_v:
7444   case NEON::BI__builtin_neon_vmul_laneq_v: {
7445     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
7446     bool Quad = false;
7447     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
7448       Quad = true;
7449     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7450     llvm::Type *VTy = GetNeonType(this,
7451       NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
7452     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7453     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
7454     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
7455     return Builder.CreateBitCast(Result, Ty);
7456   }
7457   case NEON::BI__builtin_neon_vnegd_s64:
7458     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
7459   case NEON::BI__builtin_neon_vnegh_f16:
7460     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
7461   case NEON::BI__builtin_neon_vpmaxnm_v:
7462   case NEON::BI__builtin_neon_vpmaxnmq_v: {
7463     Int = Intrinsic::aarch64_neon_fmaxnmp;
7464     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
7465   }
7466   case NEON::BI__builtin_neon_vpminnm_v:
7467   case NEON::BI__builtin_neon_vpminnmq_v: {
7468     Int = Intrinsic::aarch64_neon_fminnmp;
7469     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
7470   }
7471   case NEON::BI__builtin_neon_vsqrth_f16: {
7472     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7473     Int = Intrinsic::sqrt;
7474     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
7475   }
7476   case NEON::BI__builtin_neon_vsqrt_v:
7477   case NEON::BI__builtin_neon_vsqrtq_v: {
7478     Int = Intrinsic::sqrt;
7479     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7480     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
7481   }
7482   case NEON::BI__builtin_neon_vrbit_v:
7483   case NEON::BI__builtin_neon_vrbitq_v: {
7484     Int = Intrinsic::aarch64_neon_rbit;
7485     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
7486   }
7487   case NEON::BI__builtin_neon_vaddv_u8:
7488     // FIXME: These are handled by the AArch64 scalar code.
7489     usgn = true;
7490     LLVM_FALLTHROUGH;
7491   case NEON::BI__builtin_neon_vaddv_s8: {
7492     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7493     Ty = Int32Ty;
7494     VTy = llvm::VectorType::get(Int8Ty, 8);
7495     llvm::Type *Tys[2] = { Ty, VTy };
7496     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7497     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7498     return Builder.CreateTrunc(Ops[0], Int8Ty);
7499   }
7500   case NEON::BI__builtin_neon_vaddv_u16:
7501     usgn = true;
7502     LLVM_FALLTHROUGH;
7503   case NEON::BI__builtin_neon_vaddv_s16: {
7504     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7505     Ty = Int32Ty;
7506     VTy = llvm::VectorType::get(Int16Ty, 4);
7507     llvm::Type *Tys[2] = { Ty, VTy };
7508     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7509     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7510     return Builder.CreateTrunc(Ops[0], Int16Ty);
7511   }
7512   case NEON::BI__builtin_neon_vaddvq_u8:
7513     usgn = true;
7514     LLVM_FALLTHROUGH;
7515   case NEON::BI__builtin_neon_vaddvq_s8: {
7516     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7517     Ty = Int32Ty;
7518     VTy = llvm::VectorType::get(Int8Ty, 16);
7519     llvm::Type *Tys[2] = { Ty, VTy };
7520     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7521     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7522     return Builder.CreateTrunc(Ops[0], Int8Ty);
7523   }
7524   case NEON::BI__builtin_neon_vaddvq_u16:
7525     usgn = true;
7526     LLVM_FALLTHROUGH;
7527   case NEON::BI__builtin_neon_vaddvq_s16: {
7528     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7529     Ty = Int32Ty;
7530     VTy = llvm::VectorType::get(Int16Ty, 8);
7531     llvm::Type *Tys[2] = { Ty, VTy };
7532     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7533     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7534     return Builder.CreateTrunc(Ops[0], Int16Ty);
7535   }
7536   case NEON::BI__builtin_neon_vmaxv_u8: {
7537     Int = Intrinsic::aarch64_neon_umaxv;
7538     Ty = Int32Ty;
7539     VTy = llvm::VectorType::get(Int8Ty, 8);
7540     llvm::Type *Tys[2] = { Ty, VTy };
7541     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7542     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7543     return Builder.CreateTrunc(Ops[0], Int8Ty);
7544   }
7545   case NEON::BI__builtin_neon_vmaxv_u16: {
7546     Int = Intrinsic::aarch64_neon_umaxv;
7547     Ty = Int32Ty;
7548     VTy = llvm::VectorType::get(Int16Ty, 4);
7549     llvm::Type *Tys[2] = { Ty, VTy };
7550     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7551     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7552     return Builder.CreateTrunc(Ops[0], Int16Ty);
7553   }
7554   case NEON::BI__builtin_neon_vmaxvq_u8: {
7555     Int = Intrinsic::aarch64_neon_umaxv;
7556     Ty = Int32Ty;
7557     VTy = llvm::VectorType::get(Int8Ty, 16);
7558     llvm::Type *Tys[2] = { Ty, VTy };
7559     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7560     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7561     return Builder.CreateTrunc(Ops[0], Int8Ty);
7562   }
7563   case NEON::BI__builtin_neon_vmaxvq_u16: {
7564     Int = Intrinsic::aarch64_neon_umaxv;
7565     Ty = Int32Ty;
7566     VTy = llvm::VectorType::get(Int16Ty, 8);
7567     llvm::Type *Tys[2] = { Ty, VTy };
7568     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7569     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7570     return Builder.CreateTrunc(Ops[0], Int16Ty);
7571   }
7572   case NEON::BI__builtin_neon_vmaxv_s8: {
7573     Int = Intrinsic::aarch64_neon_smaxv;
7574     Ty = Int32Ty;
7575     VTy = llvm::VectorType::get(Int8Ty, 8);
7576     llvm::Type *Tys[2] = { Ty, VTy };
7577     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7578     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7579     return Builder.CreateTrunc(Ops[0], Int8Ty);
7580   }
7581   case NEON::BI__builtin_neon_vmaxv_s16: {
7582     Int = Intrinsic::aarch64_neon_smaxv;
7583     Ty = Int32Ty;
7584     VTy = llvm::VectorType::get(Int16Ty, 4);
7585     llvm::Type *Tys[2] = { Ty, VTy };
7586     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7587     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7588     return Builder.CreateTrunc(Ops[0], Int16Ty);
7589   }
7590   case NEON::BI__builtin_neon_vmaxvq_s8: {
7591     Int = Intrinsic::aarch64_neon_smaxv;
7592     Ty = Int32Ty;
7593     VTy = llvm::VectorType::get(Int8Ty, 16);
7594     llvm::Type *Tys[2] = { Ty, VTy };
7595     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7596     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7597     return Builder.CreateTrunc(Ops[0], Int8Ty);
7598   }
7599   case NEON::BI__builtin_neon_vmaxvq_s16: {
7600     Int = Intrinsic::aarch64_neon_smaxv;
7601     Ty = Int32Ty;
7602     VTy = llvm::VectorType::get(Int16Ty, 8);
7603     llvm::Type *Tys[2] = { Ty, VTy };
7604     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7605     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7606     return Builder.CreateTrunc(Ops[0], Int16Ty);
7607   }
7608   case NEON::BI__builtin_neon_vmaxv_f16: {
7609     Int = Intrinsic::aarch64_neon_fmaxv;
7610     Ty = HalfTy;
7611     VTy = llvm::VectorType::get(HalfTy, 4);
7612     llvm::Type *Tys[2] = { Ty, VTy };
7613     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7614     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7615     return Builder.CreateTrunc(Ops[0], HalfTy);
7616   }
7617   case NEON::BI__builtin_neon_vmaxvq_f16: {
7618     Int = Intrinsic::aarch64_neon_fmaxv;
7619     Ty = HalfTy;
7620     VTy = llvm::VectorType::get(HalfTy, 8);
7621     llvm::Type *Tys[2] = { Ty, VTy };
7622     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7623     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7624     return Builder.CreateTrunc(Ops[0], HalfTy);
7625   }
7626   case NEON::BI__builtin_neon_vminv_u8: {
7627     Int = Intrinsic::aarch64_neon_uminv;
7628     Ty = Int32Ty;
7629     VTy = llvm::VectorType::get(Int8Ty, 8);
7630     llvm::Type *Tys[2] = { Ty, VTy };
7631     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7632     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7633     return Builder.CreateTrunc(Ops[0], Int8Ty);
7634   }
7635   case NEON::BI__builtin_neon_vminv_u16: {
7636     Int = Intrinsic::aarch64_neon_uminv;
7637     Ty = Int32Ty;
7638     VTy = llvm::VectorType::get(Int16Ty, 4);
7639     llvm::Type *Tys[2] = { Ty, VTy };
7640     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7641     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7642     return Builder.CreateTrunc(Ops[0], Int16Ty);
7643   }
7644   case NEON::BI__builtin_neon_vminvq_u8: {
7645     Int = Intrinsic::aarch64_neon_uminv;
7646     Ty = Int32Ty;
7647     VTy = llvm::VectorType::get(Int8Ty, 16);
7648     llvm::Type *Tys[2] = { Ty, VTy };
7649     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7650     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7651     return Builder.CreateTrunc(Ops[0], Int8Ty);
7652   }
7653   case NEON::BI__builtin_neon_vminvq_u16: {
7654     Int = Intrinsic::aarch64_neon_uminv;
7655     Ty = Int32Ty;
7656     VTy = llvm::VectorType::get(Int16Ty, 8);
7657     llvm::Type *Tys[2] = { Ty, VTy };
7658     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7659     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7660     return Builder.CreateTrunc(Ops[0], Int16Ty);
7661   }
7662   case NEON::BI__builtin_neon_vminv_s8: {
7663     Int = Intrinsic::aarch64_neon_sminv;
7664     Ty = Int32Ty;
7665     VTy = llvm::VectorType::get(Int8Ty, 8);
7666     llvm::Type *Tys[2] = { Ty, VTy };
7667     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7668     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7669     return Builder.CreateTrunc(Ops[0], Int8Ty);
7670   }
7671   case NEON::BI__builtin_neon_vminv_s16: {
7672     Int = Intrinsic::aarch64_neon_sminv;
7673     Ty = Int32Ty;
7674     VTy = llvm::VectorType::get(Int16Ty, 4);
7675     llvm::Type *Tys[2] = { Ty, VTy };
7676     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7677     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7678     return Builder.CreateTrunc(Ops[0], Int16Ty);
7679   }
7680   case NEON::BI__builtin_neon_vminvq_s8: {
7681     Int = Intrinsic::aarch64_neon_sminv;
7682     Ty = Int32Ty;
7683     VTy = llvm::VectorType::get(Int8Ty, 16);
7684     llvm::Type *Tys[2] = { Ty, VTy };
7685     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7686     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7687     return Builder.CreateTrunc(Ops[0], Int8Ty);
7688   }
7689   case NEON::BI__builtin_neon_vminvq_s16: {
7690     Int = Intrinsic::aarch64_neon_sminv;
7691     Ty = Int32Ty;
7692     VTy = llvm::VectorType::get(Int16Ty, 8);
7693     llvm::Type *Tys[2] = { Ty, VTy };
7694     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7695     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7696     return Builder.CreateTrunc(Ops[0], Int16Ty);
7697   }
7698   case NEON::BI__builtin_neon_vminv_f16: {
7699     Int = Intrinsic::aarch64_neon_fminv;
7700     Ty = HalfTy;
7701     VTy = llvm::VectorType::get(HalfTy, 4);
7702     llvm::Type *Tys[2] = { Ty, VTy };
7703     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7704     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7705     return Builder.CreateTrunc(Ops[0], HalfTy);
7706   }
7707   case NEON::BI__builtin_neon_vminvq_f16: {
7708     Int = Intrinsic::aarch64_neon_fminv;
7709     Ty = HalfTy;
7710     VTy = llvm::VectorType::get(HalfTy, 8);
7711     llvm::Type *Tys[2] = { Ty, VTy };
7712     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7713     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7714     return Builder.CreateTrunc(Ops[0], HalfTy);
7715   }
7716   case NEON::BI__builtin_neon_vmaxnmv_f16: {
7717     Int = Intrinsic::aarch64_neon_fmaxnmv;
7718     Ty = HalfTy;
7719     VTy = llvm::VectorType::get(HalfTy, 4);
7720     llvm::Type *Tys[2] = { Ty, VTy };
7721     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7722     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
7723     return Builder.CreateTrunc(Ops[0], HalfTy);
7724   }
7725   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
7726     Int = Intrinsic::aarch64_neon_fmaxnmv;
7727     Ty = HalfTy;
7728     VTy = llvm::VectorType::get(HalfTy, 8);
7729     llvm::Type *Tys[2] = { Ty, VTy };
7730     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7731     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
7732     return Builder.CreateTrunc(Ops[0], HalfTy);
7733   }
7734   case NEON::BI__builtin_neon_vminnmv_f16: {
7735     Int = Intrinsic::aarch64_neon_fminnmv;
7736     Ty = HalfTy;
7737     VTy = llvm::VectorType::get(HalfTy, 4);
7738     llvm::Type *Tys[2] = { Ty, VTy };
7739     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7740     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
7741     return Builder.CreateTrunc(Ops[0], HalfTy);
7742   }
7743   case NEON::BI__builtin_neon_vminnmvq_f16: {
7744     Int = Intrinsic::aarch64_neon_fminnmv;
7745     Ty = HalfTy;
7746     VTy = llvm::VectorType::get(HalfTy, 8);
7747     llvm::Type *Tys[2] = { Ty, VTy };
7748     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7749     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
7750     return Builder.CreateTrunc(Ops[0], HalfTy);
7751   }
7752   case NEON::BI__builtin_neon_vmul_n_f64: {
7753     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
7754     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
7755     return Builder.CreateFMul(Ops[0], RHS);
7756   }
7757   case NEON::BI__builtin_neon_vaddlv_u8: {
7758     Int = Intrinsic::aarch64_neon_uaddlv;
7759     Ty = Int32Ty;
7760     VTy = llvm::VectorType::get(Int8Ty, 8);
7761     llvm::Type *Tys[2] = { Ty, VTy };
7762     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7763     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7764     return Builder.CreateTrunc(Ops[0], Int16Ty);
7765   }
7766   case NEON::BI__builtin_neon_vaddlv_u16: {
7767     Int = Intrinsic::aarch64_neon_uaddlv;
7768     Ty = Int32Ty;
7769     VTy = llvm::VectorType::get(Int16Ty, 4);
7770     llvm::Type *Tys[2] = { Ty, VTy };
7771     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7772     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7773   }
7774   case NEON::BI__builtin_neon_vaddlvq_u8: {
7775     Int = Intrinsic::aarch64_neon_uaddlv;
7776     Ty = Int32Ty;
7777     VTy = llvm::VectorType::get(Int8Ty, 16);
7778     llvm::Type *Tys[2] = { Ty, VTy };
7779     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7780     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7781     return Builder.CreateTrunc(Ops[0], Int16Ty);
7782   }
7783   case NEON::BI__builtin_neon_vaddlvq_u16: {
7784     Int = Intrinsic::aarch64_neon_uaddlv;
7785     Ty = Int32Ty;
7786     VTy = llvm::VectorType::get(Int16Ty, 8);
7787     llvm::Type *Tys[2] = { Ty, VTy };
7788     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7789     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7790   }
7791   case NEON::BI__builtin_neon_vaddlv_s8: {
7792     Int = Intrinsic::aarch64_neon_saddlv;
7793     Ty = Int32Ty;
7794     VTy = llvm::VectorType::get(Int8Ty, 8);
7795     llvm::Type *Tys[2] = { Ty, VTy };
7796     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7797     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7798     return Builder.CreateTrunc(Ops[0], Int16Ty);
7799   }
7800   case NEON::BI__builtin_neon_vaddlv_s16: {
7801     Int = Intrinsic::aarch64_neon_saddlv;
7802     Ty = Int32Ty;
7803     VTy = llvm::VectorType::get(Int16Ty, 4);
7804     llvm::Type *Tys[2] = { Ty, VTy };
7805     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7806     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7807   }
7808   case NEON::BI__builtin_neon_vaddlvq_s8: {
7809     Int = Intrinsic::aarch64_neon_saddlv;
7810     Ty = Int32Ty;
7811     VTy = llvm::VectorType::get(Int8Ty, 16);
7812     llvm::Type *Tys[2] = { Ty, VTy };
7813     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7814     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7815     return Builder.CreateTrunc(Ops[0], Int16Ty);
7816   }
7817   case NEON::BI__builtin_neon_vaddlvq_s16: {
7818     Int = Intrinsic::aarch64_neon_saddlv;
7819     Ty = Int32Ty;
7820     VTy = llvm::VectorType::get(Int16Ty, 8);
7821     llvm::Type *Tys[2] = { Ty, VTy };
7822     Ops.push_back(EmitScalarExpr(E->getArg(0)));
7823     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
7824   }
7825   case NEON::BI__builtin_neon_vsri_n_v:
7826   case NEON::BI__builtin_neon_vsriq_n_v: {
7827     Int = Intrinsic::aarch64_neon_vsri;
7828     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
7829     return EmitNeonCall(Intrin, Ops, "vsri_n");
7830   }
7831   case NEON::BI__builtin_neon_vsli_n_v:
7832   case NEON::BI__builtin_neon_vsliq_n_v: {
7833     Int = Intrinsic::aarch64_neon_vsli;
7834     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
7835     return EmitNeonCall(Intrin, Ops, "vsli_n");
7836   }
7837   case NEON::BI__builtin_neon_vsra_n_v:
7838   case NEON::BI__builtin_neon_vsraq_n_v:
7839     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7840     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
7841     return Builder.CreateAdd(Ops[0], Ops[1]);
7842   case NEON::BI__builtin_neon_vrsra_n_v:
7843   case NEON::BI__builtin_neon_vrsraq_n_v: {
7844     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
7845     SmallVector<llvm::Value*,2> TmpOps;
7846     TmpOps.push_back(Ops[1]);
7847     TmpOps.push_back(Ops[2]);
7848     Function* F = CGM.getIntrinsic(Int, Ty);
7849     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
7850     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7851     return Builder.CreateAdd(Ops[0], tmp);
7852   }
7853     // FIXME: Sharing loads & stores with 32-bit is complicated by the absence
7854     // of an Align parameter here.
7855   case NEON::BI__builtin_neon_vld1_x2_v:
7856   case NEON::BI__builtin_neon_vld1q_x2_v:
7857   case NEON::BI__builtin_neon_vld1_x3_v:
7858   case NEON::BI__builtin_neon_vld1q_x3_v:
7859   case NEON::BI__builtin_neon_vld1_x4_v:
7860   case NEON::BI__builtin_neon_vld1q_x4_v: {
7861     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
7862     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7863     llvm::Type *Tys[2] = { VTy, PTy };
7864     unsigned Int;
7865     switch (BuiltinID) {
7866     case NEON::BI__builtin_neon_vld1_x2_v:
7867     case NEON::BI__builtin_neon_vld1q_x2_v:
7868       Int = Intrinsic::aarch64_neon_ld1x2;
7869       break;
7870     case NEON::BI__builtin_neon_vld1_x3_v:
7871     case NEON::BI__builtin_neon_vld1q_x3_v:
7872       Int = Intrinsic::aarch64_neon_ld1x3;
7873       break;
7874     case NEON::BI__builtin_neon_vld1_x4_v:
7875     case NEON::BI__builtin_neon_vld1q_x4_v:
7876       Int = Intrinsic::aarch64_neon_ld1x4;
7877       break;
7878     }
7879     Function *F = CGM.getIntrinsic(Int, Tys);
7880     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
7881     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
7882     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7883     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7884   }
7885   case NEON::BI__builtin_neon_vst1_x2_v:
7886   case NEON::BI__builtin_neon_vst1q_x2_v:
7887   case NEON::BI__builtin_neon_vst1_x3_v:
7888   case NEON::BI__builtin_neon_vst1q_x3_v:
7889   case NEON::BI__builtin_neon_vst1_x4_v:
7890   case NEON::BI__builtin_neon_vst1q_x4_v: {
7891     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
7892     llvm::Type *Tys[2] = { VTy, PTy };
7893     unsigned Int;
7894     switch (BuiltinID) {
7895     case NEON::BI__builtin_neon_vst1_x2_v:
7896     case NEON::BI__builtin_neon_vst1q_x2_v:
7897       Int = Intrinsic::aarch64_neon_st1x2;
7898       break;
7899     case NEON::BI__builtin_neon_vst1_x3_v:
7900     case NEON::BI__builtin_neon_vst1q_x3_v:
7901       Int = Intrinsic::aarch64_neon_st1x3;
7902       break;
7903     case NEON::BI__builtin_neon_vst1_x4_v:
7904     case NEON::BI__builtin_neon_vst1q_x4_v:
7905       Int = Intrinsic::aarch64_neon_st1x4;
7906       break;
7907     }
7908     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7909     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
7910   }
7911   case NEON::BI__builtin_neon_vld1_v:
7912   case NEON::BI__builtin_neon_vld1q_v: {
7913     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
7914     auto Alignment = CharUnits::fromQuantity(
7915         BuiltinID == NEON::BI__builtin_neon_vld1_v ? 8 : 16);
7916     return Builder.CreateAlignedLoad(VTy, Ops[0], Alignment);
7917   }
7918   case NEON::BI__builtin_neon_vst1_v:
7919   case NEON::BI__builtin_neon_vst1q_v:
7920     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
7921     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7922     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7923   case NEON::BI__builtin_neon_vld1_lane_v:
7924   case NEON::BI__builtin_neon_vld1q_lane_v: {
7925     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7926     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
7927     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7928     auto Alignment = CharUnits::fromQuantity(
7929         BuiltinID == NEON::BI__builtin_neon_vld1_lane_v ? 8 : 16);
7930     Ops[0] =
7931         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
7932     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
7933   }
7934   case NEON::BI__builtin_neon_vld1_dup_v:
7935   case NEON::BI__builtin_neon_vld1q_dup_v: {
7936     Value *V = UndefValue::get(Ty);
7937     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
7938     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7939     auto Alignment = CharUnits::fromQuantity(
7940         BuiltinID == NEON::BI__builtin_neon_vld1_dup_v ? 8 : 16);
7941     Ops[0] =
7942         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
7943     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
7944     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
7945     return EmitNeonSplat(Ops[0], CI);
7946   }
7947   case NEON::BI__builtin_neon_vst1_lane_v:
7948   case NEON::BI__builtin_neon_vst1q_lane_v:
7949     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7950     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
7951     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
7952     return Builder.CreateDefaultAlignedStore(Ops[1],
7953                                              Builder.CreateBitCast(Ops[0], Ty));
7954   case NEON::BI__builtin_neon_vld2_v:
7955   case NEON::BI__builtin_neon_vld2q_v: {
7956     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
7957     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7958     llvm::Type *Tys[2] = { VTy, PTy };
7959     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
7960     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
7961     Ops[0] = Builder.CreateBitCast(Ops[0],
7962                 llvm::PointerType::getUnqual(Ops[1]->getType()));
7963     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7964   }
7965   case NEON::BI__builtin_neon_vld3_v:
7966   case NEON::BI__builtin_neon_vld3q_v: {
7967     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
7968     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7969     llvm::Type *Tys[2] = { VTy, PTy };
7970     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
7971     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
7972     Ops[0] = Builder.CreateBitCast(Ops[0],
7973                 llvm::PointerType::getUnqual(Ops[1]->getType()));
7974     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7975   }
7976   case NEON::BI__builtin_neon_vld4_v:
7977   case NEON::BI__builtin_neon_vld4q_v: {
7978     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
7979     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7980     llvm::Type *Tys[2] = { VTy, PTy };
7981     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
7982     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
7983     Ops[0] = Builder.CreateBitCast(Ops[0],
7984                 llvm::PointerType::getUnqual(Ops[1]->getType()));
7985     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7986   }
7987   case NEON::BI__builtin_neon_vld2_dup_v:
7988   case NEON::BI__builtin_neon_vld2q_dup_v: {
7989     llvm::Type *PTy =
7990       llvm::PointerType::getUnqual(VTy->getElementType());
7991     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7992     llvm::Type *Tys[2] = { VTy, PTy };
7993     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
7994     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
7995     Ops[0] = Builder.CreateBitCast(Ops[0],
7996                 llvm::PointerType::getUnqual(Ops[1]->getType()));
7997     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7998   }
7999   case NEON::BI__builtin_neon_vld3_dup_v:
8000   case NEON::BI__builtin_neon_vld3q_dup_v: {
8001     llvm::Type *PTy =
8002       llvm::PointerType::getUnqual(VTy->getElementType());
8003     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8004     llvm::Type *Tys[2] = { VTy, PTy };
8005     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
8006     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
8007     Ops[0] = Builder.CreateBitCast(Ops[0],
8008                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8009     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8010   }
8011   case NEON::BI__builtin_neon_vld4_dup_v:
8012   case NEON::BI__builtin_neon_vld4q_dup_v: {
8013     llvm::Type *PTy =
8014       llvm::PointerType::getUnqual(VTy->getElementType());
8015     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
8016     llvm::Type *Tys[2] = { VTy, PTy };
8017     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
8018     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
8019     Ops[0] = Builder.CreateBitCast(Ops[0],
8020                 llvm::PointerType::getUnqual(Ops[1]->getType()));
8021     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8022   }
8023   case NEON::BI__builtin_neon_vld2_lane_v:
8024   case NEON::BI__builtin_neon_vld2q_lane_v: {
8025     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8026     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
8027     Ops.push_back(Ops[1]);
8028     Ops.erase(Ops.begin()+1);
8029     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8030     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8031     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
8032     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane");
8033     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8034     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8035     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8036   }
8037   case NEON::BI__builtin_neon_vld3_lane_v:
8038   case NEON::BI__builtin_neon_vld3q_lane_v: {
8039     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8040     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
8041     Ops.push_back(Ops[1]);
8042     Ops.erase(Ops.begin()+1);
8043     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8044     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8045     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
8046     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
8047     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
8048     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8049     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8050     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8051   }
8052   case NEON::BI__builtin_neon_vld4_lane_v:
8053   case NEON::BI__builtin_neon_vld4q_lane_v: {
8054     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
8055     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
8056     Ops.push_back(Ops[1]);
8057     Ops.erase(Ops.begin()+1);
8058     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8059     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8060     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
8061     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
8062     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
8063     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld4_lane");
8064     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8065     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8066     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8067   }
8068   case NEON::BI__builtin_neon_vst2_v:
8069   case NEON::BI__builtin_neon_vst2q_v: {
8070     Ops.push_back(Ops[0]);
8071     Ops.erase(Ops.begin());
8072     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
8073     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
8074                         Ops, "");
8075   }
8076   case NEON::BI__builtin_neon_vst2_lane_v:
8077   case NEON::BI__builtin_neon_vst2q_lane_v: {
8078     Ops.push_back(Ops[0]);
8079     Ops.erase(Ops.begin());
8080     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
8081     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
8082     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
8083                         Ops, "");
8084   }
8085   case NEON::BI__builtin_neon_vst3_v:
8086   case NEON::BI__builtin_neon_vst3q_v: {
8087     Ops.push_back(Ops[0]);
8088     Ops.erase(Ops.begin());
8089     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
8090     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
8091                         Ops, "");
8092   }
8093   case NEON::BI__builtin_neon_vst3_lane_v:
8094   case NEON::BI__builtin_neon_vst3q_lane_v: {
8095     Ops.push_back(Ops[0]);
8096     Ops.erase(Ops.begin());
8097     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
8098     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
8099     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
8100                         Ops, "");
8101   }
8102   case NEON::BI__builtin_neon_vst4_v:
8103   case NEON::BI__builtin_neon_vst4q_v: {
8104     Ops.push_back(Ops[0]);
8105     Ops.erase(Ops.begin());
8106     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
8107     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
8108                         Ops, "");
8109   }
8110   case NEON::BI__builtin_neon_vst4_lane_v:
8111   case NEON::BI__builtin_neon_vst4q_lane_v: {
8112     Ops.push_back(Ops[0]);
8113     Ops.erase(Ops.begin());
8114     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
8115     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
8116     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
8117                         Ops, "");
8118   }
8119   case NEON::BI__builtin_neon_vtrn_v:
8120   case NEON::BI__builtin_neon_vtrnq_v: {
8121     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8122     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8123     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8124     Value *SV = nullptr;
8125 
8126     for (unsigned vi = 0; vi != 2; ++vi) {
8127       SmallVector<uint32_t, 16> Indices;
8128       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8129         Indices.push_back(i+vi);
8130         Indices.push_back(i+e+vi);
8131       }
8132       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8133       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
8134       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8135     }
8136     return SV;
8137   }
8138   case NEON::BI__builtin_neon_vuzp_v:
8139   case NEON::BI__builtin_neon_vuzpq_v: {
8140     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8141     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8142     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8143     Value *SV = nullptr;
8144 
8145     for (unsigned vi = 0; vi != 2; ++vi) {
8146       SmallVector<uint32_t, 16> Indices;
8147       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
8148         Indices.push_back(2*i+vi);
8149 
8150       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8151       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
8152       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8153     }
8154     return SV;
8155   }
8156   case NEON::BI__builtin_neon_vzip_v:
8157   case NEON::BI__builtin_neon_vzipq_v: {
8158     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
8159     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8160     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
8161     Value *SV = nullptr;
8162 
8163     for (unsigned vi = 0; vi != 2; ++vi) {
8164       SmallVector<uint32_t, 16> Indices;
8165       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8166         Indices.push_back((i + vi*e) >> 1);
8167         Indices.push_back(((i + vi*e) >> 1)+e);
8168       }
8169       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8170       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
8171       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8172     }
8173     return SV;
8174   }
8175   case NEON::BI__builtin_neon_vqtbl1q_v: {
8176     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
8177                         Ops, "vtbl1");
8178   }
8179   case NEON::BI__builtin_neon_vqtbl2q_v: {
8180     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
8181                         Ops, "vtbl2");
8182   }
8183   case NEON::BI__builtin_neon_vqtbl3q_v: {
8184     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
8185                         Ops, "vtbl3");
8186   }
8187   case NEON::BI__builtin_neon_vqtbl4q_v: {
8188     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
8189                         Ops, "vtbl4");
8190   }
8191   case NEON::BI__builtin_neon_vqtbx1q_v: {
8192     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
8193                         Ops, "vtbx1");
8194   }
8195   case NEON::BI__builtin_neon_vqtbx2q_v: {
8196     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
8197                         Ops, "vtbx2");
8198   }
8199   case NEON::BI__builtin_neon_vqtbx3q_v: {
8200     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
8201                         Ops, "vtbx3");
8202   }
8203   case NEON::BI__builtin_neon_vqtbx4q_v: {
8204     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
8205                         Ops, "vtbx4");
8206   }
8207   case NEON::BI__builtin_neon_vsqadd_v:
8208   case NEON::BI__builtin_neon_vsqaddq_v: {
8209     Int = Intrinsic::aarch64_neon_usqadd;
8210     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
8211   }
8212   case NEON::BI__builtin_neon_vuqadd_v:
8213   case NEON::BI__builtin_neon_vuqaddq_v: {
8214     Int = Intrinsic::aarch64_neon_suqadd;
8215     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
8216   }
8217   }
8218 }
8219 
8220 llvm::Value *CodeGenFunction::
8221 BuildVector(ArrayRef<llvm::Value*> Ops) {
8222   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
8223          "Not a power-of-two sized vector!");
8224   bool AllConstants = true;
8225   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
8226     AllConstants &= isa<Constant>(Ops[i]);
8227 
8228   // If this is a constant vector, create a ConstantVector.
8229   if (AllConstants) {
8230     SmallVector<llvm::Constant*, 16> CstOps;
8231     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
8232       CstOps.push_back(cast<Constant>(Ops[i]));
8233     return llvm::ConstantVector::get(CstOps);
8234   }
8235 
8236   // Otherwise, insertelement the values to build the vector.
8237   Value *Result =
8238     llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), Ops.size()));
8239 
8240   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
8241     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt32(i));
8242 
8243   return Result;
8244 }
8245 
8246 // Convert the mask from an integer type to a vector of i1.
8247 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
8248                               unsigned NumElts) {
8249 
8250   llvm::VectorType *MaskTy = llvm::VectorType::get(CGF.Builder.getInt1Ty(),
8251                          cast<IntegerType>(Mask->getType())->getBitWidth());
8252   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
8253 
8254   // If we have less than 8 elements, then the starting mask was an i8 and
8255   // we need to extract down to the right number of elements.
8256   if (NumElts < 8) {
8257     uint32_t Indices[4];
8258     for (unsigned i = 0; i != NumElts; ++i)
8259       Indices[i] = i;
8260     MaskVec = CGF.Builder.CreateShuffleVector(MaskVec, MaskVec,
8261                                              makeArrayRef(Indices, NumElts),
8262                                              "extract");
8263   }
8264   return MaskVec;
8265 }
8266 
8267 static Value *EmitX86MaskedStore(CodeGenFunction &CGF,
8268                                  SmallVectorImpl<Value *> &Ops,
8269                                  unsigned Align) {
8270   // Cast the pointer to right type.
8271   Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
8272                                llvm::PointerType::getUnqual(Ops[1]->getType()));
8273 
8274   // If the mask is all ones just emit a regular store.
8275   if (const auto *C = dyn_cast<Constant>(Ops[2]))
8276     if (C->isAllOnesValue())
8277       return CGF.Builder.CreateAlignedStore(Ops[1], Ops[0], Align);
8278 
8279   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
8280                                    Ops[1]->getType()->getVectorNumElements());
8281 
8282   return CGF.Builder.CreateMaskedStore(Ops[1], Ops[0], Align, MaskVec);
8283 }
8284 
8285 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF,
8286                                 SmallVectorImpl<Value *> &Ops, unsigned Align) {
8287   // Cast the pointer to right type.
8288   Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
8289                                llvm::PointerType::getUnqual(Ops[1]->getType()));
8290 
8291   // If the mask is all ones just emit a regular store.
8292   if (const auto *C = dyn_cast<Constant>(Ops[2]))
8293     if (C->isAllOnesValue())
8294       return CGF.Builder.CreateAlignedLoad(Ops[0], Align);
8295 
8296   Value *MaskVec = getMaskVecValue(CGF, Ops[2],
8297                                    Ops[1]->getType()->getVectorNumElements());
8298 
8299   return CGF.Builder.CreateMaskedLoad(Ops[0], Align, MaskVec, Ops[1]);
8300 }
8301 
8302 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
8303                               unsigned NumElts, SmallVectorImpl<Value *> &Ops,
8304                               bool InvertLHS = false) {
8305   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
8306   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
8307 
8308   if (InvertLHS)
8309     LHS = CGF.Builder.CreateNot(LHS);
8310 
8311   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
8312                                   CGF.Builder.getIntNTy(std::max(NumElts, 8U)));
8313 }
8314 
8315 static Value *EmitX86SubVectorBroadcast(CodeGenFunction &CGF,
8316                                         SmallVectorImpl<Value *> &Ops,
8317                                         llvm::Type *DstTy,
8318                                         unsigned SrcSizeInBits,
8319                                         unsigned Align) {
8320   // Load the subvector.
8321   Ops[0] = CGF.Builder.CreateAlignedLoad(Ops[0], Align);
8322 
8323   // Create broadcast mask.
8324   unsigned NumDstElts = DstTy->getVectorNumElements();
8325   unsigned NumSrcElts = SrcSizeInBits / DstTy->getScalarSizeInBits();
8326 
8327   SmallVector<uint32_t, 8> Mask;
8328   for (unsigned i = 0; i != NumDstElts; i += NumSrcElts)
8329     for (unsigned j = 0; j != NumSrcElts; ++j)
8330       Mask.push_back(j);
8331 
8332   return CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], Mask, "subvecbcst");
8333 }
8334 
8335 static Value *EmitX86Select(CodeGenFunction &CGF,
8336                             Value *Mask, Value *Op0, Value *Op1) {
8337 
8338   // If the mask is all ones just return first argument.
8339   if (const auto *C = dyn_cast<Constant>(Mask))
8340     if (C->isAllOnesValue())
8341       return Op0;
8342 
8343   Mask = getMaskVecValue(CGF, Mask, Op0->getType()->getVectorNumElements());
8344 
8345   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
8346 }
8347 
8348 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
8349                                          unsigned NumElts, Value *MaskIn) {
8350   if (MaskIn) {
8351     const auto *C = dyn_cast<Constant>(MaskIn);
8352     if (!C || !C->isAllOnesValue())
8353       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
8354   }
8355 
8356   if (NumElts < 8) {
8357     uint32_t Indices[8];
8358     for (unsigned i = 0; i != NumElts; ++i)
8359       Indices[i] = i;
8360     for (unsigned i = NumElts; i != 8; ++i)
8361       Indices[i] = i % NumElts + NumElts;
8362     Cmp = CGF.Builder.CreateShuffleVector(
8363         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
8364   }
8365 
8366   return CGF.Builder.CreateBitCast(Cmp,
8367                                    IntegerType::get(CGF.getLLVMContext(),
8368                                                     std::max(NumElts, 8U)));
8369 }
8370 
8371 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
8372                                    bool Signed, ArrayRef<Value *> Ops) {
8373   assert((Ops.size() == 2 || Ops.size() == 4) &&
8374          "Unexpected number of arguments");
8375   unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
8376   Value *Cmp;
8377 
8378   if (CC == 3) {
8379     Cmp = Constant::getNullValue(
8380                        llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
8381   } else if (CC == 7) {
8382     Cmp = Constant::getAllOnesValue(
8383                        llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
8384   } else {
8385     ICmpInst::Predicate Pred;
8386     switch (CC) {
8387     default: llvm_unreachable("Unknown condition code");
8388     case 0: Pred = ICmpInst::ICMP_EQ;  break;
8389     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
8390     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
8391     case 4: Pred = ICmpInst::ICMP_NE;  break;
8392     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
8393     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
8394     }
8395     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
8396   }
8397 
8398   Value *MaskIn = nullptr;
8399   if (Ops.size() == 4)
8400     MaskIn = Ops[3];
8401 
8402   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
8403 }
8404 
8405 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
8406   Value *Zero = Constant::getNullValue(In->getType());
8407   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
8408 }
8409 
8410 static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
8411 
8412   llvm::Type *Ty = Ops[0]->getType();
8413   Value *Zero = llvm::Constant::getNullValue(Ty);
8414   Value *Sub = CGF.Builder.CreateSub(Zero, Ops[0]);
8415   Value *Cmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Zero);
8416   Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Sub);
8417   if (Ops.size() == 1)
8418     return Res;
8419   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
8420 }
8421 
8422 static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
8423                             ArrayRef<Value *> Ops) {
8424   Value *Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
8425   Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
8426 
8427   if (Ops.size() == 2)
8428     return Res;
8429 
8430   assert(Ops.size() == 4);
8431   return EmitX86Select(CGF, Ops[3], Res, Ops[2]);
8432 }
8433 
8434 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
8435                            ArrayRef<Value *> Ops) {
8436   llvm::Type *Ty = Ops[0]->getType();
8437   // Arguments have a vXi32 type so cast to vXi64.
8438   Ty = llvm::VectorType::get(CGF.Int64Ty,
8439                              Ty->getPrimitiveSizeInBits() / 64);
8440   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
8441   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
8442 
8443   if (IsSigned) {
8444     // Shift left then arithmetic shift right.
8445     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
8446     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
8447     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
8448     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
8449     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
8450   } else {
8451     // Clear the upper bits.
8452     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
8453     LHS = CGF.Builder.CreateAnd(LHS, Mask);
8454     RHS = CGF.Builder.CreateAnd(RHS, Mask);
8455   }
8456 
8457   return CGF.Builder.CreateMul(LHS, RHS);
8458 }
8459 
8460 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
8461                               llvm::Type *DstTy) {
8462   unsigned NumberOfElements = DstTy->getVectorNumElements();
8463   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
8464   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
8465 }
8466 
8467 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
8468   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
8469   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
8470   return EmitX86CpuIs(CPUStr);
8471 }
8472 
8473 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
8474 
8475   llvm::Type *Int32Ty = Builder.getInt32Ty();
8476 
8477   // Matching the struct layout from the compiler-rt/libgcc structure that is
8478   // filled in:
8479   // unsigned int __cpu_vendor;
8480   // unsigned int __cpu_type;
8481   // unsigned int __cpu_subtype;
8482   // unsigned int __cpu_features[1];
8483   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
8484                                           llvm::ArrayType::get(Int32Ty, 1));
8485 
8486   // Grab the global __cpu_model.
8487   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
8488 
8489   // Calculate the index needed to access the correct field based on the
8490   // range. Also adjust the expected value.
8491   unsigned Index;
8492   unsigned Value;
8493   std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
8494 #define X86_VENDOR(ENUM, STRING)                                               \
8495   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
8496 #define X86_CPU_TYPE_COMPAT_WITH_ALIAS(ARCHNAME, ENUM, STR, ALIAS)             \
8497   .Cases(STR, ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
8498 #define X86_CPU_TYPE_COMPAT(ARCHNAME, ENUM, STR)                               \
8499   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
8500 #define X86_CPU_SUBTYPE_COMPAT(ARCHNAME, ENUM, STR)                            \
8501   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
8502 #include "llvm/Support/X86TargetParser.def"
8503                                .Default({0, 0});
8504   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
8505 
8506   // Grab the appropriate field from __cpu_model.
8507   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
8508                          ConstantInt::get(Int32Ty, Index)};
8509   llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
8510   CpuValue = Builder.CreateAlignedLoad(CpuValue, CharUnits::fromQuantity(4));
8511 
8512   // Check the value of the field against the requested value.
8513   return Builder.CreateICmpEQ(CpuValue,
8514                                   llvm::ConstantInt::get(Int32Ty, Value));
8515 }
8516 
8517 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
8518   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
8519   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
8520   return EmitX86CpuSupports(FeatureStr);
8521 }
8522 
8523 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
8524   // Processor features and mapping to processor feature value.
8525 
8526   uint32_t FeaturesMask = 0;
8527 
8528   for (const StringRef &FeatureStr : FeatureStrs) {
8529     unsigned Feature =
8530         StringSwitch<unsigned>(FeatureStr)
8531 #define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, VAL)
8532 #include "llvm/Support/X86TargetParser.def"
8533         ;
8534     FeaturesMask |= (1U << Feature);
8535   }
8536 
8537   // Matching the struct layout from the compiler-rt/libgcc structure that is
8538   // filled in:
8539   // unsigned int __cpu_vendor;
8540   // unsigned int __cpu_type;
8541   // unsigned int __cpu_subtype;
8542   // unsigned int __cpu_features[1];
8543   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
8544                                           llvm::ArrayType::get(Int32Ty, 1));
8545 
8546   // Grab the global __cpu_model.
8547   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
8548 
8549   // Grab the first (0th) element from the field __cpu_features off of the
8550   // global in the struct STy.
8551   Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 3),
8552                    ConstantInt::get(Int32Ty, 0)};
8553   Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
8554   Value *Features =
8555       Builder.CreateAlignedLoad(CpuFeatures, CharUnits::fromQuantity(4));
8556 
8557   // Check the value of the bit corresponding to the feature requested.
8558   Value *Bitset = Builder.CreateAnd(
8559       Features, llvm::ConstantInt::get(Int32Ty, FeaturesMask));
8560   return Builder.CreateICmpNE(Bitset, llvm::ConstantInt::get(Int32Ty, 0));
8561 }
8562 
8563 Value *CodeGenFunction::EmitX86CpuInit() {
8564   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
8565                                                     /*Variadic*/ false);
8566   llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
8567   return Builder.CreateCall(Func);
8568 }
8569 
8570 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
8571                                            const CallExpr *E) {
8572   if (BuiltinID == X86::BI__builtin_cpu_is)
8573     return EmitX86CpuIs(E);
8574   if (BuiltinID == X86::BI__builtin_cpu_supports)
8575     return EmitX86CpuSupports(E);
8576   if (BuiltinID == X86::BI__builtin_cpu_init)
8577     return EmitX86CpuInit();
8578 
8579   SmallVector<Value*, 4> Ops;
8580 
8581   // Find out if any arguments are required to be integer constant expressions.
8582   unsigned ICEArguments = 0;
8583   ASTContext::GetBuiltinTypeError Error;
8584   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
8585   assert(Error == ASTContext::GE_None && "Should not codegen an error");
8586 
8587   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
8588     // If this is a normal argument, just emit it as a scalar.
8589     if ((ICEArguments & (1 << i)) == 0) {
8590       Ops.push_back(EmitScalarExpr(E->getArg(i)));
8591       continue;
8592     }
8593 
8594     // If this is required to be a constant, constant fold it so that we know
8595     // that the generated intrinsic gets a ConstantInt.
8596     llvm::APSInt Result;
8597     bool IsConst = E->getArg(i)->isIntegerConstantExpr(Result, getContext());
8598     assert(IsConst && "Constant arg isn't actually constant?"); (void)IsConst;
8599     Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
8600   }
8601 
8602   // These exist so that the builtin that takes an immediate can be bounds
8603   // checked by clang to avoid passing bad immediates to the backend. Since
8604   // AVX has a larger immediate than SSE we would need separate builtins to
8605   // do the different bounds checking. Rather than create a clang specific
8606   // SSE only builtin, this implements eight separate builtins to match gcc
8607   // implementation.
8608   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
8609     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
8610     llvm::Function *F = CGM.getIntrinsic(ID);
8611     return Builder.CreateCall(F, Ops);
8612   };
8613 
8614   // For the vector forms of FP comparisons, translate the builtins directly to
8615   // IR.
8616   // TODO: The builtins could be removed if the SSE header files used vector
8617   // extension comparisons directly (vector ordered/unordered may need
8618   // additional support via __builtin_isnan()).
8619   auto getVectorFCmpIR = [this, &Ops](CmpInst::Predicate Pred) {
8620     Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
8621     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
8622     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
8623     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
8624     return Builder.CreateBitCast(Sext, FPVecTy);
8625   };
8626 
8627   switch (BuiltinID) {
8628   default: return nullptr;
8629   case X86::BI_mm_prefetch: {
8630     Value *Address = Ops[0];
8631     ConstantInt *C = cast<ConstantInt>(Ops[1]);
8632     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
8633     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
8634     Value *Data = ConstantInt::get(Int32Ty, 1);
8635     Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
8636     return Builder.CreateCall(F, {Address, RW, Locality, Data});
8637   }
8638   case X86::BI_mm_clflush: {
8639     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
8640                               Ops[0]);
8641   }
8642   case X86::BI_mm_lfence: {
8643     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
8644   }
8645   case X86::BI_mm_mfence: {
8646     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
8647   }
8648   case X86::BI_mm_sfence: {
8649     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
8650   }
8651   case X86::BI_mm_pause: {
8652     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
8653   }
8654   case X86::BI__rdtsc: {
8655     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
8656   }
8657   case X86::BI__builtin_ia32_undef128:
8658   case X86::BI__builtin_ia32_undef256:
8659   case X86::BI__builtin_ia32_undef512:
8660     // The x86 definition of "undef" is not the same as the LLVM definition
8661     // (PR32176). We leave optimizing away an unnecessary zero constant to the
8662     // IR optimizer and backend.
8663     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
8664     // value, we should use that here instead of a zero.
8665     return llvm::Constant::getNullValue(ConvertType(E->getType()));
8666   case X86::BI__builtin_ia32_vec_init_v8qi:
8667   case X86::BI__builtin_ia32_vec_init_v4hi:
8668   case X86::BI__builtin_ia32_vec_init_v2si:
8669     return Builder.CreateBitCast(BuildVector(Ops),
8670                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
8671   case X86::BI__builtin_ia32_vec_ext_v2si:
8672     return Builder.CreateExtractElement(Ops[0],
8673                                   llvm::ConstantInt::get(Ops[1]->getType(), 0));
8674   case X86::BI_mm_setcsr:
8675   case X86::BI__builtin_ia32_ldmxcsr: {
8676     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
8677     Builder.CreateStore(Ops[0], Tmp);
8678     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
8679                           Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
8680   }
8681   case X86::BI_mm_getcsr:
8682   case X86::BI__builtin_ia32_stmxcsr: {
8683     Address Tmp = CreateMemTemp(E->getType());
8684     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
8685                        Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
8686     return Builder.CreateLoad(Tmp, "stmxcsr");
8687   }
8688   case X86::BI__builtin_ia32_xsave:
8689   case X86::BI__builtin_ia32_xsave64:
8690   case X86::BI__builtin_ia32_xrstor:
8691   case X86::BI__builtin_ia32_xrstor64:
8692   case X86::BI__builtin_ia32_xsaveopt:
8693   case X86::BI__builtin_ia32_xsaveopt64:
8694   case X86::BI__builtin_ia32_xrstors:
8695   case X86::BI__builtin_ia32_xrstors64:
8696   case X86::BI__builtin_ia32_xsavec:
8697   case X86::BI__builtin_ia32_xsavec64:
8698   case X86::BI__builtin_ia32_xsaves:
8699   case X86::BI__builtin_ia32_xsaves64: {
8700     Intrinsic::ID ID;
8701 #define INTRINSIC_X86_XSAVE_ID(NAME) \
8702     case X86::BI__builtin_ia32_##NAME: \
8703       ID = Intrinsic::x86_##NAME; \
8704       break
8705     switch (BuiltinID) {
8706     default: llvm_unreachable("Unsupported intrinsic!");
8707     INTRINSIC_X86_XSAVE_ID(xsave);
8708     INTRINSIC_X86_XSAVE_ID(xsave64);
8709     INTRINSIC_X86_XSAVE_ID(xrstor);
8710     INTRINSIC_X86_XSAVE_ID(xrstor64);
8711     INTRINSIC_X86_XSAVE_ID(xsaveopt);
8712     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
8713     INTRINSIC_X86_XSAVE_ID(xrstors);
8714     INTRINSIC_X86_XSAVE_ID(xrstors64);
8715     INTRINSIC_X86_XSAVE_ID(xsavec);
8716     INTRINSIC_X86_XSAVE_ID(xsavec64);
8717     INTRINSIC_X86_XSAVE_ID(xsaves);
8718     INTRINSIC_X86_XSAVE_ID(xsaves64);
8719     }
8720 #undef INTRINSIC_X86_XSAVE_ID
8721     Value *Mhi = Builder.CreateTrunc(
8722       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
8723     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
8724     Ops[1] = Mhi;
8725     Ops.push_back(Mlo);
8726     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
8727   }
8728   case X86::BI__builtin_ia32_storedqudi128_mask:
8729   case X86::BI__builtin_ia32_storedqusi128_mask:
8730   case X86::BI__builtin_ia32_storedquhi128_mask:
8731   case X86::BI__builtin_ia32_storedquqi128_mask:
8732   case X86::BI__builtin_ia32_storeupd128_mask:
8733   case X86::BI__builtin_ia32_storeups128_mask:
8734   case X86::BI__builtin_ia32_storedqudi256_mask:
8735   case X86::BI__builtin_ia32_storedqusi256_mask:
8736   case X86::BI__builtin_ia32_storedquhi256_mask:
8737   case X86::BI__builtin_ia32_storedquqi256_mask:
8738   case X86::BI__builtin_ia32_storeupd256_mask:
8739   case X86::BI__builtin_ia32_storeups256_mask:
8740   case X86::BI__builtin_ia32_storedqudi512_mask:
8741   case X86::BI__builtin_ia32_storedqusi512_mask:
8742   case X86::BI__builtin_ia32_storedquhi512_mask:
8743   case X86::BI__builtin_ia32_storedquqi512_mask:
8744   case X86::BI__builtin_ia32_storeupd512_mask:
8745   case X86::BI__builtin_ia32_storeups512_mask:
8746     return EmitX86MaskedStore(*this, Ops, 1);
8747 
8748   case X86::BI__builtin_ia32_storess128_mask:
8749   case X86::BI__builtin_ia32_storesd128_mask: {
8750     return EmitX86MaskedStore(*this, Ops, 16);
8751   }
8752   case X86::BI__builtin_ia32_vpopcntb_128:
8753   case X86::BI__builtin_ia32_vpopcntd_128:
8754   case X86::BI__builtin_ia32_vpopcntq_128:
8755   case X86::BI__builtin_ia32_vpopcntw_128:
8756   case X86::BI__builtin_ia32_vpopcntb_256:
8757   case X86::BI__builtin_ia32_vpopcntd_256:
8758   case X86::BI__builtin_ia32_vpopcntq_256:
8759   case X86::BI__builtin_ia32_vpopcntw_256:
8760   case X86::BI__builtin_ia32_vpopcntb_512:
8761   case X86::BI__builtin_ia32_vpopcntd_512:
8762   case X86::BI__builtin_ia32_vpopcntq_512:
8763   case X86::BI__builtin_ia32_vpopcntw_512: {
8764     llvm::Type *ResultType = ConvertType(E->getType());
8765     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
8766     return Builder.CreateCall(F, Ops);
8767   }
8768   case X86::BI__builtin_ia32_cvtmask2b128:
8769   case X86::BI__builtin_ia32_cvtmask2b256:
8770   case X86::BI__builtin_ia32_cvtmask2b512:
8771   case X86::BI__builtin_ia32_cvtmask2w128:
8772   case X86::BI__builtin_ia32_cvtmask2w256:
8773   case X86::BI__builtin_ia32_cvtmask2w512:
8774   case X86::BI__builtin_ia32_cvtmask2d128:
8775   case X86::BI__builtin_ia32_cvtmask2d256:
8776   case X86::BI__builtin_ia32_cvtmask2d512:
8777   case X86::BI__builtin_ia32_cvtmask2q128:
8778   case X86::BI__builtin_ia32_cvtmask2q256:
8779   case X86::BI__builtin_ia32_cvtmask2q512:
8780     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
8781 
8782   case X86::BI__builtin_ia32_cvtb2mask128:
8783   case X86::BI__builtin_ia32_cvtb2mask256:
8784   case X86::BI__builtin_ia32_cvtb2mask512:
8785   case X86::BI__builtin_ia32_cvtw2mask128:
8786   case X86::BI__builtin_ia32_cvtw2mask256:
8787   case X86::BI__builtin_ia32_cvtw2mask512:
8788   case X86::BI__builtin_ia32_cvtd2mask128:
8789   case X86::BI__builtin_ia32_cvtd2mask256:
8790   case X86::BI__builtin_ia32_cvtd2mask512:
8791   case X86::BI__builtin_ia32_cvtq2mask128:
8792   case X86::BI__builtin_ia32_cvtq2mask256:
8793   case X86::BI__builtin_ia32_cvtq2mask512:
8794     return EmitX86ConvertToMask(*this, Ops[0]);
8795 
8796   case X86::BI__builtin_ia32_movdqa32store128_mask:
8797   case X86::BI__builtin_ia32_movdqa64store128_mask:
8798   case X86::BI__builtin_ia32_storeaps128_mask:
8799   case X86::BI__builtin_ia32_storeapd128_mask:
8800   case X86::BI__builtin_ia32_movdqa32store256_mask:
8801   case X86::BI__builtin_ia32_movdqa64store256_mask:
8802   case X86::BI__builtin_ia32_storeaps256_mask:
8803   case X86::BI__builtin_ia32_storeapd256_mask:
8804   case X86::BI__builtin_ia32_movdqa32store512_mask:
8805   case X86::BI__builtin_ia32_movdqa64store512_mask:
8806   case X86::BI__builtin_ia32_storeaps512_mask:
8807   case X86::BI__builtin_ia32_storeapd512_mask: {
8808     unsigned Align =
8809       getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
8810     return EmitX86MaskedStore(*this, Ops, Align);
8811   }
8812   case X86::BI__builtin_ia32_loadups128_mask:
8813   case X86::BI__builtin_ia32_loadups256_mask:
8814   case X86::BI__builtin_ia32_loadups512_mask:
8815   case X86::BI__builtin_ia32_loadupd128_mask:
8816   case X86::BI__builtin_ia32_loadupd256_mask:
8817   case X86::BI__builtin_ia32_loadupd512_mask:
8818   case X86::BI__builtin_ia32_loaddquqi128_mask:
8819   case X86::BI__builtin_ia32_loaddquqi256_mask:
8820   case X86::BI__builtin_ia32_loaddquqi512_mask:
8821   case X86::BI__builtin_ia32_loaddquhi128_mask:
8822   case X86::BI__builtin_ia32_loaddquhi256_mask:
8823   case X86::BI__builtin_ia32_loaddquhi512_mask:
8824   case X86::BI__builtin_ia32_loaddqusi128_mask:
8825   case X86::BI__builtin_ia32_loaddqusi256_mask:
8826   case X86::BI__builtin_ia32_loaddqusi512_mask:
8827   case X86::BI__builtin_ia32_loaddqudi128_mask:
8828   case X86::BI__builtin_ia32_loaddqudi256_mask:
8829   case X86::BI__builtin_ia32_loaddqudi512_mask:
8830     return EmitX86MaskedLoad(*this, Ops, 1);
8831 
8832   case X86::BI__builtin_ia32_loadss128_mask:
8833   case X86::BI__builtin_ia32_loadsd128_mask:
8834     return EmitX86MaskedLoad(*this, Ops, 16);
8835 
8836   case X86::BI__builtin_ia32_loadaps128_mask:
8837   case X86::BI__builtin_ia32_loadaps256_mask:
8838   case X86::BI__builtin_ia32_loadaps512_mask:
8839   case X86::BI__builtin_ia32_loadapd128_mask:
8840   case X86::BI__builtin_ia32_loadapd256_mask:
8841   case X86::BI__builtin_ia32_loadapd512_mask:
8842   case X86::BI__builtin_ia32_movdqa32load128_mask:
8843   case X86::BI__builtin_ia32_movdqa32load256_mask:
8844   case X86::BI__builtin_ia32_movdqa32load512_mask:
8845   case X86::BI__builtin_ia32_movdqa64load128_mask:
8846   case X86::BI__builtin_ia32_movdqa64load256_mask:
8847   case X86::BI__builtin_ia32_movdqa64load512_mask: {
8848     unsigned Align =
8849       getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
8850     return EmitX86MaskedLoad(*this, Ops, Align);
8851   }
8852 
8853   case X86::BI__builtin_ia32_vbroadcastf128_pd256:
8854   case X86::BI__builtin_ia32_vbroadcastf128_ps256: {
8855     llvm::Type *DstTy = ConvertType(E->getType());
8856     return EmitX86SubVectorBroadcast(*this, Ops, DstTy, 128, 1);
8857   }
8858 
8859   case X86::BI__builtin_ia32_storehps:
8860   case X86::BI__builtin_ia32_storelps: {
8861     llvm::Type *PtrTy = llvm::PointerType::getUnqual(Int64Ty);
8862     llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 2);
8863 
8864     // cast val v2i64
8865     Ops[1] = Builder.CreateBitCast(Ops[1], VecTy, "cast");
8866 
8867     // extract (0, 1)
8868     unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1;
8869     llvm::Value *Idx = llvm::ConstantInt::get(SizeTy, Index);
8870     Ops[1] = Builder.CreateExtractElement(Ops[1], Idx, "extract");
8871 
8872     // cast pointer to i64 & store
8873     Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
8874     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
8875   }
8876   case X86::BI__builtin_ia32_palignr128:
8877   case X86::BI__builtin_ia32_palignr256:
8878   case X86::BI__builtin_ia32_palignr512_mask: {
8879     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
8880 
8881     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
8882     assert(NumElts % 16 == 0);
8883 
8884     // If palignr is shifting the pair of vectors more than the size of two
8885     // lanes, emit zero.
8886     if (ShiftVal >= 32)
8887       return llvm::Constant::getNullValue(ConvertType(E->getType()));
8888 
8889     // If palignr is shifting the pair of input vectors more than one lane,
8890     // but less than two lanes, convert to shifting in zeroes.
8891     if (ShiftVal > 16) {
8892       ShiftVal -= 16;
8893       Ops[1] = Ops[0];
8894       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
8895     }
8896 
8897     uint32_t Indices[64];
8898     // 256-bit palignr operates on 128-bit lanes so we need to handle that
8899     for (unsigned l = 0; l != NumElts; l += 16) {
8900       for (unsigned i = 0; i != 16; ++i) {
8901         unsigned Idx = ShiftVal + i;
8902         if (Idx >= 16)
8903           Idx += NumElts - 16; // End of lane, switch operand.
8904         Indices[l + i] = Idx + l;
8905       }
8906     }
8907 
8908     Value *Align = Builder.CreateShuffleVector(Ops[1], Ops[0],
8909                                                makeArrayRef(Indices, NumElts),
8910                                                "palignr");
8911 
8912     // If this isn't a masked builtin, just return the align operation.
8913     if (Ops.size() == 3)
8914       return Align;
8915 
8916     return EmitX86Select(*this, Ops[4], Align, Ops[3]);
8917   }
8918 
8919   case X86::BI__builtin_ia32_vperm2f128_pd256:
8920   case X86::BI__builtin_ia32_vperm2f128_ps256:
8921   case X86::BI__builtin_ia32_vperm2f128_si256:
8922   case X86::BI__builtin_ia32_permti256: {
8923     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
8924     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
8925 
8926     // This takes a very simple approach since there are two lanes and a
8927     // shuffle can have 2 inputs. So we reserve the first input for the first
8928     // lane and the second input for the second lane. This may result in
8929     // duplicate sources, but this can be dealt with in the backend.
8930 
8931     Value *OutOps[2];
8932     uint32_t Indices[8];
8933     for (unsigned l = 0; l != 2; ++l) {
8934       // Determine the source for this lane.
8935       if (Imm & (1 << ((l * 4) + 3)))
8936         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
8937       else if (Imm & (1 << ((l * 4) + 1)))
8938         OutOps[l] = Ops[1];
8939       else
8940         OutOps[l] = Ops[0];
8941 
8942       for (unsigned i = 0; i != NumElts/2; ++i) {
8943         // Start with ith element of the source for this lane.
8944         unsigned Idx = (l * NumElts) + i;
8945         // If bit 0 of the immediate half is set, switch to the high half of
8946         // the source.
8947         if (Imm & (1 << (l * 4)))
8948           Idx += NumElts/2;
8949         Indices[(l * (NumElts/2)) + i] = Idx;
8950       }
8951     }
8952 
8953     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
8954                                        makeArrayRef(Indices, NumElts),
8955                                        "vperm");
8956   }
8957 
8958   case X86::BI__builtin_ia32_movnti:
8959   case X86::BI__builtin_ia32_movnti64:
8960   case X86::BI__builtin_ia32_movntsd:
8961   case X86::BI__builtin_ia32_movntss: {
8962     llvm::MDNode *Node = llvm::MDNode::get(
8963         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
8964 
8965     Value *Ptr = Ops[0];
8966     Value *Src = Ops[1];
8967 
8968     // Extract the 0'th element of the source vector.
8969     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
8970         BuiltinID == X86::BI__builtin_ia32_movntss)
8971       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
8972 
8973     // Convert the type of the pointer to a pointer to the stored type.
8974     Value *BC = Builder.CreateBitCast(
8975         Ptr, llvm::PointerType::getUnqual(Src->getType()), "cast");
8976 
8977     // Unaligned nontemporal store of the scalar value.
8978     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, BC);
8979     SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
8980     SI->setAlignment(1);
8981     return SI;
8982   }
8983 
8984   case X86::BI__builtin_ia32_selectb_128:
8985   case X86::BI__builtin_ia32_selectb_256:
8986   case X86::BI__builtin_ia32_selectb_512:
8987   case X86::BI__builtin_ia32_selectw_128:
8988   case X86::BI__builtin_ia32_selectw_256:
8989   case X86::BI__builtin_ia32_selectw_512:
8990   case X86::BI__builtin_ia32_selectd_128:
8991   case X86::BI__builtin_ia32_selectd_256:
8992   case X86::BI__builtin_ia32_selectd_512:
8993   case X86::BI__builtin_ia32_selectq_128:
8994   case X86::BI__builtin_ia32_selectq_256:
8995   case X86::BI__builtin_ia32_selectq_512:
8996   case X86::BI__builtin_ia32_selectps_128:
8997   case X86::BI__builtin_ia32_selectps_256:
8998   case X86::BI__builtin_ia32_selectps_512:
8999   case X86::BI__builtin_ia32_selectpd_128:
9000   case X86::BI__builtin_ia32_selectpd_256:
9001   case X86::BI__builtin_ia32_selectpd_512:
9002     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
9003   case X86::BI__builtin_ia32_cmpb128_mask:
9004   case X86::BI__builtin_ia32_cmpb256_mask:
9005   case X86::BI__builtin_ia32_cmpb512_mask:
9006   case X86::BI__builtin_ia32_cmpw128_mask:
9007   case X86::BI__builtin_ia32_cmpw256_mask:
9008   case X86::BI__builtin_ia32_cmpw512_mask:
9009   case X86::BI__builtin_ia32_cmpd128_mask:
9010   case X86::BI__builtin_ia32_cmpd256_mask:
9011   case X86::BI__builtin_ia32_cmpd512_mask:
9012   case X86::BI__builtin_ia32_cmpq128_mask:
9013   case X86::BI__builtin_ia32_cmpq256_mask:
9014   case X86::BI__builtin_ia32_cmpq512_mask: {
9015     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
9016     return EmitX86MaskedCompare(*this, CC, true, Ops);
9017   }
9018   case X86::BI__builtin_ia32_ucmpb128_mask:
9019   case X86::BI__builtin_ia32_ucmpb256_mask:
9020   case X86::BI__builtin_ia32_ucmpb512_mask:
9021   case X86::BI__builtin_ia32_ucmpw128_mask:
9022   case X86::BI__builtin_ia32_ucmpw256_mask:
9023   case X86::BI__builtin_ia32_ucmpw512_mask:
9024   case X86::BI__builtin_ia32_ucmpd128_mask:
9025   case X86::BI__builtin_ia32_ucmpd256_mask:
9026   case X86::BI__builtin_ia32_ucmpd512_mask:
9027   case X86::BI__builtin_ia32_ucmpq128_mask:
9028   case X86::BI__builtin_ia32_ucmpq256_mask:
9029   case X86::BI__builtin_ia32_ucmpq512_mask: {
9030     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
9031     return EmitX86MaskedCompare(*this, CC, false, Ops);
9032   }
9033 
9034   case X86::BI__builtin_ia32_kortestchi:
9035   case X86::BI__builtin_ia32_kortestzhi: {
9036     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, 16, Ops);
9037     Value *C;
9038     if (BuiltinID == X86::BI__builtin_ia32_kortestchi)
9039       C = llvm::Constant::getAllOnesValue(Builder.getInt16Ty());
9040     else
9041       C = llvm::Constant::getNullValue(Builder.getInt16Ty());
9042     Value *Cmp = Builder.CreateICmpEQ(Or, C);
9043     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
9044   }
9045 
9046   case X86::BI__builtin_ia32_kandhi:
9047     return EmitX86MaskLogic(*this, Instruction::And, 16, Ops);
9048   case X86::BI__builtin_ia32_kandnhi:
9049     return EmitX86MaskLogic(*this, Instruction::And, 16, Ops, true);
9050   case X86::BI__builtin_ia32_korhi:
9051     return EmitX86MaskLogic(*this, Instruction::Or, 16, Ops);
9052   case X86::BI__builtin_ia32_kxnorhi:
9053     return EmitX86MaskLogic(*this, Instruction::Xor, 16, Ops, true);
9054   case X86::BI__builtin_ia32_kxorhi:
9055     return EmitX86MaskLogic(*this, Instruction::Xor, 16, Ops);
9056   case X86::BI__builtin_ia32_knothi: {
9057     Ops[0] = getMaskVecValue(*this, Ops[0], 16);
9058     return Builder.CreateBitCast(Builder.CreateNot(Ops[0]),
9059                                  Builder.getInt16Ty());
9060   }
9061 
9062   case X86::BI__builtin_ia32_kunpckdi:
9063   case X86::BI__builtin_ia32_kunpcksi:
9064   case X86::BI__builtin_ia32_kunpckhi: {
9065     unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits();
9066     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
9067     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
9068     uint32_t Indices[64];
9069     for (unsigned i = 0; i != NumElts; ++i)
9070       Indices[i] = i;
9071 
9072     // First extract half of each vector. This gives better codegen than
9073     // doing it in a single shuffle.
9074     LHS = Builder.CreateShuffleVector(LHS, LHS,
9075                                       makeArrayRef(Indices, NumElts / 2));
9076     RHS = Builder.CreateShuffleVector(RHS, RHS,
9077                                       makeArrayRef(Indices, NumElts / 2));
9078     // Concat the vectors.
9079     // NOTE: Operands are swapped to match the intrinsic definition.
9080     Value *Res = Builder.CreateShuffleVector(RHS, LHS,
9081                                              makeArrayRef(Indices, NumElts));
9082     return Builder.CreateBitCast(Res, Ops[0]->getType());
9083   }
9084 
9085   case X86::BI__builtin_ia32_vplzcntd_128_mask:
9086   case X86::BI__builtin_ia32_vplzcntd_256_mask:
9087   case X86::BI__builtin_ia32_vplzcntd_512_mask:
9088   case X86::BI__builtin_ia32_vplzcntq_128_mask:
9089   case X86::BI__builtin_ia32_vplzcntq_256_mask:
9090   case X86::BI__builtin_ia32_vplzcntq_512_mask: {
9091     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
9092     return EmitX86Select(*this, Ops[2],
9093                          Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}),
9094                          Ops[1]);
9095   }
9096 
9097   case X86::BI__builtin_ia32_pabsb128:
9098   case X86::BI__builtin_ia32_pabsw128:
9099   case X86::BI__builtin_ia32_pabsd128:
9100   case X86::BI__builtin_ia32_pabsb256:
9101   case X86::BI__builtin_ia32_pabsw256:
9102   case X86::BI__builtin_ia32_pabsd256:
9103   case X86::BI__builtin_ia32_pabsq128_mask:
9104   case X86::BI__builtin_ia32_pabsq256_mask:
9105   case X86::BI__builtin_ia32_pabsb512_mask:
9106   case X86::BI__builtin_ia32_pabsw512_mask:
9107   case X86::BI__builtin_ia32_pabsd512_mask:
9108   case X86::BI__builtin_ia32_pabsq512_mask:
9109     return EmitX86Abs(*this, Ops);
9110 
9111   case X86::BI__builtin_ia32_pmaxsb128:
9112   case X86::BI__builtin_ia32_pmaxsw128:
9113   case X86::BI__builtin_ia32_pmaxsd128:
9114   case X86::BI__builtin_ia32_pmaxsq128_mask:
9115   case X86::BI__builtin_ia32_pmaxsb256:
9116   case X86::BI__builtin_ia32_pmaxsw256:
9117   case X86::BI__builtin_ia32_pmaxsd256:
9118   case X86::BI__builtin_ia32_pmaxsq256_mask:
9119   case X86::BI__builtin_ia32_pmaxsb512_mask:
9120   case X86::BI__builtin_ia32_pmaxsw512_mask:
9121   case X86::BI__builtin_ia32_pmaxsd512_mask:
9122   case X86::BI__builtin_ia32_pmaxsq512_mask:
9123     return EmitX86MinMax(*this, ICmpInst::ICMP_SGT, Ops);
9124   case X86::BI__builtin_ia32_pmaxub128:
9125   case X86::BI__builtin_ia32_pmaxuw128:
9126   case X86::BI__builtin_ia32_pmaxud128:
9127   case X86::BI__builtin_ia32_pmaxuq128_mask:
9128   case X86::BI__builtin_ia32_pmaxub256:
9129   case X86::BI__builtin_ia32_pmaxuw256:
9130   case X86::BI__builtin_ia32_pmaxud256:
9131   case X86::BI__builtin_ia32_pmaxuq256_mask:
9132   case X86::BI__builtin_ia32_pmaxub512_mask:
9133   case X86::BI__builtin_ia32_pmaxuw512_mask:
9134   case X86::BI__builtin_ia32_pmaxud512_mask:
9135   case X86::BI__builtin_ia32_pmaxuq512_mask:
9136     return EmitX86MinMax(*this, ICmpInst::ICMP_UGT, Ops);
9137   case X86::BI__builtin_ia32_pminsb128:
9138   case X86::BI__builtin_ia32_pminsw128:
9139   case X86::BI__builtin_ia32_pminsd128:
9140   case X86::BI__builtin_ia32_pminsq128_mask:
9141   case X86::BI__builtin_ia32_pminsb256:
9142   case X86::BI__builtin_ia32_pminsw256:
9143   case X86::BI__builtin_ia32_pminsd256:
9144   case X86::BI__builtin_ia32_pminsq256_mask:
9145   case X86::BI__builtin_ia32_pminsb512_mask:
9146   case X86::BI__builtin_ia32_pminsw512_mask:
9147   case X86::BI__builtin_ia32_pminsd512_mask:
9148   case X86::BI__builtin_ia32_pminsq512_mask:
9149     return EmitX86MinMax(*this, ICmpInst::ICMP_SLT, Ops);
9150   case X86::BI__builtin_ia32_pminub128:
9151   case X86::BI__builtin_ia32_pminuw128:
9152   case X86::BI__builtin_ia32_pminud128:
9153   case X86::BI__builtin_ia32_pminuq128_mask:
9154   case X86::BI__builtin_ia32_pminub256:
9155   case X86::BI__builtin_ia32_pminuw256:
9156   case X86::BI__builtin_ia32_pminud256:
9157   case X86::BI__builtin_ia32_pminuq256_mask:
9158   case X86::BI__builtin_ia32_pminub512_mask:
9159   case X86::BI__builtin_ia32_pminuw512_mask:
9160   case X86::BI__builtin_ia32_pminud512_mask:
9161   case X86::BI__builtin_ia32_pminuq512_mask:
9162     return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops);
9163 
9164   case X86::BI__builtin_ia32_pmuludq128:
9165   case X86::BI__builtin_ia32_pmuludq256:
9166   case X86::BI__builtin_ia32_pmuludq512:
9167     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
9168 
9169   case X86::BI__builtin_ia32_pmuldq128:
9170   case X86::BI__builtin_ia32_pmuldq256:
9171   case X86::BI__builtin_ia32_pmuldq512:
9172     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
9173 
9174   // 3DNow!
9175   case X86::BI__builtin_ia32_pswapdsf:
9176   case X86::BI__builtin_ia32_pswapdsi: {
9177     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
9178     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
9179     llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
9180     return Builder.CreateCall(F, Ops, "pswapd");
9181   }
9182   case X86::BI__builtin_ia32_rdrand16_step:
9183   case X86::BI__builtin_ia32_rdrand32_step:
9184   case X86::BI__builtin_ia32_rdrand64_step:
9185   case X86::BI__builtin_ia32_rdseed16_step:
9186   case X86::BI__builtin_ia32_rdseed32_step:
9187   case X86::BI__builtin_ia32_rdseed64_step: {
9188     Intrinsic::ID ID;
9189     switch (BuiltinID) {
9190     default: llvm_unreachable("Unsupported intrinsic!");
9191     case X86::BI__builtin_ia32_rdrand16_step:
9192       ID = Intrinsic::x86_rdrand_16;
9193       break;
9194     case X86::BI__builtin_ia32_rdrand32_step:
9195       ID = Intrinsic::x86_rdrand_32;
9196       break;
9197     case X86::BI__builtin_ia32_rdrand64_step:
9198       ID = Intrinsic::x86_rdrand_64;
9199       break;
9200     case X86::BI__builtin_ia32_rdseed16_step:
9201       ID = Intrinsic::x86_rdseed_16;
9202       break;
9203     case X86::BI__builtin_ia32_rdseed32_step:
9204       ID = Intrinsic::x86_rdseed_32;
9205       break;
9206     case X86::BI__builtin_ia32_rdseed64_step:
9207       ID = Intrinsic::x86_rdseed_64;
9208       break;
9209     }
9210 
9211     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
9212     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
9213                                       Ops[0]);
9214     return Builder.CreateExtractValue(Call, 1);
9215   }
9216 
9217   case X86::BI__builtin_ia32_cmpps128_mask:
9218   case X86::BI__builtin_ia32_cmpps256_mask:
9219   case X86::BI__builtin_ia32_cmpps512_mask:
9220   case X86::BI__builtin_ia32_cmppd128_mask:
9221   case X86::BI__builtin_ia32_cmppd256_mask:
9222   case X86::BI__builtin_ia32_cmppd512_mask: {
9223     unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
9224     Value *MaskIn = Ops[3];
9225     Ops.erase(&Ops[3]);
9226 
9227     Intrinsic::ID ID;
9228     switch (BuiltinID) {
9229     default: llvm_unreachable("Unsupported intrinsic!");
9230     case X86::BI__builtin_ia32_cmpps128_mask:
9231       ID = Intrinsic::x86_avx512_mask_cmp_ps_128;
9232       break;
9233     case X86::BI__builtin_ia32_cmpps256_mask:
9234       ID = Intrinsic::x86_avx512_mask_cmp_ps_256;
9235       break;
9236     case X86::BI__builtin_ia32_cmpps512_mask:
9237       ID = Intrinsic::x86_avx512_mask_cmp_ps_512;
9238       break;
9239     case X86::BI__builtin_ia32_cmppd128_mask:
9240       ID = Intrinsic::x86_avx512_mask_cmp_pd_128;
9241       break;
9242     case X86::BI__builtin_ia32_cmppd256_mask:
9243       ID = Intrinsic::x86_avx512_mask_cmp_pd_256;
9244       break;
9245     case X86::BI__builtin_ia32_cmppd512_mask:
9246       ID = Intrinsic::x86_avx512_mask_cmp_pd_512;
9247       break;
9248     }
9249 
9250     Value *Cmp = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
9251     return EmitX86MaskedCompareResult(*this, Cmp, NumElts, MaskIn);
9252   }
9253 
9254   // SSE packed comparison intrinsics
9255   case X86::BI__builtin_ia32_cmpeqps:
9256   case X86::BI__builtin_ia32_cmpeqpd:
9257     return getVectorFCmpIR(CmpInst::FCMP_OEQ);
9258   case X86::BI__builtin_ia32_cmpltps:
9259   case X86::BI__builtin_ia32_cmpltpd:
9260     return getVectorFCmpIR(CmpInst::FCMP_OLT);
9261   case X86::BI__builtin_ia32_cmpleps:
9262   case X86::BI__builtin_ia32_cmplepd:
9263     return getVectorFCmpIR(CmpInst::FCMP_OLE);
9264   case X86::BI__builtin_ia32_cmpunordps:
9265   case X86::BI__builtin_ia32_cmpunordpd:
9266     return getVectorFCmpIR(CmpInst::FCMP_UNO);
9267   case X86::BI__builtin_ia32_cmpneqps:
9268   case X86::BI__builtin_ia32_cmpneqpd:
9269     return getVectorFCmpIR(CmpInst::FCMP_UNE);
9270   case X86::BI__builtin_ia32_cmpnltps:
9271   case X86::BI__builtin_ia32_cmpnltpd:
9272     return getVectorFCmpIR(CmpInst::FCMP_UGE);
9273   case X86::BI__builtin_ia32_cmpnleps:
9274   case X86::BI__builtin_ia32_cmpnlepd:
9275     return getVectorFCmpIR(CmpInst::FCMP_UGT);
9276   case X86::BI__builtin_ia32_cmpordps:
9277   case X86::BI__builtin_ia32_cmpordpd:
9278     return getVectorFCmpIR(CmpInst::FCMP_ORD);
9279   case X86::BI__builtin_ia32_cmpps:
9280   case X86::BI__builtin_ia32_cmpps256:
9281   case X86::BI__builtin_ia32_cmppd:
9282   case X86::BI__builtin_ia32_cmppd256: {
9283     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
9284     // If this one of the SSE immediates, we can use native IR.
9285     if (CC < 8) {
9286       FCmpInst::Predicate Pred;
9287       switch (CC) {
9288       case 0: Pred = FCmpInst::FCMP_OEQ; break;
9289       case 1: Pred = FCmpInst::FCMP_OLT; break;
9290       case 2: Pred = FCmpInst::FCMP_OLE; break;
9291       case 3: Pred = FCmpInst::FCMP_UNO; break;
9292       case 4: Pred = FCmpInst::FCMP_UNE; break;
9293       case 5: Pred = FCmpInst::FCMP_UGE; break;
9294       case 6: Pred = FCmpInst::FCMP_UGT; break;
9295       case 7: Pred = FCmpInst::FCMP_ORD; break;
9296       }
9297       return getVectorFCmpIR(Pred);
9298     }
9299 
9300     // We can't handle 8-31 immediates with native IR, use the intrinsic.
9301     // Except for predicates that create constants.
9302     Intrinsic::ID ID;
9303     switch (BuiltinID) {
9304     default: llvm_unreachable("Unsupported intrinsic!");
9305     case X86::BI__builtin_ia32_cmpps:
9306       ID = Intrinsic::x86_sse_cmp_ps;
9307       break;
9308     case X86::BI__builtin_ia32_cmpps256:
9309       // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector
9310       // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0...
9311       if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) {
9312          Value *Constant = (CC == 0xf || CC == 0x1f) ?
9313                 llvm::Constant::getAllOnesValue(Builder.getInt32Ty()) :
9314                 llvm::Constant::getNullValue(Builder.getInt32Ty());
9315          Value *Vec = Builder.CreateVectorSplat(
9316                         Ops[0]->getType()->getVectorNumElements(), Constant);
9317          return Builder.CreateBitCast(Vec, Ops[0]->getType());
9318       }
9319       ID = Intrinsic::x86_avx_cmp_ps_256;
9320       break;
9321     case X86::BI__builtin_ia32_cmppd:
9322       ID = Intrinsic::x86_sse2_cmp_pd;
9323       break;
9324     case X86::BI__builtin_ia32_cmppd256:
9325       // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector
9326       // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0...
9327       if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) {
9328          Value *Constant = (CC == 0xf || CC == 0x1f) ?
9329                 llvm::Constant::getAllOnesValue(Builder.getInt64Ty()) :
9330                 llvm::Constant::getNullValue(Builder.getInt64Ty());
9331          Value *Vec = Builder.CreateVectorSplat(
9332                         Ops[0]->getType()->getVectorNumElements(), Constant);
9333          return Builder.CreateBitCast(Vec, Ops[0]->getType());
9334       }
9335       ID = Intrinsic::x86_avx_cmp_pd_256;
9336       break;
9337     }
9338 
9339     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
9340   }
9341 
9342   // SSE scalar comparison intrinsics
9343   case X86::BI__builtin_ia32_cmpeqss:
9344     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
9345   case X86::BI__builtin_ia32_cmpltss:
9346     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
9347   case X86::BI__builtin_ia32_cmpless:
9348     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
9349   case X86::BI__builtin_ia32_cmpunordss:
9350     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
9351   case X86::BI__builtin_ia32_cmpneqss:
9352     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
9353   case X86::BI__builtin_ia32_cmpnltss:
9354     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
9355   case X86::BI__builtin_ia32_cmpnless:
9356     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
9357   case X86::BI__builtin_ia32_cmpordss:
9358     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
9359   case X86::BI__builtin_ia32_cmpeqsd:
9360     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
9361   case X86::BI__builtin_ia32_cmpltsd:
9362     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
9363   case X86::BI__builtin_ia32_cmplesd:
9364     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
9365   case X86::BI__builtin_ia32_cmpunordsd:
9366     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
9367   case X86::BI__builtin_ia32_cmpneqsd:
9368     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
9369   case X86::BI__builtin_ia32_cmpnltsd:
9370     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
9371   case X86::BI__builtin_ia32_cmpnlesd:
9372     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
9373   case X86::BI__builtin_ia32_cmpordsd:
9374     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
9375 
9376   case X86::BI__emul:
9377   case X86::BI__emulu: {
9378     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
9379     bool isSigned = (BuiltinID == X86::BI__emul);
9380     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
9381     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
9382     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
9383   }
9384   case X86::BI__mulh:
9385   case X86::BI__umulh:
9386   case X86::BI_mul128:
9387   case X86::BI_umul128: {
9388     llvm::Type *ResType = ConvertType(E->getType());
9389     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
9390 
9391     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
9392     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
9393     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
9394 
9395     Value *MulResult, *HigherBits;
9396     if (IsSigned) {
9397       MulResult = Builder.CreateNSWMul(LHS, RHS);
9398       HigherBits = Builder.CreateAShr(MulResult, 64);
9399     } else {
9400       MulResult = Builder.CreateNUWMul(LHS, RHS);
9401       HigherBits = Builder.CreateLShr(MulResult, 64);
9402     }
9403     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
9404 
9405     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
9406       return HigherBits;
9407 
9408     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
9409     Builder.CreateStore(HigherBits, HighBitsAddress);
9410     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
9411   }
9412 
9413   case X86::BI__faststorefence: {
9414     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
9415                                llvm::SyncScope::System);
9416   }
9417   case X86::BI_ReadWriteBarrier:
9418   case X86::BI_ReadBarrier:
9419   case X86::BI_WriteBarrier: {
9420     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
9421                                llvm::SyncScope::SingleThread);
9422   }
9423   case X86::BI_BitScanForward:
9424   case X86::BI_BitScanForward64:
9425     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
9426   case X86::BI_BitScanReverse:
9427   case X86::BI_BitScanReverse64:
9428     return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
9429 
9430   case X86::BI_InterlockedAnd64:
9431     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
9432   case X86::BI_InterlockedExchange64:
9433     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
9434   case X86::BI_InterlockedExchangeAdd64:
9435     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
9436   case X86::BI_InterlockedExchangeSub64:
9437     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
9438   case X86::BI_InterlockedOr64:
9439     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
9440   case X86::BI_InterlockedXor64:
9441     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
9442   case X86::BI_InterlockedDecrement64:
9443     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
9444   case X86::BI_InterlockedIncrement64:
9445     return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
9446   case X86::BI_InterlockedCompareExchange128: {
9447     // InterlockedCompareExchange128 doesn't directly refer to 128bit ints,
9448     // instead it takes pointers to 64bit ints for Destination and
9449     // ComparandResult, and exchange is taken as two 64bit ints (high & low).
9450     // The previous value is written to ComparandResult, and success is
9451     // returned.
9452 
9453     llvm::Type *Int128Ty = Builder.getInt128Ty();
9454     llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
9455 
9456     Value *Destination =
9457         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PtrTy);
9458     Value *ExchangeHigh128 =
9459         Builder.CreateZExt(EmitScalarExpr(E->getArg(1)), Int128Ty);
9460     Value *ExchangeLow128 =
9461         Builder.CreateZExt(EmitScalarExpr(E->getArg(2)), Int128Ty);
9462     Address ComparandResult(
9463         Builder.CreateBitCast(EmitScalarExpr(E->getArg(3)), Int128PtrTy),
9464         getContext().toCharUnitsFromBits(128));
9465 
9466     Value *Exchange = Builder.CreateOr(
9467         Builder.CreateShl(ExchangeHigh128, 64, "", false, false),
9468         ExchangeLow128);
9469 
9470     Value *Comparand = Builder.CreateLoad(ComparandResult);
9471 
9472     AtomicCmpXchgInst *CXI =
9473         Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
9474                                     AtomicOrdering::SequentiallyConsistent,
9475                                     AtomicOrdering::SequentiallyConsistent);
9476     CXI->setVolatile(true);
9477 
9478     // Write the result back to the inout pointer.
9479     Builder.CreateStore(Builder.CreateExtractValue(CXI, 0), ComparandResult);
9480 
9481     // Get the success boolean and zero extend it to i8.
9482     Value *Success = Builder.CreateExtractValue(CXI, 1);
9483     return Builder.CreateZExt(Success, ConvertType(E->getType()));
9484   }
9485 
9486   case X86::BI_AddressOfReturnAddress: {
9487     Value *F = CGM.getIntrinsic(Intrinsic::addressofreturnaddress);
9488     return Builder.CreateCall(F);
9489   }
9490   case X86::BI__stosb: {
9491     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
9492     // instruction, but it will create a memset that won't be optimized away.
9493     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], 1, true);
9494   }
9495   case X86::BI__ud2:
9496     // llvm.trap makes a ud2a instruction on x86.
9497     return EmitTrapCall(Intrinsic::trap);
9498   case X86::BI__int2c: {
9499     // This syscall signals a driver assertion failure in x86 NT kernels.
9500     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
9501     llvm::InlineAsm *IA =
9502         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*SideEffects=*/true);
9503     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
9504         getLLVMContext(), llvm::AttributeList::FunctionIndex,
9505         llvm::Attribute::NoReturn);
9506     CallSite CS = Builder.CreateCall(IA);
9507     CS.setAttributes(NoReturnAttr);
9508     return CS.getInstruction();
9509   }
9510   case X86::BI__readfsbyte:
9511   case X86::BI__readfsword:
9512   case X86::BI__readfsdword:
9513   case X86::BI__readfsqword: {
9514     llvm::Type *IntTy = ConvertType(E->getType());
9515     Value *Ptr = Builder.CreateIntToPtr(EmitScalarExpr(E->getArg(0)),
9516                                         llvm::PointerType::get(IntTy, 257));
9517     LoadInst *Load = Builder.CreateAlignedLoad(
9518         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
9519     Load->setVolatile(true);
9520     return Load;
9521   }
9522   case X86::BI__readgsbyte:
9523   case X86::BI__readgsword:
9524   case X86::BI__readgsdword:
9525   case X86::BI__readgsqword: {
9526     llvm::Type *IntTy = ConvertType(E->getType());
9527     Value *Ptr = Builder.CreateIntToPtr(EmitScalarExpr(E->getArg(0)),
9528                                         llvm::PointerType::get(IntTy, 256));
9529     LoadInst *Load = Builder.CreateAlignedLoad(
9530         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
9531     Load->setVolatile(true);
9532     return Load;
9533   }
9534   }
9535 }
9536 
9537 
9538 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
9539                                            const CallExpr *E) {
9540   SmallVector<Value*, 4> Ops;
9541 
9542   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
9543     Ops.push_back(EmitScalarExpr(E->getArg(i)));
9544 
9545   Intrinsic::ID ID = Intrinsic::not_intrinsic;
9546 
9547   switch (BuiltinID) {
9548   default: return nullptr;
9549 
9550   // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
9551   // call __builtin_readcyclecounter.
9552   case PPC::BI__builtin_ppc_get_timebase:
9553     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
9554 
9555   // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
9556   case PPC::BI__builtin_altivec_lvx:
9557   case PPC::BI__builtin_altivec_lvxl:
9558   case PPC::BI__builtin_altivec_lvebx:
9559   case PPC::BI__builtin_altivec_lvehx:
9560   case PPC::BI__builtin_altivec_lvewx:
9561   case PPC::BI__builtin_altivec_lvsl:
9562   case PPC::BI__builtin_altivec_lvsr:
9563   case PPC::BI__builtin_vsx_lxvd2x:
9564   case PPC::BI__builtin_vsx_lxvw4x:
9565   case PPC::BI__builtin_vsx_lxvd2x_be:
9566   case PPC::BI__builtin_vsx_lxvw4x_be:
9567   case PPC::BI__builtin_vsx_lxvl:
9568   case PPC::BI__builtin_vsx_lxvll:
9569   {
9570     if(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
9571        BuiltinID == PPC::BI__builtin_vsx_lxvll){
9572       Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
9573     }else {
9574       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
9575       Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
9576       Ops.pop_back();
9577     }
9578 
9579     switch (BuiltinID) {
9580     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
9581     case PPC::BI__builtin_altivec_lvx:
9582       ID = Intrinsic::ppc_altivec_lvx;
9583       break;
9584     case PPC::BI__builtin_altivec_lvxl:
9585       ID = Intrinsic::ppc_altivec_lvxl;
9586       break;
9587     case PPC::BI__builtin_altivec_lvebx:
9588       ID = Intrinsic::ppc_altivec_lvebx;
9589       break;
9590     case PPC::BI__builtin_altivec_lvehx:
9591       ID = Intrinsic::ppc_altivec_lvehx;
9592       break;
9593     case PPC::BI__builtin_altivec_lvewx:
9594       ID = Intrinsic::ppc_altivec_lvewx;
9595       break;
9596     case PPC::BI__builtin_altivec_lvsl:
9597       ID = Intrinsic::ppc_altivec_lvsl;
9598       break;
9599     case PPC::BI__builtin_altivec_lvsr:
9600       ID = Intrinsic::ppc_altivec_lvsr;
9601       break;
9602     case PPC::BI__builtin_vsx_lxvd2x:
9603       ID = Intrinsic::ppc_vsx_lxvd2x;
9604       break;
9605     case PPC::BI__builtin_vsx_lxvw4x:
9606       ID = Intrinsic::ppc_vsx_lxvw4x;
9607       break;
9608     case PPC::BI__builtin_vsx_lxvd2x_be:
9609       ID = Intrinsic::ppc_vsx_lxvd2x_be;
9610       break;
9611     case PPC::BI__builtin_vsx_lxvw4x_be:
9612       ID = Intrinsic::ppc_vsx_lxvw4x_be;
9613       break;
9614     case PPC::BI__builtin_vsx_lxvl:
9615       ID = Intrinsic::ppc_vsx_lxvl;
9616       break;
9617     case PPC::BI__builtin_vsx_lxvll:
9618       ID = Intrinsic::ppc_vsx_lxvll;
9619       break;
9620     }
9621     llvm::Function *F = CGM.getIntrinsic(ID);
9622     return Builder.CreateCall(F, Ops, "");
9623   }
9624 
9625   // vec_st, vec_xst_be
9626   case PPC::BI__builtin_altivec_stvx:
9627   case PPC::BI__builtin_altivec_stvxl:
9628   case PPC::BI__builtin_altivec_stvebx:
9629   case PPC::BI__builtin_altivec_stvehx:
9630   case PPC::BI__builtin_altivec_stvewx:
9631   case PPC::BI__builtin_vsx_stxvd2x:
9632   case PPC::BI__builtin_vsx_stxvw4x:
9633   case PPC::BI__builtin_vsx_stxvd2x_be:
9634   case PPC::BI__builtin_vsx_stxvw4x_be:
9635   case PPC::BI__builtin_vsx_stxvl:
9636   case PPC::BI__builtin_vsx_stxvll:
9637   {
9638     if(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
9639       BuiltinID == PPC::BI__builtin_vsx_stxvll ){
9640       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
9641     }else {
9642       Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
9643       Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
9644       Ops.pop_back();
9645     }
9646 
9647     switch (BuiltinID) {
9648     default: llvm_unreachable("Unsupported st intrinsic!");
9649     case PPC::BI__builtin_altivec_stvx:
9650       ID = Intrinsic::ppc_altivec_stvx;
9651       break;
9652     case PPC::BI__builtin_altivec_stvxl:
9653       ID = Intrinsic::ppc_altivec_stvxl;
9654       break;
9655     case PPC::BI__builtin_altivec_stvebx:
9656       ID = Intrinsic::ppc_altivec_stvebx;
9657       break;
9658     case PPC::BI__builtin_altivec_stvehx:
9659       ID = Intrinsic::ppc_altivec_stvehx;
9660       break;
9661     case PPC::BI__builtin_altivec_stvewx:
9662       ID = Intrinsic::ppc_altivec_stvewx;
9663       break;
9664     case PPC::BI__builtin_vsx_stxvd2x:
9665       ID = Intrinsic::ppc_vsx_stxvd2x;
9666       break;
9667     case PPC::BI__builtin_vsx_stxvw4x:
9668       ID = Intrinsic::ppc_vsx_stxvw4x;
9669       break;
9670     case PPC::BI__builtin_vsx_stxvd2x_be:
9671       ID = Intrinsic::ppc_vsx_stxvd2x_be;
9672       break;
9673     case PPC::BI__builtin_vsx_stxvw4x_be:
9674       ID = Intrinsic::ppc_vsx_stxvw4x_be;
9675       break;
9676     case PPC::BI__builtin_vsx_stxvl:
9677       ID = Intrinsic::ppc_vsx_stxvl;
9678       break;
9679     case PPC::BI__builtin_vsx_stxvll:
9680       ID = Intrinsic::ppc_vsx_stxvll;
9681       break;
9682     }
9683     llvm::Function *F = CGM.getIntrinsic(ID);
9684     return Builder.CreateCall(F, Ops, "");
9685   }
9686   // Square root
9687   case PPC::BI__builtin_vsx_xvsqrtsp:
9688   case PPC::BI__builtin_vsx_xvsqrtdp: {
9689     llvm::Type *ResultType = ConvertType(E->getType());
9690     Value *X = EmitScalarExpr(E->getArg(0));
9691     ID = Intrinsic::sqrt;
9692     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
9693     return Builder.CreateCall(F, X);
9694   }
9695   // Count leading zeros
9696   case PPC::BI__builtin_altivec_vclzb:
9697   case PPC::BI__builtin_altivec_vclzh:
9698   case PPC::BI__builtin_altivec_vclzw:
9699   case PPC::BI__builtin_altivec_vclzd: {
9700     llvm::Type *ResultType = ConvertType(E->getType());
9701     Value *X = EmitScalarExpr(E->getArg(0));
9702     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
9703     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
9704     return Builder.CreateCall(F, {X, Undef});
9705   }
9706   case PPC::BI__builtin_altivec_vctzb:
9707   case PPC::BI__builtin_altivec_vctzh:
9708   case PPC::BI__builtin_altivec_vctzw:
9709   case PPC::BI__builtin_altivec_vctzd: {
9710     llvm::Type *ResultType = ConvertType(E->getType());
9711     Value *X = EmitScalarExpr(E->getArg(0));
9712     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
9713     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
9714     return Builder.CreateCall(F, {X, Undef});
9715   }
9716   case PPC::BI__builtin_altivec_vpopcntb:
9717   case PPC::BI__builtin_altivec_vpopcnth:
9718   case PPC::BI__builtin_altivec_vpopcntw:
9719   case PPC::BI__builtin_altivec_vpopcntd: {
9720     llvm::Type *ResultType = ConvertType(E->getType());
9721     Value *X = EmitScalarExpr(E->getArg(0));
9722     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
9723     return Builder.CreateCall(F, X);
9724   }
9725   // Copy sign
9726   case PPC::BI__builtin_vsx_xvcpsgnsp:
9727   case PPC::BI__builtin_vsx_xvcpsgndp: {
9728     llvm::Type *ResultType = ConvertType(E->getType());
9729     Value *X = EmitScalarExpr(E->getArg(0));
9730     Value *Y = EmitScalarExpr(E->getArg(1));
9731     ID = Intrinsic::copysign;
9732     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
9733     return Builder.CreateCall(F, {X, Y});
9734   }
9735   // Rounding/truncation
9736   case PPC::BI__builtin_vsx_xvrspip:
9737   case PPC::BI__builtin_vsx_xvrdpip:
9738   case PPC::BI__builtin_vsx_xvrdpim:
9739   case PPC::BI__builtin_vsx_xvrspim:
9740   case PPC::BI__builtin_vsx_xvrdpi:
9741   case PPC::BI__builtin_vsx_xvrspi:
9742   case PPC::BI__builtin_vsx_xvrdpic:
9743   case PPC::BI__builtin_vsx_xvrspic:
9744   case PPC::BI__builtin_vsx_xvrdpiz:
9745   case PPC::BI__builtin_vsx_xvrspiz: {
9746     llvm::Type *ResultType = ConvertType(E->getType());
9747     Value *X = EmitScalarExpr(E->getArg(0));
9748     if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
9749         BuiltinID == PPC::BI__builtin_vsx_xvrspim)
9750       ID = Intrinsic::floor;
9751     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
9752              BuiltinID == PPC::BI__builtin_vsx_xvrspi)
9753       ID = Intrinsic::round;
9754     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
9755              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
9756       ID = Intrinsic::nearbyint;
9757     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
9758              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
9759       ID = Intrinsic::ceil;
9760     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
9761              BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
9762       ID = Intrinsic::trunc;
9763     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
9764     return Builder.CreateCall(F, X);
9765   }
9766 
9767   // Absolute value
9768   case PPC::BI__builtin_vsx_xvabsdp:
9769   case PPC::BI__builtin_vsx_xvabssp: {
9770     llvm::Type *ResultType = ConvertType(E->getType());
9771     Value *X = EmitScalarExpr(E->getArg(0));
9772     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
9773     return Builder.CreateCall(F, X);
9774   }
9775 
9776   // FMA variations
9777   case PPC::BI__builtin_vsx_xvmaddadp:
9778   case PPC::BI__builtin_vsx_xvmaddasp:
9779   case PPC::BI__builtin_vsx_xvnmaddadp:
9780   case PPC::BI__builtin_vsx_xvnmaddasp:
9781   case PPC::BI__builtin_vsx_xvmsubadp:
9782   case PPC::BI__builtin_vsx_xvmsubasp:
9783   case PPC::BI__builtin_vsx_xvnmsubadp:
9784   case PPC::BI__builtin_vsx_xvnmsubasp: {
9785     llvm::Type *ResultType = ConvertType(E->getType());
9786     Value *X = EmitScalarExpr(E->getArg(0));
9787     Value *Y = EmitScalarExpr(E->getArg(1));
9788     Value *Z = EmitScalarExpr(E->getArg(2));
9789     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
9790     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
9791     switch (BuiltinID) {
9792       case PPC::BI__builtin_vsx_xvmaddadp:
9793       case PPC::BI__builtin_vsx_xvmaddasp:
9794         return Builder.CreateCall(F, {X, Y, Z});
9795       case PPC::BI__builtin_vsx_xvnmaddadp:
9796       case PPC::BI__builtin_vsx_xvnmaddasp:
9797         return Builder.CreateFSub(Zero,
9798                                   Builder.CreateCall(F, {X, Y, Z}), "sub");
9799       case PPC::BI__builtin_vsx_xvmsubadp:
9800       case PPC::BI__builtin_vsx_xvmsubasp:
9801         return Builder.CreateCall(F,
9802                                   {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
9803       case PPC::BI__builtin_vsx_xvnmsubadp:
9804       case PPC::BI__builtin_vsx_xvnmsubasp:
9805         Value *FsubRes =
9806           Builder.CreateCall(F, {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
9807         return Builder.CreateFSub(Zero, FsubRes, "sub");
9808     }
9809     llvm_unreachable("Unknown FMA operation");
9810     return nullptr; // Suppress no-return warning
9811   }
9812 
9813   case PPC::BI__builtin_vsx_insertword: {
9814     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
9815 
9816     // Third argument is a compile time constant int. It must be clamped to
9817     // to the range [0, 12].
9818     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
9819     assert(ArgCI &&
9820            "Third arg to xxinsertw intrinsic must be constant integer");
9821     const int64_t MaxIndex = 12;
9822     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
9823 
9824     // The builtin semantics don't exactly match the xxinsertw instructions
9825     // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
9826     // word from the first argument, and inserts it in the second argument. The
9827     // instruction extracts the word from its second input register and inserts
9828     // it into its first input register, so swap the first and second arguments.
9829     std::swap(Ops[0], Ops[1]);
9830 
9831     // Need to cast the second argument from a vector of unsigned int to a
9832     // vector of long long.
9833     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
9834 
9835     if (getTarget().isLittleEndian()) {
9836       // Create a shuffle mask of (1, 0)
9837       Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
9838                                    ConstantInt::get(Int32Ty, 0)
9839                                  };
9840       Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
9841 
9842       // Reverse the double words in the vector we will extract from.
9843       Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
9844       Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ShuffleMask);
9845 
9846       // Reverse the index.
9847       Index = MaxIndex - Index;
9848     }
9849 
9850     // Intrinsic expects the first arg to be a vector of int.
9851     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
9852     Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
9853     return Builder.CreateCall(F, Ops);
9854   }
9855 
9856   case PPC::BI__builtin_vsx_extractuword: {
9857     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
9858 
9859     // Intrinsic expects the first argument to be a vector of doublewords.
9860     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
9861 
9862     // The second argument is a compile time constant int that needs to
9863     // be clamped to the range [0, 12].
9864     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]);
9865     assert(ArgCI &&
9866            "Second Arg to xxextractuw intrinsic must be a constant integer!");
9867     const int64_t MaxIndex = 12;
9868     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
9869 
9870     if (getTarget().isLittleEndian()) {
9871       // Reverse the index.
9872       Index = MaxIndex - Index;
9873       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
9874 
9875       // Emit the call, then reverse the double words of the results vector.
9876       Value *Call = Builder.CreateCall(F, Ops);
9877 
9878       // Create a shuffle mask of (1, 0)
9879       Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
9880                                    ConstantInt::get(Int32Ty, 0)
9881                                  };
9882       Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
9883 
9884       Value *ShuffleCall = Builder.CreateShuffleVector(Call, Call, ShuffleMask);
9885       return ShuffleCall;
9886     } else {
9887       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
9888       return Builder.CreateCall(F, Ops);
9889     }
9890   }
9891 
9892   case PPC::BI__builtin_vsx_xxpermdi: {
9893     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
9894     assert(ArgCI && "Third arg must be constant integer!");
9895 
9896     unsigned Index = ArgCI->getZExtValue();
9897     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
9898     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
9899 
9900     // Element zero comes from the first input vector and element one comes from
9901     // the second. The element indices within each vector are numbered in big
9902     // endian order so the shuffle mask must be adjusted for this on little
9903     // endian platforms (i.e. index is complemented and source vector reversed).
9904     unsigned ElemIdx0;
9905     unsigned ElemIdx1;
9906     if (getTarget().isLittleEndian()) {
9907       ElemIdx0 = (~Index & 1) + 2;
9908       ElemIdx1 = (~Index & 2) >> 1;
9909     } else { // BigEndian
9910       ElemIdx0 = (Index & 2) >> 1;
9911       ElemIdx1 = 2 + (Index & 1);
9912     }
9913 
9914     Constant *ShuffleElts[2] = {ConstantInt::get(Int32Ty, ElemIdx0),
9915                                 ConstantInt::get(Int32Ty, ElemIdx1)};
9916     Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
9917 
9918     Value *ShuffleCall =
9919         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleMask);
9920     QualType BIRetType = E->getType();
9921     auto RetTy = ConvertType(BIRetType);
9922     return Builder.CreateBitCast(ShuffleCall, RetTy);
9923   }
9924 
9925   case PPC::BI__builtin_vsx_xxsldwi: {
9926     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
9927     assert(ArgCI && "Third argument must be a compile time constant");
9928     unsigned Index = ArgCI->getZExtValue() & 0x3;
9929     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
9930     Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int32Ty, 4));
9931 
9932     // Create a shuffle mask
9933     unsigned ElemIdx0;
9934     unsigned ElemIdx1;
9935     unsigned ElemIdx2;
9936     unsigned ElemIdx3;
9937     if (getTarget().isLittleEndian()) {
9938       // Little endian element N comes from element 8+N-Index of the
9939       // concatenated wide vector (of course, using modulo arithmetic on
9940       // the total number of elements).
9941       ElemIdx0 = (8 - Index) % 8;
9942       ElemIdx1 = (9 - Index) % 8;
9943       ElemIdx2 = (10 - Index) % 8;
9944       ElemIdx3 = (11 - Index) % 8;
9945     } else {
9946       // Big endian ElemIdx<N> = Index + N
9947       ElemIdx0 = Index;
9948       ElemIdx1 = Index + 1;
9949       ElemIdx2 = Index + 2;
9950       ElemIdx3 = Index + 3;
9951     }
9952 
9953     Constant *ShuffleElts[4] = {ConstantInt::get(Int32Ty, ElemIdx0),
9954                                 ConstantInt::get(Int32Ty, ElemIdx1),
9955                                 ConstantInt::get(Int32Ty, ElemIdx2),
9956                                 ConstantInt::get(Int32Ty, ElemIdx3)};
9957 
9958     Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
9959     Value *ShuffleCall =
9960         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleMask);
9961     QualType BIRetType = E->getType();
9962     auto RetTy = ConvertType(BIRetType);
9963     return Builder.CreateBitCast(ShuffleCall, RetTy);
9964   }
9965   }
9966 }
9967 
9968 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
9969                                               const CallExpr *E) {
9970   switch (BuiltinID) {
9971   case AMDGPU::BI__builtin_amdgcn_div_scale:
9972   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
9973     // Translate from the intrinsics's struct return to the builtin's out
9974     // argument.
9975 
9976     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
9977 
9978     llvm::Value *X = EmitScalarExpr(E->getArg(0));
9979     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
9980     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
9981 
9982     llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
9983                                            X->getType());
9984 
9985     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
9986 
9987     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
9988     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
9989 
9990     llvm::Type *RealFlagType
9991       = FlagOutPtr.getPointer()->getType()->getPointerElementType();
9992 
9993     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
9994     Builder.CreateStore(FlagExt, FlagOutPtr);
9995     return Result;
9996   }
9997   case AMDGPU::BI__builtin_amdgcn_div_fmas:
9998   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
9999     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
10000     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
10001     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
10002     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
10003 
10004     llvm::Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
10005                                       Src0->getType());
10006     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
10007     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
10008   }
10009 
10010   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
10011     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
10012   case AMDGPU::BI__builtin_amdgcn_mov_dpp: {
10013     llvm::SmallVector<llvm::Value *, 5> Args;
10014     for (unsigned I = 0; I != 5; ++I)
10015       Args.push_back(EmitScalarExpr(E->getArg(I)));
10016     Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_mov_dpp,
10017                                     Args[0]->getType());
10018     return Builder.CreateCall(F, Args);
10019   }
10020   case AMDGPU::BI__builtin_amdgcn_div_fixup:
10021   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
10022   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
10023     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
10024   case AMDGPU::BI__builtin_amdgcn_trig_preop:
10025   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
10026     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
10027   case AMDGPU::BI__builtin_amdgcn_rcp:
10028   case AMDGPU::BI__builtin_amdgcn_rcpf:
10029   case AMDGPU::BI__builtin_amdgcn_rcph:
10030     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
10031   case AMDGPU::BI__builtin_amdgcn_rsq:
10032   case AMDGPU::BI__builtin_amdgcn_rsqf:
10033   case AMDGPU::BI__builtin_amdgcn_rsqh:
10034     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
10035   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
10036   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
10037     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
10038   case AMDGPU::BI__builtin_amdgcn_sinf:
10039   case AMDGPU::BI__builtin_amdgcn_sinh:
10040     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
10041   case AMDGPU::BI__builtin_amdgcn_cosf:
10042   case AMDGPU::BI__builtin_amdgcn_cosh:
10043     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
10044   case AMDGPU::BI__builtin_amdgcn_log_clampf:
10045     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
10046   case AMDGPU::BI__builtin_amdgcn_ldexp:
10047   case AMDGPU::BI__builtin_amdgcn_ldexpf:
10048   case AMDGPU::BI__builtin_amdgcn_ldexph:
10049     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);
10050   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
10051   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
10052   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
10053     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
10054   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
10055   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
10056     Value *Src0 = EmitScalarExpr(E->getArg(0));
10057     Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
10058                                 { Builder.getInt32Ty(), Src0->getType() });
10059     return Builder.CreateCall(F, Src0);
10060   }
10061   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
10062     Value *Src0 = EmitScalarExpr(E->getArg(0));
10063     Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
10064                                 { Builder.getInt16Ty(), Src0->getType() });
10065     return Builder.CreateCall(F, Src0);
10066   }
10067   case AMDGPU::BI__builtin_amdgcn_fract:
10068   case AMDGPU::BI__builtin_amdgcn_fractf:
10069   case AMDGPU::BI__builtin_amdgcn_fracth:
10070     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
10071   case AMDGPU::BI__builtin_amdgcn_lerp:
10072     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
10073   case AMDGPU::BI__builtin_amdgcn_uicmp:
10074   case AMDGPU::BI__builtin_amdgcn_uicmpl:
10075   case AMDGPU::BI__builtin_amdgcn_sicmp:
10076   case AMDGPU::BI__builtin_amdgcn_sicmpl:
10077     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_icmp);
10078   case AMDGPU::BI__builtin_amdgcn_fcmp:
10079   case AMDGPU::BI__builtin_amdgcn_fcmpf:
10080     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fcmp);
10081   case AMDGPU::BI__builtin_amdgcn_class:
10082   case AMDGPU::BI__builtin_amdgcn_classf:
10083   case AMDGPU::BI__builtin_amdgcn_classh:
10084     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
10085   case AMDGPU::BI__builtin_amdgcn_fmed3f:
10086   case AMDGPU::BI__builtin_amdgcn_fmed3h:
10087     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
10088   case AMDGPU::BI__builtin_amdgcn_read_exec: {
10089     CallInst *CI = cast<CallInst>(
10090       EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, true, "exec"));
10091     CI->setConvergent();
10092     return CI;
10093   }
10094   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
10095   case AMDGPU::BI__builtin_amdgcn_read_exec_hi: {
10096     StringRef RegName = BuiltinID == AMDGPU::BI__builtin_amdgcn_read_exec_lo ?
10097       "exec_lo" : "exec_hi";
10098     CallInst *CI = cast<CallInst>(
10099       EmitSpecialRegisterBuiltin(*this, E, Int32Ty, Int32Ty, true, RegName));
10100     CI->setConvergent();
10101     return CI;
10102   }
10103 
10104   // amdgcn workitem
10105   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
10106     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
10107   case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
10108     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
10109   case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
10110     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
10111 
10112   // r600 intrinsics
10113   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
10114   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
10115     return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
10116   case AMDGPU::BI__builtin_r600_read_tidig_x:
10117     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
10118   case AMDGPU::BI__builtin_r600_read_tidig_y:
10119     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
10120   case AMDGPU::BI__builtin_r600_read_tidig_z:
10121     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
10122   default:
10123     return nullptr;
10124   }
10125 }
10126 
10127 /// Handle a SystemZ function in which the final argument is a pointer
10128 /// to an int that receives the post-instruction CC value.  At the LLVM level
10129 /// this is represented as a function that returns a {result, cc} pair.
10130 static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
10131                                          unsigned IntrinsicID,
10132                                          const CallExpr *E) {
10133   unsigned NumArgs = E->getNumArgs() - 1;
10134   SmallVector<Value *, 8> Args(NumArgs);
10135   for (unsigned I = 0; I < NumArgs; ++I)
10136     Args[I] = CGF.EmitScalarExpr(E->getArg(I));
10137   Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
10138   Value *F = CGF.CGM.getIntrinsic(IntrinsicID);
10139   Value *Call = CGF.Builder.CreateCall(F, Args);
10140   Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
10141   CGF.Builder.CreateStore(CC, CCPtr);
10142   return CGF.Builder.CreateExtractValue(Call, 0);
10143 }
10144 
10145 Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
10146                                                const CallExpr *E) {
10147   switch (BuiltinID) {
10148   case SystemZ::BI__builtin_tbegin: {
10149     Value *TDB = EmitScalarExpr(E->getArg(0));
10150     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
10151     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
10152     return Builder.CreateCall(F, {TDB, Control});
10153   }
10154   case SystemZ::BI__builtin_tbegin_nofloat: {
10155     Value *TDB = EmitScalarExpr(E->getArg(0));
10156     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
10157     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
10158     return Builder.CreateCall(F, {TDB, Control});
10159   }
10160   case SystemZ::BI__builtin_tbeginc: {
10161     Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
10162     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
10163     Value *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
10164     return Builder.CreateCall(F, {TDB, Control});
10165   }
10166   case SystemZ::BI__builtin_tabort: {
10167     Value *Data = EmitScalarExpr(E->getArg(0));
10168     Value *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
10169     return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
10170   }
10171   case SystemZ::BI__builtin_non_tx_store: {
10172     Value *Address = EmitScalarExpr(E->getArg(0));
10173     Value *Data = EmitScalarExpr(E->getArg(1));
10174     Value *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
10175     return Builder.CreateCall(F, {Data, Address});
10176   }
10177 
10178   // Vector builtins.  Note that most vector builtins are mapped automatically
10179   // to target-specific LLVM intrinsics.  The ones handled specially here can
10180   // be represented via standard LLVM IR, which is preferable to enable common
10181   // LLVM optimizations.
10182 
10183   case SystemZ::BI__builtin_s390_vpopctb:
10184   case SystemZ::BI__builtin_s390_vpopcth:
10185   case SystemZ::BI__builtin_s390_vpopctf:
10186   case SystemZ::BI__builtin_s390_vpopctg: {
10187     llvm::Type *ResultType = ConvertType(E->getType());
10188     Value *X = EmitScalarExpr(E->getArg(0));
10189     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
10190     return Builder.CreateCall(F, X);
10191   }
10192 
10193   case SystemZ::BI__builtin_s390_vclzb:
10194   case SystemZ::BI__builtin_s390_vclzh:
10195   case SystemZ::BI__builtin_s390_vclzf:
10196   case SystemZ::BI__builtin_s390_vclzg: {
10197     llvm::Type *ResultType = ConvertType(E->getType());
10198     Value *X = EmitScalarExpr(E->getArg(0));
10199     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
10200     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
10201     return Builder.CreateCall(F, {X, Undef});
10202   }
10203 
10204   case SystemZ::BI__builtin_s390_vctzb:
10205   case SystemZ::BI__builtin_s390_vctzh:
10206   case SystemZ::BI__builtin_s390_vctzf:
10207   case SystemZ::BI__builtin_s390_vctzg: {
10208     llvm::Type *ResultType = ConvertType(E->getType());
10209     Value *X = EmitScalarExpr(E->getArg(0));
10210     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
10211     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
10212     return Builder.CreateCall(F, {X, Undef});
10213   }
10214 
10215   case SystemZ::BI__builtin_s390_vfsqsb:
10216   case SystemZ::BI__builtin_s390_vfsqdb: {
10217     llvm::Type *ResultType = ConvertType(E->getType());
10218     Value *X = EmitScalarExpr(E->getArg(0));
10219     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
10220     return Builder.CreateCall(F, X);
10221   }
10222   case SystemZ::BI__builtin_s390_vfmasb:
10223   case SystemZ::BI__builtin_s390_vfmadb: {
10224     llvm::Type *ResultType = ConvertType(E->getType());
10225     Value *X = EmitScalarExpr(E->getArg(0));
10226     Value *Y = EmitScalarExpr(E->getArg(1));
10227     Value *Z = EmitScalarExpr(E->getArg(2));
10228     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
10229     return Builder.CreateCall(F, {X, Y, Z});
10230   }
10231   case SystemZ::BI__builtin_s390_vfmssb:
10232   case SystemZ::BI__builtin_s390_vfmsdb: {
10233     llvm::Type *ResultType = ConvertType(E->getType());
10234     Value *X = EmitScalarExpr(E->getArg(0));
10235     Value *Y = EmitScalarExpr(E->getArg(1));
10236     Value *Z = EmitScalarExpr(E->getArg(2));
10237     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
10238     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
10239     return Builder.CreateCall(F, {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
10240   }
10241   case SystemZ::BI__builtin_s390_vfnmasb:
10242   case SystemZ::BI__builtin_s390_vfnmadb: {
10243     llvm::Type *ResultType = ConvertType(E->getType());
10244     Value *X = EmitScalarExpr(E->getArg(0));
10245     Value *Y = EmitScalarExpr(E->getArg(1));
10246     Value *Z = EmitScalarExpr(E->getArg(2));
10247     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
10248     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
10249     return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, Z}), "sub");
10250   }
10251   case SystemZ::BI__builtin_s390_vfnmssb:
10252   case SystemZ::BI__builtin_s390_vfnmsdb: {
10253     llvm::Type *ResultType = ConvertType(E->getType());
10254     Value *X = EmitScalarExpr(E->getArg(0));
10255     Value *Y = EmitScalarExpr(E->getArg(1));
10256     Value *Z = EmitScalarExpr(E->getArg(2));
10257     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
10258     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
10259     Value *NegZ = Builder.CreateFSub(Zero, Z, "sub");
10260     return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, NegZ}));
10261   }
10262   case SystemZ::BI__builtin_s390_vflpsb:
10263   case SystemZ::BI__builtin_s390_vflpdb: {
10264     llvm::Type *ResultType = ConvertType(E->getType());
10265     Value *X = EmitScalarExpr(E->getArg(0));
10266     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
10267     return Builder.CreateCall(F, X);
10268   }
10269   case SystemZ::BI__builtin_s390_vflnsb:
10270   case SystemZ::BI__builtin_s390_vflndb: {
10271     llvm::Type *ResultType = ConvertType(E->getType());
10272     Value *X = EmitScalarExpr(E->getArg(0));
10273     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
10274     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
10275     return Builder.CreateFSub(Zero, Builder.CreateCall(F, X), "sub");
10276   }
10277   case SystemZ::BI__builtin_s390_vfisb:
10278   case SystemZ::BI__builtin_s390_vfidb: {
10279     llvm::Type *ResultType = ConvertType(E->getType());
10280     Value *X = EmitScalarExpr(E->getArg(0));
10281     // Constant-fold the M4 and M5 mask arguments.
10282     llvm::APSInt M4, M5;
10283     bool IsConstM4 = E->getArg(1)->isIntegerConstantExpr(M4, getContext());
10284     bool IsConstM5 = E->getArg(2)->isIntegerConstantExpr(M5, getContext());
10285     assert(IsConstM4 && IsConstM5 && "Constant arg isn't actually constant?");
10286     (void)IsConstM4; (void)IsConstM5;
10287     // Check whether this instance can be represented via a LLVM standard
10288     // intrinsic.  We only support some combinations of M4 and M5.
10289     Intrinsic::ID ID = Intrinsic::not_intrinsic;
10290     switch (M4.getZExtValue()) {
10291     default: break;
10292     case 0:  // IEEE-inexact exception allowed
10293       switch (M5.getZExtValue()) {
10294       default: break;
10295       case 0: ID = Intrinsic::rint; break;
10296       }
10297       break;
10298     case 4:  // IEEE-inexact exception suppressed
10299       switch (M5.getZExtValue()) {
10300       default: break;
10301       case 0: ID = Intrinsic::nearbyint; break;
10302       case 1: ID = Intrinsic::round; break;
10303       case 5: ID = Intrinsic::trunc; break;
10304       case 6: ID = Intrinsic::ceil; break;
10305       case 7: ID = Intrinsic::floor; break;
10306       }
10307       break;
10308     }
10309     if (ID != Intrinsic::not_intrinsic) {
10310       Function *F = CGM.getIntrinsic(ID, ResultType);
10311       return Builder.CreateCall(F, X);
10312     }
10313     switch (BuiltinID) {
10314       case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
10315       case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
10316       default: llvm_unreachable("Unknown BuiltinID");
10317     }
10318     Function *F = CGM.getIntrinsic(ID);
10319     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
10320     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
10321     return Builder.CreateCall(F, {X, M4Value, M5Value});
10322   }
10323   case SystemZ::BI__builtin_s390_vfmaxsb:
10324   case SystemZ::BI__builtin_s390_vfmaxdb: {
10325     llvm::Type *ResultType = ConvertType(E->getType());
10326     Value *X = EmitScalarExpr(E->getArg(0));
10327     Value *Y = EmitScalarExpr(E->getArg(1));
10328     // Constant-fold the M4 mask argument.
10329     llvm::APSInt M4;
10330     bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
10331     assert(IsConstM4 && "Constant arg isn't actually constant?");
10332     (void)IsConstM4;
10333     // Check whether this instance can be represented via a LLVM standard
10334     // intrinsic.  We only support some values of M4.
10335     Intrinsic::ID ID = Intrinsic::not_intrinsic;
10336     switch (M4.getZExtValue()) {
10337     default: break;
10338     case 4: ID = Intrinsic::maxnum; break;
10339     }
10340     if (ID != Intrinsic::not_intrinsic) {
10341       Function *F = CGM.getIntrinsic(ID, ResultType);
10342       return Builder.CreateCall(F, {X, Y});
10343     }
10344     switch (BuiltinID) {
10345       case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
10346       case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
10347       default: llvm_unreachable("Unknown BuiltinID");
10348     }
10349     Function *F = CGM.getIntrinsic(ID);
10350     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
10351     return Builder.CreateCall(F, {X, Y, M4Value});
10352   }
10353   case SystemZ::BI__builtin_s390_vfminsb:
10354   case SystemZ::BI__builtin_s390_vfmindb: {
10355     llvm::Type *ResultType = ConvertType(E->getType());
10356     Value *X = EmitScalarExpr(E->getArg(0));
10357     Value *Y = EmitScalarExpr(E->getArg(1));
10358     // Constant-fold the M4 mask argument.
10359     llvm::APSInt M4;
10360     bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
10361     assert(IsConstM4 && "Constant arg isn't actually constant?");
10362     (void)IsConstM4;
10363     // Check whether this instance can be represented via a LLVM standard
10364     // intrinsic.  We only support some values of M4.
10365     Intrinsic::ID ID = Intrinsic::not_intrinsic;
10366     switch (M4.getZExtValue()) {
10367     default: break;
10368     case 4: ID = Intrinsic::minnum; break;
10369     }
10370     if (ID != Intrinsic::not_intrinsic) {
10371       Function *F = CGM.getIntrinsic(ID, ResultType);
10372       return Builder.CreateCall(F, {X, Y});
10373     }
10374     switch (BuiltinID) {
10375       case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
10376       case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
10377       default: llvm_unreachable("Unknown BuiltinID");
10378     }
10379     Function *F = CGM.getIntrinsic(ID);
10380     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
10381     return Builder.CreateCall(F, {X, Y, M4Value});
10382   }
10383 
10384   // Vector intrisincs that output the post-instruction CC value.
10385 
10386 #define INTRINSIC_WITH_CC(NAME) \
10387     case SystemZ::BI__builtin_##NAME: \
10388       return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
10389 
10390   INTRINSIC_WITH_CC(s390_vpkshs);
10391   INTRINSIC_WITH_CC(s390_vpksfs);
10392   INTRINSIC_WITH_CC(s390_vpksgs);
10393 
10394   INTRINSIC_WITH_CC(s390_vpklshs);
10395   INTRINSIC_WITH_CC(s390_vpklsfs);
10396   INTRINSIC_WITH_CC(s390_vpklsgs);
10397 
10398   INTRINSIC_WITH_CC(s390_vceqbs);
10399   INTRINSIC_WITH_CC(s390_vceqhs);
10400   INTRINSIC_WITH_CC(s390_vceqfs);
10401   INTRINSIC_WITH_CC(s390_vceqgs);
10402 
10403   INTRINSIC_WITH_CC(s390_vchbs);
10404   INTRINSIC_WITH_CC(s390_vchhs);
10405   INTRINSIC_WITH_CC(s390_vchfs);
10406   INTRINSIC_WITH_CC(s390_vchgs);
10407 
10408   INTRINSIC_WITH_CC(s390_vchlbs);
10409   INTRINSIC_WITH_CC(s390_vchlhs);
10410   INTRINSIC_WITH_CC(s390_vchlfs);
10411   INTRINSIC_WITH_CC(s390_vchlgs);
10412 
10413   INTRINSIC_WITH_CC(s390_vfaebs);
10414   INTRINSIC_WITH_CC(s390_vfaehs);
10415   INTRINSIC_WITH_CC(s390_vfaefs);
10416 
10417   INTRINSIC_WITH_CC(s390_vfaezbs);
10418   INTRINSIC_WITH_CC(s390_vfaezhs);
10419   INTRINSIC_WITH_CC(s390_vfaezfs);
10420 
10421   INTRINSIC_WITH_CC(s390_vfeebs);
10422   INTRINSIC_WITH_CC(s390_vfeehs);
10423   INTRINSIC_WITH_CC(s390_vfeefs);
10424 
10425   INTRINSIC_WITH_CC(s390_vfeezbs);
10426   INTRINSIC_WITH_CC(s390_vfeezhs);
10427   INTRINSIC_WITH_CC(s390_vfeezfs);
10428 
10429   INTRINSIC_WITH_CC(s390_vfenebs);
10430   INTRINSIC_WITH_CC(s390_vfenehs);
10431   INTRINSIC_WITH_CC(s390_vfenefs);
10432 
10433   INTRINSIC_WITH_CC(s390_vfenezbs);
10434   INTRINSIC_WITH_CC(s390_vfenezhs);
10435   INTRINSIC_WITH_CC(s390_vfenezfs);
10436 
10437   INTRINSIC_WITH_CC(s390_vistrbs);
10438   INTRINSIC_WITH_CC(s390_vistrhs);
10439   INTRINSIC_WITH_CC(s390_vistrfs);
10440 
10441   INTRINSIC_WITH_CC(s390_vstrcbs);
10442   INTRINSIC_WITH_CC(s390_vstrchs);
10443   INTRINSIC_WITH_CC(s390_vstrcfs);
10444 
10445   INTRINSIC_WITH_CC(s390_vstrczbs);
10446   INTRINSIC_WITH_CC(s390_vstrczhs);
10447   INTRINSIC_WITH_CC(s390_vstrczfs);
10448 
10449   INTRINSIC_WITH_CC(s390_vfcesbs);
10450   INTRINSIC_WITH_CC(s390_vfcedbs);
10451   INTRINSIC_WITH_CC(s390_vfchsbs);
10452   INTRINSIC_WITH_CC(s390_vfchdbs);
10453   INTRINSIC_WITH_CC(s390_vfchesbs);
10454   INTRINSIC_WITH_CC(s390_vfchedbs);
10455 
10456   INTRINSIC_WITH_CC(s390_vftcisb);
10457   INTRINSIC_WITH_CC(s390_vftcidb);
10458 
10459 #undef INTRINSIC_WITH_CC
10460 
10461   default:
10462     return nullptr;
10463   }
10464 }
10465 
10466 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
10467                                              const CallExpr *E) {
10468   auto MakeLdg = [&](unsigned IntrinsicID) {
10469     Value *Ptr = EmitScalarExpr(E->getArg(0));
10470     clang::CharUnits Align =
10471         getNaturalPointeeTypeAlignment(E->getArg(0)->getType());
10472     return Builder.CreateCall(
10473         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
10474                                        Ptr->getType()}),
10475         {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
10476   };
10477   auto MakeScopedAtomic = [&](unsigned IntrinsicID) {
10478     Value *Ptr = EmitScalarExpr(E->getArg(0));
10479     return Builder.CreateCall(
10480         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
10481                                        Ptr->getType()}),
10482         {Ptr, EmitScalarExpr(E->getArg(1))});
10483   };
10484   switch (BuiltinID) {
10485   case NVPTX::BI__nvvm_atom_add_gen_i:
10486   case NVPTX::BI__nvvm_atom_add_gen_l:
10487   case NVPTX::BI__nvvm_atom_add_gen_ll:
10488     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
10489 
10490   case NVPTX::BI__nvvm_atom_sub_gen_i:
10491   case NVPTX::BI__nvvm_atom_sub_gen_l:
10492   case NVPTX::BI__nvvm_atom_sub_gen_ll:
10493     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
10494 
10495   case NVPTX::BI__nvvm_atom_and_gen_i:
10496   case NVPTX::BI__nvvm_atom_and_gen_l:
10497   case NVPTX::BI__nvvm_atom_and_gen_ll:
10498     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
10499 
10500   case NVPTX::BI__nvvm_atom_or_gen_i:
10501   case NVPTX::BI__nvvm_atom_or_gen_l:
10502   case NVPTX::BI__nvvm_atom_or_gen_ll:
10503     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
10504 
10505   case NVPTX::BI__nvvm_atom_xor_gen_i:
10506   case NVPTX::BI__nvvm_atom_xor_gen_l:
10507   case NVPTX::BI__nvvm_atom_xor_gen_ll:
10508     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
10509 
10510   case NVPTX::BI__nvvm_atom_xchg_gen_i:
10511   case NVPTX::BI__nvvm_atom_xchg_gen_l:
10512   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
10513     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
10514 
10515   case NVPTX::BI__nvvm_atom_max_gen_i:
10516   case NVPTX::BI__nvvm_atom_max_gen_l:
10517   case NVPTX::BI__nvvm_atom_max_gen_ll:
10518     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
10519 
10520   case NVPTX::BI__nvvm_atom_max_gen_ui:
10521   case NVPTX::BI__nvvm_atom_max_gen_ul:
10522   case NVPTX::BI__nvvm_atom_max_gen_ull:
10523     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
10524 
10525   case NVPTX::BI__nvvm_atom_min_gen_i:
10526   case NVPTX::BI__nvvm_atom_min_gen_l:
10527   case NVPTX::BI__nvvm_atom_min_gen_ll:
10528     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
10529 
10530   case NVPTX::BI__nvvm_atom_min_gen_ui:
10531   case NVPTX::BI__nvvm_atom_min_gen_ul:
10532   case NVPTX::BI__nvvm_atom_min_gen_ull:
10533     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
10534 
10535   case NVPTX::BI__nvvm_atom_cas_gen_i:
10536   case NVPTX::BI__nvvm_atom_cas_gen_l:
10537   case NVPTX::BI__nvvm_atom_cas_gen_ll:
10538     // __nvvm_atom_cas_gen_* should return the old value rather than the
10539     // success flag.
10540     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
10541 
10542   case NVPTX::BI__nvvm_atom_add_gen_f: {
10543     Value *Ptr = EmitScalarExpr(E->getArg(0));
10544     Value *Val = EmitScalarExpr(E->getArg(1));
10545     // atomicrmw only deals with integer arguments so we need to use
10546     // LLVM's nvvm_atomic_load_add_f32 intrinsic for that.
10547     Value *FnALAF32 =
10548         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f32, Ptr->getType());
10549     return Builder.CreateCall(FnALAF32, {Ptr, Val});
10550   }
10551 
10552   case NVPTX::BI__nvvm_atom_add_gen_d: {
10553     Value *Ptr = EmitScalarExpr(E->getArg(0));
10554     Value *Val = EmitScalarExpr(E->getArg(1));
10555     // atomicrmw only deals with integer arguments, so we need to use
10556     // LLVM's nvvm_atomic_load_add_f64 intrinsic.
10557     Value *FnALAF64 =
10558         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType());
10559     return Builder.CreateCall(FnALAF64, {Ptr, Val});
10560   }
10561 
10562   case NVPTX::BI__nvvm_atom_inc_gen_ui: {
10563     Value *Ptr = EmitScalarExpr(E->getArg(0));
10564     Value *Val = EmitScalarExpr(E->getArg(1));
10565     Value *FnALI32 =
10566         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
10567     return Builder.CreateCall(FnALI32, {Ptr, Val});
10568   }
10569 
10570   case NVPTX::BI__nvvm_atom_dec_gen_ui: {
10571     Value *Ptr = EmitScalarExpr(E->getArg(0));
10572     Value *Val = EmitScalarExpr(E->getArg(1));
10573     Value *FnALD32 =
10574         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
10575     return Builder.CreateCall(FnALD32, {Ptr, Val});
10576   }
10577 
10578   case NVPTX::BI__nvvm_ldg_c:
10579   case NVPTX::BI__nvvm_ldg_c2:
10580   case NVPTX::BI__nvvm_ldg_c4:
10581   case NVPTX::BI__nvvm_ldg_s:
10582   case NVPTX::BI__nvvm_ldg_s2:
10583   case NVPTX::BI__nvvm_ldg_s4:
10584   case NVPTX::BI__nvvm_ldg_i:
10585   case NVPTX::BI__nvvm_ldg_i2:
10586   case NVPTX::BI__nvvm_ldg_i4:
10587   case NVPTX::BI__nvvm_ldg_l:
10588   case NVPTX::BI__nvvm_ldg_ll:
10589   case NVPTX::BI__nvvm_ldg_ll2:
10590   case NVPTX::BI__nvvm_ldg_uc:
10591   case NVPTX::BI__nvvm_ldg_uc2:
10592   case NVPTX::BI__nvvm_ldg_uc4:
10593   case NVPTX::BI__nvvm_ldg_us:
10594   case NVPTX::BI__nvvm_ldg_us2:
10595   case NVPTX::BI__nvvm_ldg_us4:
10596   case NVPTX::BI__nvvm_ldg_ui:
10597   case NVPTX::BI__nvvm_ldg_ui2:
10598   case NVPTX::BI__nvvm_ldg_ui4:
10599   case NVPTX::BI__nvvm_ldg_ul:
10600   case NVPTX::BI__nvvm_ldg_ull:
10601   case NVPTX::BI__nvvm_ldg_ull2:
10602     // PTX Interoperability section 2.2: "For a vector with an even number of
10603     // elements, its alignment is set to number of elements times the alignment
10604     // of its member: n*alignof(t)."
10605     return MakeLdg(Intrinsic::nvvm_ldg_global_i);
10606   case NVPTX::BI__nvvm_ldg_f:
10607   case NVPTX::BI__nvvm_ldg_f2:
10608   case NVPTX::BI__nvvm_ldg_f4:
10609   case NVPTX::BI__nvvm_ldg_d:
10610   case NVPTX::BI__nvvm_ldg_d2:
10611     return MakeLdg(Intrinsic::nvvm_ldg_global_f);
10612 
10613   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
10614   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
10615   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
10616     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta);
10617   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
10618   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
10619   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
10620     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys);
10621   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
10622   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
10623     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta);
10624   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
10625   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
10626     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys);
10627   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
10628   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
10629   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
10630     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta);
10631   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
10632   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
10633   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
10634     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys);
10635   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
10636   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
10637   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
10638   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
10639   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
10640   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
10641     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta);
10642   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
10643   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
10644   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
10645   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
10646   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
10647   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
10648     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys);
10649   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
10650   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
10651   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
10652   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
10653   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
10654   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
10655     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta);
10656   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
10657   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
10658   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
10659   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
10660   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
10661   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
10662     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys);
10663   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
10664     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta);
10665   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
10666     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta);
10667   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
10668     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys);
10669   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
10670     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys);
10671   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
10672   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
10673   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
10674     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta);
10675   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
10676   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
10677   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
10678     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys);
10679   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
10680   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
10681   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
10682     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta);
10683   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
10684   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
10685   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
10686     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys);
10687   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
10688   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
10689   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
10690     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta);
10691   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
10692   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
10693   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
10694     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys);
10695   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
10696   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
10697   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
10698     Value *Ptr = EmitScalarExpr(E->getArg(0));
10699     return Builder.CreateCall(
10700         CGM.getIntrinsic(
10701             Intrinsic::nvvm_atomic_cas_gen_i_cta,
10702             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
10703         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
10704   }
10705   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
10706   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
10707   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
10708     Value *Ptr = EmitScalarExpr(E->getArg(0));
10709     return Builder.CreateCall(
10710         CGM.getIntrinsic(
10711             Intrinsic::nvvm_atomic_cas_gen_i_sys,
10712             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
10713         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
10714   }
10715   case NVPTX::BI__nvvm_match_all_sync_i32p:
10716   case NVPTX::BI__nvvm_match_all_sync_i64p: {
10717     Value *Mask = EmitScalarExpr(E->getArg(0));
10718     Value *Val = EmitScalarExpr(E->getArg(1));
10719     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
10720     Value *ResultPair = Builder.CreateCall(
10721         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
10722                              ? Intrinsic::nvvm_match_all_sync_i32p
10723                              : Intrinsic::nvvm_match_all_sync_i64p),
10724         {Mask, Val});
10725     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
10726                                      PredOutPtr.getElementType());
10727     Builder.CreateStore(Pred, PredOutPtr);
10728     return Builder.CreateExtractValue(ResultPair, 0);
10729   }
10730   case NVPTX::BI__hmma_m16n16k16_ld_a:
10731   case NVPTX::BI__hmma_m16n16k16_ld_b:
10732   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
10733   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
10734   case NVPTX::BI__hmma_m32n8k16_ld_a:
10735   case NVPTX::BI__hmma_m32n8k16_ld_b:
10736   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
10737   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
10738   case NVPTX::BI__hmma_m8n32k16_ld_a:
10739   case NVPTX::BI__hmma_m8n32k16_ld_b:
10740   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
10741   case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
10742     Address Dst = EmitPointerWithAlignment(E->getArg(0));
10743     Value *Src = EmitScalarExpr(E->getArg(1));
10744     Value *Ldm = EmitScalarExpr(E->getArg(2));
10745     llvm::APSInt isColMajorArg;
10746     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
10747       return nullptr;
10748     bool isColMajor = isColMajorArg.getSExtValue();
10749     unsigned IID;
10750     unsigned NumResults;
10751     switch (BuiltinID) {
10752     case NVPTX::BI__hmma_m16n16k16_ld_a:
10753       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
10754                        : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride;
10755       NumResults = 8;
10756       break;
10757     case NVPTX::BI__hmma_m16n16k16_ld_b:
10758       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
10759                        : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride;
10760       NumResults = 8;
10761       break;
10762     case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
10763       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
10764                        : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride;
10765       NumResults = 4;
10766       break;
10767     case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
10768       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
10769                        : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
10770       NumResults = 8;
10771       break;
10772     case NVPTX::BI__hmma_m32n8k16_ld_a:
10773       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
10774                        : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
10775       NumResults = 8;
10776       break;
10777     case NVPTX::BI__hmma_m32n8k16_ld_b:
10778       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
10779                        : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
10780       NumResults = 8;
10781       break;
10782     case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
10783       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
10784                        : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
10785       NumResults = 4;
10786       break;
10787     case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
10788       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
10789                        : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
10790       NumResults = 8;
10791       break;
10792     case NVPTX::BI__hmma_m8n32k16_ld_a:
10793       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
10794                        : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
10795       NumResults = 8;
10796       break;
10797     case NVPTX::BI__hmma_m8n32k16_ld_b:
10798       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
10799                        : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
10800       NumResults = 8;
10801       break;
10802     case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
10803       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
10804                        : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
10805       NumResults = 4;
10806       break;
10807     case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
10808       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
10809                        : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
10810       NumResults = 8;
10811       break;
10812     default:
10813       llvm_unreachable("Unexpected builtin ID.");
10814     }
10815     Value *Result =
10816         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
10817 
10818     // Save returned values.
10819     for (unsigned i = 0; i < NumResults; ++i) {
10820       Builder.CreateAlignedStore(
10821           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
10822                                 Dst.getElementType()),
10823           Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
10824           CharUnits::fromQuantity(4));
10825     }
10826     return Result;
10827   }
10828 
10829   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
10830   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
10831   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
10832   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
10833   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
10834   case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
10835     Value *Dst = EmitScalarExpr(E->getArg(0));
10836     Address Src = EmitPointerWithAlignment(E->getArg(1));
10837     Value *Ldm = EmitScalarExpr(E->getArg(2));
10838     llvm::APSInt isColMajorArg;
10839     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
10840       return nullptr;
10841     bool isColMajor = isColMajorArg.getSExtValue();
10842     unsigned IID;
10843     unsigned NumResults = 8;
10844     // PTX Instructions (and LLVM instrinsics) are defined for slice _d_, yet
10845     // for some reason nvcc builtins use _c_.
10846     switch (BuiltinID) {
10847     case NVPTX::BI__hmma_m16n16k16_st_c_f16:
10848       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
10849                        : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride;
10850       NumResults = 4;
10851       break;
10852     case NVPTX::BI__hmma_m16n16k16_st_c_f32:
10853       IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
10854                        : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
10855       break;
10856     case NVPTX::BI__hmma_m32n8k16_st_c_f16:
10857       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
10858                        : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
10859       NumResults = 4;
10860       break;
10861     case NVPTX::BI__hmma_m32n8k16_st_c_f32:
10862       IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
10863                        : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
10864       break;
10865     case NVPTX::BI__hmma_m8n32k16_st_c_f16:
10866       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
10867                        : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
10868       NumResults = 4;
10869       break;
10870     case NVPTX::BI__hmma_m8n32k16_st_c_f32:
10871       IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
10872                        : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
10873       break;
10874     default:
10875       llvm_unreachable("Unexpected builtin ID.");
10876     }
10877     Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType());
10878     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
10879     SmallVector<Value *, 10> Values = {Dst};
10880     for (unsigned i = 0; i < NumResults; ++i) {
10881       Value *V = Builder.CreateAlignedLoad(
10882           Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
10883           CharUnits::fromQuantity(4));
10884       Values.push_back(Builder.CreateBitCast(V, ParamType));
10885     }
10886     Values.push_back(Ldm);
10887     Value *Result = Builder.CreateCall(Intrinsic, Values);
10888     return Result;
10889   }
10890 
10891   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
10892   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
10893   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
10894   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
10895   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
10896   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
10897   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
10898   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
10899   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
10900   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
10901   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
10902   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
10903   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
10904   case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
10905     Address Dst = EmitPointerWithAlignment(E->getArg(0));
10906     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
10907     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
10908     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
10909     llvm::APSInt LayoutArg;
10910     if (!E->getArg(4)->isIntegerConstantExpr(LayoutArg, getContext()))
10911       return nullptr;
10912     int Layout = LayoutArg.getSExtValue();
10913     if (Layout < 0 || Layout > 3)
10914       return nullptr;
10915     llvm::APSInt SatfArg;
10916     if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
10917       return nullptr;
10918     bool Satf = SatfArg.getSExtValue();
10919 
10920     // clang-format off
10921 #define MMA_VARIANTS(geom, type) {{                                 \
10922       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
10923       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
10924       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
10925       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
10926       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
10927       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
10928       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type,             \
10929       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite  \
10930     }}
10931     // clang-format on
10932 
10933     auto getMMAIntrinsic = [Layout, Satf](std::array<unsigned, 8> Variants) {
10934       unsigned Index = Layout * 2 + Satf;
10935       assert(Index < 8);
10936       return Variants[Index];
10937     };
10938     unsigned IID;
10939     unsigned NumEltsC;
10940     unsigned NumEltsD;
10941     switch (BuiltinID) {
10942     case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
10943       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
10944       NumEltsC = 4;
10945       NumEltsD = 4;
10946       break;
10947     case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
10948       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
10949       NumEltsC = 4;
10950       NumEltsD = 8;
10951       break;
10952     case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
10953       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
10954       NumEltsC = 8;
10955       NumEltsD = 4;
10956       break;
10957     case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
10958       IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
10959       NumEltsC = 8;
10960       NumEltsD = 8;
10961       break;
10962     case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
10963       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
10964       NumEltsC = 4;
10965       NumEltsD = 4;
10966       break;
10967     case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
10968       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
10969       NumEltsC = 4;
10970       NumEltsD = 8;
10971       break;
10972     case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
10973       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
10974       NumEltsC = 8;
10975       NumEltsD = 4;
10976       break;
10977     case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
10978       IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
10979       NumEltsC = 8;
10980       NumEltsD = 8;
10981       break;
10982     case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
10983       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
10984       NumEltsC = 4;
10985       NumEltsD = 4;
10986       break;
10987     case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
10988       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
10989       NumEltsC = 4;
10990       NumEltsD = 8;
10991       break;
10992     case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
10993       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
10994       NumEltsC = 8;
10995       NumEltsD = 4;
10996       break;
10997     case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
10998       IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
10999       NumEltsC = 8;
11000       NumEltsD = 8;
11001       break;
11002     default:
11003       llvm_unreachable("Unexpected builtin ID.");
11004     }
11005 #undef MMA_VARIANTS
11006 
11007     SmallVector<Value *, 24> Values;
11008     Function *Intrinsic = CGM.getIntrinsic(IID);
11009     llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0);
11010     // Load A
11011     for (unsigned i = 0; i < 8; ++i) {
11012       Value *V = Builder.CreateAlignedLoad(
11013           Builder.CreateGEP(SrcA.getPointer(),
11014                             llvm::ConstantInt::get(IntTy, i)),
11015           CharUnits::fromQuantity(4));
11016       Values.push_back(Builder.CreateBitCast(V, ABType));
11017     }
11018     // Load B
11019     for (unsigned i = 0; i < 8; ++i) {
11020       Value *V = Builder.CreateAlignedLoad(
11021           Builder.CreateGEP(SrcB.getPointer(),
11022                             llvm::ConstantInt::get(IntTy, i)),
11023           CharUnits::fromQuantity(4));
11024       Values.push_back(Builder.CreateBitCast(V, ABType));
11025     }
11026     // Load C
11027     llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16);
11028     for (unsigned i = 0; i < NumEltsC; ++i) {
11029       Value *V = Builder.CreateAlignedLoad(
11030           Builder.CreateGEP(SrcC.getPointer(),
11031                             llvm::ConstantInt::get(IntTy, i)),
11032           CharUnits::fromQuantity(4));
11033       Values.push_back(Builder.CreateBitCast(V, CType));
11034     }
11035     Value *Result = Builder.CreateCall(Intrinsic, Values);
11036     llvm::Type *DType = Dst.getElementType();
11037     for (unsigned i = 0; i < NumEltsD; ++i)
11038       Builder.CreateAlignedStore(
11039           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
11040           Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
11041           CharUnits::fromQuantity(4));
11042     return Result;
11043   }
11044   default:
11045     return nullptr;
11046   }
11047 }
11048 
11049 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
11050                                                    const CallExpr *E) {
11051   switch (BuiltinID) {
11052   case WebAssembly::BI__builtin_wasm_mem_size: {
11053     llvm::Type *ResultType = ConvertType(E->getType());
11054     Value *I = EmitScalarExpr(E->getArg(0));
11055     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_size, ResultType);
11056     return Builder.CreateCall(Callee, I);
11057   }
11058   case WebAssembly::BI__builtin_wasm_mem_grow: {
11059     llvm::Type *ResultType = ConvertType(E->getType());
11060     Value *Args[] = {
11061       EmitScalarExpr(E->getArg(0)),
11062       EmitScalarExpr(E->getArg(1))
11063     };
11064     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_grow, ResultType);
11065     return Builder.CreateCall(Callee, Args);
11066   }
11067   case WebAssembly::BI__builtin_wasm_current_memory: {
11068     llvm::Type *ResultType = ConvertType(E->getType());
11069     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType);
11070     return Builder.CreateCall(Callee);
11071   }
11072   case WebAssembly::BI__builtin_wasm_grow_memory: {
11073     Value *X = EmitScalarExpr(E->getArg(0));
11074     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_grow_memory, X->getType());
11075     return Builder.CreateCall(Callee, X);
11076   }
11077   case WebAssembly::BI__builtin_wasm_throw: {
11078     Value *Tag = EmitScalarExpr(E->getArg(0));
11079     Value *Obj = EmitScalarExpr(E->getArg(1));
11080     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
11081     return Builder.CreateCall(Callee, {Tag, Obj});
11082   }
11083   case WebAssembly::BI__builtin_wasm_rethrow: {
11084     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
11085     return Builder.CreateCall(Callee);
11086   }
11087 
11088   default:
11089     return nullptr;
11090   }
11091 }
11092 
11093 Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
11094                                                const CallExpr *E) {
11095   SmallVector<llvm::Value *, 4> Ops;
11096   Intrinsic::ID ID = Intrinsic::not_intrinsic;
11097 
11098   auto MakeCircLd = [&](unsigned IntID, bool HasImm) {
11099     // The base pointer is passed by address, so it needs to be loaded.
11100     Address BP = EmitPointerWithAlignment(E->getArg(0));
11101     BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
11102                  BP.getAlignment());
11103     llvm::Value *Base = Builder.CreateLoad(BP);
11104     // Operands are Base, Increment, Modifier, Start.
11105     if (HasImm)
11106       Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
11107               EmitScalarExpr(E->getArg(3)) };
11108     else
11109       Ops = { Base, EmitScalarExpr(E->getArg(1)),
11110               EmitScalarExpr(E->getArg(2)) };
11111 
11112     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
11113     llvm::Value *NewBase = Builder.CreateExtractValue(Result, 1);
11114     llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
11115                                             NewBase->getType()->getPointerTo());
11116     Address Dest = EmitPointerWithAlignment(E->getArg(0));
11117     // The intrinsic generates two results. The new value for the base pointer
11118     // needs to be stored.
11119     Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
11120     return Builder.CreateExtractValue(Result, 0);
11121   };
11122 
11123   auto MakeCircSt = [&](unsigned IntID, bool HasImm) {
11124     // The base pointer is passed by address, so it needs to be loaded.
11125     Address BP = EmitPointerWithAlignment(E->getArg(0));
11126     BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
11127                  BP.getAlignment());
11128     llvm::Value *Base = Builder.CreateLoad(BP);
11129     // Operands are Base, Increment, Modifier, Value, Start.
11130     if (HasImm)
11131       Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
11132               EmitScalarExpr(E->getArg(3)), EmitScalarExpr(E->getArg(4)) };
11133     else
11134       Ops = { Base, EmitScalarExpr(E->getArg(1)),
11135               EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)) };
11136 
11137     llvm::Value *NewBase = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
11138     llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
11139                                             NewBase->getType()->getPointerTo());
11140     Address Dest = EmitPointerWithAlignment(E->getArg(0));
11141     // The intrinsic generates one result, which is the new value for the base
11142     // pointer. It needs to be stored.
11143     return Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
11144   };
11145 
11146   // Handle the conversion of bit-reverse load intrinsics to bit code.
11147   // The intrinsic call after this function only reads from memory and the
11148   // write to memory is dealt by the store instruction.
11149   auto MakeBrevLd = [&](unsigned IntID, llvm::Type *DestTy) {
11150     // The intrinsic generates one result, which is the new value for the base
11151     // pointer. It needs to be returned. The result of the load instruction is
11152     // passed to intrinsic by address, so the value needs to be stored.
11153     llvm::Value *BaseAddress =
11154         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
11155 
11156     // Expressions like &(*pt++) will be incremented per evaluation.
11157     // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
11158     // per call.
11159     Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
11160     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy),
11161                        DestAddr.getAlignment());
11162     llvm::Value *DestAddress = DestAddr.getPointer();
11163 
11164     // Operands are Base, Dest, Modifier.
11165     // The intrinsic format in LLVM IR is defined as
11166     // { ValueType, i8* } (i8*, i32).
11167     Ops = {BaseAddress, EmitScalarExpr(E->getArg(2))};
11168 
11169     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
11170     // The value needs to be stored as the variable is passed by reference.
11171     llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
11172 
11173     // The store needs to be truncated to fit the destination type.
11174     // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
11175     // to be handled with stores of respective destination type.
11176     DestVal = Builder.CreateTrunc(DestVal, DestTy);
11177 
11178     llvm::Value *DestForStore =
11179         Builder.CreateBitCast(DestAddress, DestVal->getType()->getPointerTo());
11180     Builder.CreateAlignedStore(DestVal, DestForStore, DestAddr.getAlignment());
11181     // The updated value of the base pointer is returned.
11182     return Builder.CreateExtractValue(Result, 1);
11183   };
11184 
11185   switch (BuiltinID) {
11186   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
11187   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B: {
11188     Address Dest = EmitPointerWithAlignment(E->getArg(2));
11189     unsigned Size;
11190     if (BuiltinID == Hexagon::BI__builtin_HEXAGON_V6_vaddcarry) {
11191       Size = 512;
11192       ID = Intrinsic::hexagon_V6_vaddcarry;
11193     } else {
11194       Size = 1024;
11195       ID = Intrinsic::hexagon_V6_vaddcarry_128B;
11196     }
11197     Dest = Builder.CreateBitCast(Dest,
11198         llvm::VectorType::get(Builder.getInt1Ty(), Size)->getPointerTo(0));
11199     LoadInst *QLd = Builder.CreateLoad(Dest);
11200     Ops = { EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), QLd };
11201     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
11202     llvm::Value *Vprd = Builder.CreateExtractValue(Result, 1);
11203     llvm::Value *Base = Builder.CreateBitCast(EmitScalarExpr(E->getArg(2)),
11204                                               Vprd->getType()->getPointerTo(0));
11205     Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment());
11206     return Builder.CreateExtractValue(Result, 0);
11207   }
11208   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
11209   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
11210     Address Dest = EmitPointerWithAlignment(E->getArg(2));
11211     unsigned Size;
11212     if (BuiltinID == Hexagon::BI__builtin_HEXAGON_V6_vsubcarry) {
11213       Size = 512;
11214       ID = Intrinsic::hexagon_V6_vsubcarry;
11215     } else {
11216       Size = 1024;
11217       ID = Intrinsic::hexagon_V6_vsubcarry_128B;
11218     }
11219     Dest = Builder.CreateBitCast(Dest,
11220         llvm::VectorType::get(Builder.getInt1Ty(), Size)->getPointerTo(0));
11221     LoadInst *QLd = Builder.CreateLoad(Dest);
11222     Ops = { EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), QLd };
11223     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
11224     llvm::Value *Vprd = Builder.CreateExtractValue(Result, 1);
11225     llvm::Value *Base = Builder.CreateBitCast(EmitScalarExpr(E->getArg(2)),
11226                                               Vprd->getType()->getPointerTo(0));
11227     Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment());
11228     return Builder.CreateExtractValue(Result, 0);
11229   }
11230   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
11231     return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pci, /*HasImm*/true);
11232   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
11233     return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pci,  /*HasImm*/true);
11234   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
11235     return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pci, /*HasImm*/true);
11236   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
11237     return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pci,  /*HasImm*/true);
11238   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
11239     return MakeCircLd(Intrinsic::hexagon_L2_loadri_pci,  /*HasImm*/true);
11240   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
11241     return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pci,  /*HasImm*/true);
11242   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
11243     return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pcr, /*HasImm*/false);
11244   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
11245     return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pcr,  /*HasImm*/false);
11246   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
11247     return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pcr, /*HasImm*/false);
11248   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
11249     return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pcr,  /*HasImm*/false);
11250   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
11251     return MakeCircLd(Intrinsic::hexagon_L2_loadri_pcr,  /*HasImm*/false);
11252   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
11253     return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pcr,  /*HasImm*/false);
11254   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
11255     return MakeCircSt(Intrinsic::hexagon_S2_storerb_pci, /*HasImm*/true);
11256   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
11257     return MakeCircSt(Intrinsic::hexagon_S2_storerh_pci, /*HasImm*/true);
11258   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
11259     return MakeCircSt(Intrinsic::hexagon_S2_storerf_pci, /*HasImm*/true);
11260   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
11261     return MakeCircSt(Intrinsic::hexagon_S2_storeri_pci, /*HasImm*/true);
11262   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
11263     return MakeCircSt(Intrinsic::hexagon_S2_storerd_pci, /*HasImm*/true);
11264   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
11265     return MakeCircSt(Intrinsic::hexagon_S2_storerb_pcr, /*HasImm*/false);
11266   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
11267     return MakeCircSt(Intrinsic::hexagon_S2_storerh_pcr, /*HasImm*/false);
11268   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
11269     return MakeCircSt(Intrinsic::hexagon_S2_storerf_pcr, /*HasImm*/false);
11270   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
11271     return MakeCircSt(Intrinsic::hexagon_S2_storeri_pcr, /*HasImm*/false);
11272   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
11273     return MakeCircSt(Intrinsic::hexagon_S2_storerd_pcr, /*HasImm*/false);
11274   case Hexagon::BI__builtin_brev_ldub:
11275     return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
11276   case Hexagon::BI__builtin_brev_ldb:
11277     return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
11278   case Hexagon::BI__builtin_brev_lduh:
11279     return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
11280   case Hexagon::BI__builtin_brev_ldh:
11281     return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
11282   case Hexagon::BI__builtin_brev_ldw:
11283     return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
11284   case Hexagon::BI__builtin_brev_ldd:
11285     return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
11286   default:
11287     break;
11288   } // switch
11289 
11290   return nullptr;
11291 }
11292