1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "CGCUDARuntime.h"
14 #include "CGCXXABI.h"
15 #include "CGObjCRuntime.h"
16 #include "CGOpenCLRuntime.h"
17 #include "CGRecordLayout.h"
18 #include "CodeGenFunction.h"
19 #include "CodeGenModule.h"
20 #include "ConstantEmitter.h"
21 #include "PatternInit.h"
22 #include "TargetInfo.h"
23 #include "clang/AST/ASTContext.h"
24 #include "clang/AST/Attr.h"
25 #include "clang/AST/Decl.h"
26 #include "clang/AST/OSLog.h"
27 #include "clang/Basic/TargetBuiltins.h"
28 #include "clang/Basic/TargetInfo.h"
29 #include "clang/CodeGen/CGFunctionInfo.h"
30 #include "llvm/ADT/APFloat.h"
31 #include "llvm/ADT/APInt.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/StringExtras.h"
34 #include "llvm/Analysis/ValueTracking.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/InlineAsm.h"
37 #include "llvm/IR/Intrinsics.h"
38 #include "llvm/IR/IntrinsicsAArch64.h"
39 #include "llvm/IR/IntrinsicsAMDGPU.h"
40 #include "llvm/IR/IntrinsicsARM.h"
41 #include "llvm/IR/IntrinsicsBPF.h"
42 #include "llvm/IR/IntrinsicsHexagon.h"
43 #include "llvm/IR/IntrinsicsNVPTX.h"
44 #include "llvm/IR/IntrinsicsPowerPC.h"
45 #include "llvm/IR/IntrinsicsR600.h"
46 #include "llvm/IR/IntrinsicsRISCV.h"
47 #include "llvm/IR/IntrinsicsS390.h"
48 #include "llvm/IR/IntrinsicsWebAssembly.h"
49 #include "llvm/IR/IntrinsicsX86.h"
50 #include "llvm/IR/MDBuilder.h"
51 #include "llvm/IR/MatrixBuilder.h"
52 #include "llvm/Support/ConvertUTF.h"
53 #include "llvm/Support/ScopedPrinter.h"
54 #include "llvm/Support/X86TargetParser.h"
55 #include <sstream>
56 
57 using namespace clang;
58 using namespace CodeGen;
59 using namespace llvm;
60 
61 static
62 int64_t clamp(int64_t Value, int64_t Low, int64_t High) {
63   return std::min(High, std::max(Low, Value));
64 }
65 
66 static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size,
67                              Align AlignmentInBytes) {
68   ConstantInt *Byte;
69   switch (CGF.getLangOpts().getTrivialAutoVarInit()) {
70   case LangOptions::TrivialAutoVarInitKind::Uninitialized:
71     // Nothing to initialize.
72     return;
73   case LangOptions::TrivialAutoVarInitKind::Zero:
74     Byte = CGF.Builder.getInt8(0x00);
75     break;
76   case LangOptions::TrivialAutoVarInitKind::Pattern: {
77     llvm::Type *Int8 = llvm::IntegerType::getInt8Ty(CGF.CGM.getLLVMContext());
78     Byte = llvm::dyn_cast<llvm::ConstantInt>(
79         initializationPatternFor(CGF.CGM, Int8));
80     break;
81   }
82   }
83   if (CGF.CGM.stopAutoInit())
84     return;
85   auto *I = CGF.Builder.CreateMemSet(AI, Byte, Size, AlignmentInBytes);
86   I->addAnnotationMetadata("auto-init");
87 }
88 
89 /// getBuiltinLibFunction - Given a builtin id for a function like
90 /// "__builtin_fabsf", return a Function* for "fabsf".
91 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
92                                                      unsigned BuiltinID) {
93   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
94 
95   // Get the name, skip over the __builtin_ prefix (if necessary).
96   StringRef Name;
97   GlobalDecl D(FD);
98 
99   // TODO: This list should be expanded or refactored after all GCC-compatible
100   // std libcall builtins are implemented.
101   static SmallDenseMap<unsigned, StringRef, 8> F128Builtins{
102       {Builtin::BI__builtin_printf, "__printfieee128"},
103       {Builtin::BI__builtin_vsnprintf, "__vsnprintfieee128"},
104       {Builtin::BI__builtin_vsprintf, "__vsprintfieee128"},
105       {Builtin::BI__builtin_sprintf, "__sprintfieee128"},
106       {Builtin::BI__builtin_snprintf, "__snprintfieee128"},
107       {Builtin::BI__builtin_fprintf, "__fprintfieee128"},
108       {Builtin::BI__builtin_nexttowardf128, "__nexttowardieee128"},
109   };
110 
111   // If the builtin has been declared explicitly with an assembler label,
112   // use the mangled name. This differs from the plain label on platforms
113   // that prefix labels.
114   if (FD->hasAttr<AsmLabelAttr>())
115     Name = getMangledName(D);
116   else {
117     // TODO: This mutation should also be applied to other targets other than
118     // PPC, after backend supports IEEE 128-bit style libcalls.
119     if (getTriple().isPPC64() &&
120         &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad() &&
121         F128Builtins.find(BuiltinID) != F128Builtins.end())
122       Name = F128Builtins[BuiltinID];
123     else
124       Name = Context.BuiltinInfo.getName(BuiltinID) + 10;
125   }
126 
127   llvm::FunctionType *Ty =
128     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
129 
130   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
131 }
132 
133 /// Emit the conversions required to turn the given value into an
134 /// integer of the given size.
135 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
136                         QualType T, llvm::IntegerType *IntType) {
137   V = CGF.EmitToMemory(V, T);
138 
139   if (V->getType()->isPointerTy())
140     return CGF.Builder.CreatePtrToInt(V, IntType);
141 
142   assert(V->getType() == IntType);
143   return V;
144 }
145 
146 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
147                           QualType T, llvm::Type *ResultType) {
148   V = CGF.EmitFromMemory(V, T);
149 
150   if (ResultType->isPointerTy())
151     return CGF.Builder.CreateIntToPtr(V, ResultType);
152 
153   assert(V->getType() == ResultType);
154   return V;
155 }
156 
157 /// Utility to insert an atomic instruction based on Intrinsic::ID
158 /// and the expression node.
159 static Value *MakeBinaryAtomicValue(
160     CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E,
161     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
162 
163   QualType T = E->getType();
164   assert(E->getArg(0)->getType()->isPointerType());
165   assert(CGF.getContext().hasSameUnqualifiedType(T,
166                                   E->getArg(0)->getType()->getPointeeType()));
167   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
168 
169   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
170   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
171 
172   llvm::IntegerType *IntType =
173     llvm::IntegerType::get(CGF.getLLVMContext(),
174                            CGF.getContext().getTypeSize(T));
175   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
176 
177   llvm::Value *Args[2];
178   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
179   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
180   llvm::Type *ValueType = Args[1]->getType();
181   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
182 
183   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
184       Kind, Args[0], Args[1], Ordering);
185   return EmitFromInt(CGF, Result, T, ValueType);
186 }
187 
188 static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
189   Value *Val = CGF.EmitScalarExpr(E->getArg(0));
190   Value *Address = CGF.EmitScalarExpr(E->getArg(1));
191 
192   // Convert the type of the pointer to a pointer to the stored type.
193   Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
194   unsigned SrcAddrSpace = Address->getType()->getPointerAddressSpace();
195   Value *BC = CGF.Builder.CreateBitCast(
196       Address, llvm::PointerType::get(Val->getType(), SrcAddrSpace), "cast");
197   LValue LV = CGF.MakeNaturalAlignAddrLValue(BC, E->getArg(0)->getType());
198   LV.setNontemporal(true);
199   CGF.EmitStoreOfScalar(Val, LV, false);
200   return nullptr;
201 }
202 
203 static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
204   Value *Address = CGF.EmitScalarExpr(E->getArg(0));
205 
206   LValue LV = CGF.MakeNaturalAlignAddrLValue(Address, E->getType());
207   LV.setNontemporal(true);
208   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
209 }
210 
211 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
212                                llvm::AtomicRMWInst::BinOp Kind,
213                                const CallExpr *E) {
214   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
215 }
216 
217 /// Utility to insert an atomic instruction based Intrinsic::ID and
218 /// the expression node, where the return value is the result of the
219 /// operation.
220 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
221                                    llvm::AtomicRMWInst::BinOp Kind,
222                                    const CallExpr *E,
223                                    Instruction::BinaryOps Op,
224                                    bool Invert = false) {
225   QualType T = E->getType();
226   assert(E->getArg(0)->getType()->isPointerType());
227   assert(CGF.getContext().hasSameUnqualifiedType(T,
228                                   E->getArg(0)->getType()->getPointeeType()));
229   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
230 
231   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
232   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
233 
234   llvm::IntegerType *IntType =
235     llvm::IntegerType::get(CGF.getLLVMContext(),
236                            CGF.getContext().getTypeSize(T));
237   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
238 
239   llvm::Value *Args[2];
240   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
241   llvm::Type *ValueType = Args[1]->getType();
242   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
243   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
244 
245   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
246       Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
247   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
248   if (Invert)
249     Result =
250         CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
251                                 llvm::ConstantInt::getAllOnesValue(IntType));
252   Result = EmitFromInt(CGF, Result, T, ValueType);
253   return RValue::get(Result);
254 }
255 
256 /// Utility to insert an atomic cmpxchg instruction.
257 ///
258 /// @param CGF The current codegen function.
259 /// @param E   Builtin call expression to convert to cmpxchg.
260 ///            arg0 - address to operate on
261 ///            arg1 - value to compare with
262 ///            arg2 - new value
263 /// @param ReturnBool Specifies whether to return success flag of
264 ///                   cmpxchg result or the old value.
265 ///
266 /// @returns result of cmpxchg, according to ReturnBool
267 ///
268 /// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
269 /// invoke the function EmitAtomicCmpXchgForMSIntrin.
270 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
271                                      bool ReturnBool) {
272   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
273   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
274   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
275 
276   llvm::IntegerType *IntType = llvm::IntegerType::get(
277       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
278   llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);
279 
280   Value *Args[3];
281   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
282   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
283   llvm::Type *ValueType = Args[1]->getType();
284   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
285   Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
286 
287   Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
288       Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent,
289       llvm::AtomicOrdering::SequentiallyConsistent);
290   if (ReturnBool)
291     // Extract boolean success flag and zext it to int.
292     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
293                                   CGF.ConvertType(E->getType()));
294   else
295     // Extract old value and emit it using the same type as compare value.
296     return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
297                        ValueType);
298 }
299 
300 /// This function should be invoked to emit atomic cmpxchg for Microsoft's
301 /// _InterlockedCompareExchange* intrinsics which have the following signature:
302 /// T _InterlockedCompareExchange(T volatile *Destination,
303 ///                               T Exchange,
304 ///                               T Comparand);
305 ///
306 /// Whereas the llvm 'cmpxchg' instruction has the following syntax:
307 /// cmpxchg *Destination, Comparand, Exchange.
308 /// So we need to swap Comparand and Exchange when invoking
309 /// CreateAtomicCmpXchg. That is the reason we could not use the above utility
310 /// function MakeAtomicCmpXchgValue since it expects the arguments to be
311 /// already swapped.
312 
313 static
314 Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
315     AtomicOrdering SuccessOrdering = AtomicOrdering::SequentiallyConsistent) {
316   assert(E->getArg(0)->getType()->isPointerType());
317   assert(CGF.getContext().hasSameUnqualifiedType(
318       E->getType(), E->getArg(0)->getType()->getPointeeType()));
319   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
320                                                  E->getArg(1)->getType()));
321   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
322                                                  E->getArg(2)->getType()));
323 
324   auto *Destination = CGF.EmitScalarExpr(E->getArg(0));
325   auto *Comparand = CGF.EmitScalarExpr(E->getArg(2));
326   auto *Exchange = CGF.EmitScalarExpr(E->getArg(1));
327 
328   // For Release ordering, the failure ordering should be Monotonic.
329   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release ?
330                          AtomicOrdering::Monotonic :
331                          SuccessOrdering;
332 
333   // The atomic instruction is marked volatile for consistency with MSVC. This
334   // blocks the few atomics optimizations that LLVM has. If we want to optimize
335   // _Interlocked* operations in the future, we will have to remove the volatile
336   // marker.
337   auto *Result = CGF.Builder.CreateAtomicCmpXchg(
338                    Destination, Comparand, Exchange,
339                    SuccessOrdering, FailureOrdering);
340   Result->setVolatile(true);
341   return CGF.Builder.CreateExtractValue(Result, 0);
342 }
343 
344 // 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
345 // prototyped like this:
346 //
347 // unsigned char _InterlockedCompareExchange128...(
348 //     __int64 volatile * _Destination,
349 //     __int64 _ExchangeHigh,
350 //     __int64 _ExchangeLow,
351 //     __int64 * _ComparandResult);
352 static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
353                                               const CallExpr *E,
354                                               AtomicOrdering SuccessOrdering) {
355   assert(E->getNumArgs() == 4);
356   llvm::Value *Destination = CGF.EmitScalarExpr(E->getArg(0));
357   llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
358   llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
359   llvm::Value *ComparandPtr = CGF.EmitScalarExpr(E->getArg(3));
360 
361   assert(Destination->getType()->isPointerTy());
362   assert(!ExchangeHigh->getType()->isPointerTy());
363   assert(!ExchangeLow->getType()->isPointerTy());
364   assert(ComparandPtr->getType()->isPointerTy());
365 
366   // For Release ordering, the failure ordering should be Monotonic.
367   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
368                              ? AtomicOrdering::Monotonic
369                              : SuccessOrdering;
370 
371   // Convert to i128 pointers and values.
372   llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
373   llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
374   Destination = CGF.Builder.CreateBitCast(Destination, Int128PtrTy);
375   Address ComparandResult(CGF.Builder.CreateBitCast(ComparandPtr, Int128PtrTy),
376                           CGF.getContext().toCharUnitsFromBits(128));
377 
378   // (((i128)hi) << 64) | ((i128)lo)
379   ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
380   ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
381   ExchangeHigh =
382       CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
383   llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
384 
385   // Load the comparand for the instruction.
386   llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandResult);
387 
388   auto *CXI = CGF.Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
389                                               SuccessOrdering, FailureOrdering);
390 
391   // The atomic instruction is marked volatile for consistency with MSVC. This
392   // blocks the few atomics optimizations that LLVM has. If we want to optimize
393   // _Interlocked* operations in the future, we will have to remove the volatile
394   // marker.
395   CXI->setVolatile(true);
396 
397   // Store the result as an outparameter.
398   CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
399                           ComparandResult);
400 
401   // Get the success boolean and zero extend it to i8.
402   Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
403   return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
404 }
405 
406 static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
407     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
408   assert(E->getArg(0)->getType()->isPointerType());
409 
410   auto *IntTy = CGF.ConvertType(E->getType());
411   auto *Result = CGF.Builder.CreateAtomicRMW(
412                    AtomicRMWInst::Add,
413                    CGF.EmitScalarExpr(E->getArg(0)),
414                    ConstantInt::get(IntTy, 1),
415                    Ordering);
416   return CGF.Builder.CreateAdd(Result, ConstantInt::get(IntTy, 1));
417 }
418 
419 static Value *EmitAtomicDecrementValue(CodeGenFunction &CGF, const CallExpr *E,
420     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
421   assert(E->getArg(0)->getType()->isPointerType());
422 
423   auto *IntTy = CGF.ConvertType(E->getType());
424   auto *Result = CGF.Builder.CreateAtomicRMW(
425                    AtomicRMWInst::Sub,
426                    CGF.EmitScalarExpr(E->getArg(0)),
427                    ConstantInt::get(IntTy, 1),
428                    Ordering);
429   return CGF.Builder.CreateSub(Result, ConstantInt::get(IntTy, 1));
430 }
431 
432 // Build a plain volatile load.
433 static Value *EmitISOVolatileLoad(CodeGenFunction &CGF, const CallExpr *E) {
434   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
435   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
436   CharUnits LoadSize = CGF.getContext().getTypeSizeInChars(ElTy);
437   llvm::Type *ITy =
438       llvm::IntegerType::get(CGF.getLLVMContext(), LoadSize.getQuantity() * 8);
439   Ptr = CGF.Builder.CreateBitCast(Ptr, ITy->getPointerTo());
440   llvm::LoadInst *Load = CGF.Builder.CreateAlignedLoad(ITy, Ptr, LoadSize);
441   Load->setVolatile(true);
442   return Load;
443 }
444 
445 // Build a plain volatile store.
446 static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) {
447   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
448   Value *Value = CGF.EmitScalarExpr(E->getArg(1));
449   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
450   CharUnits StoreSize = CGF.getContext().getTypeSizeInChars(ElTy);
451   llvm::Type *ITy =
452       llvm::IntegerType::get(CGF.getLLVMContext(), StoreSize.getQuantity() * 8);
453   Ptr = CGF.Builder.CreateBitCast(Ptr, ITy->getPointerTo());
454   llvm::StoreInst *Store =
455       CGF.Builder.CreateAlignedStore(Value, Ptr, StoreSize);
456   Store->setVolatile(true);
457   return Store;
458 }
459 
460 // Emit a simple mangled intrinsic that has 1 argument and a return type
461 // matching the argument type. Depending on mode, this may be a constrained
462 // floating-point intrinsic.
463 static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
464                                 const CallExpr *E, unsigned IntrinsicID,
465                                 unsigned ConstrainedIntrinsicID) {
466   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
467 
468   if (CGF.Builder.getIsFPConstrained()) {
469     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
470     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
471     return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
472   } else {
473     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
474     return CGF.Builder.CreateCall(F, Src0);
475   }
476 }
477 
478 // Emit an intrinsic that has 2 operands of the same type as its result.
479 // Depending on mode, this may be a constrained floating-point intrinsic.
480 static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
481                                 const CallExpr *E, unsigned IntrinsicID,
482                                 unsigned ConstrainedIntrinsicID) {
483   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
484   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
485 
486   if (CGF.Builder.getIsFPConstrained()) {
487     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
488     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
489     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 });
490   } else {
491     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
492     return CGF.Builder.CreateCall(F, { Src0, Src1 });
493   }
494 }
495 
496 // Emit an intrinsic that has 3 operands of the same type as its result.
497 // Depending on mode, this may be a constrained floating-point intrinsic.
498 static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
499                                  const CallExpr *E, unsigned IntrinsicID,
500                                  unsigned ConstrainedIntrinsicID) {
501   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
502   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
503   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
504 
505   if (CGF.Builder.getIsFPConstrained()) {
506     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
507     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
508     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 });
509   } else {
510     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
511     return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
512   }
513 }
514 
515 // Emit an intrinsic where all operands are of the same type as the result.
516 // Depending on mode, this may be a constrained floating-point intrinsic.
517 static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
518                                                 unsigned IntrinsicID,
519                                                 unsigned ConstrainedIntrinsicID,
520                                                 llvm::Type *Ty,
521                                                 ArrayRef<Value *> Args) {
522   Function *F;
523   if (CGF.Builder.getIsFPConstrained())
524     F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
525   else
526     F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
527 
528   if (CGF.Builder.getIsFPConstrained())
529     return CGF.Builder.CreateConstrainedFPCall(F, Args);
530   else
531     return CGF.Builder.CreateCall(F, Args);
532 }
533 
534 // Emit a simple mangled intrinsic that has 1 argument and a return type
535 // matching the argument type.
536 static Value *emitUnaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
537                                unsigned IntrinsicID,
538                                llvm::StringRef Name = "") {
539   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
540 
541   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
542   return CGF.Builder.CreateCall(F, Src0, Name);
543 }
544 
545 // Emit an intrinsic that has 2 operands of the same type as its result.
546 static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
547                                 const CallExpr *E,
548                                 unsigned IntrinsicID) {
549   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
550   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
551 
552   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
553   return CGF.Builder.CreateCall(F, { Src0, Src1 });
554 }
555 
556 // Emit an intrinsic that has 3 operands of the same type as its result.
557 static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
558                                  const CallExpr *E,
559                                  unsigned IntrinsicID) {
560   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
561   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
562   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
563 
564   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
565   return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
566 }
567 
568 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
569 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
570                                const CallExpr *E,
571                                unsigned IntrinsicID) {
572   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
573   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
574 
575   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
576   return CGF.Builder.CreateCall(F, {Src0, Src1});
577 }
578 
579 // Emit an intrinsic that has overloaded integer result and fp operand.
580 static Value *
581 emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E,
582                                         unsigned IntrinsicID,
583                                         unsigned ConstrainedIntrinsicID) {
584   llvm::Type *ResultType = CGF.ConvertType(E->getType());
585   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
586 
587   if (CGF.Builder.getIsFPConstrained()) {
588     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
589     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
590                                        {ResultType, Src0->getType()});
591     return CGF.Builder.CreateConstrainedFPCall(F, {Src0});
592   } else {
593     Function *F =
594         CGF.CGM.getIntrinsic(IntrinsicID, {ResultType, Src0->getType()});
595     return CGF.Builder.CreateCall(F, Src0);
596   }
597 }
598 
599 /// EmitFAbs - Emit a call to @llvm.fabs().
600 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
601   Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
602   llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
603   Call->setDoesNotAccessMemory();
604   return Call;
605 }
606 
607 /// Emit the computation of the sign bit for a floating point value. Returns
608 /// the i1 sign bit value.
609 static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
610   LLVMContext &C = CGF.CGM.getLLVMContext();
611 
612   llvm::Type *Ty = V->getType();
613   int Width = Ty->getPrimitiveSizeInBits();
614   llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
615   V = CGF.Builder.CreateBitCast(V, IntTy);
616   if (Ty->isPPC_FP128Ty()) {
617     // We want the sign bit of the higher-order double. The bitcast we just
618     // did works as if the double-double was stored to memory and then
619     // read as an i128. The "store" will put the higher-order double in the
620     // lower address in both little- and big-Endian modes, but the "load"
621     // will treat those bits as a different part of the i128: the low bits in
622     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
623     // we need to shift the high bits down to the low before truncating.
624     Width >>= 1;
625     if (CGF.getTarget().isBigEndian()) {
626       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
627       V = CGF.Builder.CreateLShr(V, ShiftCst);
628     }
629     // We are truncating value in order to extract the higher-order
630     // double, which we will be using to extract the sign from.
631     IntTy = llvm::IntegerType::get(C, Width);
632     V = CGF.Builder.CreateTrunc(V, IntTy);
633   }
634   Value *Zero = llvm::Constant::getNullValue(IntTy);
635   return CGF.Builder.CreateICmpSLT(V, Zero);
636 }
637 
638 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
639                               const CallExpr *E, llvm::Constant *calleeValue) {
640   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
641   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
642 }
643 
644 /// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
645 /// depending on IntrinsicID.
646 ///
647 /// \arg CGF The current codegen function.
648 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
649 /// \arg X The first argument to the llvm.*.with.overflow.*.
650 /// \arg Y The second argument to the llvm.*.with.overflow.*.
651 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
652 /// \returns The result (i.e. sum/product) returned by the intrinsic.
653 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
654                                           const llvm::Intrinsic::ID IntrinsicID,
655                                           llvm::Value *X, llvm::Value *Y,
656                                           llvm::Value *&Carry) {
657   // Make sure we have integers of the same width.
658   assert(X->getType() == Y->getType() &&
659          "Arguments must be the same type. (Did you forget to make sure both "
660          "arguments have the same integer width?)");
661 
662   Function *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
663   llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
664   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
665   return CGF.Builder.CreateExtractValue(Tmp, 0);
666 }
667 
668 static Value *emitRangedBuiltin(CodeGenFunction &CGF,
669                                 unsigned IntrinsicID,
670                                 int low, int high) {
671     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
672     llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
673     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
674     llvm::Instruction *Call = CGF.Builder.CreateCall(F);
675     Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
676     return Call;
677 }
678 
679 namespace {
680   struct WidthAndSignedness {
681     unsigned Width;
682     bool Signed;
683   };
684 }
685 
686 static WidthAndSignedness
687 getIntegerWidthAndSignedness(const clang::ASTContext &context,
688                              const clang::QualType Type) {
689   assert(Type->isIntegerType() && "Given type is not an integer.");
690   unsigned Width = Type->isBooleanType()  ? 1
691                    : Type->isBitIntType() ? context.getIntWidth(Type)
692                                           : context.getTypeInfo(Type).Width;
693   bool Signed = Type->isSignedIntegerType();
694   return {Width, Signed};
695 }
696 
697 // Given one or more integer types, this function produces an integer type that
698 // encompasses them: any value in one of the given types could be expressed in
699 // the encompassing type.
700 static struct WidthAndSignedness
701 EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
702   assert(Types.size() > 0 && "Empty list of types.");
703 
704   // If any of the given types is signed, we must return a signed type.
705   bool Signed = false;
706   for (const auto &Type : Types) {
707     Signed |= Type.Signed;
708   }
709 
710   // The encompassing type must have a width greater than or equal to the width
711   // of the specified types.  Additionally, if the encompassing type is signed,
712   // its width must be strictly greater than the width of any unsigned types
713   // given.
714   unsigned Width = 0;
715   for (const auto &Type : Types) {
716     unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
717     if (Width < MinWidth) {
718       Width = MinWidth;
719     }
720   }
721 
722   return {Width, Signed};
723 }
724 
725 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
726   llvm::Type *DestType = Int8PtrTy;
727   if (ArgValue->getType() != DestType)
728     ArgValue =
729         Builder.CreateBitCast(ArgValue, DestType, ArgValue->getName().data());
730 
731   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
732   return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
733 }
734 
735 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
736 /// __builtin_object_size(p, @p To) is correct
737 static bool areBOSTypesCompatible(int From, int To) {
738   // Note: Our __builtin_object_size implementation currently treats Type=0 and
739   // Type=2 identically. Encoding this implementation detail here may make
740   // improving __builtin_object_size difficult in the future, so it's omitted.
741   return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
742 }
743 
744 static llvm::Value *
745 getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
746   return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
747 }
748 
749 llvm::Value *
750 CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
751                                                  llvm::IntegerType *ResType,
752                                                  llvm::Value *EmittedE,
753                                                  bool IsDynamic) {
754   uint64_t ObjectSize;
755   if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
756     return emitBuiltinObjectSize(E, Type, ResType, EmittedE, IsDynamic);
757   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
758 }
759 
760 /// Returns a Value corresponding to the size of the given expression.
761 /// This Value may be either of the following:
762 ///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
763 ///     it)
764 ///   - A call to the @llvm.objectsize intrinsic
765 ///
766 /// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
767 /// and we wouldn't otherwise try to reference a pass_object_size parameter,
768 /// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
769 llvm::Value *
770 CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
771                                        llvm::IntegerType *ResType,
772                                        llvm::Value *EmittedE, bool IsDynamic) {
773   // We need to reference an argument if the pointer is a parameter with the
774   // pass_object_size attribute.
775   if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
776     auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
777     auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
778     if (Param != nullptr && PS != nullptr &&
779         areBOSTypesCompatible(PS->getType(), Type)) {
780       auto Iter = SizeArguments.find(Param);
781       assert(Iter != SizeArguments.end());
782 
783       const ImplicitParamDecl *D = Iter->second;
784       auto DIter = LocalDeclMap.find(D);
785       assert(DIter != LocalDeclMap.end());
786 
787       return EmitLoadOfScalar(DIter->second, /*Volatile=*/false,
788                               getContext().getSizeType(), E->getBeginLoc());
789     }
790   }
791 
792   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
793   // evaluate E for side-effects. In either case, we shouldn't lower to
794   // @llvm.objectsize.
795   if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
796     return getDefaultBuiltinObjectSizeResult(Type, ResType);
797 
798   Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
799   assert(Ptr->getType()->isPointerTy() &&
800          "Non-pointer passed to __builtin_object_size?");
801 
802   Function *F =
803       CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
804 
805   // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
806   Value *Min = Builder.getInt1((Type & 2) != 0);
807   // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
808   Value *NullIsUnknown = Builder.getTrue();
809   Value *Dynamic = Builder.getInt1(IsDynamic);
810   return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown, Dynamic});
811 }
812 
813 namespace {
814 /// A struct to generically describe a bit test intrinsic.
815 struct BitTest {
816   enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
817   enum InterlockingKind : uint8_t {
818     Unlocked,
819     Sequential,
820     Acquire,
821     Release,
822     NoFence
823   };
824 
825   ActionKind Action;
826   InterlockingKind Interlocking;
827   bool Is64Bit;
828 
829   static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
830 };
831 } // namespace
832 
833 BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
834   switch (BuiltinID) {
835     // Main portable variants.
836   case Builtin::BI_bittest:
837     return {TestOnly, Unlocked, false};
838   case Builtin::BI_bittestandcomplement:
839     return {Complement, Unlocked, false};
840   case Builtin::BI_bittestandreset:
841     return {Reset, Unlocked, false};
842   case Builtin::BI_bittestandset:
843     return {Set, Unlocked, false};
844   case Builtin::BI_interlockedbittestandreset:
845     return {Reset, Sequential, false};
846   case Builtin::BI_interlockedbittestandset:
847     return {Set, Sequential, false};
848 
849     // X86-specific 64-bit variants.
850   case Builtin::BI_bittest64:
851     return {TestOnly, Unlocked, true};
852   case Builtin::BI_bittestandcomplement64:
853     return {Complement, Unlocked, true};
854   case Builtin::BI_bittestandreset64:
855     return {Reset, Unlocked, true};
856   case Builtin::BI_bittestandset64:
857     return {Set, Unlocked, true};
858   case Builtin::BI_interlockedbittestandreset64:
859     return {Reset, Sequential, true};
860   case Builtin::BI_interlockedbittestandset64:
861     return {Set, Sequential, true};
862 
863     // ARM/AArch64-specific ordering variants.
864   case Builtin::BI_interlockedbittestandset_acq:
865     return {Set, Acquire, false};
866   case Builtin::BI_interlockedbittestandset_rel:
867     return {Set, Release, false};
868   case Builtin::BI_interlockedbittestandset_nf:
869     return {Set, NoFence, false};
870   case Builtin::BI_interlockedbittestandreset_acq:
871     return {Reset, Acquire, false};
872   case Builtin::BI_interlockedbittestandreset_rel:
873     return {Reset, Release, false};
874   case Builtin::BI_interlockedbittestandreset_nf:
875     return {Reset, NoFence, false};
876   }
877   llvm_unreachable("expected only bittest intrinsics");
878 }
879 
880 static char bitActionToX86BTCode(BitTest::ActionKind A) {
881   switch (A) {
882   case BitTest::TestOnly:   return '\0';
883   case BitTest::Complement: return 'c';
884   case BitTest::Reset:      return 'r';
885   case BitTest::Set:        return 's';
886   }
887   llvm_unreachable("invalid action");
888 }
889 
890 static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
891                                             BitTest BT,
892                                             const CallExpr *E, Value *BitBase,
893                                             Value *BitPos) {
894   char Action = bitActionToX86BTCode(BT.Action);
895   char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
896 
897   // Build the assembly.
898   SmallString<64> Asm;
899   raw_svector_ostream AsmOS(Asm);
900   if (BT.Interlocking != BitTest::Unlocked)
901     AsmOS << "lock ";
902   AsmOS << "bt";
903   if (Action)
904     AsmOS << Action;
905   AsmOS << SizeSuffix << " $2, ($1)";
906 
907   // Build the constraints. FIXME: We should support immediates when possible.
908   std::string Constraints = "={@ccc},r,r,~{cc},~{memory}";
909   std::string MachineClobbers = CGF.getTarget().getClobbers();
910   if (!MachineClobbers.empty()) {
911     Constraints += ',';
912     Constraints += MachineClobbers;
913   }
914   llvm::IntegerType *IntType = llvm::IntegerType::get(
915       CGF.getLLVMContext(),
916       CGF.getContext().getTypeSize(E->getArg(1)->getType()));
917   llvm::Type *IntPtrType = IntType->getPointerTo();
918   llvm::FunctionType *FTy =
919       llvm::FunctionType::get(CGF.Int8Ty, {IntPtrType, IntType}, false);
920 
921   llvm::InlineAsm *IA =
922       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
923   return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
924 }
925 
926 static llvm::AtomicOrdering
927 getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
928   switch (I) {
929   case BitTest::Unlocked:   return llvm::AtomicOrdering::NotAtomic;
930   case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
931   case BitTest::Acquire:    return llvm::AtomicOrdering::Acquire;
932   case BitTest::Release:    return llvm::AtomicOrdering::Release;
933   case BitTest::NoFence:    return llvm::AtomicOrdering::Monotonic;
934   }
935   llvm_unreachable("invalid interlocking");
936 }
937 
938 /// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
939 /// bits and a bit position and read and optionally modify the bit at that
940 /// position. The position index can be arbitrarily large, i.e. it can be larger
941 /// than 31 or 63, so we need an indexed load in the general case.
942 static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
943                                          unsigned BuiltinID,
944                                          const CallExpr *E) {
945   Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
946   Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
947 
948   BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
949 
950   // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
951   // indexing operation internally. Use them if possible.
952   if (CGF.getTarget().getTriple().isX86())
953     return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
954 
955   // Otherwise, use generic code to load one byte and test the bit. Use all but
956   // the bottom three bits as the array index, and the bottom three bits to form
957   // a mask.
958   // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
959   Value *ByteIndex = CGF.Builder.CreateAShr(
960       BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
961   Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
962   Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
963                                                  ByteIndex, "bittest.byteaddr"),
964                    CharUnits::One());
965   Value *PosLow =
966       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
967                             llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
968 
969   // The updating instructions will need a mask.
970   Value *Mask = nullptr;
971   if (BT.Action != BitTest::TestOnly) {
972     Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
973                                  "bittest.mask");
974   }
975 
976   // Check the action and ordering of the interlocked intrinsics.
977   llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
978 
979   Value *OldByte = nullptr;
980   if (Ordering != llvm::AtomicOrdering::NotAtomic) {
981     // Emit a combined atomicrmw load/store operation for the interlocked
982     // intrinsics.
983     llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
984     if (BT.Action == BitTest::Reset) {
985       Mask = CGF.Builder.CreateNot(Mask);
986       RMWOp = llvm::AtomicRMWInst::And;
987     }
988     OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr.getPointer(), Mask,
989                                           Ordering);
990   } else {
991     // Emit a plain load for the non-interlocked intrinsics.
992     OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
993     Value *NewByte = nullptr;
994     switch (BT.Action) {
995     case BitTest::TestOnly:
996       // Don't store anything.
997       break;
998     case BitTest::Complement:
999       NewByte = CGF.Builder.CreateXor(OldByte, Mask);
1000       break;
1001     case BitTest::Reset:
1002       NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
1003       break;
1004     case BitTest::Set:
1005       NewByte = CGF.Builder.CreateOr(OldByte, Mask);
1006       break;
1007     }
1008     if (NewByte)
1009       CGF.Builder.CreateStore(NewByte, ByteAddr);
1010   }
1011 
1012   // However we loaded the old byte, either by plain load or atomicrmw, shift
1013   // the bit into the low position and mask it to 0 or 1.
1014   Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
1015   return CGF.Builder.CreateAnd(
1016       ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
1017 }
1018 
1019 static llvm::Value *emitPPCLoadReserveIntrinsic(CodeGenFunction &CGF,
1020                                                 unsigned BuiltinID,
1021                                                 const CallExpr *E) {
1022   Value *Addr = CGF.EmitScalarExpr(E->getArg(0));
1023 
1024   SmallString<64> Asm;
1025   raw_svector_ostream AsmOS(Asm);
1026   llvm::IntegerType *RetType = CGF.Int32Ty;
1027 
1028   switch (BuiltinID) {
1029   case clang::PPC::BI__builtin_ppc_ldarx:
1030     AsmOS << "ldarx ";
1031     RetType = CGF.Int64Ty;
1032     break;
1033   case clang::PPC::BI__builtin_ppc_lwarx:
1034     AsmOS << "lwarx ";
1035     RetType = CGF.Int32Ty;
1036     break;
1037   case clang::PPC::BI__builtin_ppc_lharx:
1038     AsmOS << "lharx ";
1039     RetType = CGF.Int16Ty;
1040     break;
1041   case clang::PPC::BI__builtin_ppc_lbarx:
1042     AsmOS << "lbarx ";
1043     RetType = CGF.Int8Ty;
1044     break;
1045   default:
1046     llvm_unreachable("Expected only PowerPC load reserve intrinsics");
1047   }
1048 
1049   AsmOS << "$0, ${1:y}";
1050 
1051   std::string Constraints = "=r,*Z,~{memory}";
1052   std::string MachineClobbers = CGF.getTarget().getClobbers();
1053   if (!MachineClobbers.empty()) {
1054     Constraints += ',';
1055     Constraints += MachineClobbers;
1056   }
1057 
1058   llvm::Type *IntPtrType = RetType->getPointerTo();
1059   llvm::FunctionType *FTy =
1060       llvm::FunctionType::get(RetType, {IntPtrType}, false);
1061 
1062   llvm::InlineAsm *IA =
1063       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1064   llvm::CallInst *CI = CGF.Builder.CreateCall(IA, {Addr});
1065   CI->addParamAttr(
1066       0, Attribute::get(CGF.getLLVMContext(), Attribute::ElementType, RetType));
1067   return CI;
1068 }
1069 
1070 namespace {
1071 enum class MSVCSetJmpKind {
1072   _setjmpex,
1073   _setjmp3,
1074   _setjmp
1075 };
1076 }
1077 
1078 /// MSVC handles setjmp a bit differently on different platforms. On every
1079 /// architecture except 32-bit x86, the frame address is passed. On x86, extra
1080 /// parameters can be passed as variadic arguments, but we always pass none.
1081 static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
1082                                const CallExpr *E) {
1083   llvm::Value *Arg1 = nullptr;
1084   llvm::Type *Arg1Ty = nullptr;
1085   StringRef Name;
1086   bool IsVarArg = false;
1087   if (SJKind == MSVCSetJmpKind::_setjmp3) {
1088     Name = "_setjmp3";
1089     Arg1Ty = CGF.Int32Ty;
1090     Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
1091     IsVarArg = true;
1092   } else {
1093     Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
1094     Arg1Ty = CGF.Int8PtrTy;
1095     if (CGF.getTarget().getTriple().getArch() == llvm::Triple::aarch64) {
1096       Arg1 = CGF.Builder.CreateCall(
1097           CGF.CGM.getIntrinsic(Intrinsic::sponentry, CGF.AllocaInt8PtrTy));
1098     } else
1099       Arg1 = CGF.Builder.CreateCall(
1100           CGF.CGM.getIntrinsic(Intrinsic::frameaddress, CGF.AllocaInt8PtrTy),
1101           llvm::ConstantInt::get(CGF.Int32Ty, 0));
1102   }
1103 
1104   // Mark the call site and declaration with ReturnsTwice.
1105   llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
1106   llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
1107       CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
1108       llvm::Attribute::ReturnsTwice);
1109   llvm::FunctionCallee SetJmpFn = CGF.CGM.CreateRuntimeFunction(
1110       llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
1111       ReturnsTwiceAttr, /*Local=*/true);
1112 
1113   llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
1114       CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
1115   llvm::Value *Args[] = {Buf, Arg1};
1116   llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
1117   CB->setAttributes(ReturnsTwiceAttr);
1118   return RValue::get(CB);
1119 }
1120 
1121 // Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
1122 // we handle them here.
1123 enum class CodeGenFunction::MSVCIntrin {
1124   _BitScanForward,
1125   _BitScanReverse,
1126   _InterlockedAnd,
1127   _InterlockedDecrement,
1128   _InterlockedExchange,
1129   _InterlockedExchangeAdd,
1130   _InterlockedExchangeSub,
1131   _InterlockedIncrement,
1132   _InterlockedOr,
1133   _InterlockedXor,
1134   _InterlockedExchangeAdd_acq,
1135   _InterlockedExchangeAdd_rel,
1136   _InterlockedExchangeAdd_nf,
1137   _InterlockedExchange_acq,
1138   _InterlockedExchange_rel,
1139   _InterlockedExchange_nf,
1140   _InterlockedCompareExchange_acq,
1141   _InterlockedCompareExchange_rel,
1142   _InterlockedCompareExchange_nf,
1143   _InterlockedCompareExchange128,
1144   _InterlockedCompareExchange128_acq,
1145   _InterlockedCompareExchange128_rel,
1146   _InterlockedCompareExchange128_nf,
1147   _InterlockedOr_acq,
1148   _InterlockedOr_rel,
1149   _InterlockedOr_nf,
1150   _InterlockedXor_acq,
1151   _InterlockedXor_rel,
1152   _InterlockedXor_nf,
1153   _InterlockedAnd_acq,
1154   _InterlockedAnd_rel,
1155   _InterlockedAnd_nf,
1156   _InterlockedIncrement_acq,
1157   _InterlockedIncrement_rel,
1158   _InterlockedIncrement_nf,
1159   _InterlockedDecrement_acq,
1160   _InterlockedDecrement_rel,
1161   _InterlockedDecrement_nf,
1162   __fastfail,
1163 };
1164 
1165 static Optional<CodeGenFunction::MSVCIntrin>
1166 translateArmToMsvcIntrin(unsigned BuiltinID) {
1167   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1168   switch (BuiltinID) {
1169   default:
1170     return None;
1171   case ARM::BI_BitScanForward:
1172   case ARM::BI_BitScanForward64:
1173     return MSVCIntrin::_BitScanForward;
1174   case ARM::BI_BitScanReverse:
1175   case ARM::BI_BitScanReverse64:
1176     return MSVCIntrin::_BitScanReverse;
1177   case ARM::BI_InterlockedAnd64:
1178     return MSVCIntrin::_InterlockedAnd;
1179   case ARM::BI_InterlockedExchange64:
1180     return MSVCIntrin::_InterlockedExchange;
1181   case ARM::BI_InterlockedExchangeAdd64:
1182     return MSVCIntrin::_InterlockedExchangeAdd;
1183   case ARM::BI_InterlockedExchangeSub64:
1184     return MSVCIntrin::_InterlockedExchangeSub;
1185   case ARM::BI_InterlockedOr64:
1186     return MSVCIntrin::_InterlockedOr;
1187   case ARM::BI_InterlockedXor64:
1188     return MSVCIntrin::_InterlockedXor;
1189   case ARM::BI_InterlockedDecrement64:
1190     return MSVCIntrin::_InterlockedDecrement;
1191   case ARM::BI_InterlockedIncrement64:
1192     return MSVCIntrin::_InterlockedIncrement;
1193   case ARM::BI_InterlockedExchangeAdd8_acq:
1194   case ARM::BI_InterlockedExchangeAdd16_acq:
1195   case ARM::BI_InterlockedExchangeAdd_acq:
1196   case ARM::BI_InterlockedExchangeAdd64_acq:
1197     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1198   case ARM::BI_InterlockedExchangeAdd8_rel:
1199   case ARM::BI_InterlockedExchangeAdd16_rel:
1200   case ARM::BI_InterlockedExchangeAdd_rel:
1201   case ARM::BI_InterlockedExchangeAdd64_rel:
1202     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1203   case ARM::BI_InterlockedExchangeAdd8_nf:
1204   case ARM::BI_InterlockedExchangeAdd16_nf:
1205   case ARM::BI_InterlockedExchangeAdd_nf:
1206   case ARM::BI_InterlockedExchangeAdd64_nf:
1207     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1208   case ARM::BI_InterlockedExchange8_acq:
1209   case ARM::BI_InterlockedExchange16_acq:
1210   case ARM::BI_InterlockedExchange_acq:
1211   case ARM::BI_InterlockedExchange64_acq:
1212     return MSVCIntrin::_InterlockedExchange_acq;
1213   case ARM::BI_InterlockedExchange8_rel:
1214   case ARM::BI_InterlockedExchange16_rel:
1215   case ARM::BI_InterlockedExchange_rel:
1216   case ARM::BI_InterlockedExchange64_rel:
1217     return MSVCIntrin::_InterlockedExchange_rel;
1218   case ARM::BI_InterlockedExchange8_nf:
1219   case ARM::BI_InterlockedExchange16_nf:
1220   case ARM::BI_InterlockedExchange_nf:
1221   case ARM::BI_InterlockedExchange64_nf:
1222     return MSVCIntrin::_InterlockedExchange_nf;
1223   case ARM::BI_InterlockedCompareExchange8_acq:
1224   case ARM::BI_InterlockedCompareExchange16_acq:
1225   case ARM::BI_InterlockedCompareExchange_acq:
1226   case ARM::BI_InterlockedCompareExchange64_acq:
1227     return MSVCIntrin::_InterlockedCompareExchange_acq;
1228   case ARM::BI_InterlockedCompareExchange8_rel:
1229   case ARM::BI_InterlockedCompareExchange16_rel:
1230   case ARM::BI_InterlockedCompareExchange_rel:
1231   case ARM::BI_InterlockedCompareExchange64_rel:
1232     return MSVCIntrin::_InterlockedCompareExchange_rel;
1233   case ARM::BI_InterlockedCompareExchange8_nf:
1234   case ARM::BI_InterlockedCompareExchange16_nf:
1235   case ARM::BI_InterlockedCompareExchange_nf:
1236   case ARM::BI_InterlockedCompareExchange64_nf:
1237     return MSVCIntrin::_InterlockedCompareExchange_nf;
1238   case ARM::BI_InterlockedOr8_acq:
1239   case ARM::BI_InterlockedOr16_acq:
1240   case ARM::BI_InterlockedOr_acq:
1241   case ARM::BI_InterlockedOr64_acq:
1242     return MSVCIntrin::_InterlockedOr_acq;
1243   case ARM::BI_InterlockedOr8_rel:
1244   case ARM::BI_InterlockedOr16_rel:
1245   case ARM::BI_InterlockedOr_rel:
1246   case ARM::BI_InterlockedOr64_rel:
1247     return MSVCIntrin::_InterlockedOr_rel;
1248   case ARM::BI_InterlockedOr8_nf:
1249   case ARM::BI_InterlockedOr16_nf:
1250   case ARM::BI_InterlockedOr_nf:
1251   case ARM::BI_InterlockedOr64_nf:
1252     return MSVCIntrin::_InterlockedOr_nf;
1253   case ARM::BI_InterlockedXor8_acq:
1254   case ARM::BI_InterlockedXor16_acq:
1255   case ARM::BI_InterlockedXor_acq:
1256   case ARM::BI_InterlockedXor64_acq:
1257     return MSVCIntrin::_InterlockedXor_acq;
1258   case ARM::BI_InterlockedXor8_rel:
1259   case ARM::BI_InterlockedXor16_rel:
1260   case ARM::BI_InterlockedXor_rel:
1261   case ARM::BI_InterlockedXor64_rel:
1262     return MSVCIntrin::_InterlockedXor_rel;
1263   case ARM::BI_InterlockedXor8_nf:
1264   case ARM::BI_InterlockedXor16_nf:
1265   case ARM::BI_InterlockedXor_nf:
1266   case ARM::BI_InterlockedXor64_nf:
1267     return MSVCIntrin::_InterlockedXor_nf;
1268   case ARM::BI_InterlockedAnd8_acq:
1269   case ARM::BI_InterlockedAnd16_acq:
1270   case ARM::BI_InterlockedAnd_acq:
1271   case ARM::BI_InterlockedAnd64_acq:
1272     return MSVCIntrin::_InterlockedAnd_acq;
1273   case ARM::BI_InterlockedAnd8_rel:
1274   case ARM::BI_InterlockedAnd16_rel:
1275   case ARM::BI_InterlockedAnd_rel:
1276   case ARM::BI_InterlockedAnd64_rel:
1277     return MSVCIntrin::_InterlockedAnd_rel;
1278   case ARM::BI_InterlockedAnd8_nf:
1279   case ARM::BI_InterlockedAnd16_nf:
1280   case ARM::BI_InterlockedAnd_nf:
1281   case ARM::BI_InterlockedAnd64_nf:
1282     return MSVCIntrin::_InterlockedAnd_nf;
1283   case ARM::BI_InterlockedIncrement16_acq:
1284   case ARM::BI_InterlockedIncrement_acq:
1285   case ARM::BI_InterlockedIncrement64_acq:
1286     return MSVCIntrin::_InterlockedIncrement_acq;
1287   case ARM::BI_InterlockedIncrement16_rel:
1288   case ARM::BI_InterlockedIncrement_rel:
1289   case ARM::BI_InterlockedIncrement64_rel:
1290     return MSVCIntrin::_InterlockedIncrement_rel;
1291   case ARM::BI_InterlockedIncrement16_nf:
1292   case ARM::BI_InterlockedIncrement_nf:
1293   case ARM::BI_InterlockedIncrement64_nf:
1294     return MSVCIntrin::_InterlockedIncrement_nf;
1295   case ARM::BI_InterlockedDecrement16_acq:
1296   case ARM::BI_InterlockedDecrement_acq:
1297   case ARM::BI_InterlockedDecrement64_acq:
1298     return MSVCIntrin::_InterlockedDecrement_acq;
1299   case ARM::BI_InterlockedDecrement16_rel:
1300   case ARM::BI_InterlockedDecrement_rel:
1301   case ARM::BI_InterlockedDecrement64_rel:
1302     return MSVCIntrin::_InterlockedDecrement_rel;
1303   case ARM::BI_InterlockedDecrement16_nf:
1304   case ARM::BI_InterlockedDecrement_nf:
1305   case ARM::BI_InterlockedDecrement64_nf:
1306     return MSVCIntrin::_InterlockedDecrement_nf;
1307   }
1308   llvm_unreachable("must return from switch");
1309 }
1310 
1311 static Optional<CodeGenFunction::MSVCIntrin>
1312 translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
1313   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1314   switch (BuiltinID) {
1315   default:
1316     return None;
1317   case AArch64::BI_BitScanForward:
1318   case AArch64::BI_BitScanForward64:
1319     return MSVCIntrin::_BitScanForward;
1320   case AArch64::BI_BitScanReverse:
1321   case AArch64::BI_BitScanReverse64:
1322     return MSVCIntrin::_BitScanReverse;
1323   case AArch64::BI_InterlockedAnd64:
1324     return MSVCIntrin::_InterlockedAnd;
1325   case AArch64::BI_InterlockedExchange64:
1326     return MSVCIntrin::_InterlockedExchange;
1327   case AArch64::BI_InterlockedExchangeAdd64:
1328     return MSVCIntrin::_InterlockedExchangeAdd;
1329   case AArch64::BI_InterlockedExchangeSub64:
1330     return MSVCIntrin::_InterlockedExchangeSub;
1331   case AArch64::BI_InterlockedOr64:
1332     return MSVCIntrin::_InterlockedOr;
1333   case AArch64::BI_InterlockedXor64:
1334     return MSVCIntrin::_InterlockedXor;
1335   case AArch64::BI_InterlockedDecrement64:
1336     return MSVCIntrin::_InterlockedDecrement;
1337   case AArch64::BI_InterlockedIncrement64:
1338     return MSVCIntrin::_InterlockedIncrement;
1339   case AArch64::BI_InterlockedExchangeAdd8_acq:
1340   case AArch64::BI_InterlockedExchangeAdd16_acq:
1341   case AArch64::BI_InterlockedExchangeAdd_acq:
1342   case AArch64::BI_InterlockedExchangeAdd64_acq:
1343     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1344   case AArch64::BI_InterlockedExchangeAdd8_rel:
1345   case AArch64::BI_InterlockedExchangeAdd16_rel:
1346   case AArch64::BI_InterlockedExchangeAdd_rel:
1347   case AArch64::BI_InterlockedExchangeAdd64_rel:
1348     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1349   case AArch64::BI_InterlockedExchangeAdd8_nf:
1350   case AArch64::BI_InterlockedExchangeAdd16_nf:
1351   case AArch64::BI_InterlockedExchangeAdd_nf:
1352   case AArch64::BI_InterlockedExchangeAdd64_nf:
1353     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1354   case AArch64::BI_InterlockedExchange8_acq:
1355   case AArch64::BI_InterlockedExchange16_acq:
1356   case AArch64::BI_InterlockedExchange_acq:
1357   case AArch64::BI_InterlockedExchange64_acq:
1358     return MSVCIntrin::_InterlockedExchange_acq;
1359   case AArch64::BI_InterlockedExchange8_rel:
1360   case AArch64::BI_InterlockedExchange16_rel:
1361   case AArch64::BI_InterlockedExchange_rel:
1362   case AArch64::BI_InterlockedExchange64_rel:
1363     return MSVCIntrin::_InterlockedExchange_rel;
1364   case AArch64::BI_InterlockedExchange8_nf:
1365   case AArch64::BI_InterlockedExchange16_nf:
1366   case AArch64::BI_InterlockedExchange_nf:
1367   case AArch64::BI_InterlockedExchange64_nf:
1368     return MSVCIntrin::_InterlockedExchange_nf;
1369   case AArch64::BI_InterlockedCompareExchange8_acq:
1370   case AArch64::BI_InterlockedCompareExchange16_acq:
1371   case AArch64::BI_InterlockedCompareExchange_acq:
1372   case AArch64::BI_InterlockedCompareExchange64_acq:
1373     return MSVCIntrin::_InterlockedCompareExchange_acq;
1374   case AArch64::BI_InterlockedCompareExchange8_rel:
1375   case AArch64::BI_InterlockedCompareExchange16_rel:
1376   case AArch64::BI_InterlockedCompareExchange_rel:
1377   case AArch64::BI_InterlockedCompareExchange64_rel:
1378     return MSVCIntrin::_InterlockedCompareExchange_rel;
1379   case AArch64::BI_InterlockedCompareExchange8_nf:
1380   case AArch64::BI_InterlockedCompareExchange16_nf:
1381   case AArch64::BI_InterlockedCompareExchange_nf:
1382   case AArch64::BI_InterlockedCompareExchange64_nf:
1383     return MSVCIntrin::_InterlockedCompareExchange_nf;
1384   case AArch64::BI_InterlockedCompareExchange128:
1385     return MSVCIntrin::_InterlockedCompareExchange128;
1386   case AArch64::BI_InterlockedCompareExchange128_acq:
1387     return MSVCIntrin::_InterlockedCompareExchange128_acq;
1388   case AArch64::BI_InterlockedCompareExchange128_nf:
1389     return MSVCIntrin::_InterlockedCompareExchange128_nf;
1390   case AArch64::BI_InterlockedCompareExchange128_rel:
1391     return MSVCIntrin::_InterlockedCompareExchange128_rel;
1392   case AArch64::BI_InterlockedOr8_acq:
1393   case AArch64::BI_InterlockedOr16_acq:
1394   case AArch64::BI_InterlockedOr_acq:
1395   case AArch64::BI_InterlockedOr64_acq:
1396     return MSVCIntrin::_InterlockedOr_acq;
1397   case AArch64::BI_InterlockedOr8_rel:
1398   case AArch64::BI_InterlockedOr16_rel:
1399   case AArch64::BI_InterlockedOr_rel:
1400   case AArch64::BI_InterlockedOr64_rel:
1401     return MSVCIntrin::_InterlockedOr_rel;
1402   case AArch64::BI_InterlockedOr8_nf:
1403   case AArch64::BI_InterlockedOr16_nf:
1404   case AArch64::BI_InterlockedOr_nf:
1405   case AArch64::BI_InterlockedOr64_nf:
1406     return MSVCIntrin::_InterlockedOr_nf;
1407   case AArch64::BI_InterlockedXor8_acq:
1408   case AArch64::BI_InterlockedXor16_acq:
1409   case AArch64::BI_InterlockedXor_acq:
1410   case AArch64::BI_InterlockedXor64_acq:
1411     return MSVCIntrin::_InterlockedXor_acq;
1412   case AArch64::BI_InterlockedXor8_rel:
1413   case AArch64::BI_InterlockedXor16_rel:
1414   case AArch64::BI_InterlockedXor_rel:
1415   case AArch64::BI_InterlockedXor64_rel:
1416     return MSVCIntrin::_InterlockedXor_rel;
1417   case AArch64::BI_InterlockedXor8_nf:
1418   case AArch64::BI_InterlockedXor16_nf:
1419   case AArch64::BI_InterlockedXor_nf:
1420   case AArch64::BI_InterlockedXor64_nf:
1421     return MSVCIntrin::_InterlockedXor_nf;
1422   case AArch64::BI_InterlockedAnd8_acq:
1423   case AArch64::BI_InterlockedAnd16_acq:
1424   case AArch64::BI_InterlockedAnd_acq:
1425   case AArch64::BI_InterlockedAnd64_acq:
1426     return MSVCIntrin::_InterlockedAnd_acq;
1427   case AArch64::BI_InterlockedAnd8_rel:
1428   case AArch64::BI_InterlockedAnd16_rel:
1429   case AArch64::BI_InterlockedAnd_rel:
1430   case AArch64::BI_InterlockedAnd64_rel:
1431     return MSVCIntrin::_InterlockedAnd_rel;
1432   case AArch64::BI_InterlockedAnd8_nf:
1433   case AArch64::BI_InterlockedAnd16_nf:
1434   case AArch64::BI_InterlockedAnd_nf:
1435   case AArch64::BI_InterlockedAnd64_nf:
1436     return MSVCIntrin::_InterlockedAnd_nf;
1437   case AArch64::BI_InterlockedIncrement16_acq:
1438   case AArch64::BI_InterlockedIncrement_acq:
1439   case AArch64::BI_InterlockedIncrement64_acq:
1440     return MSVCIntrin::_InterlockedIncrement_acq;
1441   case AArch64::BI_InterlockedIncrement16_rel:
1442   case AArch64::BI_InterlockedIncrement_rel:
1443   case AArch64::BI_InterlockedIncrement64_rel:
1444     return MSVCIntrin::_InterlockedIncrement_rel;
1445   case AArch64::BI_InterlockedIncrement16_nf:
1446   case AArch64::BI_InterlockedIncrement_nf:
1447   case AArch64::BI_InterlockedIncrement64_nf:
1448     return MSVCIntrin::_InterlockedIncrement_nf;
1449   case AArch64::BI_InterlockedDecrement16_acq:
1450   case AArch64::BI_InterlockedDecrement_acq:
1451   case AArch64::BI_InterlockedDecrement64_acq:
1452     return MSVCIntrin::_InterlockedDecrement_acq;
1453   case AArch64::BI_InterlockedDecrement16_rel:
1454   case AArch64::BI_InterlockedDecrement_rel:
1455   case AArch64::BI_InterlockedDecrement64_rel:
1456     return MSVCIntrin::_InterlockedDecrement_rel;
1457   case AArch64::BI_InterlockedDecrement16_nf:
1458   case AArch64::BI_InterlockedDecrement_nf:
1459   case AArch64::BI_InterlockedDecrement64_nf:
1460     return MSVCIntrin::_InterlockedDecrement_nf;
1461   }
1462   llvm_unreachable("must return from switch");
1463 }
1464 
1465 static Optional<CodeGenFunction::MSVCIntrin>
1466 translateX86ToMsvcIntrin(unsigned BuiltinID) {
1467   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1468   switch (BuiltinID) {
1469   default:
1470     return None;
1471   case clang::X86::BI_BitScanForward:
1472   case clang::X86::BI_BitScanForward64:
1473     return MSVCIntrin::_BitScanForward;
1474   case clang::X86::BI_BitScanReverse:
1475   case clang::X86::BI_BitScanReverse64:
1476     return MSVCIntrin::_BitScanReverse;
1477   case clang::X86::BI_InterlockedAnd64:
1478     return MSVCIntrin::_InterlockedAnd;
1479   case clang::X86::BI_InterlockedCompareExchange128:
1480     return MSVCIntrin::_InterlockedCompareExchange128;
1481   case clang::X86::BI_InterlockedExchange64:
1482     return MSVCIntrin::_InterlockedExchange;
1483   case clang::X86::BI_InterlockedExchangeAdd64:
1484     return MSVCIntrin::_InterlockedExchangeAdd;
1485   case clang::X86::BI_InterlockedExchangeSub64:
1486     return MSVCIntrin::_InterlockedExchangeSub;
1487   case clang::X86::BI_InterlockedOr64:
1488     return MSVCIntrin::_InterlockedOr;
1489   case clang::X86::BI_InterlockedXor64:
1490     return MSVCIntrin::_InterlockedXor;
1491   case clang::X86::BI_InterlockedDecrement64:
1492     return MSVCIntrin::_InterlockedDecrement;
1493   case clang::X86::BI_InterlockedIncrement64:
1494     return MSVCIntrin::_InterlockedIncrement;
1495   }
1496   llvm_unreachable("must return from switch");
1497 }
1498 
1499 // Emit an MSVC intrinsic. Assumes that arguments have *not* been evaluated.
1500 Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
1501                                             const CallExpr *E) {
1502   switch (BuiltinID) {
1503   case MSVCIntrin::_BitScanForward:
1504   case MSVCIntrin::_BitScanReverse: {
1505     Address IndexAddress(EmitPointerWithAlignment(E->getArg(0)));
1506     Value *ArgValue = EmitScalarExpr(E->getArg(1));
1507 
1508     llvm::Type *ArgType = ArgValue->getType();
1509     llvm::Type *IndexType = IndexAddress.getElementType();
1510     llvm::Type *ResultType = ConvertType(E->getType());
1511 
1512     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1513     Value *ResZero = llvm::Constant::getNullValue(ResultType);
1514     Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
1515 
1516     BasicBlock *Begin = Builder.GetInsertBlock();
1517     BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
1518     Builder.SetInsertPoint(End);
1519     PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
1520 
1521     Builder.SetInsertPoint(Begin);
1522     Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
1523     BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
1524     Builder.CreateCondBr(IsZero, End, NotZero);
1525     Result->addIncoming(ResZero, Begin);
1526 
1527     Builder.SetInsertPoint(NotZero);
1528 
1529     if (BuiltinID == MSVCIntrin::_BitScanForward) {
1530       Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1531       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1532       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1533       Builder.CreateStore(ZeroCount, IndexAddress, false);
1534     } else {
1535       unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1536       Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
1537 
1538       Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1539       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1540       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1541       Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
1542       Builder.CreateStore(Index, IndexAddress, false);
1543     }
1544     Builder.CreateBr(End);
1545     Result->addIncoming(ResOne, NotZero);
1546 
1547     Builder.SetInsertPoint(End);
1548     return Result;
1549   }
1550   case MSVCIntrin::_InterlockedAnd:
1551     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
1552   case MSVCIntrin::_InterlockedExchange:
1553     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
1554   case MSVCIntrin::_InterlockedExchangeAdd:
1555     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
1556   case MSVCIntrin::_InterlockedExchangeSub:
1557     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
1558   case MSVCIntrin::_InterlockedOr:
1559     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
1560   case MSVCIntrin::_InterlockedXor:
1561     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
1562   case MSVCIntrin::_InterlockedExchangeAdd_acq:
1563     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1564                                  AtomicOrdering::Acquire);
1565   case MSVCIntrin::_InterlockedExchangeAdd_rel:
1566     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1567                                  AtomicOrdering::Release);
1568   case MSVCIntrin::_InterlockedExchangeAdd_nf:
1569     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1570                                  AtomicOrdering::Monotonic);
1571   case MSVCIntrin::_InterlockedExchange_acq:
1572     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1573                                  AtomicOrdering::Acquire);
1574   case MSVCIntrin::_InterlockedExchange_rel:
1575     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1576                                  AtomicOrdering::Release);
1577   case MSVCIntrin::_InterlockedExchange_nf:
1578     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1579                                  AtomicOrdering::Monotonic);
1580   case MSVCIntrin::_InterlockedCompareExchange_acq:
1581     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Acquire);
1582   case MSVCIntrin::_InterlockedCompareExchange_rel:
1583     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
1584   case MSVCIntrin::_InterlockedCompareExchange_nf:
1585     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1586   case MSVCIntrin::_InterlockedCompareExchange128:
1587     return EmitAtomicCmpXchg128ForMSIntrin(
1588         *this, E, AtomicOrdering::SequentiallyConsistent);
1589   case MSVCIntrin::_InterlockedCompareExchange128_acq:
1590     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
1591   case MSVCIntrin::_InterlockedCompareExchange128_rel:
1592     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
1593   case MSVCIntrin::_InterlockedCompareExchange128_nf:
1594     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1595   case MSVCIntrin::_InterlockedOr_acq:
1596     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1597                                  AtomicOrdering::Acquire);
1598   case MSVCIntrin::_InterlockedOr_rel:
1599     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1600                                  AtomicOrdering::Release);
1601   case MSVCIntrin::_InterlockedOr_nf:
1602     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1603                                  AtomicOrdering::Monotonic);
1604   case MSVCIntrin::_InterlockedXor_acq:
1605     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1606                                  AtomicOrdering::Acquire);
1607   case MSVCIntrin::_InterlockedXor_rel:
1608     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1609                                  AtomicOrdering::Release);
1610   case MSVCIntrin::_InterlockedXor_nf:
1611     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1612                                  AtomicOrdering::Monotonic);
1613   case MSVCIntrin::_InterlockedAnd_acq:
1614     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1615                                  AtomicOrdering::Acquire);
1616   case MSVCIntrin::_InterlockedAnd_rel:
1617     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1618                                  AtomicOrdering::Release);
1619   case MSVCIntrin::_InterlockedAnd_nf:
1620     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1621                                  AtomicOrdering::Monotonic);
1622   case MSVCIntrin::_InterlockedIncrement_acq:
1623     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Acquire);
1624   case MSVCIntrin::_InterlockedIncrement_rel:
1625     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Release);
1626   case MSVCIntrin::_InterlockedIncrement_nf:
1627     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Monotonic);
1628   case MSVCIntrin::_InterlockedDecrement_acq:
1629     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Acquire);
1630   case MSVCIntrin::_InterlockedDecrement_rel:
1631     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Release);
1632   case MSVCIntrin::_InterlockedDecrement_nf:
1633     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Monotonic);
1634 
1635   case MSVCIntrin::_InterlockedDecrement:
1636     return EmitAtomicDecrementValue(*this, E);
1637   case MSVCIntrin::_InterlockedIncrement:
1638     return EmitAtomicIncrementValue(*this, E);
1639 
1640   case MSVCIntrin::__fastfail: {
1641     // Request immediate process termination from the kernel. The instruction
1642     // sequences to do this are documented on MSDN:
1643     // https://msdn.microsoft.com/en-us/library/dn774154.aspx
1644     llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
1645     StringRef Asm, Constraints;
1646     switch (ISA) {
1647     default:
1648       ErrorUnsupported(E, "__fastfail call for this architecture");
1649       break;
1650     case llvm::Triple::x86:
1651     case llvm::Triple::x86_64:
1652       Asm = "int $$0x29";
1653       Constraints = "{cx}";
1654       break;
1655     case llvm::Triple::thumb:
1656       Asm = "udf #251";
1657       Constraints = "{r0}";
1658       break;
1659     case llvm::Triple::aarch64:
1660       Asm = "brk #0xF003";
1661       Constraints = "{w0}";
1662     }
1663     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
1664     llvm::InlineAsm *IA =
1665         llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1666     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
1667         getLLVMContext(), llvm::AttributeList::FunctionIndex,
1668         llvm::Attribute::NoReturn);
1669     llvm::CallInst *CI = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
1670     CI->setAttributes(NoReturnAttr);
1671     return CI;
1672   }
1673   }
1674   llvm_unreachable("Incorrect MSVC intrinsic!");
1675 }
1676 
1677 namespace {
1678 // ARC cleanup for __builtin_os_log_format
1679 struct CallObjCArcUse final : EHScopeStack::Cleanup {
1680   CallObjCArcUse(llvm::Value *object) : object(object) {}
1681   llvm::Value *object;
1682 
1683   void Emit(CodeGenFunction &CGF, Flags flags) override {
1684     CGF.EmitARCIntrinsicUse(object);
1685   }
1686 };
1687 }
1688 
1689 Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
1690                                                  BuiltinCheckKind Kind) {
1691   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
1692           && "Unsupported builtin check kind");
1693 
1694   Value *ArgValue = EmitScalarExpr(E);
1695   if (!SanOpts.has(SanitizerKind::Builtin) || !getTarget().isCLZForZeroUndef())
1696     return ArgValue;
1697 
1698   SanitizerScope SanScope(this);
1699   Value *Cond = Builder.CreateICmpNE(
1700       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
1701   EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
1702             SanitizerHandler::InvalidBuiltin,
1703             {EmitCheckSourceLocation(E->getExprLoc()),
1704              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
1705             None);
1706   return ArgValue;
1707 }
1708 
1709 /// Get the argument type for arguments to os_log_helper.
1710 static CanQualType getOSLogArgType(ASTContext &C, int Size) {
1711   QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
1712   return C.getCanonicalType(UnsignedTy);
1713 }
1714 
1715 llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
1716     const analyze_os_log::OSLogBufferLayout &Layout,
1717     CharUnits BufferAlignment) {
1718   ASTContext &Ctx = getContext();
1719 
1720   llvm::SmallString<64> Name;
1721   {
1722     raw_svector_ostream OS(Name);
1723     OS << "__os_log_helper";
1724     OS << "_" << BufferAlignment.getQuantity();
1725     OS << "_" << int(Layout.getSummaryByte());
1726     OS << "_" << int(Layout.getNumArgsByte());
1727     for (const auto &Item : Layout.Items)
1728       OS << "_" << int(Item.getSizeByte()) << "_"
1729          << int(Item.getDescriptorByte());
1730   }
1731 
1732   if (llvm::Function *F = CGM.getModule().getFunction(Name))
1733     return F;
1734 
1735   llvm::SmallVector<QualType, 4> ArgTys;
1736   FunctionArgList Args;
1737   Args.push_back(ImplicitParamDecl::Create(
1738       Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"), Ctx.VoidPtrTy,
1739       ImplicitParamDecl::Other));
1740   ArgTys.emplace_back(Ctx.VoidPtrTy);
1741 
1742   for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
1743     char Size = Layout.Items[I].getSizeByte();
1744     if (!Size)
1745       continue;
1746 
1747     QualType ArgTy = getOSLogArgType(Ctx, Size);
1748     Args.push_back(ImplicitParamDecl::Create(
1749         Ctx, nullptr, SourceLocation(),
1750         &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)), ArgTy,
1751         ImplicitParamDecl::Other));
1752     ArgTys.emplace_back(ArgTy);
1753   }
1754 
1755   QualType ReturnTy = Ctx.VoidTy;
1756 
1757   // The helper function has linkonce_odr linkage to enable the linker to merge
1758   // identical functions. To ensure the merging always happens, 'noinline' is
1759   // attached to the function when compiling with -Oz.
1760   const CGFunctionInfo &FI =
1761       CGM.getTypes().arrangeBuiltinFunctionDeclaration(ReturnTy, Args);
1762   llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
1763   llvm::Function *Fn = llvm::Function::Create(
1764       FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
1765   Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
1766   CGM.SetLLVMFunctionAttributes(GlobalDecl(), FI, Fn, /*IsThunk=*/false);
1767   CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
1768   Fn->setDoesNotThrow();
1769 
1770   // Attach 'noinline' at -Oz.
1771   if (CGM.getCodeGenOpts().OptimizeSize == 2)
1772     Fn->addFnAttr(llvm::Attribute::NoInline);
1773 
1774   auto NL = ApplyDebugLocation::CreateEmpty(*this);
1775   StartFunction(GlobalDecl(), ReturnTy, Fn, FI, Args);
1776 
1777   // Create a scope with an artificial location for the body of this function.
1778   auto AL = ApplyDebugLocation::CreateArtificial(*this);
1779 
1780   CharUnits Offset;
1781   Address BufAddr(Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"),
1782                   BufferAlignment);
1783   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
1784                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
1785   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
1786                       Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
1787 
1788   unsigned I = 1;
1789   for (const auto &Item : Layout.Items) {
1790     Builder.CreateStore(
1791         Builder.getInt8(Item.getDescriptorByte()),
1792         Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
1793     Builder.CreateStore(
1794         Builder.getInt8(Item.getSizeByte()),
1795         Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
1796 
1797     CharUnits Size = Item.size();
1798     if (!Size.getQuantity())
1799       continue;
1800 
1801     Address Arg = GetAddrOfLocalVar(Args[I]);
1802     Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
1803     Addr = Builder.CreateBitCast(Addr, Arg.getPointer()->getType(),
1804                                  "argDataCast");
1805     Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
1806     Offset += Size;
1807     ++I;
1808   }
1809 
1810   FinishFunction();
1811 
1812   return Fn;
1813 }
1814 
1815 RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
1816   assert(E.getNumArgs() >= 2 &&
1817          "__builtin_os_log_format takes at least 2 arguments");
1818   ASTContext &Ctx = getContext();
1819   analyze_os_log::OSLogBufferLayout Layout;
1820   analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
1821   Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
1822   llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
1823 
1824   // Ignore argument 1, the format string. It is not currently used.
1825   CallArgList Args;
1826   Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
1827 
1828   for (const auto &Item : Layout.Items) {
1829     int Size = Item.getSizeByte();
1830     if (!Size)
1831       continue;
1832 
1833     llvm::Value *ArgVal;
1834 
1835     if (Item.getKind() == analyze_os_log::OSLogBufferItem::MaskKind) {
1836       uint64_t Val = 0;
1837       for (unsigned I = 0, E = Item.getMaskType().size(); I < E; ++I)
1838         Val |= ((uint64_t)Item.getMaskType()[I]) << I * 8;
1839       ArgVal = llvm::Constant::getIntegerValue(Int64Ty, llvm::APInt(64, Val));
1840     } else if (const Expr *TheExpr = Item.getExpr()) {
1841       ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
1842 
1843       // If a temporary object that requires destruction after the full
1844       // expression is passed, push a lifetime-extended cleanup to extend its
1845       // lifetime to the end of the enclosing block scope.
1846       auto LifetimeExtendObject = [&](const Expr *E) {
1847         E = E->IgnoreParenCasts();
1848         // Extend lifetimes of objects returned by function calls and message
1849         // sends.
1850 
1851         // FIXME: We should do this in other cases in which temporaries are
1852         //        created including arguments of non-ARC types (e.g., C++
1853         //        temporaries).
1854         if (isa<CallExpr>(E) || isa<ObjCMessageExpr>(E))
1855           return true;
1856         return false;
1857       };
1858 
1859       if (TheExpr->getType()->isObjCRetainableType() &&
1860           getLangOpts().ObjCAutoRefCount && LifetimeExtendObject(TheExpr)) {
1861         assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
1862                "Only scalar can be a ObjC retainable type");
1863         if (!isa<Constant>(ArgVal)) {
1864           CleanupKind Cleanup = getARCCleanupKind();
1865           QualType Ty = TheExpr->getType();
1866           Address Alloca = Address::invalid();
1867           Address Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca);
1868           ArgVal = EmitARCRetain(Ty, ArgVal);
1869           Builder.CreateStore(ArgVal, Addr);
1870           pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty,
1871                                       CodeGenFunction::destroyARCStrongPrecise,
1872                                       Cleanup & EHCleanup);
1873 
1874           // Push a clang.arc.use call to ensure ARC optimizer knows that the
1875           // argument has to be alive.
1876           if (CGM.getCodeGenOpts().OptimizationLevel != 0)
1877             pushCleanupAfterFullExpr<CallObjCArcUse>(Cleanup, ArgVal);
1878         }
1879       }
1880     } else {
1881       ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
1882     }
1883 
1884     unsigned ArgValSize =
1885         CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
1886     llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
1887                                                      ArgValSize);
1888     ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
1889     CanQualType ArgTy = getOSLogArgType(Ctx, Size);
1890     // If ArgVal has type x86_fp80, zero-extend ArgVal.
1891     ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
1892     Args.add(RValue::get(ArgVal), ArgTy);
1893   }
1894 
1895   const CGFunctionInfo &FI =
1896       CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
1897   llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
1898       Layout, BufAddr.getAlignment());
1899   EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
1900   return RValue::get(BufAddr.getPointer());
1901 }
1902 
1903 static bool isSpecialUnsignedMultiplySignedResult(
1904     unsigned BuiltinID, WidthAndSignedness Op1Info, WidthAndSignedness Op2Info,
1905     WidthAndSignedness ResultInfo) {
1906   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
1907          Op1Info.Width == Op2Info.Width && Op2Info.Width == ResultInfo.Width &&
1908          !Op1Info.Signed && !Op2Info.Signed && ResultInfo.Signed;
1909 }
1910 
1911 static RValue EmitCheckedUnsignedMultiplySignedResult(
1912     CodeGenFunction &CGF, const clang::Expr *Op1, WidthAndSignedness Op1Info,
1913     const clang::Expr *Op2, WidthAndSignedness Op2Info,
1914     const clang::Expr *ResultArg, QualType ResultQTy,
1915     WidthAndSignedness ResultInfo) {
1916   assert(isSpecialUnsignedMultiplySignedResult(
1917              Builtin::BI__builtin_mul_overflow, Op1Info, Op2Info, ResultInfo) &&
1918          "Cannot specialize this multiply");
1919 
1920   llvm::Value *V1 = CGF.EmitScalarExpr(Op1);
1921   llvm::Value *V2 = CGF.EmitScalarExpr(Op2);
1922 
1923   llvm::Value *HasOverflow;
1924   llvm::Value *Result = EmitOverflowIntrinsic(
1925       CGF, llvm::Intrinsic::umul_with_overflow, V1, V2, HasOverflow);
1926 
1927   // The intrinsic call will detect overflow when the value is > UINT_MAX,
1928   // however, since the original builtin had a signed result, we need to report
1929   // an overflow when the result is greater than INT_MAX.
1930   auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width);
1931   llvm::Value *IntMaxValue = llvm::ConstantInt::get(Result->getType(), IntMax);
1932 
1933   llvm::Value *IntMaxOverflow = CGF.Builder.CreateICmpUGT(Result, IntMaxValue);
1934   HasOverflow = CGF.Builder.CreateOr(HasOverflow, IntMaxOverflow);
1935 
1936   bool isVolatile =
1937       ResultArg->getType()->getPointeeType().isVolatileQualified();
1938   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
1939   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
1940                           isVolatile);
1941   return RValue::get(HasOverflow);
1942 }
1943 
1944 /// Determine if a binop is a checked mixed-sign multiply we can specialize.
1945 static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
1946                                        WidthAndSignedness Op1Info,
1947                                        WidthAndSignedness Op2Info,
1948                                        WidthAndSignedness ResultInfo) {
1949   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
1950          std::max(Op1Info.Width, Op2Info.Width) >= ResultInfo.Width &&
1951          Op1Info.Signed != Op2Info.Signed;
1952 }
1953 
1954 /// Emit a checked mixed-sign multiply. This is a cheaper specialization of
1955 /// the generic checked-binop irgen.
1956 static RValue
1957 EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
1958                              WidthAndSignedness Op1Info, const clang::Expr *Op2,
1959                              WidthAndSignedness Op2Info,
1960                              const clang::Expr *ResultArg, QualType ResultQTy,
1961                              WidthAndSignedness ResultInfo) {
1962   assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
1963                                     Op2Info, ResultInfo) &&
1964          "Not a mixed-sign multipliction we can specialize");
1965 
1966   // Emit the signed and unsigned operands.
1967   const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
1968   const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
1969   llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
1970   llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
1971   unsigned SignedOpWidth = Op1Info.Signed ? Op1Info.Width : Op2Info.Width;
1972   unsigned UnsignedOpWidth = Op1Info.Signed ? Op2Info.Width : Op1Info.Width;
1973 
1974   // One of the operands may be smaller than the other. If so, [s|z]ext it.
1975   if (SignedOpWidth < UnsignedOpWidth)
1976     Signed = CGF.Builder.CreateSExt(Signed, Unsigned->getType(), "op.sext");
1977   if (UnsignedOpWidth < SignedOpWidth)
1978     Unsigned = CGF.Builder.CreateZExt(Unsigned, Signed->getType(), "op.zext");
1979 
1980   llvm::Type *OpTy = Signed->getType();
1981   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
1982   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
1983   llvm::Type *ResTy = ResultPtr.getElementType();
1984   unsigned OpWidth = std::max(Op1Info.Width, Op2Info.Width);
1985 
1986   // Take the absolute value of the signed operand.
1987   llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
1988   llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
1989   llvm::Value *AbsSigned =
1990       CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
1991 
1992   // Perform a checked unsigned multiplication.
1993   llvm::Value *UnsignedOverflow;
1994   llvm::Value *UnsignedResult =
1995       EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
1996                             Unsigned, UnsignedOverflow);
1997 
1998   llvm::Value *Overflow, *Result;
1999   if (ResultInfo.Signed) {
2000     // Signed overflow occurs if the result is greater than INT_MAX or lesser
2001     // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
2002     auto IntMax =
2003         llvm::APInt::getSignedMaxValue(ResultInfo.Width).zextOrSelf(OpWidth);
2004     llvm::Value *MaxResult =
2005         CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
2006                               CGF.Builder.CreateZExt(IsNegative, OpTy));
2007     llvm::Value *SignedOverflow =
2008         CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
2009     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
2010 
2011     // Prepare the signed result (possibly by negating it).
2012     llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
2013     llvm::Value *SignedResult =
2014         CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
2015     Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
2016   } else {
2017     // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
2018     llvm::Value *Underflow = CGF.Builder.CreateAnd(
2019         IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
2020     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
2021     if (ResultInfo.Width < OpWidth) {
2022       auto IntMax =
2023           llvm::APInt::getMaxValue(ResultInfo.Width).zext(OpWidth);
2024       llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
2025           UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
2026       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
2027     }
2028 
2029     // Negate the product if it would be negative in infinite precision.
2030     Result = CGF.Builder.CreateSelect(
2031         IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
2032 
2033     Result = CGF.Builder.CreateTrunc(Result, ResTy);
2034   }
2035   assert(Overflow && Result && "Missing overflow or result");
2036 
2037   bool isVolatile =
2038       ResultArg->getType()->getPointeeType().isVolatileQualified();
2039   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2040                           isVolatile);
2041   return RValue::get(Overflow);
2042 }
2043 
2044 static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType,
2045                                Value *&RecordPtr, CharUnits Align,
2046                                llvm::FunctionCallee Func, int Lvl) {
2047   ASTContext &Context = CGF.getContext();
2048   RecordDecl *RD = RType->castAs<RecordType>()->getDecl()->getDefinition();
2049   std::string Pad = std::string(Lvl * 4, ' ');
2050 
2051   Value *GString =
2052       CGF.Builder.CreateGlobalStringPtr(RType.getAsString() + " {\n");
2053   Value *Res = CGF.Builder.CreateCall(Func, {GString});
2054 
2055   static llvm::DenseMap<QualType, const char *> Types;
2056   if (Types.empty()) {
2057     Types[Context.CharTy] = "%c";
2058     Types[Context.BoolTy] = "%d";
2059     Types[Context.SignedCharTy] = "%hhd";
2060     Types[Context.UnsignedCharTy] = "%hhu";
2061     Types[Context.IntTy] = "%d";
2062     Types[Context.UnsignedIntTy] = "%u";
2063     Types[Context.LongTy] = "%ld";
2064     Types[Context.UnsignedLongTy] = "%lu";
2065     Types[Context.LongLongTy] = "%lld";
2066     Types[Context.UnsignedLongLongTy] = "%llu";
2067     Types[Context.ShortTy] = "%hd";
2068     Types[Context.UnsignedShortTy] = "%hu";
2069     Types[Context.VoidPtrTy] = "%p";
2070     Types[Context.FloatTy] = "%f";
2071     Types[Context.DoubleTy] = "%f";
2072     Types[Context.LongDoubleTy] = "%Lf";
2073     Types[Context.getPointerType(Context.CharTy)] = "%s";
2074     Types[Context.getPointerType(Context.getConstType(Context.CharTy))] = "%s";
2075   }
2076 
2077   for (const auto *FD : RD->fields()) {
2078     Value *FieldPtr = RecordPtr;
2079     if (RD->isUnion())
2080       FieldPtr = CGF.Builder.CreatePointerCast(
2081           FieldPtr, CGF.ConvertType(Context.getPointerType(FD->getType())));
2082     else
2083       FieldPtr = CGF.Builder.CreateStructGEP(CGF.ConvertType(RType), FieldPtr,
2084                                              FD->getFieldIndex());
2085 
2086     GString = CGF.Builder.CreateGlobalStringPtr(
2087         llvm::Twine(Pad)
2088             .concat(FD->getType().getAsString())
2089             .concat(llvm::Twine(' '))
2090             .concat(FD->getNameAsString())
2091             .concat(" : ")
2092             .str());
2093     Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
2094     Res = CGF.Builder.CreateAdd(Res, TmpRes);
2095 
2096     QualType CanonicalType =
2097         FD->getType().getUnqualifiedType().getCanonicalType();
2098 
2099     // We check whether we are in a recursive type
2100     if (CanonicalType->isRecordType()) {
2101       TmpRes = dumpRecord(CGF, CanonicalType, FieldPtr, Align, Func, Lvl + 1);
2102       Res = CGF.Builder.CreateAdd(TmpRes, Res);
2103       continue;
2104     }
2105 
2106     // We try to determine the best format to print the current field
2107     llvm::Twine Format = Types.find(CanonicalType) == Types.end()
2108                              ? Types[Context.VoidPtrTy]
2109                              : Types[CanonicalType];
2110 
2111     Address FieldAddress = Address(FieldPtr, Align);
2112     FieldPtr = CGF.Builder.CreateLoad(FieldAddress);
2113 
2114     // FIXME Need to handle bitfield here
2115     GString = CGF.Builder.CreateGlobalStringPtr(
2116         Format.concat(llvm::Twine('\n')).str());
2117     TmpRes = CGF.Builder.CreateCall(Func, {GString, FieldPtr});
2118     Res = CGF.Builder.CreateAdd(Res, TmpRes);
2119   }
2120 
2121   GString = CGF.Builder.CreateGlobalStringPtr(Pad + "}\n");
2122   Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
2123   Res = CGF.Builder.CreateAdd(Res, TmpRes);
2124   return Res;
2125 }
2126 
2127 static bool
2128 TypeRequiresBuiltinLaunderImp(const ASTContext &Ctx, QualType Ty,
2129                               llvm::SmallPtrSetImpl<const Decl *> &Seen) {
2130   if (const auto *Arr = Ctx.getAsArrayType(Ty))
2131     Ty = Ctx.getBaseElementType(Arr);
2132 
2133   const auto *Record = Ty->getAsCXXRecordDecl();
2134   if (!Record)
2135     return false;
2136 
2137   // We've already checked this type, or are in the process of checking it.
2138   if (!Seen.insert(Record).second)
2139     return false;
2140 
2141   assert(Record->hasDefinition() &&
2142          "Incomplete types should already be diagnosed");
2143 
2144   if (Record->isDynamicClass())
2145     return true;
2146 
2147   for (FieldDecl *F : Record->fields()) {
2148     if (TypeRequiresBuiltinLaunderImp(Ctx, F->getType(), Seen))
2149       return true;
2150   }
2151   return false;
2152 }
2153 
2154 /// Determine if the specified type requires laundering by checking if it is a
2155 /// dynamic class type or contains a subobject which is a dynamic class type.
2156 static bool TypeRequiresBuiltinLaunder(CodeGenModule &CGM, QualType Ty) {
2157   if (!CGM.getCodeGenOpts().StrictVTablePointers)
2158     return false;
2159   llvm::SmallPtrSet<const Decl *, 16> Seen;
2160   return TypeRequiresBuiltinLaunderImp(CGM.getContext(), Ty, Seen);
2161 }
2162 
2163 RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) {
2164   llvm::Value *Src = EmitScalarExpr(E->getArg(0));
2165   llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1));
2166 
2167   // The builtin's shift arg may have a different type than the source arg and
2168   // result, but the LLVM intrinsic uses the same type for all values.
2169   llvm::Type *Ty = Src->getType();
2170   ShiftAmt = Builder.CreateIntCast(ShiftAmt, Ty, false);
2171 
2172   // Rotate is a special case of LLVM funnel shift - 1st 2 args are the same.
2173   unsigned IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
2174   Function *F = CGM.getIntrinsic(IID, Ty);
2175   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
2176 }
2177 
2178 // Map math builtins for long-double to f128 version.
2179 static unsigned mutateLongDoubleBuiltin(unsigned BuiltinID) {
2180   switch (BuiltinID) {
2181 #define MUTATE_LDBL(func) \
2182   case Builtin::BI__builtin_##func##l: \
2183     return Builtin::BI__builtin_##func##f128;
2184   MUTATE_LDBL(sqrt)
2185   MUTATE_LDBL(cbrt)
2186   MUTATE_LDBL(fabs)
2187   MUTATE_LDBL(log)
2188   MUTATE_LDBL(log2)
2189   MUTATE_LDBL(log10)
2190   MUTATE_LDBL(log1p)
2191   MUTATE_LDBL(logb)
2192   MUTATE_LDBL(exp)
2193   MUTATE_LDBL(exp2)
2194   MUTATE_LDBL(expm1)
2195   MUTATE_LDBL(fdim)
2196   MUTATE_LDBL(hypot)
2197   MUTATE_LDBL(ilogb)
2198   MUTATE_LDBL(pow)
2199   MUTATE_LDBL(fmin)
2200   MUTATE_LDBL(fmax)
2201   MUTATE_LDBL(ceil)
2202   MUTATE_LDBL(trunc)
2203   MUTATE_LDBL(rint)
2204   MUTATE_LDBL(nearbyint)
2205   MUTATE_LDBL(round)
2206   MUTATE_LDBL(floor)
2207   MUTATE_LDBL(lround)
2208   MUTATE_LDBL(llround)
2209   MUTATE_LDBL(lrint)
2210   MUTATE_LDBL(llrint)
2211   MUTATE_LDBL(fmod)
2212   MUTATE_LDBL(modf)
2213   MUTATE_LDBL(nan)
2214   MUTATE_LDBL(nans)
2215   MUTATE_LDBL(inf)
2216   MUTATE_LDBL(fma)
2217   MUTATE_LDBL(sin)
2218   MUTATE_LDBL(cos)
2219   MUTATE_LDBL(tan)
2220   MUTATE_LDBL(sinh)
2221   MUTATE_LDBL(cosh)
2222   MUTATE_LDBL(tanh)
2223   MUTATE_LDBL(asin)
2224   MUTATE_LDBL(acos)
2225   MUTATE_LDBL(atan)
2226   MUTATE_LDBL(asinh)
2227   MUTATE_LDBL(acosh)
2228   MUTATE_LDBL(atanh)
2229   MUTATE_LDBL(atan2)
2230   MUTATE_LDBL(erf)
2231   MUTATE_LDBL(erfc)
2232   MUTATE_LDBL(ldexp)
2233   MUTATE_LDBL(frexp)
2234   MUTATE_LDBL(huge_val)
2235   MUTATE_LDBL(copysign)
2236   MUTATE_LDBL(nextafter)
2237   MUTATE_LDBL(nexttoward)
2238   MUTATE_LDBL(remainder)
2239   MUTATE_LDBL(remquo)
2240   MUTATE_LDBL(scalbln)
2241   MUTATE_LDBL(scalbn)
2242   MUTATE_LDBL(tgamma)
2243   MUTATE_LDBL(lgamma)
2244 #undef MUTATE_LDBL
2245   default:
2246     return BuiltinID;
2247   }
2248 }
2249 
2250 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
2251                                         const CallExpr *E,
2252                                         ReturnValueSlot ReturnValue) {
2253   const FunctionDecl *FD = GD.getDecl()->getAsFunction();
2254   // See if we can constant fold this builtin.  If so, don't emit it at all.
2255   Expr::EvalResult Result;
2256   if (E->EvaluateAsRValue(Result, CGM.getContext()) &&
2257       !Result.hasSideEffects()) {
2258     if (Result.Val.isInt())
2259       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
2260                                                 Result.Val.getInt()));
2261     if (Result.Val.isFloat())
2262       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
2263                                                Result.Val.getFloat()));
2264   }
2265 
2266   // If current long-double semantics is IEEE 128-bit, replace math builtins
2267   // of long-double with f128 equivalent.
2268   // TODO: This mutation should also be applied to other targets other than PPC,
2269   // after backend supports IEEE 128-bit style libcalls.
2270   if (getTarget().getTriple().isPPC64() &&
2271       &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
2272     BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
2273 
2274   // If the builtin has been declared explicitly with an assembler label,
2275   // disable the specialized emitting below. Ideally we should communicate the
2276   // rename in IR, or at least avoid generating the intrinsic calls that are
2277   // likely to get lowered to the renamed library functions.
2278   const unsigned BuiltinIDIfNoAsmLabel =
2279       FD->hasAttr<AsmLabelAttr>() ? 0 : BuiltinID;
2280 
2281   // There are LLVM math intrinsics/instructions corresponding to math library
2282   // functions except the LLVM op will never set errno while the math library
2283   // might. Also, math builtins have the same semantics as their math library
2284   // twins. Thus, we can transform math library and builtin calls to their
2285   // LLVM counterparts if the call is marked 'const' (known to never set errno).
2286   if (FD->hasAttr<ConstAttr>()) {
2287     switch (BuiltinIDIfNoAsmLabel) {
2288     case Builtin::BIceil:
2289     case Builtin::BIceilf:
2290     case Builtin::BIceill:
2291     case Builtin::BI__builtin_ceil:
2292     case Builtin::BI__builtin_ceilf:
2293     case Builtin::BI__builtin_ceilf16:
2294     case Builtin::BI__builtin_ceill:
2295     case Builtin::BI__builtin_ceilf128:
2296       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2297                                    Intrinsic::ceil,
2298                                    Intrinsic::experimental_constrained_ceil));
2299 
2300     case Builtin::BIcopysign:
2301     case Builtin::BIcopysignf:
2302     case Builtin::BIcopysignl:
2303     case Builtin::BI__builtin_copysign:
2304     case Builtin::BI__builtin_copysignf:
2305     case Builtin::BI__builtin_copysignf16:
2306     case Builtin::BI__builtin_copysignl:
2307     case Builtin::BI__builtin_copysignf128:
2308       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
2309 
2310     case Builtin::BIcos:
2311     case Builtin::BIcosf:
2312     case Builtin::BIcosl:
2313     case Builtin::BI__builtin_cos:
2314     case Builtin::BI__builtin_cosf:
2315     case Builtin::BI__builtin_cosf16:
2316     case Builtin::BI__builtin_cosl:
2317     case Builtin::BI__builtin_cosf128:
2318       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2319                                    Intrinsic::cos,
2320                                    Intrinsic::experimental_constrained_cos));
2321 
2322     case Builtin::BIexp:
2323     case Builtin::BIexpf:
2324     case Builtin::BIexpl:
2325     case Builtin::BI__builtin_exp:
2326     case Builtin::BI__builtin_expf:
2327     case Builtin::BI__builtin_expf16:
2328     case Builtin::BI__builtin_expl:
2329     case Builtin::BI__builtin_expf128:
2330       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2331                                    Intrinsic::exp,
2332                                    Intrinsic::experimental_constrained_exp));
2333 
2334     case Builtin::BIexp2:
2335     case Builtin::BIexp2f:
2336     case Builtin::BIexp2l:
2337     case Builtin::BI__builtin_exp2:
2338     case Builtin::BI__builtin_exp2f:
2339     case Builtin::BI__builtin_exp2f16:
2340     case Builtin::BI__builtin_exp2l:
2341     case Builtin::BI__builtin_exp2f128:
2342       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2343                                    Intrinsic::exp2,
2344                                    Intrinsic::experimental_constrained_exp2));
2345 
2346     case Builtin::BIfabs:
2347     case Builtin::BIfabsf:
2348     case Builtin::BIfabsl:
2349     case Builtin::BI__builtin_fabs:
2350     case Builtin::BI__builtin_fabsf:
2351     case Builtin::BI__builtin_fabsf16:
2352     case Builtin::BI__builtin_fabsl:
2353     case Builtin::BI__builtin_fabsf128:
2354       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
2355 
2356     case Builtin::BIfloor:
2357     case Builtin::BIfloorf:
2358     case Builtin::BIfloorl:
2359     case Builtin::BI__builtin_floor:
2360     case Builtin::BI__builtin_floorf:
2361     case Builtin::BI__builtin_floorf16:
2362     case Builtin::BI__builtin_floorl:
2363     case Builtin::BI__builtin_floorf128:
2364       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2365                                    Intrinsic::floor,
2366                                    Intrinsic::experimental_constrained_floor));
2367 
2368     case Builtin::BIfma:
2369     case Builtin::BIfmaf:
2370     case Builtin::BIfmal:
2371     case Builtin::BI__builtin_fma:
2372     case Builtin::BI__builtin_fmaf:
2373     case Builtin::BI__builtin_fmaf16:
2374     case Builtin::BI__builtin_fmal:
2375     case Builtin::BI__builtin_fmaf128:
2376       return RValue::get(emitTernaryMaybeConstrainedFPBuiltin(*this, E,
2377                                    Intrinsic::fma,
2378                                    Intrinsic::experimental_constrained_fma));
2379 
2380     case Builtin::BIfmax:
2381     case Builtin::BIfmaxf:
2382     case Builtin::BIfmaxl:
2383     case Builtin::BI__builtin_fmax:
2384     case Builtin::BI__builtin_fmaxf:
2385     case Builtin::BI__builtin_fmaxf16:
2386     case Builtin::BI__builtin_fmaxl:
2387     case Builtin::BI__builtin_fmaxf128:
2388       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2389                                    Intrinsic::maxnum,
2390                                    Intrinsic::experimental_constrained_maxnum));
2391 
2392     case Builtin::BIfmin:
2393     case Builtin::BIfminf:
2394     case Builtin::BIfminl:
2395     case Builtin::BI__builtin_fmin:
2396     case Builtin::BI__builtin_fminf:
2397     case Builtin::BI__builtin_fminf16:
2398     case Builtin::BI__builtin_fminl:
2399     case Builtin::BI__builtin_fminf128:
2400       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2401                                    Intrinsic::minnum,
2402                                    Intrinsic::experimental_constrained_minnum));
2403 
2404     // fmod() is a special-case. It maps to the frem instruction rather than an
2405     // LLVM intrinsic.
2406     case Builtin::BIfmod:
2407     case Builtin::BIfmodf:
2408     case Builtin::BIfmodl:
2409     case Builtin::BI__builtin_fmod:
2410     case Builtin::BI__builtin_fmodf:
2411     case Builtin::BI__builtin_fmodf16:
2412     case Builtin::BI__builtin_fmodl:
2413     case Builtin::BI__builtin_fmodf128: {
2414       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2415       Value *Arg1 = EmitScalarExpr(E->getArg(0));
2416       Value *Arg2 = EmitScalarExpr(E->getArg(1));
2417       return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
2418     }
2419 
2420     case Builtin::BIlog:
2421     case Builtin::BIlogf:
2422     case Builtin::BIlogl:
2423     case Builtin::BI__builtin_log:
2424     case Builtin::BI__builtin_logf:
2425     case Builtin::BI__builtin_logf16:
2426     case Builtin::BI__builtin_logl:
2427     case Builtin::BI__builtin_logf128:
2428       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2429                                    Intrinsic::log,
2430                                    Intrinsic::experimental_constrained_log));
2431 
2432     case Builtin::BIlog10:
2433     case Builtin::BIlog10f:
2434     case Builtin::BIlog10l:
2435     case Builtin::BI__builtin_log10:
2436     case Builtin::BI__builtin_log10f:
2437     case Builtin::BI__builtin_log10f16:
2438     case Builtin::BI__builtin_log10l:
2439     case Builtin::BI__builtin_log10f128:
2440       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2441                                    Intrinsic::log10,
2442                                    Intrinsic::experimental_constrained_log10));
2443 
2444     case Builtin::BIlog2:
2445     case Builtin::BIlog2f:
2446     case Builtin::BIlog2l:
2447     case Builtin::BI__builtin_log2:
2448     case Builtin::BI__builtin_log2f:
2449     case Builtin::BI__builtin_log2f16:
2450     case Builtin::BI__builtin_log2l:
2451     case Builtin::BI__builtin_log2f128:
2452       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2453                                    Intrinsic::log2,
2454                                    Intrinsic::experimental_constrained_log2));
2455 
2456     case Builtin::BInearbyint:
2457     case Builtin::BInearbyintf:
2458     case Builtin::BInearbyintl:
2459     case Builtin::BI__builtin_nearbyint:
2460     case Builtin::BI__builtin_nearbyintf:
2461     case Builtin::BI__builtin_nearbyintl:
2462     case Builtin::BI__builtin_nearbyintf128:
2463       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2464                                 Intrinsic::nearbyint,
2465                                 Intrinsic::experimental_constrained_nearbyint));
2466 
2467     case Builtin::BIpow:
2468     case Builtin::BIpowf:
2469     case Builtin::BIpowl:
2470     case Builtin::BI__builtin_pow:
2471     case Builtin::BI__builtin_powf:
2472     case Builtin::BI__builtin_powf16:
2473     case Builtin::BI__builtin_powl:
2474     case Builtin::BI__builtin_powf128:
2475       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2476                                    Intrinsic::pow,
2477                                    Intrinsic::experimental_constrained_pow));
2478 
2479     case Builtin::BIrint:
2480     case Builtin::BIrintf:
2481     case Builtin::BIrintl:
2482     case Builtin::BI__builtin_rint:
2483     case Builtin::BI__builtin_rintf:
2484     case Builtin::BI__builtin_rintf16:
2485     case Builtin::BI__builtin_rintl:
2486     case Builtin::BI__builtin_rintf128:
2487       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2488                                    Intrinsic::rint,
2489                                    Intrinsic::experimental_constrained_rint));
2490 
2491     case Builtin::BIround:
2492     case Builtin::BIroundf:
2493     case Builtin::BIroundl:
2494     case Builtin::BI__builtin_round:
2495     case Builtin::BI__builtin_roundf:
2496     case Builtin::BI__builtin_roundf16:
2497     case Builtin::BI__builtin_roundl:
2498     case Builtin::BI__builtin_roundf128:
2499       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2500                                    Intrinsic::round,
2501                                    Intrinsic::experimental_constrained_round));
2502 
2503     case Builtin::BIsin:
2504     case Builtin::BIsinf:
2505     case Builtin::BIsinl:
2506     case Builtin::BI__builtin_sin:
2507     case Builtin::BI__builtin_sinf:
2508     case Builtin::BI__builtin_sinf16:
2509     case Builtin::BI__builtin_sinl:
2510     case Builtin::BI__builtin_sinf128:
2511       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2512                                    Intrinsic::sin,
2513                                    Intrinsic::experimental_constrained_sin));
2514 
2515     case Builtin::BIsqrt:
2516     case Builtin::BIsqrtf:
2517     case Builtin::BIsqrtl:
2518     case Builtin::BI__builtin_sqrt:
2519     case Builtin::BI__builtin_sqrtf:
2520     case Builtin::BI__builtin_sqrtf16:
2521     case Builtin::BI__builtin_sqrtl:
2522     case Builtin::BI__builtin_sqrtf128:
2523       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2524                                    Intrinsic::sqrt,
2525                                    Intrinsic::experimental_constrained_sqrt));
2526 
2527     case Builtin::BItrunc:
2528     case Builtin::BItruncf:
2529     case Builtin::BItruncl:
2530     case Builtin::BI__builtin_trunc:
2531     case Builtin::BI__builtin_truncf:
2532     case Builtin::BI__builtin_truncf16:
2533     case Builtin::BI__builtin_truncl:
2534     case Builtin::BI__builtin_truncf128:
2535       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2536                                    Intrinsic::trunc,
2537                                    Intrinsic::experimental_constrained_trunc));
2538 
2539     case Builtin::BIlround:
2540     case Builtin::BIlroundf:
2541     case Builtin::BIlroundl:
2542     case Builtin::BI__builtin_lround:
2543     case Builtin::BI__builtin_lroundf:
2544     case Builtin::BI__builtin_lroundl:
2545     case Builtin::BI__builtin_lroundf128:
2546       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2547           *this, E, Intrinsic::lround,
2548           Intrinsic::experimental_constrained_lround));
2549 
2550     case Builtin::BIllround:
2551     case Builtin::BIllroundf:
2552     case Builtin::BIllroundl:
2553     case Builtin::BI__builtin_llround:
2554     case Builtin::BI__builtin_llroundf:
2555     case Builtin::BI__builtin_llroundl:
2556     case Builtin::BI__builtin_llroundf128:
2557       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2558           *this, E, Intrinsic::llround,
2559           Intrinsic::experimental_constrained_llround));
2560 
2561     case Builtin::BIlrint:
2562     case Builtin::BIlrintf:
2563     case Builtin::BIlrintl:
2564     case Builtin::BI__builtin_lrint:
2565     case Builtin::BI__builtin_lrintf:
2566     case Builtin::BI__builtin_lrintl:
2567     case Builtin::BI__builtin_lrintf128:
2568       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2569           *this, E, Intrinsic::lrint,
2570           Intrinsic::experimental_constrained_lrint));
2571 
2572     case Builtin::BIllrint:
2573     case Builtin::BIllrintf:
2574     case Builtin::BIllrintl:
2575     case Builtin::BI__builtin_llrint:
2576     case Builtin::BI__builtin_llrintf:
2577     case Builtin::BI__builtin_llrintl:
2578     case Builtin::BI__builtin_llrintf128:
2579       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2580           *this, E, Intrinsic::llrint,
2581           Intrinsic::experimental_constrained_llrint));
2582 
2583     default:
2584       break;
2585     }
2586   }
2587 
2588   switch (BuiltinIDIfNoAsmLabel) {
2589   default: break;
2590   case Builtin::BI__builtin___CFStringMakeConstantString:
2591   case Builtin::BI__builtin___NSStringMakeConstantString:
2592     return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
2593   case Builtin::BI__builtin_stdarg_start:
2594   case Builtin::BI__builtin_va_start:
2595   case Builtin::BI__va_start:
2596   case Builtin::BI__builtin_va_end:
2597     return RValue::get(
2598         EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
2599                            ? EmitScalarExpr(E->getArg(0))
2600                            : EmitVAListRef(E->getArg(0)).getPointer(),
2601                        BuiltinID != Builtin::BI__builtin_va_end));
2602   case Builtin::BI__builtin_va_copy: {
2603     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
2604     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
2605 
2606     llvm::Type *Type = Int8PtrTy;
2607 
2608     DstPtr = Builder.CreateBitCast(DstPtr, Type);
2609     SrcPtr = Builder.CreateBitCast(SrcPtr, Type);
2610     return RValue::get(Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy),
2611                                           {DstPtr, SrcPtr}));
2612   }
2613   case Builtin::BI__builtin_abs:
2614   case Builtin::BI__builtin_labs:
2615   case Builtin::BI__builtin_llabs: {
2616     // X < 0 ? -X : X
2617     // The negation has 'nsw' because abs of INT_MIN is undefined.
2618     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2619     Value *NegOp = Builder.CreateNSWNeg(ArgValue, "neg");
2620     Constant *Zero = llvm::Constant::getNullValue(ArgValue->getType());
2621     Value *CmpResult = Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
2622     Value *Result = Builder.CreateSelect(CmpResult, NegOp, ArgValue, "abs");
2623     return RValue::get(Result);
2624   }
2625   case Builtin::BI__builtin_complex: {
2626     Value *Real = EmitScalarExpr(E->getArg(0));
2627     Value *Imag = EmitScalarExpr(E->getArg(1));
2628     return RValue::getComplex({Real, Imag});
2629   }
2630   case Builtin::BI__builtin_conj:
2631   case Builtin::BI__builtin_conjf:
2632   case Builtin::BI__builtin_conjl:
2633   case Builtin::BIconj:
2634   case Builtin::BIconjf:
2635   case Builtin::BIconjl: {
2636     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2637     Value *Real = ComplexVal.first;
2638     Value *Imag = ComplexVal.second;
2639     Imag = Builder.CreateFNeg(Imag, "neg");
2640     return RValue::getComplex(std::make_pair(Real, Imag));
2641   }
2642   case Builtin::BI__builtin_creal:
2643   case Builtin::BI__builtin_crealf:
2644   case Builtin::BI__builtin_creall:
2645   case Builtin::BIcreal:
2646   case Builtin::BIcrealf:
2647   case Builtin::BIcreall: {
2648     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2649     return RValue::get(ComplexVal.first);
2650   }
2651 
2652   case Builtin::BI__builtin_dump_struct: {
2653     llvm::Type *LLVMIntTy = getTypes().ConvertType(getContext().IntTy);
2654     llvm::FunctionType *LLVMFuncType = llvm::FunctionType::get(
2655         LLVMIntTy, {llvm::Type::getInt8PtrTy(getLLVMContext())}, true);
2656 
2657     Value *Func = EmitScalarExpr(E->getArg(1)->IgnoreImpCasts());
2658     CharUnits Arg0Align = EmitPointerWithAlignment(E->getArg(0)).getAlignment();
2659 
2660     const Expr *Arg0 = E->getArg(0)->IgnoreImpCasts();
2661     QualType Arg0Type = Arg0->getType()->getPointeeType();
2662 
2663     Value *RecordPtr = EmitScalarExpr(Arg0);
2664     Value *Res = dumpRecord(*this, Arg0Type, RecordPtr, Arg0Align,
2665                             {LLVMFuncType, Func}, 0);
2666     return RValue::get(Res);
2667   }
2668 
2669   case Builtin::BI__builtin_preserve_access_index: {
2670     // Only enabled preserved access index region when debuginfo
2671     // is available as debuginfo is needed to preserve user-level
2672     // access pattern.
2673     if (!getDebugInfo()) {
2674       CGM.Error(E->getExprLoc(), "using builtin_preserve_access_index() without -g");
2675       return RValue::get(EmitScalarExpr(E->getArg(0)));
2676     }
2677 
2678     // Nested builtin_preserve_access_index() not supported
2679     if (IsInPreservedAIRegion) {
2680       CGM.Error(E->getExprLoc(), "nested builtin_preserve_access_index() not supported");
2681       return RValue::get(EmitScalarExpr(E->getArg(0)));
2682     }
2683 
2684     IsInPreservedAIRegion = true;
2685     Value *Res = EmitScalarExpr(E->getArg(0));
2686     IsInPreservedAIRegion = false;
2687     return RValue::get(Res);
2688   }
2689 
2690   case Builtin::BI__builtin_cimag:
2691   case Builtin::BI__builtin_cimagf:
2692   case Builtin::BI__builtin_cimagl:
2693   case Builtin::BIcimag:
2694   case Builtin::BIcimagf:
2695   case Builtin::BIcimagl: {
2696     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2697     return RValue::get(ComplexVal.second);
2698   }
2699 
2700   case Builtin::BI__builtin_clrsb:
2701   case Builtin::BI__builtin_clrsbl:
2702   case Builtin::BI__builtin_clrsbll: {
2703     // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or
2704     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2705 
2706     llvm::Type *ArgType = ArgValue->getType();
2707     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2708 
2709     llvm::Type *ResultType = ConvertType(E->getType());
2710     Value *Zero = llvm::Constant::getNullValue(ArgType);
2711     Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg");
2712     Value *Inverse = Builder.CreateNot(ArgValue, "not");
2713     Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue);
2714     Value *Ctlz = Builder.CreateCall(F, {Tmp, Builder.getFalse()});
2715     Value *Result = Builder.CreateSub(Ctlz, llvm::ConstantInt::get(ArgType, 1));
2716     Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2717                                    "cast");
2718     return RValue::get(Result);
2719   }
2720   case Builtin::BI__builtin_ctzs:
2721   case Builtin::BI__builtin_ctz:
2722   case Builtin::BI__builtin_ctzl:
2723   case Builtin::BI__builtin_ctzll: {
2724     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
2725 
2726     llvm::Type *ArgType = ArgValue->getType();
2727     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
2728 
2729     llvm::Type *ResultType = ConvertType(E->getType());
2730     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
2731     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
2732     if (Result->getType() != ResultType)
2733       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2734                                      "cast");
2735     return RValue::get(Result);
2736   }
2737   case Builtin::BI__builtin_clzs:
2738   case Builtin::BI__builtin_clz:
2739   case Builtin::BI__builtin_clzl:
2740   case Builtin::BI__builtin_clzll: {
2741     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
2742 
2743     llvm::Type *ArgType = ArgValue->getType();
2744     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2745 
2746     llvm::Type *ResultType = ConvertType(E->getType());
2747     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
2748     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
2749     if (Result->getType() != ResultType)
2750       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2751                                      "cast");
2752     return RValue::get(Result);
2753   }
2754   case Builtin::BI__builtin_ffs:
2755   case Builtin::BI__builtin_ffsl:
2756   case Builtin::BI__builtin_ffsll: {
2757     // ffs(x) -> x ? cttz(x) + 1 : 0
2758     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2759 
2760     llvm::Type *ArgType = ArgValue->getType();
2761     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
2762 
2763     llvm::Type *ResultType = ConvertType(E->getType());
2764     Value *Tmp =
2765         Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
2766                           llvm::ConstantInt::get(ArgType, 1));
2767     Value *Zero = llvm::Constant::getNullValue(ArgType);
2768     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
2769     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
2770     if (Result->getType() != ResultType)
2771       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2772                                      "cast");
2773     return RValue::get(Result);
2774   }
2775   case Builtin::BI__builtin_parity:
2776   case Builtin::BI__builtin_parityl:
2777   case Builtin::BI__builtin_parityll: {
2778     // parity(x) -> ctpop(x) & 1
2779     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2780 
2781     llvm::Type *ArgType = ArgValue->getType();
2782     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
2783 
2784     llvm::Type *ResultType = ConvertType(E->getType());
2785     Value *Tmp = Builder.CreateCall(F, ArgValue);
2786     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
2787     if (Result->getType() != ResultType)
2788       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2789                                      "cast");
2790     return RValue::get(Result);
2791   }
2792   case Builtin::BI__lzcnt16:
2793   case Builtin::BI__lzcnt:
2794   case Builtin::BI__lzcnt64: {
2795     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2796 
2797     llvm::Type *ArgType = ArgValue->getType();
2798     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2799 
2800     llvm::Type *ResultType = ConvertType(E->getType());
2801     Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getFalse()});
2802     if (Result->getType() != ResultType)
2803       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2804                                      "cast");
2805     return RValue::get(Result);
2806   }
2807   case Builtin::BI__popcnt16:
2808   case Builtin::BI__popcnt:
2809   case Builtin::BI__popcnt64:
2810   case Builtin::BI__builtin_popcount:
2811   case Builtin::BI__builtin_popcountl:
2812   case Builtin::BI__builtin_popcountll: {
2813     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2814 
2815     llvm::Type *ArgType = ArgValue->getType();
2816     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
2817 
2818     llvm::Type *ResultType = ConvertType(E->getType());
2819     Value *Result = Builder.CreateCall(F, ArgValue);
2820     if (Result->getType() != ResultType)
2821       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2822                                      "cast");
2823     return RValue::get(Result);
2824   }
2825   case Builtin::BI__builtin_unpredictable: {
2826     // Always return the argument of __builtin_unpredictable. LLVM does not
2827     // handle this builtin. Metadata for this builtin should be added directly
2828     // to instructions such as branches or switches that use it.
2829     return RValue::get(EmitScalarExpr(E->getArg(0)));
2830   }
2831   case Builtin::BI__builtin_expect: {
2832     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2833     llvm::Type *ArgType = ArgValue->getType();
2834 
2835     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
2836     // Don't generate llvm.expect on -O0 as the backend won't use it for
2837     // anything.
2838     // Note, we still IRGen ExpectedValue because it could have side-effects.
2839     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
2840       return RValue::get(ArgValue);
2841 
2842     Function *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
2843     Value *Result =
2844         Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
2845     return RValue::get(Result);
2846   }
2847   case Builtin::BI__builtin_expect_with_probability: {
2848     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2849     llvm::Type *ArgType = ArgValue->getType();
2850 
2851     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
2852     llvm::APFloat Probability(0.0);
2853     const Expr *ProbArg = E->getArg(2);
2854     bool EvalSucceed = ProbArg->EvaluateAsFloat(Probability, CGM.getContext());
2855     assert(EvalSucceed && "probability should be able to evaluate as float");
2856     (void)EvalSucceed;
2857     bool LoseInfo = false;
2858     Probability.convert(llvm::APFloat::IEEEdouble(),
2859                         llvm::RoundingMode::Dynamic, &LoseInfo);
2860     llvm::Type *Ty = ConvertType(ProbArg->getType());
2861     Constant *Confidence = ConstantFP::get(Ty, Probability);
2862     // Don't generate llvm.expect.with.probability on -O0 as the backend
2863     // won't use it for anything.
2864     // Note, we still IRGen ExpectedValue because it could have side-effects.
2865     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
2866       return RValue::get(ArgValue);
2867 
2868     Function *FnExpect =
2869         CGM.getIntrinsic(Intrinsic::expect_with_probability, ArgType);
2870     Value *Result = Builder.CreateCall(
2871         FnExpect, {ArgValue, ExpectedValue, Confidence}, "expval");
2872     return RValue::get(Result);
2873   }
2874   case Builtin::BI__builtin_assume_aligned: {
2875     const Expr *Ptr = E->getArg(0);
2876     Value *PtrValue = EmitScalarExpr(Ptr);
2877     Value *OffsetValue =
2878       (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
2879 
2880     Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
2881     ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
2882     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
2883       AlignmentCI = ConstantInt::get(AlignmentCI->getType(),
2884                                      llvm::Value::MaximumAlignment);
2885 
2886     emitAlignmentAssumption(PtrValue, Ptr,
2887                             /*The expr loc is sufficient.*/ SourceLocation(),
2888                             AlignmentCI, OffsetValue);
2889     return RValue::get(PtrValue);
2890   }
2891   case Builtin::BI__assume:
2892   case Builtin::BI__builtin_assume: {
2893     if (E->getArg(0)->HasSideEffects(getContext()))
2894       return RValue::get(nullptr);
2895 
2896     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2897     Function *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
2898     return RValue::get(Builder.CreateCall(FnAssume, ArgValue));
2899   }
2900   case Builtin::BI__arithmetic_fence: {
2901     // Create the builtin call if FastMath is selected, and the target
2902     // supports the builtin, otherwise just return the argument.
2903     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2904     llvm::FastMathFlags FMF = Builder.getFastMathFlags();
2905     bool isArithmeticFenceEnabled =
2906         FMF.allowReassoc() &&
2907         getContext().getTargetInfo().checkArithmeticFenceSupported();
2908     QualType ArgType = E->getArg(0)->getType();
2909     if (ArgType->isComplexType()) {
2910       if (isArithmeticFenceEnabled) {
2911         QualType ElementType = ArgType->castAs<ComplexType>()->getElementType();
2912         ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2913         Value *Real = Builder.CreateArithmeticFence(ComplexVal.first,
2914                                                     ConvertType(ElementType));
2915         Value *Imag = Builder.CreateArithmeticFence(ComplexVal.second,
2916                                                     ConvertType(ElementType));
2917         return RValue::getComplex(std::make_pair(Real, Imag));
2918       }
2919       ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2920       Value *Real = ComplexVal.first;
2921       Value *Imag = ComplexVal.second;
2922       return RValue::getComplex(std::make_pair(Real, Imag));
2923     }
2924     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2925     if (isArithmeticFenceEnabled)
2926       return RValue::get(
2927           Builder.CreateArithmeticFence(ArgValue, ConvertType(ArgType)));
2928     return RValue::get(ArgValue);
2929   }
2930   case Builtin::BI__builtin_bswap16:
2931   case Builtin::BI__builtin_bswap32:
2932   case Builtin::BI__builtin_bswap64: {
2933     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
2934   }
2935   case Builtin::BI__builtin_bitreverse8:
2936   case Builtin::BI__builtin_bitreverse16:
2937   case Builtin::BI__builtin_bitreverse32:
2938   case Builtin::BI__builtin_bitreverse64: {
2939     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
2940   }
2941   case Builtin::BI__builtin_rotateleft8:
2942   case Builtin::BI__builtin_rotateleft16:
2943   case Builtin::BI__builtin_rotateleft32:
2944   case Builtin::BI__builtin_rotateleft64:
2945   case Builtin::BI_rotl8: // Microsoft variants of rotate left
2946   case Builtin::BI_rotl16:
2947   case Builtin::BI_rotl:
2948   case Builtin::BI_lrotl:
2949   case Builtin::BI_rotl64:
2950     return emitRotate(E, false);
2951 
2952   case Builtin::BI__builtin_rotateright8:
2953   case Builtin::BI__builtin_rotateright16:
2954   case Builtin::BI__builtin_rotateright32:
2955   case Builtin::BI__builtin_rotateright64:
2956   case Builtin::BI_rotr8: // Microsoft variants of rotate right
2957   case Builtin::BI_rotr16:
2958   case Builtin::BI_rotr:
2959   case Builtin::BI_lrotr:
2960   case Builtin::BI_rotr64:
2961     return emitRotate(E, true);
2962 
2963   case Builtin::BI__builtin_constant_p: {
2964     llvm::Type *ResultType = ConvertType(E->getType());
2965 
2966     const Expr *Arg = E->getArg(0);
2967     QualType ArgType = Arg->getType();
2968     // FIXME: The allowance for Obj-C pointers and block pointers is historical
2969     // and likely a mistake.
2970     if (!ArgType->isIntegralOrEnumerationType() && !ArgType->isFloatingType() &&
2971         !ArgType->isObjCObjectPointerType() && !ArgType->isBlockPointerType())
2972       // Per the GCC documentation, only numeric constants are recognized after
2973       // inlining.
2974       return RValue::get(ConstantInt::get(ResultType, 0));
2975 
2976     if (Arg->HasSideEffects(getContext()))
2977       // The argument is unevaluated, so be conservative if it might have
2978       // side-effects.
2979       return RValue::get(ConstantInt::get(ResultType, 0));
2980 
2981     Value *ArgValue = EmitScalarExpr(Arg);
2982     if (ArgType->isObjCObjectPointerType()) {
2983       // Convert Objective-C objects to id because we cannot distinguish between
2984       // LLVM types for Obj-C classes as they are opaque.
2985       ArgType = CGM.getContext().getObjCIdType();
2986       ArgValue = Builder.CreateBitCast(ArgValue, ConvertType(ArgType));
2987     }
2988     Function *F =
2989         CGM.getIntrinsic(Intrinsic::is_constant, ConvertType(ArgType));
2990     Value *Result = Builder.CreateCall(F, ArgValue);
2991     if (Result->getType() != ResultType)
2992       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/false);
2993     return RValue::get(Result);
2994   }
2995   case Builtin::BI__builtin_dynamic_object_size:
2996   case Builtin::BI__builtin_object_size: {
2997     unsigned Type =
2998         E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
2999     auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
3000 
3001     // We pass this builtin onto the optimizer so that it can figure out the
3002     // object size in more complex cases.
3003     bool IsDynamic = BuiltinID == Builtin::BI__builtin_dynamic_object_size;
3004     return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
3005                                              /*EmittedE=*/nullptr, IsDynamic));
3006   }
3007   case Builtin::BI__builtin_prefetch: {
3008     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
3009     // FIXME: Technically these constants should of type 'int', yes?
3010     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
3011       llvm::ConstantInt::get(Int32Ty, 0);
3012     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
3013       llvm::ConstantInt::get(Int32Ty, 3);
3014     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
3015     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
3016     return RValue::get(Builder.CreateCall(F, {Address, RW, Locality, Data}));
3017   }
3018   case Builtin::BI__builtin_readcyclecounter: {
3019     Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
3020     return RValue::get(Builder.CreateCall(F));
3021   }
3022   case Builtin::BI__builtin___clear_cache: {
3023     Value *Begin = EmitScalarExpr(E->getArg(0));
3024     Value *End = EmitScalarExpr(E->getArg(1));
3025     Function *F = CGM.getIntrinsic(Intrinsic::clear_cache);
3026     return RValue::get(Builder.CreateCall(F, {Begin, End}));
3027   }
3028   case Builtin::BI__builtin_trap:
3029     return RValue::get(EmitTrapCall(Intrinsic::trap));
3030   case Builtin::BI__debugbreak:
3031     return RValue::get(EmitTrapCall(Intrinsic::debugtrap));
3032   case Builtin::BI__builtin_unreachable: {
3033     EmitUnreachable(E->getExprLoc());
3034 
3035     // We do need to preserve an insertion point.
3036     EmitBlock(createBasicBlock("unreachable.cont"));
3037 
3038     return RValue::get(nullptr);
3039   }
3040 
3041   case Builtin::BI__builtin_powi:
3042   case Builtin::BI__builtin_powif:
3043   case Builtin::BI__builtin_powil: {
3044     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
3045     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
3046 
3047     if (Builder.getIsFPConstrained()) {
3048       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3049       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_powi,
3050                                      Src0->getType());
3051       return RValue::get(Builder.CreateConstrainedFPCall(F, { Src0, Src1 }));
3052     }
3053 
3054     Function *F = CGM.getIntrinsic(Intrinsic::powi,
3055                                    { Src0->getType(), Src1->getType() });
3056     return RValue::get(Builder.CreateCall(F, { Src0, Src1 }));
3057   }
3058   case Builtin::BI__builtin_isgreater:
3059   case Builtin::BI__builtin_isgreaterequal:
3060   case Builtin::BI__builtin_isless:
3061   case Builtin::BI__builtin_islessequal:
3062   case Builtin::BI__builtin_islessgreater:
3063   case Builtin::BI__builtin_isunordered: {
3064     // Ordered comparisons: we know the arguments to these are matching scalar
3065     // floating point values.
3066     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3067     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3068     Value *LHS = EmitScalarExpr(E->getArg(0));
3069     Value *RHS = EmitScalarExpr(E->getArg(1));
3070 
3071     switch (BuiltinID) {
3072     default: llvm_unreachable("Unknown ordered comparison");
3073     case Builtin::BI__builtin_isgreater:
3074       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
3075       break;
3076     case Builtin::BI__builtin_isgreaterequal:
3077       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
3078       break;
3079     case Builtin::BI__builtin_isless:
3080       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
3081       break;
3082     case Builtin::BI__builtin_islessequal:
3083       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
3084       break;
3085     case Builtin::BI__builtin_islessgreater:
3086       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
3087       break;
3088     case Builtin::BI__builtin_isunordered:
3089       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
3090       break;
3091     }
3092     // ZExt bool to int type.
3093     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
3094   }
3095   case Builtin::BI__builtin_isnan: {
3096     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3097     Value *V = EmitScalarExpr(E->getArg(0));
3098     llvm::Type *Ty = V->getType();
3099     const llvm::fltSemantics &Semantics = Ty->getFltSemantics();
3100     if (!Builder.getIsFPConstrained() ||
3101         Builder.getDefaultConstrainedExcept() == fp::ebIgnore ||
3102         !Ty->isIEEE()) {
3103       V = Builder.CreateFCmpUNO(V, V, "cmp");
3104       return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
3105     }
3106 
3107     if (Value *Result = getTargetHooks().testFPKind(V, BuiltinID, Builder, CGM))
3108       return RValue::get(Result);
3109 
3110     // NaN has all exp bits set and a non zero significand. Therefore:
3111     // isnan(V) == ((exp mask - (abs(V) & exp mask)) < 0)
3112     unsigned bitsize = Ty->getScalarSizeInBits();
3113     llvm::IntegerType *IntTy = Builder.getIntNTy(bitsize);
3114     Value *IntV = Builder.CreateBitCast(V, IntTy);
3115     APInt AndMask = APInt::getSignedMaxValue(bitsize);
3116     Value *AbsV =
3117         Builder.CreateAnd(IntV, llvm::ConstantInt::get(IntTy, AndMask));
3118     APInt ExpMask = APFloat::getInf(Semantics).bitcastToAPInt();
3119     Value *Sub =
3120         Builder.CreateSub(llvm::ConstantInt::get(IntTy, ExpMask), AbsV);
3121     // V = sign bit (Sub) <=> V = (Sub < 0)
3122     V = Builder.CreateLShr(Sub, llvm::ConstantInt::get(IntTy, bitsize - 1));
3123     if (bitsize > 32)
3124       V = Builder.CreateTrunc(V, ConvertType(E->getType()));
3125     return RValue::get(V);
3126   }
3127 
3128   case Builtin::BI__builtin_elementwise_abs: {
3129     Value *Result;
3130     QualType QT = E->getArg(0)->getType();
3131 
3132     if (auto *VecTy = QT->getAs<VectorType>())
3133       QT = VecTy->getElementType();
3134     if (QT->isIntegerType())
3135       Result = Builder.CreateBinaryIntrinsic(
3136           llvm::Intrinsic::abs, EmitScalarExpr(E->getArg(0)),
3137           Builder.getFalse(), nullptr, "elt.abs");
3138     else
3139       Result = emitUnaryBuiltin(*this, E, llvm::Intrinsic::fabs, "elt.abs");
3140 
3141     return RValue::get(Result);
3142   }
3143 
3144   case Builtin::BI__builtin_elementwise_ceil:
3145     return RValue::get(
3146         emitUnaryBuiltin(*this, E, llvm::Intrinsic::ceil, "elt.ceil"));
3147   case Builtin::BI__builtin_elementwise_floor:
3148     return RValue::get(
3149         emitUnaryBuiltin(*this, E, llvm::Intrinsic::floor, "elt.floor"));
3150   case Builtin::BI__builtin_elementwise_roundeven:
3151     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::roundeven,
3152                                         "elt.roundeven"));
3153   case Builtin::BI__builtin_elementwise_trunc:
3154     return RValue::get(
3155         emitUnaryBuiltin(*this, E, llvm::Intrinsic::trunc, "elt.trunc"));
3156 
3157   case Builtin::BI__builtin_elementwise_add_sat:
3158   case Builtin::BI__builtin_elementwise_sub_sat: {
3159     Value *Op0 = EmitScalarExpr(E->getArg(0));
3160     Value *Op1 = EmitScalarExpr(E->getArg(1));
3161     Value *Result;
3162     assert(Op0->getType()->isIntOrIntVectorTy() && "integer type expected");
3163     QualType Ty = E->getArg(0)->getType();
3164     if (auto *VecTy = Ty->getAs<VectorType>())
3165       Ty = VecTy->getElementType();
3166     bool IsSigned = Ty->isSignedIntegerType();
3167     unsigned Opc;
3168     if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_elementwise_add_sat)
3169       Opc = IsSigned ? llvm::Intrinsic::sadd_sat : llvm::Intrinsic::uadd_sat;
3170     else
3171       Opc = IsSigned ? llvm::Intrinsic::ssub_sat : llvm::Intrinsic::usub_sat;
3172     Result = Builder.CreateBinaryIntrinsic(Opc, Op0, Op1, nullptr, "elt.sat");
3173     return RValue::get(Result);
3174   }
3175 
3176   case Builtin::BI__builtin_elementwise_max: {
3177     Value *Op0 = EmitScalarExpr(E->getArg(0));
3178     Value *Op1 = EmitScalarExpr(E->getArg(1));
3179     Value *Result;
3180     if (Op0->getType()->isIntOrIntVectorTy()) {
3181       QualType Ty = E->getArg(0)->getType();
3182       if (auto *VecTy = Ty->getAs<VectorType>())
3183         Ty = VecTy->getElementType();
3184       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3185                                                  ? llvm::Intrinsic::smax
3186                                                  : llvm::Intrinsic::umax,
3187                                              Op0, Op1, nullptr, "elt.max");
3188     } else
3189       Result = Builder.CreateMaxNum(Op0, Op1, "elt.max");
3190     return RValue::get(Result);
3191   }
3192   case Builtin::BI__builtin_elementwise_min: {
3193     Value *Op0 = EmitScalarExpr(E->getArg(0));
3194     Value *Op1 = EmitScalarExpr(E->getArg(1));
3195     Value *Result;
3196     if (Op0->getType()->isIntOrIntVectorTy()) {
3197       QualType Ty = E->getArg(0)->getType();
3198       if (auto *VecTy = Ty->getAs<VectorType>())
3199         Ty = VecTy->getElementType();
3200       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3201                                                  ? llvm::Intrinsic::smin
3202                                                  : llvm::Intrinsic::umin,
3203                                              Op0, Op1, nullptr, "elt.min");
3204     } else
3205       Result = Builder.CreateMinNum(Op0, Op1, "elt.min");
3206     return RValue::get(Result);
3207   }
3208 
3209   case Builtin::BI__builtin_reduce_max: {
3210     auto GetIntrinsicID = [](QualType QT) {
3211       if (auto *VecTy = QT->getAs<VectorType>())
3212         QT = VecTy->getElementType();
3213       if (QT->isSignedIntegerType())
3214         return llvm::Intrinsic::vector_reduce_smax;
3215       if (QT->isUnsignedIntegerType())
3216         return llvm::Intrinsic::vector_reduce_umax;
3217       assert(QT->isFloatingType() && "must have a float here");
3218       return llvm::Intrinsic::vector_reduce_fmax;
3219     };
3220     return RValue::get(emitUnaryBuiltin(
3221         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3222   }
3223 
3224   case Builtin::BI__builtin_reduce_min: {
3225     auto GetIntrinsicID = [](QualType QT) {
3226       if (auto *VecTy = QT->getAs<VectorType>())
3227         QT = VecTy->getElementType();
3228       if (QT->isSignedIntegerType())
3229         return llvm::Intrinsic::vector_reduce_smin;
3230       if (QT->isUnsignedIntegerType())
3231         return llvm::Intrinsic::vector_reduce_umin;
3232       assert(QT->isFloatingType() && "must have a float here");
3233       return llvm::Intrinsic::vector_reduce_fmin;
3234     };
3235 
3236     return RValue::get(emitUnaryBuiltin(
3237         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3238   }
3239 
3240   case Builtin::BI__builtin_reduce_xor:
3241     return RValue::get(emitUnaryBuiltin(
3242         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
3243   case Builtin::BI__builtin_reduce_or:
3244     return RValue::get(emitUnaryBuiltin(
3245         *this, E, llvm::Intrinsic::vector_reduce_or, "rdx.or"));
3246   case Builtin::BI__builtin_reduce_and:
3247     return RValue::get(emitUnaryBuiltin(
3248         *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and"));
3249 
3250   case Builtin::BI__builtin_matrix_transpose: {
3251     auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
3252     Value *MatValue = EmitScalarExpr(E->getArg(0));
3253     MatrixBuilder MB(Builder);
3254     Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
3255                                              MatrixTy->getNumColumns());
3256     return RValue::get(Result);
3257   }
3258 
3259   case Builtin::BI__builtin_matrix_column_major_load: {
3260     MatrixBuilder MB(Builder);
3261     // Emit everything that isn't dependent on the first parameter type
3262     Value *Stride = EmitScalarExpr(E->getArg(3));
3263     const auto *ResultTy = E->getType()->getAs<ConstantMatrixType>();
3264     auto *PtrTy = E->getArg(0)->getType()->getAs<PointerType>();
3265     assert(PtrTy && "arg0 must be of pointer type");
3266     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3267 
3268     Address Src = EmitPointerWithAlignment(E->getArg(0));
3269     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
3270                         E->getArg(0)->getExprLoc(), FD, 0);
3271     Value *Result = MB.CreateColumnMajorLoad(
3272         Src.getElementType(), Src.getPointer(),
3273         Align(Src.getAlignment().getQuantity()), Stride, IsVolatile,
3274         ResultTy->getNumRows(), ResultTy->getNumColumns(),
3275         "matrix");
3276     return RValue::get(Result);
3277   }
3278 
3279   case Builtin::BI__builtin_matrix_column_major_store: {
3280     MatrixBuilder MB(Builder);
3281     Value *Matrix = EmitScalarExpr(E->getArg(0));
3282     Address Dst = EmitPointerWithAlignment(E->getArg(1));
3283     Value *Stride = EmitScalarExpr(E->getArg(2));
3284 
3285     const auto *MatrixTy = E->getArg(0)->getType()->getAs<ConstantMatrixType>();
3286     auto *PtrTy = E->getArg(1)->getType()->getAs<PointerType>();
3287     assert(PtrTy && "arg1 must be of pointer type");
3288     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3289 
3290     EmitNonNullArgCheck(RValue::get(Dst.getPointer()), E->getArg(1)->getType(),
3291                         E->getArg(1)->getExprLoc(), FD, 0);
3292     Value *Result = MB.CreateColumnMajorStore(
3293         Matrix, Dst.getPointer(), Align(Dst.getAlignment().getQuantity()),
3294         Stride, IsVolatile, MatrixTy->getNumRows(), MatrixTy->getNumColumns());
3295     return RValue::get(Result);
3296   }
3297 
3298   case Builtin::BIfinite:
3299   case Builtin::BI__finite:
3300   case Builtin::BIfinitef:
3301   case Builtin::BI__finitef:
3302   case Builtin::BIfinitel:
3303   case Builtin::BI__finitel:
3304   case Builtin::BI__builtin_isinf:
3305   case Builtin::BI__builtin_isfinite: {
3306     // isinf(x)    --> fabs(x) == infinity
3307     // isfinite(x) --> fabs(x) != infinity
3308     // x != NaN via the ordered compare in either case.
3309     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3310     Value *V = EmitScalarExpr(E->getArg(0));
3311     llvm::Type *Ty = V->getType();
3312     if (!Builder.getIsFPConstrained() ||
3313         Builder.getDefaultConstrainedExcept() == fp::ebIgnore ||
3314         !Ty->isIEEE()) {
3315       Value *Fabs = EmitFAbs(*this, V);
3316       Constant *Infinity = ConstantFP::getInfinity(V->getType());
3317       CmpInst::Predicate Pred = (BuiltinID == Builtin::BI__builtin_isinf)
3318                                     ? CmpInst::FCMP_OEQ
3319                                     : CmpInst::FCMP_ONE;
3320       Value *FCmp = Builder.CreateFCmp(Pred, Fabs, Infinity, "cmpinf");
3321       return RValue::get(Builder.CreateZExt(FCmp, ConvertType(E->getType())));
3322     }
3323 
3324     if (Value *Result = getTargetHooks().testFPKind(V, BuiltinID, Builder, CGM))
3325       return RValue::get(Result);
3326 
3327     // Inf values have all exp bits set and a zero significand. Therefore:
3328     // isinf(V) == ((V << 1) == ((exp mask) << 1))
3329     // isfinite(V) == ((V << 1) < ((exp mask) << 1)) using unsigned comparison
3330     unsigned bitsize = Ty->getScalarSizeInBits();
3331     llvm::IntegerType *IntTy = Builder.getIntNTy(bitsize);
3332     Value *IntV = Builder.CreateBitCast(V, IntTy);
3333     Value *Shl1 = Builder.CreateShl(IntV, 1);
3334     const llvm::fltSemantics &Semantics = Ty->getFltSemantics();
3335     APInt ExpMask = APFloat::getInf(Semantics).bitcastToAPInt();
3336     Value *ExpMaskShl1 = llvm::ConstantInt::get(IntTy, ExpMask.shl(1));
3337     if (BuiltinID == Builtin::BI__builtin_isinf)
3338       V = Builder.CreateICmpEQ(Shl1, ExpMaskShl1);
3339     else
3340       V = Builder.CreateICmpULT(Shl1, ExpMaskShl1);
3341     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
3342   }
3343 
3344   case Builtin::BI__builtin_isinf_sign: {
3345     // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
3346     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3347     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3348     Value *Arg = EmitScalarExpr(E->getArg(0));
3349     Value *AbsArg = EmitFAbs(*this, Arg);
3350     Value *IsInf = Builder.CreateFCmpOEQ(
3351         AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
3352     Value *IsNeg = EmitSignBit(*this, Arg);
3353 
3354     llvm::Type *IntTy = ConvertType(E->getType());
3355     Value *Zero = Constant::getNullValue(IntTy);
3356     Value *One = ConstantInt::get(IntTy, 1);
3357     Value *NegativeOne = ConstantInt::get(IntTy, -1);
3358     Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
3359     Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
3360     return RValue::get(Result);
3361   }
3362 
3363   case Builtin::BI__builtin_isnormal: {
3364     // isnormal(x) --> x == x && fabsf(x) < infinity && fabsf(x) >= float_min
3365     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3366     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3367     Value *V = EmitScalarExpr(E->getArg(0));
3368     Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
3369 
3370     Value *Abs = EmitFAbs(*this, V);
3371     Value *IsLessThanInf =
3372       Builder.CreateFCmpULT(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
3373     APFloat Smallest = APFloat::getSmallestNormalized(
3374                    getContext().getFloatTypeSemantics(E->getArg(0)->getType()));
3375     Value *IsNormal =
3376       Builder.CreateFCmpUGE(Abs, ConstantFP::get(V->getContext(), Smallest),
3377                             "isnormal");
3378     V = Builder.CreateAnd(Eq, IsLessThanInf, "and");
3379     V = Builder.CreateAnd(V, IsNormal, "and");
3380     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
3381   }
3382 
3383   case Builtin::BI__builtin_flt_rounds: {
3384     Function *F = CGM.getIntrinsic(Intrinsic::flt_rounds);
3385 
3386     llvm::Type *ResultType = ConvertType(E->getType());
3387     Value *Result = Builder.CreateCall(F);
3388     if (Result->getType() != ResultType)
3389       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3390                                      "cast");
3391     return RValue::get(Result);
3392   }
3393 
3394   case Builtin::BI__builtin_fpclassify: {
3395     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3396     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3397     Value *V = EmitScalarExpr(E->getArg(5));
3398     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
3399 
3400     // Create Result
3401     BasicBlock *Begin = Builder.GetInsertBlock();
3402     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
3403     Builder.SetInsertPoint(End);
3404     PHINode *Result =
3405       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
3406                         "fpclassify_result");
3407 
3408     // if (V==0) return FP_ZERO
3409     Builder.SetInsertPoint(Begin);
3410     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
3411                                           "iszero");
3412     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
3413     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
3414     Builder.CreateCondBr(IsZero, End, NotZero);
3415     Result->addIncoming(ZeroLiteral, Begin);
3416 
3417     // if (V != V) return FP_NAN
3418     Builder.SetInsertPoint(NotZero);
3419     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
3420     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
3421     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
3422     Builder.CreateCondBr(IsNan, End, NotNan);
3423     Result->addIncoming(NanLiteral, NotZero);
3424 
3425     // if (fabs(V) == infinity) return FP_INFINITY
3426     Builder.SetInsertPoint(NotNan);
3427     Value *VAbs = EmitFAbs(*this, V);
3428     Value *IsInf =
3429       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
3430                             "isinf");
3431     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
3432     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
3433     Builder.CreateCondBr(IsInf, End, NotInf);
3434     Result->addIncoming(InfLiteral, NotNan);
3435 
3436     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
3437     Builder.SetInsertPoint(NotInf);
3438     APFloat Smallest = APFloat::getSmallestNormalized(
3439         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
3440     Value *IsNormal =
3441       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
3442                             "isnormal");
3443     Value *NormalResult =
3444       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
3445                            EmitScalarExpr(E->getArg(3)));
3446     Builder.CreateBr(End);
3447     Result->addIncoming(NormalResult, NotInf);
3448 
3449     // return Result
3450     Builder.SetInsertPoint(End);
3451     return RValue::get(Result);
3452   }
3453 
3454   case Builtin::BIalloca:
3455   case Builtin::BI_alloca:
3456   case Builtin::BI__builtin_alloca_uninitialized:
3457   case Builtin::BI__builtin_alloca: {
3458     Value *Size = EmitScalarExpr(E->getArg(0));
3459     const TargetInfo &TI = getContext().getTargetInfo();
3460     // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
3461     const Align SuitableAlignmentInBytes =
3462         CGM.getContext()
3463             .toCharUnitsFromBits(TI.getSuitableAlign())
3464             .getAsAlign();
3465     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3466     AI->setAlignment(SuitableAlignmentInBytes);
3467     if (BuiltinID != Builtin::BI__builtin_alloca_uninitialized)
3468       initializeAlloca(*this, AI, Size, SuitableAlignmentInBytes);
3469     return RValue::get(AI);
3470   }
3471 
3472   case Builtin::BI__builtin_alloca_with_align_uninitialized:
3473   case Builtin::BI__builtin_alloca_with_align: {
3474     Value *Size = EmitScalarExpr(E->getArg(0));
3475     Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
3476     auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
3477     unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
3478     const Align AlignmentInBytes =
3479         CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getAsAlign();
3480     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3481     AI->setAlignment(AlignmentInBytes);
3482     if (BuiltinID != Builtin::BI__builtin_alloca_with_align_uninitialized)
3483       initializeAlloca(*this, AI, Size, AlignmentInBytes);
3484     return RValue::get(AI);
3485   }
3486 
3487   case Builtin::BIbzero:
3488   case Builtin::BI__builtin_bzero: {
3489     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3490     Value *SizeVal = EmitScalarExpr(E->getArg(1));
3491     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3492                         E->getArg(0)->getExprLoc(), FD, 0);
3493     Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
3494     return RValue::get(nullptr);
3495   }
3496   case Builtin::BImemcpy:
3497   case Builtin::BI__builtin_memcpy:
3498   case Builtin::BImempcpy:
3499   case Builtin::BI__builtin_mempcpy: {
3500     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3501     Address Src = EmitPointerWithAlignment(E->getArg(1));
3502     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3503     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3504                         E->getArg(0)->getExprLoc(), FD, 0);
3505     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3506                         E->getArg(1)->getExprLoc(), FD, 1);
3507     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
3508     if (BuiltinID == Builtin::BImempcpy ||
3509         BuiltinID == Builtin::BI__builtin_mempcpy)
3510       return RValue::get(Builder.CreateInBoundsGEP(Dest.getElementType(),
3511                                                    Dest.getPointer(), SizeVal));
3512     else
3513       return RValue::get(Dest.getPointer());
3514   }
3515 
3516   case Builtin::BI__builtin_memcpy_inline: {
3517     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3518     Address Src = EmitPointerWithAlignment(E->getArg(1));
3519     uint64_t Size =
3520         E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
3521     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3522                         E->getArg(0)->getExprLoc(), FD, 0);
3523     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3524                         E->getArg(1)->getExprLoc(), FD, 1);
3525     Builder.CreateMemCpyInline(Dest, Src, Size);
3526     return RValue::get(nullptr);
3527   }
3528 
3529   case Builtin::BI__builtin_char_memchr:
3530     BuiltinID = Builtin::BI__builtin_memchr;
3531     break;
3532 
3533   case Builtin::BI__builtin___memcpy_chk: {
3534     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
3535     Expr::EvalResult SizeResult, DstSizeResult;
3536     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3537         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3538       break;
3539     llvm::APSInt Size = SizeResult.Val.getInt();
3540     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3541     if (Size.ugt(DstSize))
3542       break;
3543     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3544     Address Src = EmitPointerWithAlignment(E->getArg(1));
3545     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3546     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
3547     return RValue::get(Dest.getPointer());
3548   }
3549 
3550   case Builtin::BI__builtin_objc_memmove_collectable: {
3551     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
3552     Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
3553     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3554     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
3555                                                   DestAddr, SrcAddr, SizeVal);
3556     return RValue::get(DestAddr.getPointer());
3557   }
3558 
3559   case Builtin::BI__builtin___memmove_chk: {
3560     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
3561     Expr::EvalResult SizeResult, DstSizeResult;
3562     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3563         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3564       break;
3565     llvm::APSInt Size = SizeResult.Val.getInt();
3566     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3567     if (Size.ugt(DstSize))
3568       break;
3569     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3570     Address Src = EmitPointerWithAlignment(E->getArg(1));
3571     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3572     Builder.CreateMemMove(Dest, Src, SizeVal, false);
3573     return RValue::get(Dest.getPointer());
3574   }
3575 
3576   case Builtin::BImemmove:
3577   case Builtin::BI__builtin_memmove: {
3578     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3579     Address Src = EmitPointerWithAlignment(E->getArg(1));
3580     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3581     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3582                         E->getArg(0)->getExprLoc(), FD, 0);
3583     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3584                         E->getArg(1)->getExprLoc(), FD, 1);
3585     Builder.CreateMemMove(Dest, Src, SizeVal, false);
3586     return RValue::get(Dest.getPointer());
3587   }
3588   case Builtin::BImemset:
3589   case Builtin::BI__builtin_memset: {
3590     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3591     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
3592                                          Builder.getInt8Ty());
3593     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3594     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3595                         E->getArg(0)->getExprLoc(), FD, 0);
3596     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
3597     return RValue::get(Dest.getPointer());
3598   }
3599   case Builtin::BI__builtin___memset_chk: {
3600     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
3601     Expr::EvalResult SizeResult, DstSizeResult;
3602     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3603         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3604       break;
3605     llvm::APSInt Size = SizeResult.Val.getInt();
3606     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3607     if (Size.ugt(DstSize))
3608       break;
3609     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3610     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
3611                                          Builder.getInt8Ty());
3612     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3613     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
3614     return RValue::get(Dest.getPointer());
3615   }
3616   case Builtin::BI__builtin_wmemchr: {
3617     // The MSVC runtime library does not provide a definition of wmemchr, so we
3618     // need an inline implementation.
3619     if (!getTarget().getTriple().isOSMSVCRT())
3620       break;
3621 
3622     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
3623     Value *Str = EmitScalarExpr(E->getArg(0));
3624     Value *Chr = EmitScalarExpr(E->getArg(1));
3625     Value *Size = EmitScalarExpr(E->getArg(2));
3626 
3627     BasicBlock *Entry = Builder.GetInsertBlock();
3628     BasicBlock *CmpEq = createBasicBlock("wmemchr.eq");
3629     BasicBlock *Next = createBasicBlock("wmemchr.next");
3630     BasicBlock *Exit = createBasicBlock("wmemchr.exit");
3631     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
3632     Builder.CreateCondBr(SizeEq0, Exit, CmpEq);
3633 
3634     EmitBlock(CmpEq);
3635     PHINode *StrPhi = Builder.CreatePHI(Str->getType(), 2);
3636     StrPhi->addIncoming(Str, Entry);
3637     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
3638     SizePhi->addIncoming(Size, Entry);
3639     CharUnits WCharAlign =
3640         getContext().getTypeAlignInChars(getContext().WCharTy);
3641     Value *StrCh = Builder.CreateAlignedLoad(WCharTy, StrPhi, WCharAlign);
3642     Value *FoundChr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 0);
3643     Value *StrEqChr = Builder.CreateICmpEQ(StrCh, Chr);
3644     Builder.CreateCondBr(StrEqChr, Exit, Next);
3645 
3646     EmitBlock(Next);
3647     Value *NextStr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 1);
3648     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
3649     Value *NextSizeEq0 =
3650         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
3651     Builder.CreateCondBr(NextSizeEq0, Exit, CmpEq);
3652     StrPhi->addIncoming(NextStr, Next);
3653     SizePhi->addIncoming(NextSize, Next);
3654 
3655     EmitBlock(Exit);
3656     PHINode *Ret = Builder.CreatePHI(Str->getType(), 3);
3657     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Entry);
3658     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Next);
3659     Ret->addIncoming(FoundChr, CmpEq);
3660     return RValue::get(Ret);
3661   }
3662   case Builtin::BI__builtin_wmemcmp: {
3663     // The MSVC runtime library does not provide a definition of wmemcmp, so we
3664     // need an inline implementation.
3665     if (!getTarget().getTriple().isOSMSVCRT())
3666       break;
3667 
3668     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
3669 
3670     Value *Dst = EmitScalarExpr(E->getArg(0));
3671     Value *Src = EmitScalarExpr(E->getArg(1));
3672     Value *Size = EmitScalarExpr(E->getArg(2));
3673 
3674     BasicBlock *Entry = Builder.GetInsertBlock();
3675     BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
3676     BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
3677     BasicBlock *Next = createBasicBlock("wmemcmp.next");
3678     BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
3679     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
3680     Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
3681 
3682     EmitBlock(CmpGT);
3683     PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
3684     DstPhi->addIncoming(Dst, Entry);
3685     PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
3686     SrcPhi->addIncoming(Src, Entry);
3687     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
3688     SizePhi->addIncoming(Size, Entry);
3689     CharUnits WCharAlign =
3690         getContext().getTypeAlignInChars(getContext().WCharTy);
3691     Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
3692     Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
3693     Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
3694     Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
3695 
3696     EmitBlock(CmpLT);
3697     Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
3698     Builder.CreateCondBr(DstLtSrc, Exit, Next);
3699 
3700     EmitBlock(Next);
3701     Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
3702     Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
3703     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
3704     Value *NextSizeEq0 =
3705         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
3706     Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
3707     DstPhi->addIncoming(NextDst, Next);
3708     SrcPhi->addIncoming(NextSrc, Next);
3709     SizePhi->addIncoming(NextSize, Next);
3710 
3711     EmitBlock(Exit);
3712     PHINode *Ret = Builder.CreatePHI(IntTy, 4);
3713     Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
3714     Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
3715     Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
3716     Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
3717     return RValue::get(Ret);
3718   }
3719   case Builtin::BI__builtin_dwarf_cfa: {
3720     // The offset in bytes from the first argument to the CFA.
3721     //
3722     // Why on earth is this in the frontend?  Is there any reason at
3723     // all that the backend can't reasonably determine this while
3724     // lowering llvm.eh.dwarf.cfa()?
3725     //
3726     // TODO: If there's a satisfactory reason, add a target hook for
3727     // this instead of hard-coding 0, which is correct for most targets.
3728     int32_t Offset = 0;
3729 
3730     Function *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
3731     return RValue::get(Builder.CreateCall(F,
3732                                       llvm::ConstantInt::get(Int32Ty, Offset)));
3733   }
3734   case Builtin::BI__builtin_return_address: {
3735     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
3736                                                    getContext().UnsignedIntTy);
3737     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
3738     return RValue::get(Builder.CreateCall(F, Depth));
3739   }
3740   case Builtin::BI_ReturnAddress: {
3741     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
3742     return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
3743   }
3744   case Builtin::BI__builtin_frame_address: {
3745     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
3746                                                    getContext().UnsignedIntTy);
3747     Function *F = CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy);
3748     return RValue::get(Builder.CreateCall(F, Depth));
3749   }
3750   case Builtin::BI__builtin_extract_return_addr: {
3751     Value *Address = EmitScalarExpr(E->getArg(0));
3752     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
3753     return RValue::get(Result);
3754   }
3755   case Builtin::BI__builtin_frob_return_addr: {
3756     Value *Address = EmitScalarExpr(E->getArg(0));
3757     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
3758     return RValue::get(Result);
3759   }
3760   case Builtin::BI__builtin_dwarf_sp_column: {
3761     llvm::IntegerType *Ty
3762       = cast<llvm::IntegerType>(ConvertType(E->getType()));
3763     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
3764     if (Column == -1) {
3765       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
3766       return RValue::get(llvm::UndefValue::get(Ty));
3767     }
3768     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
3769   }
3770   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
3771     Value *Address = EmitScalarExpr(E->getArg(0));
3772     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
3773       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
3774     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
3775   }
3776   case Builtin::BI__builtin_eh_return: {
3777     Value *Int = EmitScalarExpr(E->getArg(0));
3778     Value *Ptr = EmitScalarExpr(E->getArg(1));
3779 
3780     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
3781     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
3782            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
3783     Function *F =
3784         CGM.getIntrinsic(IntTy->getBitWidth() == 32 ? Intrinsic::eh_return_i32
3785                                                     : Intrinsic::eh_return_i64);
3786     Builder.CreateCall(F, {Int, Ptr});
3787     Builder.CreateUnreachable();
3788 
3789     // We do need to preserve an insertion point.
3790     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
3791 
3792     return RValue::get(nullptr);
3793   }
3794   case Builtin::BI__builtin_unwind_init: {
3795     Function *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
3796     return RValue::get(Builder.CreateCall(F));
3797   }
3798   case Builtin::BI__builtin_extend_pointer: {
3799     // Extends a pointer to the size of an _Unwind_Word, which is
3800     // uint64_t on all platforms.  Generally this gets poked into a
3801     // register and eventually used as an address, so if the
3802     // addressing registers are wider than pointers and the platform
3803     // doesn't implicitly ignore high-order bits when doing
3804     // addressing, we need to make sure we zext / sext based on
3805     // the platform's expectations.
3806     //
3807     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
3808 
3809     // Cast the pointer to intptr_t.
3810     Value *Ptr = EmitScalarExpr(E->getArg(0));
3811     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
3812 
3813     // If that's 64 bits, we're done.
3814     if (IntPtrTy->getBitWidth() == 64)
3815       return RValue::get(Result);
3816 
3817     // Otherwise, ask the codegen data what to do.
3818     if (getTargetHooks().extendPointerWithSExt())
3819       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
3820     else
3821       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
3822   }
3823   case Builtin::BI__builtin_setjmp: {
3824     // Buffer is a void**.
3825     Address Buf = EmitPointerWithAlignment(E->getArg(0));
3826 
3827     // Store the frame pointer to the setjmp buffer.
3828     Value *FrameAddr = Builder.CreateCall(
3829         CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy),
3830         ConstantInt::get(Int32Ty, 0));
3831     Builder.CreateStore(FrameAddr, Buf);
3832 
3833     // Store the stack pointer to the setjmp buffer.
3834     Value *StackAddr =
3835         Builder.CreateCall(CGM.getIntrinsic(Intrinsic::stacksave));
3836     Address StackSaveSlot = Builder.CreateConstInBoundsGEP(Buf, 2);
3837     Builder.CreateStore(StackAddr, StackSaveSlot);
3838 
3839     // Call LLVM's EH setjmp, which is lightweight.
3840     Function *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
3841     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
3842     return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
3843   }
3844   case Builtin::BI__builtin_longjmp: {
3845     Value *Buf = EmitScalarExpr(E->getArg(0));
3846     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
3847 
3848     // Call LLVM's EH longjmp, which is lightweight.
3849     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
3850 
3851     // longjmp doesn't return; mark this as unreachable.
3852     Builder.CreateUnreachable();
3853 
3854     // We do need to preserve an insertion point.
3855     EmitBlock(createBasicBlock("longjmp.cont"));
3856 
3857     return RValue::get(nullptr);
3858   }
3859   case Builtin::BI__builtin_launder: {
3860     const Expr *Arg = E->getArg(0);
3861     QualType ArgTy = Arg->getType()->getPointeeType();
3862     Value *Ptr = EmitScalarExpr(Arg);
3863     if (TypeRequiresBuiltinLaunder(CGM, ArgTy))
3864       Ptr = Builder.CreateLaunderInvariantGroup(Ptr);
3865 
3866     return RValue::get(Ptr);
3867   }
3868   case Builtin::BI__sync_fetch_and_add:
3869   case Builtin::BI__sync_fetch_and_sub:
3870   case Builtin::BI__sync_fetch_and_or:
3871   case Builtin::BI__sync_fetch_and_and:
3872   case Builtin::BI__sync_fetch_and_xor:
3873   case Builtin::BI__sync_fetch_and_nand:
3874   case Builtin::BI__sync_add_and_fetch:
3875   case Builtin::BI__sync_sub_and_fetch:
3876   case Builtin::BI__sync_and_and_fetch:
3877   case Builtin::BI__sync_or_and_fetch:
3878   case Builtin::BI__sync_xor_and_fetch:
3879   case Builtin::BI__sync_nand_and_fetch:
3880   case Builtin::BI__sync_val_compare_and_swap:
3881   case Builtin::BI__sync_bool_compare_and_swap:
3882   case Builtin::BI__sync_lock_test_and_set:
3883   case Builtin::BI__sync_lock_release:
3884   case Builtin::BI__sync_swap:
3885     llvm_unreachable("Shouldn't make it through sema");
3886   case Builtin::BI__sync_fetch_and_add_1:
3887   case Builtin::BI__sync_fetch_and_add_2:
3888   case Builtin::BI__sync_fetch_and_add_4:
3889   case Builtin::BI__sync_fetch_and_add_8:
3890   case Builtin::BI__sync_fetch_and_add_16:
3891     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
3892   case Builtin::BI__sync_fetch_and_sub_1:
3893   case Builtin::BI__sync_fetch_and_sub_2:
3894   case Builtin::BI__sync_fetch_and_sub_4:
3895   case Builtin::BI__sync_fetch_and_sub_8:
3896   case Builtin::BI__sync_fetch_and_sub_16:
3897     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
3898   case Builtin::BI__sync_fetch_and_or_1:
3899   case Builtin::BI__sync_fetch_and_or_2:
3900   case Builtin::BI__sync_fetch_and_or_4:
3901   case Builtin::BI__sync_fetch_and_or_8:
3902   case Builtin::BI__sync_fetch_and_or_16:
3903     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
3904   case Builtin::BI__sync_fetch_and_and_1:
3905   case Builtin::BI__sync_fetch_and_and_2:
3906   case Builtin::BI__sync_fetch_and_and_4:
3907   case Builtin::BI__sync_fetch_and_and_8:
3908   case Builtin::BI__sync_fetch_and_and_16:
3909     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
3910   case Builtin::BI__sync_fetch_and_xor_1:
3911   case Builtin::BI__sync_fetch_and_xor_2:
3912   case Builtin::BI__sync_fetch_and_xor_4:
3913   case Builtin::BI__sync_fetch_and_xor_8:
3914   case Builtin::BI__sync_fetch_and_xor_16:
3915     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
3916   case Builtin::BI__sync_fetch_and_nand_1:
3917   case Builtin::BI__sync_fetch_and_nand_2:
3918   case Builtin::BI__sync_fetch_and_nand_4:
3919   case Builtin::BI__sync_fetch_and_nand_8:
3920   case Builtin::BI__sync_fetch_and_nand_16:
3921     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
3922 
3923   // Clang extensions: not overloaded yet.
3924   case Builtin::BI__sync_fetch_and_min:
3925     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
3926   case Builtin::BI__sync_fetch_and_max:
3927     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
3928   case Builtin::BI__sync_fetch_and_umin:
3929     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
3930   case Builtin::BI__sync_fetch_and_umax:
3931     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
3932 
3933   case Builtin::BI__sync_add_and_fetch_1:
3934   case Builtin::BI__sync_add_and_fetch_2:
3935   case Builtin::BI__sync_add_and_fetch_4:
3936   case Builtin::BI__sync_add_and_fetch_8:
3937   case Builtin::BI__sync_add_and_fetch_16:
3938     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
3939                                 llvm::Instruction::Add);
3940   case Builtin::BI__sync_sub_and_fetch_1:
3941   case Builtin::BI__sync_sub_and_fetch_2:
3942   case Builtin::BI__sync_sub_and_fetch_4:
3943   case Builtin::BI__sync_sub_and_fetch_8:
3944   case Builtin::BI__sync_sub_and_fetch_16:
3945     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
3946                                 llvm::Instruction::Sub);
3947   case Builtin::BI__sync_and_and_fetch_1:
3948   case Builtin::BI__sync_and_and_fetch_2:
3949   case Builtin::BI__sync_and_and_fetch_4:
3950   case Builtin::BI__sync_and_and_fetch_8:
3951   case Builtin::BI__sync_and_and_fetch_16:
3952     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
3953                                 llvm::Instruction::And);
3954   case Builtin::BI__sync_or_and_fetch_1:
3955   case Builtin::BI__sync_or_and_fetch_2:
3956   case Builtin::BI__sync_or_and_fetch_4:
3957   case Builtin::BI__sync_or_and_fetch_8:
3958   case Builtin::BI__sync_or_and_fetch_16:
3959     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
3960                                 llvm::Instruction::Or);
3961   case Builtin::BI__sync_xor_and_fetch_1:
3962   case Builtin::BI__sync_xor_and_fetch_2:
3963   case Builtin::BI__sync_xor_and_fetch_4:
3964   case Builtin::BI__sync_xor_and_fetch_8:
3965   case Builtin::BI__sync_xor_and_fetch_16:
3966     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
3967                                 llvm::Instruction::Xor);
3968   case Builtin::BI__sync_nand_and_fetch_1:
3969   case Builtin::BI__sync_nand_and_fetch_2:
3970   case Builtin::BI__sync_nand_and_fetch_4:
3971   case Builtin::BI__sync_nand_and_fetch_8:
3972   case Builtin::BI__sync_nand_and_fetch_16:
3973     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
3974                                 llvm::Instruction::And, true);
3975 
3976   case Builtin::BI__sync_val_compare_and_swap_1:
3977   case Builtin::BI__sync_val_compare_and_swap_2:
3978   case Builtin::BI__sync_val_compare_and_swap_4:
3979   case Builtin::BI__sync_val_compare_and_swap_8:
3980   case Builtin::BI__sync_val_compare_and_swap_16:
3981     return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
3982 
3983   case Builtin::BI__sync_bool_compare_and_swap_1:
3984   case Builtin::BI__sync_bool_compare_and_swap_2:
3985   case Builtin::BI__sync_bool_compare_and_swap_4:
3986   case Builtin::BI__sync_bool_compare_and_swap_8:
3987   case Builtin::BI__sync_bool_compare_and_swap_16:
3988     return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
3989 
3990   case Builtin::BI__sync_swap_1:
3991   case Builtin::BI__sync_swap_2:
3992   case Builtin::BI__sync_swap_4:
3993   case Builtin::BI__sync_swap_8:
3994   case Builtin::BI__sync_swap_16:
3995     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
3996 
3997   case Builtin::BI__sync_lock_test_and_set_1:
3998   case Builtin::BI__sync_lock_test_and_set_2:
3999   case Builtin::BI__sync_lock_test_and_set_4:
4000   case Builtin::BI__sync_lock_test_and_set_8:
4001   case Builtin::BI__sync_lock_test_and_set_16:
4002     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4003 
4004   case Builtin::BI__sync_lock_release_1:
4005   case Builtin::BI__sync_lock_release_2:
4006   case Builtin::BI__sync_lock_release_4:
4007   case Builtin::BI__sync_lock_release_8:
4008   case Builtin::BI__sync_lock_release_16: {
4009     Value *Ptr = EmitScalarExpr(E->getArg(0));
4010     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
4011     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
4012     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
4013                                              StoreSize.getQuantity() * 8);
4014     Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
4015     llvm::StoreInst *Store =
4016       Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr,
4017                                  StoreSize);
4018     Store->setAtomic(llvm::AtomicOrdering::Release);
4019     return RValue::get(nullptr);
4020   }
4021 
4022   case Builtin::BI__sync_synchronize: {
4023     // We assume this is supposed to correspond to a C++0x-style
4024     // sequentially-consistent fence (i.e. this is only usable for
4025     // synchronization, not device I/O or anything like that). This intrinsic
4026     // is really badly designed in the sense that in theory, there isn't
4027     // any way to safely use it... but in practice, it mostly works
4028     // to use it with non-atomic loads and stores to get acquire/release
4029     // semantics.
4030     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
4031     return RValue::get(nullptr);
4032   }
4033 
4034   case Builtin::BI__builtin_nontemporal_load:
4035     return RValue::get(EmitNontemporalLoad(*this, E));
4036   case Builtin::BI__builtin_nontemporal_store:
4037     return RValue::get(EmitNontemporalStore(*this, E));
4038   case Builtin::BI__c11_atomic_is_lock_free:
4039   case Builtin::BI__atomic_is_lock_free: {
4040     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
4041     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
4042     // _Atomic(T) is always properly-aligned.
4043     const char *LibCallName = "__atomic_is_lock_free";
4044     CallArgList Args;
4045     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
4046              getContext().getSizeType());
4047     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
4048       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
4049                getContext().VoidPtrTy);
4050     else
4051       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
4052                getContext().VoidPtrTy);
4053     const CGFunctionInfo &FuncInfo =
4054         CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
4055     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
4056     llvm::FunctionCallee Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
4057     return EmitCall(FuncInfo, CGCallee::forDirect(Func),
4058                     ReturnValueSlot(), Args);
4059   }
4060 
4061   case Builtin::BI__atomic_test_and_set: {
4062     // Look at the argument type to determine whether this is a volatile
4063     // operation. The parameter type is always volatile.
4064     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4065     bool Volatile =
4066         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4067 
4068     Value *Ptr = EmitScalarExpr(E->getArg(0));
4069     unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
4070     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
4071     Value *NewVal = Builder.getInt8(1);
4072     Value *Order = EmitScalarExpr(E->getArg(1));
4073     if (isa<llvm::ConstantInt>(Order)) {
4074       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4075       AtomicRMWInst *Result = nullptr;
4076       switch (ord) {
4077       case 0:  // memory_order_relaxed
4078       default: // invalid order
4079         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4080                                          llvm::AtomicOrdering::Monotonic);
4081         break;
4082       case 1: // memory_order_consume
4083       case 2: // memory_order_acquire
4084         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4085                                          llvm::AtomicOrdering::Acquire);
4086         break;
4087       case 3: // memory_order_release
4088         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4089                                          llvm::AtomicOrdering::Release);
4090         break;
4091       case 4: // memory_order_acq_rel
4092 
4093         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4094                                          llvm::AtomicOrdering::AcquireRelease);
4095         break;
4096       case 5: // memory_order_seq_cst
4097         Result = Builder.CreateAtomicRMW(
4098             llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4099             llvm::AtomicOrdering::SequentiallyConsistent);
4100         break;
4101       }
4102       Result->setVolatile(Volatile);
4103       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4104     }
4105 
4106     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4107 
4108     llvm::BasicBlock *BBs[5] = {
4109       createBasicBlock("monotonic", CurFn),
4110       createBasicBlock("acquire", CurFn),
4111       createBasicBlock("release", CurFn),
4112       createBasicBlock("acqrel", CurFn),
4113       createBasicBlock("seqcst", CurFn)
4114     };
4115     llvm::AtomicOrdering Orders[5] = {
4116         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
4117         llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
4118         llvm::AtomicOrdering::SequentiallyConsistent};
4119 
4120     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4121     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4122 
4123     Builder.SetInsertPoint(ContBB);
4124     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
4125 
4126     for (unsigned i = 0; i < 5; ++i) {
4127       Builder.SetInsertPoint(BBs[i]);
4128       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
4129                                                    Ptr, NewVal, Orders[i]);
4130       RMW->setVolatile(Volatile);
4131       Result->addIncoming(RMW, BBs[i]);
4132       Builder.CreateBr(ContBB);
4133     }
4134 
4135     SI->addCase(Builder.getInt32(0), BBs[0]);
4136     SI->addCase(Builder.getInt32(1), BBs[1]);
4137     SI->addCase(Builder.getInt32(2), BBs[1]);
4138     SI->addCase(Builder.getInt32(3), BBs[2]);
4139     SI->addCase(Builder.getInt32(4), BBs[3]);
4140     SI->addCase(Builder.getInt32(5), BBs[4]);
4141 
4142     Builder.SetInsertPoint(ContBB);
4143     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4144   }
4145 
4146   case Builtin::BI__atomic_clear: {
4147     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4148     bool Volatile =
4149         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4150 
4151     Address Ptr = EmitPointerWithAlignment(E->getArg(0));
4152     unsigned AddrSpace = Ptr.getPointer()->getType()->getPointerAddressSpace();
4153     Ptr = Builder.CreateBitCast(Ptr, Int8Ty->getPointerTo(AddrSpace));
4154     Value *NewVal = Builder.getInt8(0);
4155     Value *Order = EmitScalarExpr(E->getArg(1));
4156     if (isa<llvm::ConstantInt>(Order)) {
4157       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4158       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4159       switch (ord) {
4160       case 0:  // memory_order_relaxed
4161       default: // invalid order
4162         Store->setOrdering(llvm::AtomicOrdering::Monotonic);
4163         break;
4164       case 3:  // memory_order_release
4165         Store->setOrdering(llvm::AtomicOrdering::Release);
4166         break;
4167       case 5:  // memory_order_seq_cst
4168         Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
4169         break;
4170       }
4171       return RValue::get(nullptr);
4172     }
4173 
4174     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4175 
4176     llvm::BasicBlock *BBs[3] = {
4177       createBasicBlock("monotonic", CurFn),
4178       createBasicBlock("release", CurFn),
4179       createBasicBlock("seqcst", CurFn)
4180     };
4181     llvm::AtomicOrdering Orders[3] = {
4182         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
4183         llvm::AtomicOrdering::SequentiallyConsistent};
4184 
4185     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4186     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4187 
4188     for (unsigned i = 0; i < 3; ++i) {
4189       Builder.SetInsertPoint(BBs[i]);
4190       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4191       Store->setOrdering(Orders[i]);
4192       Builder.CreateBr(ContBB);
4193     }
4194 
4195     SI->addCase(Builder.getInt32(0), BBs[0]);
4196     SI->addCase(Builder.getInt32(3), BBs[1]);
4197     SI->addCase(Builder.getInt32(5), BBs[2]);
4198 
4199     Builder.SetInsertPoint(ContBB);
4200     return RValue::get(nullptr);
4201   }
4202 
4203   case Builtin::BI__atomic_thread_fence:
4204   case Builtin::BI__atomic_signal_fence:
4205   case Builtin::BI__c11_atomic_thread_fence:
4206   case Builtin::BI__c11_atomic_signal_fence: {
4207     llvm::SyncScope::ID SSID;
4208     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
4209         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
4210       SSID = llvm::SyncScope::SingleThread;
4211     else
4212       SSID = llvm::SyncScope::System;
4213     Value *Order = EmitScalarExpr(E->getArg(0));
4214     if (isa<llvm::ConstantInt>(Order)) {
4215       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4216       switch (ord) {
4217       case 0:  // memory_order_relaxed
4218       default: // invalid order
4219         break;
4220       case 1:  // memory_order_consume
4221       case 2:  // memory_order_acquire
4222         Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4223         break;
4224       case 3:  // memory_order_release
4225         Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4226         break;
4227       case 4:  // memory_order_acq_rel
4228         Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4229         break;
4230       case 5:  // memory_order_seq_cst
4231         Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4232         break;
4233       }
4234       return RValue::get(nullptr);
4235     }
4236 
4237     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
4238     AcquireBB = createBasicBlock("acquire", CurFn);
4239     ReleaseBB = createBasicBlock("release", CurFn);
4240     AcqRelBB = createBasicBlock("acqrel", CurFn);
4241     SeqCstBB = createBasicBlock("seqcst", CurFn);
4242     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4243 
4244     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4245     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
4246 
4247     Builder.SetInsertPoint(AcquireBB);
4248     Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4249     Builder.CreateBr(ContBB);
4250     SI->addCase(Builder.getInt32(1), AcquireBB);
4251     SI->addCase(Builder.getInt32(2), AcquireBB);
4252 
4253     Builder.SetInsertPoint(ReleaseBB);
4254     Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4255     Builder.CreateBr(ContBB);
4256     SI->addCase(Builder.getInt32(3), ReleaseBB);
4257 
4258     Builder.SetInsertPoint(AcqRelBB);
4259     Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4260     Builder.CreateBr(ContBB);
4261     SI->addCase(Builder.getInt32(4), AcqRelBB);
4262 
4263     Builder.SetInsertPoint(SeqCstBB);
4264     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4265     Builder.CreateBr(ContBB);
4266     SI->addCase(Builder.getInt32(5), SeqCstBB);
4267 
4268     Builder.SetInsertPoint(ContBB);
4269     return RValue::get(nullptr);
4270   }
4271 
4272   case Builtin::BI__builtin_signbit:
4273   case Builtin::BI__builtin_signbitf:
4274   case Builtin::BI__builtin_signbitl: {
4275     return RValue::get(
4276         Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
4277                            ConvertType(E->getType())));
4278   }
4279   case Builtin::BI__warn_memset_zero_len:
4280     return RValue::getIgnored();
4281   case Builtin::BI__annotation: {
4282     // Re-encode each wide string to UTF8 and make an MDString.
4283     SmallVector<Metadata *, 1> Strings;
4284     for (const Expr *Arg : E->arguments()) {
4285       const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
4286       assert(Str->getCharByteWidth() == 2);
4287       StringRef WideBytes = Str->getBytes();
4288       std::string StrUtf8;
4289       if (!convertUTF16ToUTF8String(
4290               makeArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
4291         CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
4292         continue;
4293       }
4294       Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
4295     }
4296 
4297     // Build and MDTuple of MDStrings and emit the intrinsic call.
4298     llvm::Function *F =
4299         CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
4300     MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
4301     Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
4302     return RValue::getIgnored();
4303   }
4304   case Builtin::BI__builtin_annotation: {
4305     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
4306     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::annotation,
4307                                       AnnVal->getType());
4308 
4309     // Get the annotation string, go through casts. Sema requires this to be a
4310     // non-wide string literal, potentially casted, so the cast<> is safe.
4311     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
4312     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
4313     return RValue::get(
4314         EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc(), nullptr));
4315   }
4316   case Builtin::BI__builtin_addcb:
4317   case Builtin::BI__builtin_addcs:
4318   case Builtin::BI__builtin_addc:
4319   case Builtin::BI__builtin_addcl:
4320   case Builtin::BI__builtin_addcll:
4321   case Builtin::BI__builtin_subcb:
4322   case Builtin::BI__builtin_subcs:
4323   case Builtin::BI__builtin_subc:
4324   case Builtin::BI__builtin_subcl:
4325   case Builtin::BI__builtin_subcll: {
4326 
4327     // We translate all of these builtins from expressions of the form:
4328     //   int x = ..., y = ..., carryin = ..., carryout, result;
4329     //   result = __builtin_addc(x, y, carryin, &carryout);
4330     //
4331     // to LLVM IR of the form:
4332     //
4333     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
4334     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
4335     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
4336     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
4337     //                                                       i32 %carryin)
4338     //   %result = extractvalue {i32, i1} %tmp2, 0
4339     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
4340     //   %tmp3 = or i1 %carry1, %carry2
4341     //   %tmp4 = zext i1 %tmp3 to i32
4342     //   store i32 %tmp4, i32* %carryout
4343 
4344     // Scalarize our inputs.
4345     llvm::Value *X = EmitScalarExpr(E->getArg(0));
4346     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4347     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
4348     Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
4349 
4350     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
4351     llvm::Intrinsic::ID IntrinsicId;
4352     switch (BuiltinID) {
4353     default: llvm_unreachable("Unknown multiprecision builtin id.");
4354     case Builtin::BI__builtin_addcb:
4355     case Builtin::BI__builtin_addcs:
4356     case Builtin::BI__builtin_addc:
4357     case Builtin::BI__builtin_addcl:
4358     case Builtin::BI__builtin_addcll:
4359       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4360       break;
4361     case Builtin::BI__builtin_subcb:
4362     case Builtin::BI__builtin_subcs:
4363     case Builtin::BI__builtin_subc:
4364     case Builtin::BI__builtin_subcl:
4365     case Builtin::BI__builtin_subcll:
4366       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4367       break;
4368     }
4369 
4370     // Construct our resulting LLVM IR expression.
4371     llvm::Value *Carry1;
4372     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
4373                                               X, Y, Carry1);
4374     llvm::Value *Carry2;
4375     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
4376                                               Sum1, Carryin, Carry2);
4377     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
4378                                                X->getType());
4379     Builder.CreateStore(CarryOut, CarryOutPtr);
4380     return RValue::get(Sum2);
4381   }
4382 
4383   case Builtin::BI__builtin_add_overflow:
4384   case Builtin::BI__builtin_sub_overflow:
4385   case Builtin::BI__builtin_mul_overflow: {
4386     const clang::Expr *LeftArg = E->getArg(0);
4387     const clang::Expr *RightArg = E->getArg(1);
4388     const clang::Expr *ResultArg = E->getArg(2);
4389 
4390     clang::QualType ResultQTy =
4391         ResultArg->getType()->castAs<PointerType>()->getPointeeType();
4392 
4393     WidthAndSignedness LeftInfo =
4394         getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
4395     WidthAndSignedness RightInfo =
4396         getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
4397     WidthAndSignedness ResultInfo =
4398         getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
4399 
4400     // Handle mixed-sign multiplication as a special case, because adding
4401     // runtime or backend support for our generic irgen would be too expensive.
4402     if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
4403       return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
4404                                           RightInfo, ResultArg, ResultQTy,
4405                                           ResultInfo);
4406 
4407     if (isSpecialUnsignedMultiplySignedResult(BuiltinID, LeftInfo, RightInfo,
4408                                               ResultInfo))
4409       return EmitCheckedUnsignedMultiplySignedResult(
4410           *this, LeftArg, LeftInfo, RightArg, RightInfo, ResultArg, ResultQTy,
4411           ResultInfo);
4412 
4413     WidthAndSignedness EncompassingInfo =
4414         EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
4415 
4416     llvm::Type *EncompassingLLVMTy =
4417         llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
4418 
4419     llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
4420 
4421     llvm::Intrinsic::ID IntrinsicId;
4422     switch (BuiltinID) {
4423     default:
4424       llvm_unreachable("Unknown overflow builtin id.");
4425     case Builtin::BI__builtin_add_overflow:
4426       IntrinsicId = EncompassingInfo.Signed
4427                         ? llvm::Intrinsic::sadd_with_overflow
4428                         : llvm::Intrinsic::uadd_with_overflow;
4429       break;
4430     case Builtin::BI__builtin_sub_overflow:
4431       IntrinsicId = EncompassingInfo.Signed
4432                         ? llvm::Intrinsic::ssub_with_overflow
4433                         : llvm::Intrinsic::usub_with_overflow;
4434       break;
4435     case Builtin::BI__builtin_mul_overflow:
4436       IntrinsicId = EncompassingInfo.Signed
4437                         ? llvm::Intrinsic::smul_with_overflow
4438                         : llvm::Intrinsic::umul_with_overflow;
4439       break;
4440     }
4441 
4442     llvm::Value *Left = EmitScalarExpr(LeftArg);
4443     llvm::Value *Right = EmitScalarExpr(RightArg);
4444     Address ResultPtr = EmitPointerWithAlignment(ResultArg);
4445 
4446     // Extend each operand to the encompassing type.
4447     Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
4448     Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
4449 
4450     // Perform the operation on the extended values.
4451     llvm::Value *Overflow, *Result;
4452     Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
4453 
4454     if (EncompassingInfo.Width > ResultInfo.Width) {
4455       // The encompassing type is wider than the result type, so we need to
4456       // truncate it.
4457       llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
4458 
4459       // To see if the truncation caused an overflow, we will extend
4460       // the result and then compare it to the original result.
4461       llvm::Value *ResultTruncExt = Builder.CreateIntCast(
4462           ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
4463       llvm::Value *TruncationOverflow =
4464           Builder.CreateICmpNE(Result, ResultTruncExt);
4465 
4466       Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
4467       Result = ResultTrunc;
4468     }
4469 
4470     // Finally, store the result using the pointer.
4471     bool isVolatile =
4472       ResultArg->getType()->getPointeeType().isVolatileQualified();
4473     Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
4474 
4475     return RValue::get(Overflow);
4476   }
4477 
4478   case Builtin::BI__builtin_uadd_overflow:
4479   case Builtin::BI__builtin_uaddl_overflow:
4480   case Builtin::BI__builtin_uaddll_overflow:
4481   case Builtin::BI__builtin_usub_overflow:
4482   case Builtin::BI__builtin_usubl_overflow:
4483   case Builtin::BI__builtin_usubll_overflow:
4484   case Builtin::BI__builtin_umul_overflow:
4485   case Builtin::BI__builtin_umull_overflow:
4486   case Builtin::BI__builtin_umulll_overflow:
4487   case Builtin::BI__builtin_sadd_overflow:
4488   case Builtin::BI__builtin_saddl_overflow:
4489   case Builtin::BI__builtin_saddll_overflow:
4490   case Builtin::BI__builtin_ssub_overflow:
4491   case Builtin::BI__builtin_ssubl_overflow:
4492   case Builtin::BI__builtin_ssubll_overflow:
4493   case Builtin::BI__builtin_smul_overflow:
4494   case Builtin::BI__builtin_smull_overflow:
4495   case Builtin::BI__builtin_smulll_overflow: {
4496 
4497     // We translate all of these builtins directly to the relevant llvm IR node.
4498 
4499     // Scalarize our inputs.
4500     llvm::Value *X = EmitScalarExpr(E->getArg(0));
4501     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4502     Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
4503 
4504     // Decide which of the overflow intrinsics we are lowering to:
4505     llvm::Intrinsic::ID IntrinsicId;
4506     switch (BuiltinID) {
4507     default: llvm_unreachable("Unknown overflow builtin id.");
4508     case Builtin::BI__builtin_uadd_overflow:
4509     case Builtin::BI__builtin_uaddl_overflow:
4510     case Builtin::BI__builtin_uaddll_overflow:
4511       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4512       break;
4513     case Builtin::BI__builtin_usub_overflow:
4514     case Builtin::BI__builtin_usubl_overflow:
4515     case Builtin::BI__builtin_usubll_overflow:
4516       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4517       break;
4518     case Builtin::BI__builtin_umul_overflow:
4519     case Builtin::BI__builtin_umull_overflow:
4520     case Builtin::BI__builtin_umulll_overflow:
4521       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
4522       break;
4523     case Builtin::BI__builtin_sadd_overflow:
4524     case Builtin::BI__builtin_saddl_overflow:
4525     case Builtin::BI__builtin_saddll_overflow:
4526       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
4527       break;
4528     case Builtin::BI__builtin_ssub_overflow:
4529     case Builtin::BI__builtin_ssubl_overflow:
4530     case Builtin::BI__builtin_ssubll_overflow:
4531       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
4532       break;
4533     case Builtin::BI__builtin_smul_overflow:
4534     case Builtin::BI__builtin_smull_overflow:
4535     case Builtin::BI__builtin_smulll_overflow:
4536       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
4537       break;
4538     }
4539 
4540 
4541     llvm::Value *Carry;
4542     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
4543     Builder.CreateStore(Sum, SumOutPtr);
4544 
4545     return RValue::get(Carry);
4546   }
4547   case Builtin::BI__builtin_addressof:
4548     return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
4549   case Builtin::BI__builtin_function_start:
4550     return RValue::get(CGM.GetFunctionStart(
4551         E->getArg(0)->getAsBuiltinConstantDeclRef(CGM.getContext())));
4552   case Builtin::BI__builtin_operator_new:
4553     return EmitBuiltinNewDeleteCall(
4554         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
4555   case Builtin::BI__builtin_operator_delete:
4556     return EmitBuiltinNewDeleteCall(
4557         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
4558 
4559   case Builtin::BI__builtin_is_aligned:
4560     return EmitBuiltinIsAligned(E);
4561   case Builtin::BI__builtin_align_up:
4562     return EmitBuiltinAlignTo(E, true);
4563   case Builtin::BI__builtin_align_down:
4564     return EmitBuiltinAlignTo(E, false);
4565 
4566   case Builtin::BI__noop:
4567     // __noop always evaluates to an integer literal zero.
4568     return RValue::get(ConstantInt::get(IntTy, 0));
4569   case Builtin::BI__builtin_call_with_static_chain: {
4570     const CallExpr *Call = cast<CallExpr>(E->getArg(0));
4571     const Expr *Chain = E->getArg(1);
4572     return EmitCall(Call->getCallee()->getType(),
4573                     EmitCallee(Call->getCallee()), Call, ReturnValue,
4574                     EmitScalarExpr(Chain));
4575   }
4576   case Builtin::BI_InterlockedExchange8:
4577   case Builtin::BI_InterlockedExchange16:
4578   case Builtin::BI_InterlockedExchange:
4579   case Builtin::BI_InterlockedExchangePointer:
4580     return RValue::get(
4581         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
4582   case Builtin::BI_InterlockedCompareExchangePointer:
4583   case Builtin::BI_InterlockedCompareExchangePointer_nf: {
4584     llvm::Type *RTy;
4585     llvm::IntegerType *IntType =
4586       IntegerType::get(getLLVMContext(),
4587                        getContext().getTypeSize(E->getType()));
4588     llvm::Type *IntPtrType = IntType->getPointerTo();
4589 
4590     llvm::Value *Destination =
4591       Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), IntPtrType);
4592 
4593     llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
4594     RTy = Exchange->getType();
4595     Exchange = Builder.CreatePtrToInt(Exchange, IntType);
4596 
4597     llvm::Value *Comparand =
4598       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
4599 
4600     auto Ordering =
4601       BuiltinID == Builtin::BI_InterlockedCompareExchangePointer_nf ?
4602       AtomicOrdering::Monotonic : AtomicOrdering::SequentiallyConsistent;
4603 
4604     auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
4605                                               Ordering, Ordering);
4606     Result->setVolatile(true);
4607 
4608     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
4609                                                                          0),
4610                                               RTy));
4611   }
4612   case Builtin::BI_InterlockedCompareExchange8:
4613   case Builtin::BI_InterlockedCompareExchange16:
4614   case Builtin::BI_InterlockedCompareExchange:
4615   case Builtin::BI_InterlockedCompareExchange64:
4616     return RValue::get(EmitAtomicCmpXchgForMSIntrin(*this, E));
4617   case Builtin::BI_InterlockedIncrement16:
4618   case Builtin::BI_InterlockedIncrement:
4619     return RValue::get(
4620         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
4621   case Builtin::BI_InterlockedDecrement16:
4622   case Builtin::BI_InterlockedDecrement:
4623     return RValue::get(
4624         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
4625   case Builtin::BI_InterlockedAnd8:
4626   case Builtin::BI_InterlockedAnd16:
4627   case Builtin::BI_InterlockedAnd:
4628     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
4629   case Builtin::BI_InterlockedExchangeAdd8:
4630   case Builtin::BI_InterlockedExchangeAdd16:
4631   case Builtin::BI_InterlockedExchangeAdd:
4632     return RValue::get(
4633         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
4634   case Builtin::BI_InterlockedExchangeSub8:
4635   case Builtin::BI_InterlockedExchangeSub16:
4636   case Builtin::BI_InterlockedExchangeSub:
4637     return RValue::get(
4638         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
4639   case Builtin::BI_InterlockedOr8:
4640   case Builtin::BI_InterlockedOr16:
4641   case Builtin::BI_InterlockedOr:
4642     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
4643   case Builtin::BI_InterlockedXor8:
4644   case Builtin::BI_InterlockedXor16:
4645   case Builtin::BI_InterlockedXor:
4646     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
4647 
4648   case Builtin::BI_bittest64:
4649   case Builtin::BI_bittest:
4650   case Builtin::BI_bittestandcomplement64:
4651   case Builtin::BI_bittestandcomplement:
4652   case Builtin::BI_bittestandreset64:
4653   case Builtin::BI_bittestandreset:
4654   case Builtin::BI_bittestandset64:
4655   case Builtin::BI_bittestandset:
4656   case Builtin::BI_interlockedbittestandreset:
4657   case Builtin::BI_interlockedbittestandreset64:
4658   case Builtin::BI_interlockedbittestandset64:
4659   case Builtin::BI_interlockedbittestandset:
4660   case Builtin::BI_interlockedbittestandset_acq:
4661   case Builtin::BI_interlockedbittestandset_rel:
4662   case Builtin::BI_interlockedbittestandset_nf:
4663   case Builtin::BI_interlockedbittestandreset_acq:
4664   case Builtin::BI_interlockedbittestandreset_rel:
4665   case Builtin::BI_interlockedbittestandreset_nf:
4666     return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
4667 
4668     // These builtins exist to emit regular volatile loads and stores not
4669     // affected by the -fms-volatile setting.
4670   case Builtin::BI__iso_volatile_load8:
4671   case Builtin::BI__iso_volatile_load16:
4672   case Builtin::BI__iso_volatile_load32:
4673   case Builtin::BI__iso_volatile_load64:
4674     return RValue::get(EmitISOVolatileLoad(*this, E));
4675   case Builtin::BI__iso_volatile_store8:
4676   case Builtin::BI__iso_volatile_store16:
4677   case Builtin::BI__iso_volatile_store32:
4678   case Builtin::BI__iso_volatile_store64:
4679     return RValue::get(EmitISOVolatileStore(*this, E));
4680 
4681   case Builtin::BI__exception_code:
4682   case Builtin::BI_exception_code:
4683     return RValue::get(EmitSEHExceptionCode());
4684   case Builtin::BI__exception_info:
4685   case Builtin::BI_exception_info:
4686     return RValue::get(EmitSEHExceptionInfo());
4687   case Builtin::BI__abnormal_termination:
4688   case Builtin::BI_abnormal_termination:
4689     return RValue::get(EmitSEHAbnormalTermination());
4690   case Builtin::BI_setjmpex:
4691     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
4692         E->getArg(0)->getType()->isPointerType())
4693       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
4694     break;
4695   case Builtin::BI_setjmp:
4696     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
4697         E->getArg(0)->getType()->isPointerType()) {
4698       if (getTarget().getTriple().getArch() == llvm::Triple::x86)
4699         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
4700       else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
4701         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
4702       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
4703     }
4704     break;
4705 
4706   case Builtin::BI__GetExceptionInfo: {
4707     if (llvm::GlobalVariable *GV =
4708             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
4709       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
4710     break;
4711   }
4712 
4713   case Builtin::BI__fastfail:
4714     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
4715 
4716   case Builtin::BI__builtin_coro_size: {
4717     auto & Context = getContext();
4718     auto SizeTy = Context.getSizeType();
4719     auto T = Builder.getIntNTy(Context.getTypeSize(SizeTy));
4720     Function *F = CGM.getIntrinsic(Intrinsic::coro_size, T);
4721     return RValue::get(Builder.CreateCall(F));
4722   }
4723 
4724   case Builtin::BI__builtin_coro_id:
4725     return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
4726   case Builtin::BI__builtin_coro_promise:
4727     return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
4728   case Builtin::BI__builtin_coro_resume:
4729     return EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
4730   case Builtin::BI__builtin_coro_frame:
4731     return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
4732   case Builtin::BI__builtin_coro_noop:
4733     return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
4734   case Builtin::BI__builtin_coro_free:
4735     return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
4736   case Builtin::BI__builtin_coro_destroy:
4737     return EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
4738   case Builtin::BI__builtin_coro_done:
4739     return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
4740   case Builtin::BI__builtin_coro_alloc:
4741     return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
4742   case Builtin::BI__builtin_coro_begin:
4743     return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
4744   case Builtin::BI__builtin_coro_end:
4745     return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
4746   case Builtin::BI__builtin_coro_suspend:
4747     return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
4748 
4749   // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
4750   case Builtin::BIread_pipe:
4751   case Builtin::BIwrite_pipe: {
4752     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4753           *Arg1 = EmitScalarExpr(E->getArg(1));
4754     CGOpenCLRuntime OpenCLRT(CGM);
4755     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4756     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4757 
4758     // Type of the generic packet parameter.
4759     unsigned GenericAS =
4760         getContext().getTargetAddressSpace(LangAS::opencl_generic);
4761     llvm::Type *I8PTy = llvm::PointerType::get(
4762         llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
4763 
4764     // Testing which overloaded version we should generate the call for.
4765     if (2U == E->getNumArgs()) {
4766       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
4767                                                              : "__write_pipe_2";
4768       // Creating a generic function type to be able to call with any builtin or
4769       // user defined type.
4770       llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
4771       llvm::FunctionType *FTy = llvm::FunctionType::get(
4772           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4773       Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
4774       return RValue::get(
4775           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4776                           {Arg0, BCast, PacketSize, PacketAlign}));
4777     } else {
4778       assert(4 == E->getNumArgs() &&
4779              "Illegal number of parameters to pipe function");
4780       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
4781                                                              : "__write_pipe_4";
4782 
4783       llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
4784                               Int32Ty, Int32Ty};
4785       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
4786             *Arg3 = EmitScalarExpr(E->getArg(3));
4787       llvm::FunctionType *FTy = llvm::FunctionType::get(
4788           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4789       Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
4790       // We know the third argument is an integer type, but we may need to cast
4791       // it to i32.
4792       if (Arg2->getType() != Int32Ty)
4793         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
4794       return RValue::get(
4795           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4796                           {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
4797     }
4798   }
4799   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
4800   // functions
4801   case Builtin::BIreserve_read_pipe:
4802   case Builtin::BIreserve_write_pipe:
4803   case Builtin::BIwork_group_reserve_read_pipe:
4804   case Builtin::BIwork_group_reserve_write_pipe:
4805   case Builtin::BIsub_group_reserve_read_pipe:
4806   case Builtin::BIsub_group_reserve_write_pipe: {
4807     // Composing the mangled name for the function.
4808     const char *Name;
4809     if (BuiltinID == Builtin::BIreserve_read_pipe)
4810       Name = "__reserve_read_pipe";
4811     else if (BuiltinID == Builtin::BIreserve_write_pipe)
4812       Name = "__reserve_write_pipe";
4813     else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
4814       Name = "__work_group_reserve_read_pipe";
4815     else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
4816       Name = "__work_group_reserve_write_pipe";
4817     else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
4818       Name = "__sub_group_reserve_read_pipe";
4819     else
4820       Name = "__sub_group_reserve_write_pipe";
4821 
4822     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4823           *Arg1 = EmitScalarExpr(E->getArg(1));
4824     llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
4825     CGOpenCLRuntime OpenCLRT(CGM);
4826     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4827     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4828 
4829     // Building the generic function prototype.
4830     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
4831     llvm::FunctionType *FTy = llvm::FunctionType::get(
4832         ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4833     // We know the second argument is an integer type, but we may need to cast
4834     // it to i32.
4835     if (Arg1->getType() != Int32Ty)
4836       Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
4837     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4838                                        {Arg0, Arg1, PacketSize, PacketAlign}));
4839   }
4840   // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
4841   // functions
4842   case Builtin::BIcommit_read_pipe:
4843   case Builtin::BIcommit_write_pipe:
4844   case Builtin::BIwork_group_commit_read_pipe:
4845   case Builtin::BIwork_group_commit_write_pipe:
4846   case Builtin::BIsub_group_commit_read_pipe:
4847   case Builtin::BIsub_group_commit_write_pipe: {
4848     const char *Name;
4849     if (BuiltinID == Builtin::BIcommit_read_pipe)
4850       Name = "__commit_read_pipe";
4851     else if (BuiltinID == Builtin::BIcommit_write_pipe)
4852       Name = "__commit_write_pipe";
4853     else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
4854       Name = "__work_group_commit_read_pipe";
4855     else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
4856       Name = "__work_group_commit_write_pipe";
4857     else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
4858       Name = "__sub_group_commit_read_pipe";
4859     else
4860       Name = "__sub_group_commit_write_pipe";
4861 
4862     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4863           *Arg1 = EmitScalarExpr(E->getArg(1));
4864     CGOpenCLRuntime OpenCLRT(CGM);
4865     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4866     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4867 
4868     // Building the generic function prototype.
4869     llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
4870     llvm::FunctionType *FTy =
4871         llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
4872                                 llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4873 
4874     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4875                                        {Arg0, Arg1, PacketSize, PacketAlign}));
4876   }
4877   // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
4878   case Builtin::BIget_pipe_num_packets:
4879   case Builtin::BIget_pipe_max_packets: {
4880     const char *BaseName;
4881     const auto *PipeTy = E->getArg(0)->getType()->castAs<PipeType>();
4882     if (BuiltinID == Builtin::BIget_pipe_num_packets)
4883       BaseName = "__get_pipe_num_packets";
4884     else
4885       BaseName = "__get_pipe_max_packets";
4886     std::string Name = std::string(BaseName) +
4887                        std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
4888 
4889     // Building the generic function prototype.
4890     Value *Arg0 = EmitScalarExpr(E->getArg(0));
4891     CGOpenCLRuntime OpenCLRT(CGM);
4892     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4893     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4894     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
4895     llvm::FunctionType *FTy = llvm::FunctionType::get(
4896         Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4897 
4898     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4899                                        {Arg0, PacketSize, PacketAlign}));
4900   }
4901 
4902   // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
4903   case Builtin::BIto_global:
4904   case Builtin::BIto_local:
4905   case Builtin::BIto_private: {
4906     auto Arg0 = EmitScalarExpr(E->getArg(0));
4907     auto NewArgT = llvm::PointerType::get(Int8Ty,
4908       CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
4909     auto NewRetT = llvm::PointerType::get(Int8Ty,
4910       CGM.getContext().getTargetAddressSpace(
4911         E->getType()->getPointeeType().getAddressSpace()));
4912     auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
4913     llvm::Value *NewArg;
4914     if (Arg0->getType()->getPointerAddressSpace() !=
4915         NewArgT->getPointerAddressSpace())
4916       NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
4917     else
4918       NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
4919     auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
4920     auto NewCall =
4921         EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
4922     return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
4923       ConvertType(E->getType())));
4924   }
4925 
4926   // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
4927   // It contains four different overload formats specified in Table 6.13.17.1.
4928   case Builtin::BIenqueue_kernel: {
4929     StringRef Name; // Generated function call name
4930     unsigned NumArgs = E->getNumArgs();
4931 
4932     llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
4933     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
4934         getContext().getTargetAddressSpace(LangAS::opencl_generic));
4935 
4936     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
4937     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
4938     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
4939     llvm::Value *Range = NDRangeL.getAddress(*this).getPointer();
4940     llvm::Type *RangeTy = NDRangeL.getAddress(*this).getType();
4941 
4942     if (NumArgs == 4) {
4943       // The most basic form of the call with parameters:
4944       // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
4945       Name = "__enqueue_kernel_basic";
4946       llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
4947                               GenericVoidPtrTy};
4948       llvm::FunctionType *FTy = llvm::FunctionType::get(
4949           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4950 
4951       auto Info =
4952           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
4953       llvm::Value *Kernel =
4954           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
4955       llvm::Value *Block =
4956           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
4957 
4958       AttrBuilder B(Builder.getContext());
4959       B.addByValAttr(NDRangeL.getAddress(*this).getElementType());
4960       llvm::AttributeList ByValAttrSet =
4961           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
4962 
4963       auto RTCall =
4964           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
4965                           {Queue, Flags, Range, Kernel, Block});
4966       RTCall->setAttributes(ByValAttrSet);
4967       return RValue::get(RTCall);
4968     }
4969     assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
4970 
4971     // Create a temporary array to hold the sizes of local pointer arguments
4972     // for the block. \p First is the position of the first size argument.
4973     auto CreateArrayForSizeVar = [=](unsigned First)
4974         -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
4975       llvm::APInt ArraySize(32, NumArgs - First);
4976       QualType SizeArrayTy = getContext().getConstantArrayType(
4977           getContext().getSizeType(), ArraySize, nullptr, ArrayType::Normal,
4978           /*IndexTypeQuals=*/0);
4979       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
4980       llvm::Value *TmpPtr = Tmp.getPointer();
4981       llvm::Value *TmpSize = EmitLifetimeStart(
4982           CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
4983       llvm::Value *ElemPtr;
4984       // Each of the following arguments specifies the size of the corresponding
4985       // argument passed to the enqueued block.
4986       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
4987       for (unsigned I = First; I < NumArgs; ++I) {
4988         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
4989         auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
4990                                       {Zero, Index});
4991         if (I == First)
4992           ElemPtr = GEP;
4993         auto *V =
4994             Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
4995         Builder.CreateAlignedStore(
4996             V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
4997       }
4998       return std::tie(ElemPtr, TmpSize, TmpPtr);
4999     };
5000 
5001     // Could have events and/or varargs.
5002     if (E->getArg(3)->getType()->isBlockPointerType()) {
5003       // No events passed, but has variadic arguments.
5004       Name = "__enqueue_kernel_varargs";
5005       auto Info =
5006           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5007       llvm::Value *Kernel =
5008           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
5009       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5010       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5011       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4);
5012 
5013       // Create a vector of the arguments, as well as a constant value to
5014       // express to the runtime the number of variadic arguments.
5015       llvm::Value *const Args[] = {Queue,  Flags,
5016                                    Range,  Kernel,
5017                                    Block,  ConstantInt::get(IntTy, NumArgs - 4),
5018                                    ElemPtr};
5019       llvm::Type *const ArgTys[] = {
5020           QueueTy,          IntTy, RangeTy,           GenericVoidPtrTy,
5021           GenericVoidPtrTy, IntTy, ElemPtr->getType()};
5022 
5023       llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
5024       auto Call = RValue::get(
5025           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
5026       if (TmpSize)
5027         EmitLifetimeEnd(TmpSize, TmpPtr);
5028       return Call;
5029     }
5030     // Any calls now have event arguments passed.
5031     if (NumArgs >= 7) {
5032       llvm::Type *EventTy = ConvertType(getContext().OCLClkEventTy);
5033       llvm::PointerType *EventPtrTy = EventTy->getPointerTo(
5034           CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5035 
5036       llvm::Value *NumEvents =
5037           Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
5038 
5039       // Since SemaOpenCLBuiltinEnqueueKernel allows fifth and sixth arguments
5040       // to be a null pointer constant (including `0` literal), we can take it
5041       // into account and emit null pointer directly.
5042       llvm::Value *EventWaitList = nullptr;
5043       if (E->getArg(4)->isNullPointerConstant(
5044               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5045         EventWaitList = llvm::ConstantPointerNull::get(EventPtrTy);
5046       } else {
5047         EventWaitList = E->getArg(4)->getType()->isArrayType()
5048                         ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
5049                         : EmitScalarExpr(E->getArg(4));
5050         // Convert to generic address space.
5051         EventWaitList = Builder.CreatePointerCast(EventWaitList, EventPtrTy);
5052       }
5053       llvm::Value *EventRet = nullptr;
5054       if (E->getArg(5)->isNullPointerConstant(
5055               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5056         EventRet = llvm::ConstantPointerNull::get(EventPtrTy);
5057       } else {
5058         EventRet =
5059             Builder.CreatePointerCast(EmitScalarExpr(E->getArg(5)), EventPtrTy);
5060       }
5061 
5062       auto Info =
5063           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
5064       llvm::Value *Kernel =
5065           Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
5066       llvm::Value *Block =
5067           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5068 
5069       std::vector<llvm::Type *> ArgTys = {
5070           QueueTy,    Int32Ty,    RangeTy,          Int32Ty,
5071           EventPtrTy, EventPtrTy, GenericVoidPtrTy, GenericVoidPtrTy};
5072 
5073       std::vector<llvm::Value *> Args = {Queue,     Flags,         Range,
5074                                          NumEvents, EventWaitList, EventRet,
5075                                          Kernel,    Block};
5076 
5077       if (NumArgs == 7) {
5078         // Has events but no variadics.
5079         Name = "__enqueue_kernel_basic_events";
5080         llvm::FunctionType *FTy = llvm::FunctionType::get(
5081             Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5082         return RValue::get(
5083             EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5084                             llvm::ArrayRef<llvm::Value *>(Args)));
5085       }
5086       // Has event info and variadics
5087       // Pass the number of variadics to the runtime function too.
5088       Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
5089       ArgTys.push_back(Int32Ty);
5090       Name = "__enqueue_kernel_events_varargs";
5091 
5092       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5093       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7);
5094       Args.push_back(ElemPtr);
5095       ArgTys.push_back(ElemPtr->getType());
5096 
5097       llvm::FunctionType *FTy = llvm::FunctionType::get(
5098           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5099       auto Call =
5100           RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5101                                       llvm::ArrayRef<llvm::Value *>(Args)));
5102       if (TmpSize)
5103         EmitLifetimeEnd(TmpSize, TmpPtr);
5104       return Call;
5105     }
5106     LLVM_FALLTHROUGH;
5107   }
5108   // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
5109   // parameter.
5110   case Builtin::BIget_kernel_work_group_size: {
5111     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5112         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5113     auto Info =
5114         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5115     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
5116     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5117     return RValue::get(EmitRuntimeCall(
5118         CGM.CreateRuntimeFunction(
5119             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5120                                     false),
5121             "__get_kernel_work_group_size_impl"),
5122         {Kernel, Arg}));
5123   }
5124   case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
5125     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5126         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5127     auto Info =
5128         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5129     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
5130     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5131     return RValue::get(EmitRuntimeCall(
5132         CGM.CreateRuntimeFunction(
5133             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5134                                     false),
5135             "__get_kernel_preferred_work_group_size_multiple_impl"),
5136         {Kernel, Arg}));
5137   }
5138   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
5139   case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
5140     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5141         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5142     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
5143     llvm::Value *NDRange = NDRangeL.getAddress(*this).getPointer();
5144     auto Info =
5145         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
5146     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
5147     Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5148     const char *Name =
5149         BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
5150             ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
5151             : "__get_kernel_sub_group_count_for_ndrange_impl";
5152     return RValue::get(EmitRuntimeCall(
5153         CGM.CreateRuntimeFunction(
5154             llvm::FunctionType::get(
5155                 IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
5156                 false),
5157             Name),
5158         {NDRange, Kernel, Block}));
5159   }
5160 
5161   case Builtin::BI__builtin_store_half:
5162   case Builtin::BI__builtin_store_halff: {
5163     Value *Val = EmitScalarExpr(E->getArg(0));
5164     Address Address = EmitPointerWithAlignment(E->getArg(1));
5165     Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
5166     return RValue::get(Builder.CreateStore(HalfVal, Address));
5167   }
5168   case Builtin::BI__builtin_load_half: {
5169     Address Address = EmitPointerWithAlignment(E->getArg(0));
5170     Value *HalfVal = Builder.CreateLoad(Address);
5171     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
5172   }
5173   case Builtin::BI__builtin_load_halff: {
5174     Address Address = EmitPointerWithAlignment(E->getArg(0));
5175     Value *HalfVal = Builder.CreateLoad(Address);
5176     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
5177   }
5178   case Builtin::BIprintf:
5179     if (getTarget().getTriple().isNVPTX() ||
5180         getTarget().getTriple().isAMDGCN()) {
5181       if (getLangOpts().OpenMPIsDevice)
5182         return EmitOpenMPDevicePrintfCallExpr(E);
5183       if (getTarget().getTriple().isNVPTX())
5184         return EmitNVPTXDevicePrintfCallExpr(E);
5185       if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
5186         return EmitAMDGPUDevicePrintfCallExpr(E);
5187     }
5188 
5189     break;
5190   case Builtin::BI__builtin_canonicalize:
5191   case Builtin::BI__builtin_canonicalizef:
5192   case Builtin::BI__builtin_canonicalizef16:
5193   case Builtin::BI__builtin_canonicalizel:
5194     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
5195 
5196   case Builtin::BI__builtin_thread_pointer: {
5197     if (!getContext().getTargetInfo().isTLSSupported())
5198       CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
5199     // Fall through - it's already mapped to the intrinsic by GCCBuiltin.
5200     break;
5201   }
5202   case Builtin::BI__builtin_os_log_format:
5203     return emitBuiltinOSLogFormat(*E);
5204 
5205   case Builtin::BI__xray_customevent: {
5206     if (!ShouldXRayInstrumentFunction())
5207       return RValue::getIgnored();
5208 
5209     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5210             XRayInstrKind::Custom))
5211       return RValue::getIgnored();
5212 
5213     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5214       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
5215         return RValue::getIgnored();
5216 
5217     Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
5218     auto FTy = F->getFunctionType();
5219     auto Arg0 = E->getArg(0);
5220     auto Arg0Val = EmitScalarExpr(Arg0);
5221     auto Arg0Ty = Arg0->getType();
5222     auto PTy0 = FTy->getParamType(0);
5223     if (PTy0 != Arg0Val->getType()) {
5224       if (Arg0Ty->isArrayType())
5225         Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
5226       else
5227         Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
5228     }
5229     auto Arg1 = EmitScalarExpr(E->getArg(1));
5230     auto PTy1 = FTy->getParamType(1);
5231     if (PTy1 != Arg1->getType())
5232       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
5233     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
5234   }
5235 
5236   case Builtin::BI__xray_typedevent: {
5237     // TODO: There should be a way to always emit events even if the current
5238     // function is not instrumented. Losing events in a stream can cripple
5239     // a trace.
5240     if (!ShouldXRayInstrumentFunction())
5241       return RValue::getIgnored();
5242 
5243     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5244             XRayInstrKind::Typed))
5245       return RValue::getIgnored();
5246 
5247     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5248       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
5249         return RValue::getIgnored();
5250 
5251     Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
5252     auto FTy = F->getFunctionType();
5253     auto Arg0 = EmitScalarExpr(E->getArg(0));
5254     auto PTy0 = FTy->getParamType(0);
5255     if (PTy0 != Arg0->getType())
5256       Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
5257     auto Arg1 = E->getArg(1);
5258     auto Arg1Val = EmitScalarExpr(Arg1);
5259     auto Arg1Ty = Arg1->getType();
5260     auto PTy1 = FTy->getParamType(1);
5261     if (PTy1 != Arg1Val->getType()) {
5262       if (Arg1Ty->isArrayType())
5263         Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
5264       else
5265         Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
5266     }
5267     auto Arg2 = EmitScalarExpr(E->getArg(2));
5268     auto PTy2 = FTy->getParamType(2);
5269     if (PTy2 != Arg2->getType())
5270       Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
5271     return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
5272   }
5273 
5274   case Builtin::BI__builtin_ms_va_start:
5275   case Builtin::BI__builtin_ms_va_end:
5276     return RValue::get(
5277         EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
5278                        BuiltinID == Builtin::BI__builtin_ms_va_start));
5279 
5280   case Builtin::BI__builtin_ms_va_copy: {
5281     // Lower this manually. We can't reliably determine whether or not any
5282     // given va_copy() is for a Win64 va_list from the calling convention
5283     // alone, because it's legal to do this from a System V ABI function.
5284     // With opaque pointer types, we won't have enough information in LLVM
5285     // IR to determine this from the argument types, either. Best to do it
5286     // now, while we have enough information.
5287     Address DestAddr = EmitMSVAListRef(E->getArg(0));
5288     Address SrcAddr = EmitMSVAListRef(E->getArg(1));
5289 
5290     llvm::Type *BPP = Int8PtrPtrTy;
5291 
5292     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
5293                        Int8PtrTy, DestAddr.getAlignment());
5294     SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
5295                       Int8PtrTy, SrcAddr.getAlignment());
5296 
5297     Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
5298     return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
5299   }
5300 
5301   case Builtin::BI__builtin_get_device_side_mangled_name: {
5302     auto Name = CGM.getCUDARuntime().getDeviceSideName(
5303         cast<DeclRefExpr>(E->getArg(0)->IgnoreImpCasts())->getDecl());
5304     auto Str = CGM.GetAddrOfConstantCString(Name, "");
5305     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
5306                                llvm::ConstantInt::get(SizeTy, 0)};
5307     auto *Ptr = llvm::ConstantExpr::getGetElementPtr(Str.getElementType(),
5308                                                      Str.getPointer(), Zeros);
5309     return RValue::get(Ptr);
5310   }
5311   }
5312 
5313   // If this is an alias for a lib function (e.g. __builtin_sin), emit
5314   // the call using the normal call path, but using the unmangled
5315   // version of the function name.
5316   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
5317     return emitLibraryCall(*this, FD, E,
5318                            CGM.getBuiltinLibFunction(FD, BuiltinID));
5319 
5320   // If this is a predefined lib function (e.g. malloc), emit the call
5321   // using exactly the normal call path.
5322   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
5323     return emitLibraryCall(*this, FD, E,
5324                       cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
5325 
5326   // Check that a call to a target specific builtin has the correct target
5327   // features.
5328   // This is down here to avoid non-target specific builtins, however, if
5329   // generic builtins start to require generic target features then we
5330   // can move this up to the beginning of the function.
5331   checkTargetFeatures(E, FD);
5332 
5333   if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
5334     LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
5335 
5336   // See if we have a target specific intrinsic.
5337   const char *Name = getContext().BuiltinInfo.getName(BuiltinID);
5338   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
5339   StringRef Prefix =
5340       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
5341   if (!Prefix.empty()) {
5342     IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix.data(), Name);
5343     // NOTE we don't need to perform a compatibility flag check here since the
5344     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
5345     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
5346     if (IntrinsicID == Intrinsic::not_intrinsic)
5347       IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
5348   }
5349 
5350   if (IntrinsicID != Intrinsic::not_intrinsic) {
5351     SmallVector<Value*, 16> Args;
5352 
5353     // Find out if any arguments are required to be integer constant
5354     // expressions.
5355     unsigned ICEArguments = 0;
5356     ASTContext::GetBuiltinTypeError Error;
5357     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5358     assert(Error == ASTContext::GE_None && "Should not codegen an error");
5359 
5360     Function *F = CGM.getIntrinsic(IntrinsicID);
5361     llvm::FunctionType *FTy = F->getFunctionType();
5362 
5363     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
5364       Value *ArgValue;
5365       // If this is a normal argument, just emit it as a scalar.
5366       if ((ICEArguments & (1 << i)) == 0) {
5367         ArgValue = EmitScalarExpr(E->getArg(i));
5368       } else {
5369         // If this is required to be a constant, constant fold it so that we
5370         // know that the generated intrinsic gets a ConstantInt.
5371         ArgValue = llvm::ConstantInt::get(
5372             getLLVMContext(),
5373             *E->getArg(i)->getIntegerConstantExpr(getContext()));
5374       }
5375 
5376       // If the intrinsic arg type is different from the builtin arg type
5377       // we need to do a bit cast.
5378       llvm::Type *PTy = FTy->getParamType(i);
5379       if (PTy != ArgValue->getType()) {
5380         // XXX - vector of pointers?
5381         if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) {
5382           if (PtrTy->getAddressSpace() !=
5383               ArgValue->getType()->getPointerAddressSpace()) {
5384             ArgValue = Builder.CreateAddrSpaceCast(
5385               ArgValue,
5386               ArgValue->getType()->getPointerTo(PtrTy->getAddressSpace()));
5387           }
5388         }
5389 
5390         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
5391                "Must be able to losslessly bit cast to param");
5392         ArgValue = Builder.CreateBitCast(ArgValue, PTy);
5393       }
5394 
5395       Args.push_back(ArgValue);
5396     }
5397 
5398     Value *V = Builder.CreateCall(F, Args);
5399     QualType BuiltinRetType = E->getType();
5400 
5401     llvm::Type *RetTy = VoidTy;
5402     if (!BuiltinRetType->isVoidType())
5403       RetTy = ConvertType(BuiltinRetType);
5404 
5405     if (RetTy != V->getType()) {
5406       // XXX - vector of pointers?
5407       if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) {
5408         if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) {
5409           V = Builder.CreateAddrSpaceCast(
5410             V, V->getType()->getPointerTo(PtrTy->getAddressSpace()));
5411         }
5412       }
5413 
5414       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
5415              "Must be able to losslessly bit cast result type");
5416       V = Builder.CreateBitCast(V, RetTy);
5417     }
5418 
5419     return RValue::get(V);
5420   }
5421 
5422   // Some target-specific builtins can have aggregate return values, e.g.
5423   // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
5424   // ReturnValue to be non-null, so that the target-specific emission code can
5425   // always just emit into it.
5426   TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
5427   if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
5428     Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
5429     ReturnValue = ReturnValueSlot(DestPtr, false);
5430   }
5431 
5432   // Now see if we can emit a target-specific builtin.
5433   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
5434     switch (EvalKind) {
5435     case TEK_Scalar:
5436       return RValue::get(V);
5437     case TEK_Aggregate:
5438       return RValue::getAggregate(ReturnValue.getValue(),
5439                                   ReturnValue.isVolatile());
5440     case TEK_Complex:
5441       llvm_unreachable("No current target builtin returns complex");
5442     }
5443     llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
5444   }
5445 
5446   ErrorUnsupported(E, "builtin function");
5447 
5448   // Unknown builtin, for now just dump it out and return undef.
5449   return GetUndefRValue(E->getType());
5450 }
5451 
5452 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
5453                                         unsigned BuiltinID, const CallExpr *E,
5454                                         ReturnValueSlot ReturnValue,
5455                                         llvm::Triple::ArchType Arch) {
5456   switch (Arch) {
5457   case llvm::Triple::arm:
5458   case llvm::Triple::armeb:
5459   case llvm::Triple::thumb:
5460   case llvm::Triple::thumbeb:
5461     return CGF->EmitARMBuiltinExpr(BuiltinID, E, ReturnValue, Arch);
5462   case llvm::Triple::aarch64:
5463   case llvm::Triple::aarch64_32:
5464   case llvm::Triple::aarch64_be:
5465     return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
5466   case llvm::Triple::bpfeb:
5467   case llvm::Triple::bpfel:
5468     return CGF->EmitBPFBuiltinExpr(BuiltinID, E);
5469   case llvm::Triple::x86:
5470   case llvm::Triple::x86_64:
5471     return CGF->EmitX86BuiltinExpr(BuiltinID, E);
5472   case llvm::Triple::ppc:
5473   case llvm::Triple::ppcle:
5474   case llvm::Triple::ppc64:
5475   case llvm::Triple::ppc64le:
5476     return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
5477   case llvm::Triple::r600:
5478   case llvm::Triple::amdgcn:
5479     return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
5480   case llvm::Triple::systemz:
5481     return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
5482   case llvm::Triple::nvptx:
5483   case llvm::Triple::nvptx64:
5484     return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
5485   case llvm::Triple::wasm32:
5486   case llvm::Triple::wasm64:
5487     return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
5488   case llvm::Triple::hexagon:
5489     return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
5490   case llvm::Triple::riscv32:
5491   case llvm::Triple::riscv64:
5492     return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue);
5493   default:
5494     return nullptr;
5495   }
5496 }
5497 
5498 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
5499                                               const CallExpr *E,
5500                                               ReturnValueSlot ReturnValue) {
5501   if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
5502     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
5503     return EmitTargetArchBuiltinExpr(
5504         this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
5505         ReturnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
5506   }
5507 
5508   return EmitTargetArchBuiltinExpr(this, BuiltinID, E, ReturnValue,
5509                                    getTarget().getTriple().getArch());
5510 }
5511 
5512 static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
5513                                           NeonTypeFlags TypeFlags,
5514                                           bool HasLegalHalfType = true,
5515                                           bool V1Ty = false,
5516                                           bool AllowBFloatArgsAndRet = true) {
5517   int IsQuad = TypeFlags.isQuad();
5518   switch (TypeFlags.getEltType()) {
5519   case NeonTypeFlags::Int8:
5520   case NeonTypeFlags::Poly8:
5521     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
5522   case NeonTypeFlags::Int16:
5523   case NeonTypeFlags::Poly16:
5524     return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5525   case NeonTypeFlags::BFloat16:
5526     if (AllowBFloatArgsAndRet)
5527       return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad));
5528     else
5529       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5530   case NeonTypeFlags::Float16:
5531     if (HasLegalHalfType)
5532       return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
5533     else
5534       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5535   case NeonTypeFlags::Int32:
5536     return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
5537   case NeonTypeFlags::Int64:
5538   case NeonTypeFlags::Poly64:
5539     return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
5540   case NeonTypeFlags::Poly128:
5541     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
5542     // There is a lot of i128 and f128 API missing.
5543     // so we use v16i8 to represent poly128 and get pattern matched.
5544     return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
5545   case NeonTypeFlags::Float32:
5546     return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
5547   case NeonTypeFlags::Float64:
5548     return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
5549   }
5550   llvm_unreachable("Unknown vector element type!");
5551 }
5552 
5553 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
5554                                           NeonTypeFlags IntTypeFlags) {
5555   int IsQuad = IntTypeFlags.isQuad();
5556   switch (IntTypeFlags.getEltType()) {
5557   case NeonTypeFlags::Int16:
5558     return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
5559   case NeonTypeFlags::Int32:
5560     return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
5561   case NeonTypeFlags::Int64:
5562     return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
5563   default:
5564     llvm_unreachable("Type can't be converted to floating-point!");
5565   }
5566 }
5567 
5568 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
5569                                       const ElementCount &Count) {
5570   Value *SV = llvm::ConstantVector::getSplat(Count, C);
5571   return Builder.CreateShuffleVector(V, V, SV, "lane");
5572 }
5573 
5574 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
5575   ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount();
5576   return EmitNeonSplat(V, C, EC);
5577 }
5578 
5579 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
5580                                      const char *name,
5581                                      unsigned shift, bool rightshift) {
5582   unsigned j = 0;
5583   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
5584        ai != ae; ++ai, ++j) {
5585     if (F->isConstrainedFPIntrinsic())
5586       if (ai->getType()->isMetadataTy())
5587         continue;
5588     if (shift > 0 && shift == j)
5589       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
5590     else
5591       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
5592   }
5593 
5594   if (F->isConstrainedFPIntrinsic())
5595     return Builder.CreateConstrainedFPCall(F, Ops, name);
5596   else
5597     return Builder.CreateCall(F, Ops, name);
5598 }
5599 
5600 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
5601                                             bool neg) {
5602   int SV = cast<ConstantInt>(V)->getSExtValue();
5603   return ConstantInt::get(Ty, neg ? -SV : SV);
5604 }
5605 
5606 // Right-shift a vector by a constant.
5607 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
5608                                           llvm::Type *Ty, bool usgn,
5609                                           const char *name) {
5610   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
5611 
5612   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
5613   int EltSize = VTy->getScalarSizeInBits();
5614 
5615   Vec = Builder.CreateBitCast(Vec, Ty);
5616 
5617   // lshr/ashr are undefined when the shift amount is equal to the vector
5618   // element size.
5619   if (ShiftAmt == EltSize) {
5620     if (usgn) {
5621       // Right-shifting an unsigned value by its size yields 0.
5622       return llvm::ConstantAggregateZero::get(VTy);
5623     } else {
5624       // Right-shifting a signed value by its size is equivalent
5625       // to a shift of size-1.
5626       --ShiftAmt;
5627       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
5628     }
5629   }
5630 
5631   Shift = EmitNeonShiftVector(Shift, Ty, false);
5632   if (usgn)
5633     return Builder.CreateLShr(Vec, Shift, name);
5634   else
5635     return Builder.CreateAShr(Vec, Shift, name);
5636 }
5637 
5638 enum {
5639   AddRetType = (1 << 0),
5640   Add1ArgType = (1 << 1),
5641   Add2ArgTypes = (1 << 2),
5642 
5643   VectorizeRetType = (1 << 3),
5644   VectorizeArgTypes = (1 << 4),
5645 
5646   InventFloatType = (1 << 5),
5647   UnsignedAlts = (1 << 6),
5648 
5649   Use64BitVectors = (1 << 7),
5650   Use128BitVectors = (1 << 8),
5651 
5652   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
5653   VectorRet = AddRetType | VectorizeRetType,
5654   VectorRetGetArgs01 =
5655       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
5656   FpCmpzModifiers =
5657       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
5658 };
5659 
5660 namespace {
5661 struct ARMVectorIntrinsicInfo {
5662   const char *NameHint;
5663   unsigned BuiltinID;
5664   unsigned LLVMIntrinsic;
5665   unsigned AltLLVMIntrinsic;
5666   uint64_t TypeModifier;
5667 
5668   bool operator<(unsigned RHSBuiltinID) const {
5669     return BuiltinID < RHSBuiltinID;
5670   }
5671   bool operator<(const ARMVectorIntrinsicInfo &TE) const {
5672     return BuiltinID < TE.BuiltinID;
5673   }
5674 };
5675 } // end anonymous namespace
5676 
5677 #define NEONMAP0(NameBase) \
5678   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
5679 
5680 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
5681   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
5682       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
5683 
5684 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
5685   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
5686       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
5687       TypeModifier }
5688 
5689 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
5690   NEONMAP1(__a32_vcvt_bf16_v, arm_neon_vcvtfp2bf, 0),
5691   NEONMAP0(splat_lane_v),
5692   NEONMAP0(splat_laneq_v),
5693   NEONMAP0(splatq_lane_v),
5694   NEONMAP0(splatq_laneq_v),
5695   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
5696   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
5697   NEONMAP1(vabs_v, arm_neon_vabs, 0),
5698   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
5699   NEONMAP0(vadd_v),
5700   NEONMAP0(vaddhn_v),
5701   NEONMAP0(vaddq_v),
5702   NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
5703   NEONMAP1(vaeseq_v, arm_neon_aese, 0),
5704   NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
5705   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
5706   NEONMAP1(vbfdot_v, arm_neon_bfdot, 0),
5707   NEONMAP1(vbfdotq_v, arm_neon_bfdot, 0),
5708   NEONMAP1(vbfmlalbq_v, arm_neon_bfmlalb, 0),
5709   NEONMAP1(vbfmlaltq_v, arm_neon_bfmlalt, 0),
5710   NEONMAP1(vbfmmlaq_v, arm_neon_bfmmla, 0),
5711   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
5712   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
5713   NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
5714   NEONMAP1(vcadd_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
5715   NEONMAP1(vcaddq_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
5716   NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
5717   NEONMAP1(vcage_v, arm_neon_vacge, 0),
5718   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
5719   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
5720   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
5721   NEONMAP1(vcale_v, arm_neon_vacge, 0),
5722   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
5723   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
5724   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
5725   NEONMAP0(vceqz_v),
5726   NEONMAP0(vceqzq_v),
5727   NEONMAP0(vcgez_v),
5728   NEONMAP0(vcgezq_v),
5729   NEONMAP0(vcgtz_v),
5730   NEONMAP0(vcgtzq_v),
5731   NEONMAP0(vclez_v),
5732   NEONMAP0(vclezq_v),
5733   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
5734   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
5735   NEONMAP0(vcltz_v),
5736   NEONMAP0(vcltzq_v),
5737   NEONMAP1(vclz_v, ctlz, Add1ArgType),
5738   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
5739   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
5740   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
5741   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
5742   NEONMAP0(vcvt_f16_v),
5743   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
5744   NEONMAP0(vcvt_f32_v),
5745   NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5746   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5747   NEONMAP1(vcvt_n_s16_v, arm_neon_vcvtfp2fxs, 0),
5748   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
5749   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
5750   NEONMAP1(vcvt_n_u16_v, arm_neon_vcvtfp2fxu, 0),
5751   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
5752   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
5753   NEONMAP0(vcvt_s16_v),
5754   NEONMAP0(vcvt_s32_v),
5755   NEONMAP0(vcvt_s64_v),
5756   NEONMAP0(vcvt_u16_v),
5757   NEONMAP0(vcvt_u32_v),
5758   NEONMAP0(vcvt_u64_v),
5759   NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0),
5760   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
5761   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
5762   NEONMAP1(vcvta_u16_v, arm_neon_vcvtau, 0),
5763   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
5764   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
5765   NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0),
5766   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
5767   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
5768   NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0),
5769   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
5770   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
5771   NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
5772   NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0),
5773   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
5774   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
5775   NEONMAP1(vcvtm_u16_v, arm_neon_vcvtmu, 0),
5776   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
5777   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
5778   NEONMAP1(vcvtmq_s16_v, arm_neon_vcvtms, 0),
5779   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
5780   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
5781   NEONMAP1(vcvtmq_u16_v, arm_neon_vcvtmu, 0),
5782   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
5783   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
5784   NEONMAP1(vcvtn_s16_v, arm_neon_vcvtns, 0),
5785   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
5786   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
5787   NEONMAP1(vcvtn_u16_v, arm_neon_vcvtnu, 0),
5788   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
5789   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
5790   NEONMAP1(vcvtnq_s16_v, arm_neon_vcvtns, 0),
5791   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
5792   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
5793   NEONMAP1(vcvtnq_u16_v, arm_neon_vcvtnu, 0),
5794   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
5795   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
5796   NEONMAP1(vcvtp_s16_v, arm_neon_vcvtps, 0),
5797   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
5798   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
5799   NEONMAP1(vcvtp_u16_v, arm_neon_vcvtpu, 0),
5800   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
5801   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
5802   NEONMAP1(vcvtpq_s16_v, arm_neon_vcvtps, 0),
5803   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
5804   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
5805   NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0),
5806   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
5807   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
5808   NEONMAP0(vcvtq_f16_v),
5809   NEONMAP0(vcvtq_f32_v),
5810   NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5811   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5812   NEONMAP1(vcvtq_n_s16_v, arm_neon_vcvtfp2fxs, 0),
5813   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
5814   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
5815   NEONMAP1(vcvtq_n_u16_v, arm_neon_vcvtfp2fxu, 0),
5816   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
5817   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
5818   NEONMAP0(vcvtq_s16_v),
5819   NEONMAP0(vcvtq_s32_v),
5820   NEONMAP0(vcvtq_s64_v),
5821   NEONMAP0(vcvtq_u16_v),
5822   NEONMAP0(vcvtq_u32_v),
5823   NEONMAP0(vcvtq_u64_v),
5824   NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
5825   NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
5826   NEONMAP0(vext_v),
5827   NEONMAP0(vextq_v),
5828   NEONMAP0(vfma_v),
5829   NEONMAP0(vfmaq_v),
5830   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
5831   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
5832   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
5833   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
5834   NEONMAP0(vld1_dup_v),
5835   NEONMAP1(vld1_v, arm_neon_vld1, 0),
5836   NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
5837   NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
5838   NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
5839   NEONMAP0(vld1q_dup_v),
5840   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
5841   NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
5842   NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
5843   NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
5844   NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
5845   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
5846   NEONMAP1(vld2_v, arm_neon_vld2, 0),
5847   NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
5848   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
5849   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
5850   NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
5851   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
5852   NEONMAP1(vld3_v, arm_neon_vld3, 0),
5853   NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
5854   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
5855   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
5856   NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
5857   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
5858   NEONMAP1(vld4_v, arm_neon_vld4, 0),
5859   NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
5860   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
5861   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
5862   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
5863   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
5864   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
5865   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
5866   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
5867   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
5868   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
5869   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
5870   NEONMAP2(vmmlaq_v, arm_neon_ummla, arm_neon_smmla, 0),
5871   NEONMAP0(vmovl_v),
5872   NEONMAP0(vmovn_v),
5873   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
5874   NEONMAP0(vmull_v),
5875   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
5876   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
5877   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
5878   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
5879   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
5880   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
5881   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
5882   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
5883   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
5884   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
5885   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
5886   NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
5887   NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
5888   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
5889   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
5890   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
5891   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
5892   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
5893   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
5894   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
5895   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
5896   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
5897   NEONMAP1(vqrdmlah_v, arm_neon_vqrdmlah, Add1ArgType),
5898   NEONMAP1(vqrdmlahq_v, arm_neon_vqrdmlah, Add1ArgType),
5899   NEONMAP1(vqrdmlsh_v, arm_neon_vqrdmlsh, Add1ArgType),
5900   NEONMAP1(vqrdmlshq_v, arm_neon_vqrdmlsh, Add1ArgType),
5901   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
5902   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
5903   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
5904   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
5905   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
5906   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
5907   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
5908   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
5909   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
5910   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
5911   NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
5912   NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
5913   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
5914   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
5915   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
5916   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
5917   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
5918   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
5919   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
5920   NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
5921   NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
5922   NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
5923   NEONMAP0(vrndi_v),
5924   NEONMAP0(vrndiq_v),
5925   NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
5926   NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
5927   NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
5928   NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
5929   NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
5930   NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
5931   NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
5932   NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
5933   NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
5934   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
5935   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
5936   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
5937   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
5938   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
5939   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
5940   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
5941   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
5942   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
5943   NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
5944   NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
5945   NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
5946   NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
5947   NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
5948   NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
5949   NEONMAP0(vshl_n_v),
5950   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
5951   NEONMAP0(vshll_n_v),
5952   NEONMAP0(vshlq_n_v),
5953   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
5954   NEONMAP0(vshr_n_v),
5955   NEONMAP0(vshrn_n_v),
5956   NEONMAP0(vshrq_n_v),
5957   NEONMAP1(vst1_v, arm_neon_vst1, 0),
5958   NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
5959   NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
5960   NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
5961   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
5962   NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
5963   NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
5964   NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
5965   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
5966   NEONMAP1(vst2_v, arm_neon_vst2, 0),
5967   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
5968   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
5969   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
5970   NEONMAP1(vst3_v, arm_neon_vst3, 0),
5971   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
5972   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
5973   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
5974   NEONMAP1(vst4_v, arm_neon_vst4, 0),
5975   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
5976   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
5977   NEONMAP0(vsubhn_v),
5978   NEONMAP0(vtrn_v),
5979   NEONMAP0(vtrnq_v),
5980   NEONMAP0(vtst_v),
5981   NEONMAP0(vtstq_v),
5982   NEONMAP1(vusdot_v, arm_neon_usdot, 0),
5983   NEONMAP1(vusdotq_v, arm_neon_usdot, 0),
5984   NEONMAP1(vusmmlaq_v, arm_neon_usmmla, 0),
5985   NEONMAP0(vuzp_v),
5986   NEONMAP0(vuzpq_v),
5987   NEONMAP0(vzip_v),
5988   NEONMAP0(vzipq_v)
5989 };
5990 
5991 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
5992   NEONMAP1(__a64_vcvtq_low_bf16_v, aarch64_neon_bfcvtn, 0),
5993   NEONMAP0(splat_lane_v),
5994   NEONMAP0(splat_laneq_v),
5995   NEONMAP0(splatq_lane_v),
5996   NEONMAP0(splatq_laneq_v),
5997   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
5998   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
5999   NEONMAP0(vadd_v),
6000   NEONMAP0(vaddhn_v),
6001   NEONMAP0(vaddq_p128),
6002   NEONMAP0(vaddq_v),
6003   NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
6004   NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
6005   NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
6006   NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
6007   NEONMAP2(vbcaxq_v, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6008   NEONMAP1(vbfdot_v, aarch64_neon_bfdot, 0),
6009   NEONMAP1(vbfdotq_v, aarch64_neon_bfdot, 0),
6010   NEONMAP1(vbfmlalbq_v, aarch64_neon_bfmlalb, 0),
6011   NEONMAP1(vbfmlaltq_v, aarch64_neon_bfmlalt, 0),
6012   NEONMAP1(vbfmmlaq_v, aarch64_neon_bfmmla, 0),
6013   NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
6014   NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
6015   NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
6016   NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
6017   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
6018   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
6019   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
6020   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
6021   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
6022   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
6023   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
6024   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
6025   NEONMAP0(vceqz_v),
6026   NEONMAP0(vceqzq_v),
6027   NEONMAP0(vcgez_v),
6028   NEONMAP0(vcgezq_v),
6029   NEONMAP0(vcgtz_v),
6030   NEONMAP0(vcgtzq_v),
6031   NEONMAP0(vclez_v),
6032   NEONMAP0(vclezq_v),
6033   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
6034   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
6035   NEONMAP0(vcltz_v),
6036   NEONMAP0(vcltzq_v),
6037   NEONMAP1(vclz_v, ctlz, Add1ArgType),
6038   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6039   NEONMAP1(vcmla_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
6040   NEONMAP1(vcmla_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
6041   NEONMAP1(vcmla_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
6042   NEONMAP1(vcmla_v, aarch64_neon_vcmla_rot0, Add1ArgType),
6043   NEONMAP1(vcmlaq_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
6044   NEONMAP1(vcmlaq_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
6045   NEONMAP1(vcmlaq_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
6046   NEONMAP1(vcmlaq_v, aarch64_neon_vcmla_rot0, Add1ArgType),
6047   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6048   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6049   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
6050   NEONMAP0(vcvt_f16_v),
6051   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
6052   NEONMAP0(vcvt_f32_v),
6053   NEONMAP2(vcvt_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6054   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6055   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6056   NEONMAP1(vcvt_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
6057   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6058   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6059   NEONMAP1(vcvt_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
6060   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6061   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6062   NEONMAP0(vcvtq_f16_v),
6063   NEONMAP0(vcvtq_f32_v),
6064   NEONMAP1(vcvtq_high_bf16_v, aarch64_neon_bfcvtn2, 0),
6065   NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6066   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6067   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6068   NEONMAP1(vcvtq_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
6069   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6070   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6071   NEONMAP1(vcvtq_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
6072   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6073   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6074   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
6075   NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
6076   NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
6077   NEONMAP2(veor3q_v, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6078   NEONMAP0(vext_v),
6079   NEONMAP0(vextq_v),
6080   NEONMAP0(vfma_v),
6081   NEONMAP0(vfmaq_v),
6082   NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0),
6083   NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0),
6084   NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0),
6085   NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0),
6086   NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0),
6087   NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0),
6088   NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0),
6089   NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0),
6090   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6091   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6092   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6093   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6094   NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
6095   NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
6096   NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
6097   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
6098   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
6099   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
6100   NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0),
6101   NEONMAP0(vmovl_v),
6102   NEONMAP0(vmovn_v),
6103   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
6104   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
6105   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
6106   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6107   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6108   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
6109   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
6110   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
6111   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6112   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6113   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
6114   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
6115   NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
6116   NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6117   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
6118   NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
6119   NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6120   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
6121   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
6122   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
6123   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
6124   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
6125   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
6126   NEONMAP1(vqrdmlah_v, aarch64_neon_sqrdmlah, Add1ArgType),
6127   NEONMAP1(vqrdmlahq_v, aarch64_neon_sqrdmlah, Add1ArgType),
6128   NEONMAP1(vqrdmlsh_v, aarch64_neon_sqrdmlsh, Add1ArgType),
6129   NEONMAP1(vqrdmlshq_v, aarch64_neon_sqrdmlsh, Add1ArgType),
6130   NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6131   NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6132   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
6133   NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6134   NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6135   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
6136   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6137   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6138   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
6139   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6140   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
6141   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6142   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
6143   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
6144   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6145   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6146   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
6147   NEONMAP1(vrax1q_v, aarch64_crypto_rax1, 0),
6148   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6149   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6150   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
6151   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
6152   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6153   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6154   NEONMAP1(vrnd32x_v, aarch64_neon_frint32x, Add1ArgType),
6155   NEONMAP1(vrnd32xq_v, aarch64_neon_frint32x, Add1ArgType),
6156   NEONMAP1(vrnd32z_v, aarch64_neon_frint32z, Add1ArgType),
6157   NEONMAP1(vrnd32zq_v, aarch64_neon_frint32z, Add1ArgType),
6158   NEONMAP1(vrnd64x_v, aarch64_neon_frint64x, Add1ArgType),
6159   NEONMAP1(vrnd64xq_v, aarch64_neon_frint64x, Add1ArgType),
6160   NEONMAP1(vrnd64z_v, aarch64_neon_frint64z, Add1ArgType),
6161   NEONMAP1(vrnd64zq_v, aarch64_neon_frint64z, Add1ArgType),
6162   NEONMAP0(vrndi_v),
6163   NEONMAP0(vrndiq_v),
6164   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6165   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6166   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6167   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6168   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6169   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6170   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
6171   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
6172   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
6173   NEONMAP1(vsha1su0q_v, aarch64_crypto_sha1su0, 0),
6174   NEONMAP1(vsha1su1q_v, aarch64_crypto_sha1su1, 0),
6175   NEONMAP1(vsha256h2q_v, aarch64_crypto_sha256h2, 0),
6176   NEONMAP1(vsha256hq_v, aarch64_crypto_sha256h, 0),
6177   NEONMAP1(vsha256su0q_v, aarch64_crypto_sha256su0, 0),
6178   NEONMAP1(vsha256su1q_v, aarch64_crypto_sha256su1, 0),
6179   NEONMAP1(vsha512h2q_v, aarch64_crypto_sha512h2, 0),
6180   NEONMAP1(vsha512hq_v, aarch64_crypto_sha512h, 0),
6181   NEONMAP1(vsha512su0q_v, aarch64_crypto_sha512su0, 0),
6182   NEONMAP1(vsha512su1q_v, aarch64_crypto_sha512su1, 0),
6183   NEONMAP0(vshl_n_v),
6184   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6185   NEONMAP0(vshll_n_v),
6186   NEONMAP0(vshlq_n_v),
6187   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6188   NEONMAP0(vshr_n_v),
6189   NEONMAP0(vshrn_n_v),
6190   NEONMAP0(vshrq_n_v),
6191   NEONMAP1(vsm3partw1q_v, aarch64_crypto_sm3partw1, 0),
6192   NEONMAP1(vsm3partw2q_v, aarch64_crypto_sm3partw2, 0),
6193   NEONMAP1(vsm3ss1q_v, aarch64_crypto_sm3ss1, 0),
6194   NEONMAP1(vsm3tt1aq_v, aarch64_crypto_sm3tt1a, 0),
6195   NEONMAP1(vsm3tt1bq_v, aarch64_crypto_sm3tt1b, 0),
6196   NEONMAP1(vsm3tt2aq_v, aarch64_crypto_sm3tt2a, 0),
6197   NEONMAP1(vsm3tt2bq_v, aarch64_crypto_sm3tt2b, 0),
6198   NEONMAP1(vsm4ekeyq_v, aarch64_crypto_sm4ekey, 0),
6199   NEONMAP1(vsm4eq_v, aarch64_crypto_sm4e, 0),
6200   NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
6201   NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
6202   NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
6203   NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
6204   NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
6205   NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
6206   NEONMAP0(vsubhn_v),
6207   NEONMAP0(vtst_v),
6208   NEONMAP0(vtstq_v),
6209   NEONMAP1(vusdot_v, aarch64_neon_usdot, 0),
6210   NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0),
6211   NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0),
6212   NEONMAP1(vxarq_v, aarch64_crypto_xar, 0),
6213 };
6214 
6215 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
6216   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
6217   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
6218   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
6219   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6220   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6221   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6222   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6223   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6224   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6225   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6226   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6227   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
6228   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6229   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
6230   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6231   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6232   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6233   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6234   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6235   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6236   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6237   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6238   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6239   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6240   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6241   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6242   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6243   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6244   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6245   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6246   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6247   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6248   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6249   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6250   NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
6251   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6252   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6253   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6254   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6255   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6256   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6257   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6258   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6259   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6260   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6261   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6262   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6263   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6264   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6265   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6266   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6267   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6268   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6269   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
6270   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6271   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6272   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6273   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6274   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6275   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6276   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6277   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6278   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6279   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6280   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6281   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6282   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6283   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6284   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6285   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6286   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6287   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6288   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6289   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6290   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
6291   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
6292   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
6293   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6294   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6295   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6296   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6297   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6298   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6299   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6300   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6301   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6302   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6303   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6304   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
6305   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6306   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
6307   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6308   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6309   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
6310   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
6311   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6312   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6313   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
6314   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
6315   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
6316   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
6317   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
6318   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
6319   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
6320   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
6321   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6322   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6323   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6324   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6325   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
6326   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6327   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6328   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6329   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
6330   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6331   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
6332   NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
6333   NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6334   NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
6335   NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6336   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
6337   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
6338   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6339   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6340   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
6341   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
6342   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6343   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6344   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
6345   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
6346   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
6347   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
6348   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6349   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6350   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6351   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6352   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
6353   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6354   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6355   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6356   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6357   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6358   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6359   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
6360   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
6361   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6362   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6363   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6364   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6365   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
6366   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
6367   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
6368   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
6369   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6370   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6371   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
6372   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
6373   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
6374   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6375   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6376   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6377   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6378   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
6379   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
6380   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
6381   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
6382   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
6383   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
6384   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
6385   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
6386   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
6387   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
6388   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
6389   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
6390   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
6391   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
6392   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
6393   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
6394   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
6395   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
6396   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
6397   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
6398   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
6399   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
6400   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
6401   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
6402   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
6403   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
6404   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
6405   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
6406   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
6407   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
6408   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
6409   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
6410   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
6411   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
6412   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
6413   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
6414   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
6415   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
6416   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
6417   // FP16 scalar intrinisics go here.
6418   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
6419   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6420   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6421   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6422   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6423   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6424   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6425   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6426   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6427   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6428   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6429   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6430   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6431   NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6432   NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6433   NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6434   NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6435   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6436   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6437   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6438   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6439   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6440   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6441   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6442   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6443   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6444   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6445   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6446   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6447   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
6448   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
6449   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
6450   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
6451   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
6452 };
6453 
6454 #undef NEONMAP0
6455 #undef NEONMAP1
6456 #undef NEONMAP2
6457 
6458 #define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
6459   {                                                                            \
6460     #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
6461         TypeModifier                                                           \
6462   }
6463 
6464 #define SVEMAP2(NameBase, TypeModifier)                                        \
6465   { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
6466 static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
6467 #define GET_SVE_LLVM_INTRINSIC_MAP
6468 #include "clang/Basic/arm_sve_builtin_cg.inc"
6469 #include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
6470 #undef GET_SVE_LLVM_INTRINSIC_MAP
6471 };
6472 
6473 #undef SVEMAP1
6474 #undef SVEMAP2
6475 
6476 static bool NEONSIMDIntrinsicsProvenSorted = false;
6477 
6478 static bool AArch64SIMDIntrinsicsProvenSorted = false;
6479 static bool AArch64SISDIntrinsicsProvenSorted = false;
6480 static bool AArch64SVEIntrinsicsProvenSorted = false;
6481 
6482 static const ARMVectorIntrinsicInfo *
6483 findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
6484                             unsigned BuiltinID, bool &MapProvenSorted) {
6485 
6486 #ifndef NDEBUG
6487   if (!MapProvenSorted) {
6488     assert(llvm::is_sorted(IntrinsicMap));
6489     MapProvenSorted = true;
6490   }
6491 #endif
6492 
6493   const ARMVectorIntrinsicInfo *Builtin =
6494       llvm::lower_bound(IntrinsicMap, BuiltinID);
6495 
6496   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
6497     return Builtin;
6498 
6499   return nullptr;
6500 }
6501 
6502 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
6503                                                    unsigned Modifier,
6504                                                    llvm::Type *ArgType,
6505                                                    const CallExpr *E) {
6506   int VectorSize = 0;
6507   if (Modifier & Use64BitVectors)
6508     VectorSize = 64;
6509   else if (Modifier & Use128BitVectors)
6510     VectorSize = 128;
6511 
6512   // Return type.
6513   SmallVector<llvm::Type *, 3> Tys;
6514   if (Modifier & AddRetType) {
6515     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
6516     if (Modifier & VectorizeRetType)
6517       Ty = llvm::FixedVectorType::get(
6518           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
6519 
6520     Tys.push_back(Ty);
6521   }
6522 
6523   // Arguments.
6524   if (Modifier & VectorizeArgTypes) {
6525     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
6526     ArgType = llvm::FixedVectorType::get(ArgType, Elts);
6527   }
6528 
6529   if (Modifier & (Add1ArgType | Add2ArgTypes))
6530     Tys.push_back(ArgType);
6531 
6532   if (Modifier & Add2ArgTypes)
6533     Tys.push_back(ArgType);
6534 
6535   if (Modifier & InventFloatType)
6536     Tys.push_back(FloatTy);
6537 
6538   return CGM.getIntrinsic(IntrinsicID, Tys);
6539 }
6540 
6541 static Value *EmitCommonNeonSISDBuiltinExpr(
6542     CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
6543     SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
6544   unsigned BuiltinID = SISDInfo.BuiltinID;
6545   unsigned int Int = SISDInfo.LLVMIntrinsic;
6546   unsigned Modifier = SISDInfo.TypeModifier;
6547   const char *s = SISDInfo.NameHint;
6548 
6549   switch (BuiltinID) {
6550   case NEON::BI__builtin_neon_vcled_s64:
6551   case NEON::BI__builtin_neon_vcled_u64:
6552   case NEON::BI__builtin_neon_vcles_f32:
6553   case NEON::BI__builtin_neon_vcled_f64:
6554   case NEON::BI__builtin_neon_vcltd_s64:
6555   case NEON::BI__builtin_neon_vcltd_u64:
6556   case NEON::BI__builtin_neon_vclts_f32:
6557   case NEON::BI__builtin_neon_vcltd_f64:
6558   case NEON::BI__builtin_neon_vcales_f32:
6559   case NEON::BI__builtin_neon_vcaled_f64:
6560   case NEON::BI__builtin_neon_vcalts_f32:
6561   case NEON::BI__builtin_neon_vcaltd_f64:
6562     // Only one direction of comparisons actually exist, cmle is actually a cmge
6563     // with swapped operands. The table gives us the right intrinsic but we
6564     // still need to do the swap.
6565     std::swap(Ops[0], Ops[1]);
6566     break;
6567   }
6568 
6569   assert(Int && "Generic code assumes a valid intrinsic");
6570 
6571   // Determine the type(s) of this overloaded AArch64 intrinsic.
6572   const Expr *Arg = E->getArg(0);
6573   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
6574   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
6575 
6576   int j = 0;
6577   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
6578   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
6579        ai != ae; ++ai, ++j) {
6580     llvm::Type *ArgTy = ai->getType();
6581     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
6582              ArgTy->getPrimitiveSizeInBits())
6583       continue;
6584 
6585     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
6586     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
6587     // it before inserting.
6588     Ops[j] = CGF.Builder.CreateTruncOrBitCast(
6589         Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType());
6590     Ops[j] =
6591         CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0);
6592   }
6593 
6594   Value *Result = CGF.EmitNeonCall(F, Ops, s);
6595   llvm::Type *ResultType = CGF.ConvertType(E->getType());
6596   if (ResultType->getPrimitiveSizeInBits().getFixedSize() <
6597       Result->getType()->getPrimitiveSizeInBits().getFixedSize())
6598     return CGF.Builder.CreateExtractElement(Result, C0);
6599 
6600   return CGF.Builder.CreateBitCast(Result, ResultType, s);
6601 }
6602 
6603 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
6604     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
6605     const char *NameHint, unsigned Modifier, const CallExpr *E,
6606     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
6607     llvm::Triple::ArchType Arch) {
6608   // Get the last argument, which specifies the vector type.
6609   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
6610   Optional<llvm::APSInt> NeonTypeConst =
6611       Arg->getIntegerConstantExpr(getContext());
6612   if (!NeonTypeConst)
6613     return nullptr;
6614 
6615   // Determine the type of this overloaded NEON intrinsic.
6616   NeonTypeFlags Type(NeonTypeConst->getZExtValue());
6617   bool Usgn = Type.isUnsigned();
6618   bool Quad = Type.isQuad();
6619   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
6620   const bool AllowBFloatArgsAndRet =
6621       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
6622 
6623   llvm::FixedVectorType *VTy =
6624       GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
6625   llvm::Type *Ty = VTy;
6626   if (!Ty)
6627     return nullptr;
6628 
6629   auto getAlignmentValue32 = [&](Address addr) -> Value* {
6630     return Builder.getInt32(addr.getAlignment().getQuantity());
6631   };
6632 
6633   unsigned Int = LLVMIntrinsic;
6634   if ((Modifier & UnsignedAlts) && !Usgn)
6635     Int = AltLLVMIntrinsic;
6636 
6637   switch (BuiltinID) {
6638   default: break;
6639   case NEON::BI__builtin_neon_splat_lane_v:
6640   case NEON::BI__builtin_neon_splat_laneq_v:
6641   case NEON::BI__builtin_neon_splatq_lane_v:
6642   case NEON::BI__builtin_neon_splatq_laneq_v: {
6643     auto NumElements = VTy->getElementCount();
6644     if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
6645       NumElements = NumElements * 2;
6646     if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
6647       NumElements = NumElements.divideCoefficientBy(2);
6648 
6649     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
6650     return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
6651   }
6652   case NEON::BI__builtin_neon_vpadd_v:
6653   case NEON::BI__builtin_neon_vpaddq_v:
6654     // We don't allow fp/int overloading of intrinsics.
6655     if (VTy->getElementType()->isFloatingPointTy() &&
6656         Int == Intrinsic::aarch64_neon_addp)
6657       Int = Intrinsic::aarch64_neon_faddp;
6658     break;
6659   case NEON::BI__builtin_neon_vabs_v:
6660   case NEON::BI__builtin_neon_vabsq_v:
6661     if (VTy->getElementType()->isFloatingPointTy())
6662       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
6663     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
6664   case NEON::BI__builtin_neon_vadd_v:
6665   case NEON::BI__builtin_neon_vaddq_v: {
6666     llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
6667     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
6668     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
6669     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
6670     return Builder.CreateBitCast(Ops[0], Ty);
6671   }
6672   case NEON::BI__builtin_neon_vaddhn_v: {
6673     llvm::FixedVectorType *SrcTy =
6674         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
6675 
6676     // %sum = add <4 x i32> %lhs, %rhs
6677     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
6678     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
6679     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
6680 
6681     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
6682     Constant *ShiftAmt =
6683         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
6684     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
6685 
6686     // %res = trunc <4 x i32> %high to <4 x i16>
6687     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
6688   }
6689   case NEON::BI__builtin_neon_vcale_v:
6690   case NEON::BI__builtin_neon_vcaleq_v:
6691   case NEON::BI__builtin_neon_vcalt_v:
6692   case NEON::BI__builtin_neon_vcaltq_v:
6693     std::swap(Ops[0], Ops[1]);
6694     LLVM_FALLTHROUGH;
6695   case NEON::BI__builtin_neon_vcage_v:
6696   case NEON::BI__builtin_neon_vcageq_v:
6697   case NEON::BI__builtin_neon_vcagt_v:
6698   case NEON::BI__builtin_neon_vcagtq_v: {
6699     llvm::Type *Ty;
6700     switch (VTy->getScalarSizeInBits()) {
6701     default: llvm_unreachable("unexpected type");
6702     case 32:
6703       Ty = FloatTy;
6704       break;
6705     case 64:
6706       Ty = DoubleTy;
6707       break;
6708     case 16:
6709       Ty = HalfTy;
6710       break;
6711     }
6712     auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
6713     llvm::Type *Tys[] = { VTy, VecFlt };
6714     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
6715     return EmitNeonCall(F, Ops, NameHint);
6716   }
6717   case NEON::BI__builtin_neon_vceqz_v:
6718   case NEON::BI__builtin_neon_vceqzq_v:
6719     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
6720                                          ICmpInst::ICMP_EQ, "vceqz");
6721   case NEON::BI__builtin_neon_vcgez_v:
6722   case NEON::BI__builtin_neon_vcgezq_v:
6723     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
6724                                          ICmpInst::ICMP_SGE, "vcgez");
6725   case NEON::BI__builtin_neon_vclez_v:
6726   case NEON::BI__builtin_neon_vclezq_v:
6727     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
6728                                          ICmpInst::ICMP_SLE, "vclez");
6729   case NEON::BI__builtin_neon_vcgtz_v:
6730   case NEON::BI__builtin_neon_vcgtzq_v:
6731     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
6732                                          ICmpInst::ICMP_SGT, "vcgtz");
6733   case NEON::BI__builtin_neon_vcltz_v:
6734   case NEON::BI__builtin_neon_vcltzq_v:
6735     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
6736                                          ICmpInst::ICMP_SLT, "vcltz");
6737   case NEON::BI__builtin_neon_vclz_v:
6738   case NEON::BI__builtin_neon_vclzq_v:
6739     // We generate target-independent intrinsic, which needs a second argument
6740     // for whether or not clz of zero is undefined; on ARM it isn't.
6741     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
6742     break;
6743   case NEON::BI__builtin_neon_vcvt_f32_v:
6744   case NEON::BI__builtin_neon_vcvtq_f32_v:
6745     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6746     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
6747                      HasLegalHalfType);
6748     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
6749                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
6750   case NEON::BI__builtin_neon_vcvt_f16_v:
6751   case NEON::BI__builtin_neon_vcvtq_f16_v:
6752     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6753     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
6754                      HasLegalHalfType);
6755     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
6756                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
6757   case NEON::BI__builtin_neon_vcvt_n_f16_v:
6758   case NEON::BI__builtin_neon_vcvt_n_f32_v:
6759   case NEON::BI__builtin_neon_vcvt_n_f64_v:
6760   case NEON::BI__builtin_neon_vcvtq_n_f16_v:
6761   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
6762   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
6763     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
6764     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
6765     Function *F = CGM.getIntrinsic(Int, Tys);
6766     return EmitNeonCall(F, Ops, "vcvt_n");
6767   }
6768   case NEON::BI__builtin_neon_vcvt_n_s16_v:
6769   case NEON::BI__builtin_neon_vcvt_n_s32_v:
6770   case NEON::BI__builtin_neon_vcvt_n_u16_v:
6771   case NEON::BI__builtin_neon_vcvt_n_u32_v:
6772   case NEON::BI__builtin_neon_vcvt_n_s64_v:
6773   case NEON::BI__builtin_neon_vcvt_n_u64_v:
6774   case NEON::BI__builtin_neon_vcvtq_n_s16_v:
6775   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
6776   case NEON::BI__builtin_neon_vcvtq_n_u16_v:
6777   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
6778   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
6779   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
6780     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6781     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
6782     return EmitNeonCall(F, Ops, "vcvt_n");
6783   }
6784   case NEON::BI__builtin_neon_vcvt_s32_v:
6785   case NEON::BI__builtin_neon_vcvt_u32_v:
6786   case NEON::BI__builtin_neon_vcvt_s64_v:
6787   case NEON::BI__builtin_neon_vcvt_u64_v:
6788   case NEON::BI__builtin_neon_vcvt_s16_v:
6789   case NEON::BI__builtin_neon_vcvt_u16_v:
6790   case NEON::BI__builtin_neon_vcvtq_s32_v:
6791   case NEON::BI__builtin_neon_vcvtq_u32_v:
6792   case NEON::BI__builtin_neon_vcvtq_s64_v:
6793   case NEON::BI__builtin_neon_vcvtq_u64_v:
6794   case NEON::BI__builtin_neon_vcvtq_s16_v:
6795   case NEON::BI__builtin_neon_vcvtq_u16_v: {
6796     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
6797     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
6798                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
6799   }
6800   case NEON::BI__builtin_neon_vcvta_s16_v:
6801   case NEON::BI__builtin_neon_vcvta_s32_v:
6802   case NEON::BI__builtin_neon_vcvta_s64_v:
6803   case NEON::BI__builtin_neon_vcvta_u16_v:
6804   case NEON::BI__builtin_neon_vcvta_u32_v:
6805   case NEON::BI__builtin_neon_vcvta_u64_v:
6806   case NEON::BI__builtin_neon_vcvtaq_s16_v:
6807   case NEON::BI__builtin_neon_vcvtaq_s32_v:
6808   case NEON::BI__builtin_neon_vcvtaq_s64_v:
6809   case NEON::BI__builtin_neon_vcvtaq_u16_v:
6810   case NEON::BI__builtin_neon_vcvtaq_u32_v:
6811   case NEON::BI__builtin_neon_vcvtaq_u64_v:
6812   case NEON::BI__builtin_neon_vcvtn_s16_v:
6813   case NEON::BI__builtin_neon_vcvtn_s32_v:
6814   case NEON::BI__builtin_neon_vcvtn_s64_v:
6815   case NEON::BI__builtin_neon_vcvtn_u16_v:
6816   case NEON::BI__builtin_neon_vcvtn_u32_v:
6817   case NEON::BI__builtin_neon_vcvtn_u64_v:
6818   case NEON::BI__builtin_neon_vcvtnq_s16_v:
6819   case NEON::BI__builtin_neon_vcvtnq_s32_v:
6820   case NEON::BI__builtin_neon_vcvtnq_s64_v:
6821   case NEON::BI__builtin_neon_vcvtnq_u16_v:
6822   case NEON::BI__builtin_neon_vcvtnq_u32_v:
6823   case NEON::BI__builtin_neon_vcvtnq_u64_v:
6824   case NEON::BI__builtin_neon_vcvtp_s16_v:
6825   case NEON::BI__builtin_neon_vcvtp_s32_v:
6826   case NEON::BI__builtin_neon_vcvtp_s64_v:
6827   case NEON::BI__builtin_neon_vcvtp_u16_v:
6828   case NEON::BI__builtin_neon_vcvtp_u32_v:
6829   case NEON::BI__builtin_neon_vcvtp_u64_v:
6830   case NEON::BI__builtin_neon_vcvtpq_s16_v:
6831   case NEON::BI__builtin_neon_vcvtpq_s32_v:
6832   case NEON::BI__builtin_neon_vcvtpq_s64_v:
6833   case NEON::BI__builtin_neon_vcvtpq_u16_v:
6834   case NEON::BI__builtin_neon_vcvtpq_u32_v:
6835   case NEON::BI__builtin_neon_vcvtpq_u64_v:
6836   case NEON::BI__builtin_neon_vcvtm_s16_v:
6837   case NEON::BI__builtin_neon_vcvtm_s32_v:
6838   case NEON::BI__builtin_neon_vcvtm_s64_v:
6839   case NEON::BI__builtin_neon_vcvtm_u16_v:
6840   case NEON::BI__builtin_neon_vcvtm_u32_v:
6841   case NEON::BI__builtin_neon_vcvtm_u64_v:
6842   case NEON::BI__builtin_neon_vcvtmq_s16_v:
6843   case NEON::BI__builtin_neon_vcvtmq_s32_v:
6844   case NEON::BI__builtin_neon_vcvtmq_s64_v:
6845   case NEON::BI__builtin_neon_vcvtmq_u16_v:
6846   case NEON::BI__builtin_neon_vcvtmq_u32_v:
6847   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
6848     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
6849     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
6850   }
6851   case NEON::BI__builtin_neon_vcvtx_f32_v: {
6852     llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
6853     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
6854 
6855   }
6856   case NEON::BI__builtin_neon_vext_v:
6857   case NEON::BI__builtin_neon_vextq_v: {
6858     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
6859     SmallVector<int, 16> Indices;
6860     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
6861       Indices.push_back(i+CV);
6862 
6863     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6864     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6865     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
6866   }
6867   case NEON::BI__builtin_neon_vfma_v:
6868   case NEON::BI__builtin_neon_vfmaq_v: {
6869     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6870     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
6871     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
6872 
6873     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6874     return emitCallMaybeConstrainedFPBuiltin(
6875         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
6876         {Ops[1], Ops[2], Ops[0]});
6877   }
6878   case NEON::BI__builtin_neon_vld1_v:
6879   case NEON::BI__builtin_neon_vld1q_v: {
6880     llvm::Type *Tys[] = {Ty, Int8PtrTy};
6881     Ops.push_back(getAlignmentValue32(PtrOp0));
6882     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
6883   }
6884   case NEON::BI__builtin_neon_vld1_x2_v:
6885   case NEON::BI__builtin_neon_vld1q_x2_v:
6886   case NEON::BI__builtin_neon_vld1_x3_v:
6887   case NEON::BI__builtin_neon_vld1q_x3_v:
6888   case NEON::BI__builtin_neon_vld1_x4_v:
6889   case NEON::BI__builtin_neon_vld1q_x4_v: {
6890     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getElementType());
6891     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
6892     llvm::Type *Tys[2] = { VTy, PTy };
6893     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
6894     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
6895     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6896     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6897     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
6898   }
6899   case NEON::BI__builtin_neon_vld2_v:
6900   case NEON::BI__builtin_neon_vld2q_v:
6901   case NEON::BI__builtin_neon_vld3_v:
6902   case NEON::BI__builtin_neon_vld3q_v:
6903   case NEON::BI__builtin_neon_vld4_v:
6904   case NEON::BI__builtin_neon_vld4q_v:
6905   case NEON::BI__builtin_neon_vld2_dup_v:
6906   case NEON::BI__builtin_neon_vld2q_dup_v:
6907   case NEON::BI__builtin_neon_vld3_dup_v:
6908   case NEON::BI__builtin_neon_vld3q_dup_v:
6909   case NEON::BI__builtin_neon_vld4_dup_v:
6910   case NEON::BI__builtin_neon_vld4q_dup_v: {
6911     llvm::Type *Tys[] = {Ty, Int8PtrTy};
6912     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
6913     Value *Align = getAlignmentValue32(PtrOp1);
6914     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
6915     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6916     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6917     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
6918   }
6919   case NEON::BI__builtin_neon_vld1_dup_v:
6920   case NEON::BI__builtin_neon_vld1q_dup_v: {
6921     Value *V = UndefValue::get(Ty);
6922     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
6923     PtrOp0 = Builder.CreateBitCast(PtrOp0, Ty);
6924     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
6925     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
6926     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
6927     return EmitNeonSplat(Ops[0], CI);
6928   }
6929   case NEON::BI__builtin_neon_vld2_lane_v:
6930   case NEON::BI__builtin_neon_vld2q_lane_v:
6931   case NEON::BI__builtin_neon_vld3_lane_v:
6932   case NEON::BI__builtin_neon_vld3q_lane_v:
6933   case NEON::BI__builtin_neon_vld4_lane_v:
6934   case NEON::BI__builtin_neon_vld4q_lane_v: {
6935     llvm::Type *Tys[] = {Ty, Int8PtrTy};
6936     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
6937     for (unsigned I = 2; I < Ops.size() - 1; ++I)
6938       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
6939     Ops.push_back(getAlignmentValue32(PtrOp1));
6940     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
6941     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
6942     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
6943     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
6944   }
6945   case NEON::BI__builtin_neon_vmovl_v: {
6946     llvm::FixedVectorType *DTy =
6947         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
6948     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
6949     if (Usgn)
6950       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
6951     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
6952   }
6953   case NEON::BI__builtin_neon_vmovn_v: {
6954     llvm::FixedVectorType *QTy =
6955         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
6956     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
6957     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
6958   }
6959   case NEON::BI__builtin_neon_vmull_v:
6960     // FIXME: the integer vmull operations could be emitted in terms of pure
6961     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
6962     // hoisting the exts outside loops. Until global ISel comes along that can
6963     // see through such movement this leads to bad CodeGen. So we need an
6964     // intrinsic for now.
6965     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
6966     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
6967     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
6968   case NEON::BI__builtin_neon_vpadal_v:
6969   case NEON::BI__builtin_neon_vpadalq_v: {
6970     // The source operand type has twice as many elements of half the size.
6971     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
6972     llvm::Type *EltTy =
6973       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
6974     auto *NarrowTy =
6975         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
6976     llvm::Type *Tys[2] = { Ty, NarrowTy };
6977     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
6978   }
6979   case NEON::BI__builtin_neon_vpaddl_v:
6980   case NEON::BI__builtin_neon_vpaddlq_v: {
6981     // The source operand type has twice as many elements of half the size.
6982     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
6983     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
6984     auto *NarrowTy =
6985         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
6986     llvm::Type *Tys[2] = { Ty, NarrowTy };
6987     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
6988   }
6989   case NEON::BI__builtin_neon_vqdmlal_v:
6990   case NEON::BI__builtin_neon_vqdmlsl_v: {
6991     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
6992     Ops[1] =
6993         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
6994     Ops.resize(2);
6995     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
6996   }
6997   case NEON::BI__builtin_neon_vqdmulhq_lane_v:
6998   case NEON::BI__builtin_neon_vqdmulh_lane_v:
6999   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
7000   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
7001     auto *RTy = cast<llvm::FixedVectorType>(Ty);
7002     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
7003         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
7004       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
7005                                        RTy->getNumElements() * 2);
7006     llvm::Type *Tys[2] = {
7007         RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7008                                              /*isQuad*/ false))};
7009     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7010   }
7011   case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
7012   case NEON::BI__builtin_neon_vqdmulh_laneq_v:
7013   case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
7014   case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
7015     llvm::Type *Tys[2] = {
7016         Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7017                                             /*isQuad*/ true))};
7018     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7019   }
7020   case NEON::BI__builtin_neon_vqshl_n_v:
7021   case NEON::BI__builtin_neon_vqshlq_n_v:
7022     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
7023                         1, false);
7024   case NEON::BI__builtin_neon_vqshlu_n_v:
7025   case NEON::BI__builtin_neon_vqshluq_n_v:
7026     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
7027                         1, false);
7028   case NEON::BI__builtin_neon_vrecpe_v:
7029   case NEON::BI__builtin_neon_vrecpeq_v:
7030   case NEON::BI__builtin_neon_vrsqrte_v:
7031   case NEON::BI__builtin_neon_vrsqrteq_v:
7032     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
7033     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7034   case NEON::BI__builtin_neon_vrndi_v:
7035   case NEON::BI__builtin_neon_vrndiq_v:
7036     Int = Builder.getIsFPConstrained()
7037               ? Intrinsic::experimental_constrained_nearbyint
7038               : Intrinsic::nearbyint;
7039     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7040   case NEON::BI__builtin_neon_vrshr_n_v:
7041   case NEON::BI__builtin_neon_vrshrq_n_v:
7042     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
7043                         1, true);
7044   case NEON::BI__builtin_neon_vsha512hq_v:
7045   case NEON::BI__builtin_neon_vsha512h2q_v:
7046   case NEON::BI__builtin_neon_vsha512su0q_v:
7047   case NEON::BI__builtin_neon_vsha512su1q_v: {
7048     Function *F = CGM.getIntrinsic(Int);
7049     return EmitNeonCall(F, Ops, "");
7050   }
7051   case NEON::BI__builtin_neon_vshl_n_v:
7052   case NEON::BI__builtin_neon_vshlq_n_v:
7053     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
7054     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
7055                              "vshl_n");
7056   case NEON::BI__builtin_neon_vshll_n_v: {
7057     llvm::FixedVectorType *SrcTy =
7058         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7059     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7060     if (Usgn)
7061       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
7062     else
7063       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
7064     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
7065     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
7066   }
7067   case NEON::BI__builtin_neon_vshrn_n_v: {
7068     llvm::FixedVectorType *SrcTy =
7069         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7070     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7071     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
7072     if (Usgn)
7073       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
7074     else
7075       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
7076     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
7077   }
7078   case NEON::BI__builtin_neon_vshr_n_v:
7079   case NEON::BI__builtin_neon_vshrq_n_v:
7080     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
7081   case NEON::BI__builtin_neon_vst1_v:
7082   case NEON::BI__builtin_neon_vst1q_v:
7083   case NEON::BI__builtin_neon_vst2_v:
7084   case NEON::BI__builtin_neon_vst2q_v:
7085   case NEON::BI__builtin_neon_vst3_v:
7086   case NEON::BI__builtin_neon_vst3q_v:
7087   case NEON::BI__builtin_neon_vst4_v:
7088   case NEON::BI__builtin_neon_vst4q_v:
7089   case NEON::BI__builtin_neon_vst2_lane_v:
7090   case NEON::BI__builtin_neon_vst2q_lane_v:
7091   case NEON::BI__builtin_neon_vst3_lane_v:
7092   case NEON::BI__builtin_neon_vst3q_lane_v:
7093   case NEON::BI__builtin_neon_vst4_lane_v:
7094   case NEON::BI__builtin_neon_vst4q_lane_v: {
7095     llvm::Type *Tys[] = {Int8PtrTy, Ty};
7096     Ops.push_back(getAlignmentValue32(PtrOp0));
7097     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
7098   }
7099   case NEON::BI__builtin_neon_vsm3partw1q_v:
7100   case NEON::BI__builtin_neon_vsm3partw2q_v:
7101   case NEON::BI__builtin_neon_vsm3ss1q_v:
7102   case NEON::BI__builtin_neon_vsm4ekeyq_v:
7103   case NEON::BI__builtin_neon_vsm4eq_v: {
7104     Function *F = CGM.getIntrinsic(Int);
7105     return EmitNeonCall(F, Ops, "");
7106   }
7107   case NEON::BI__builtin_neon_vsm3tt1aq_v:
7108   case NEON::BI__builtin_neon_vsm3tt1bq_v:
7109   case NEON::BI__builtin_neon_vsm3tt2aq_v:
7110   case NEON::BI__builtin_neon_vsm3tt2bq_v: {
7111     Function *F = CGM.getIntrinsic(Int);
7112     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7113     return EmitNeonCall(F, Ops, "");
7114   }
7115   case NEON::BI__builtin_neon_vst1_x2_v:
7116   case NEON::BI__builtin_neon_vst1q_x2_v:
7117   case NEON::BI__builtin_neon_vst1_x3_v:
7118   case NEON::BI__builtin_neon_vst1q_x3_v:
7119   case NEON::BI__builtin_neon_vst1_x4_v:
7120   case NEON::BI__builtin_neon_vst1q_x4_v: {
7121     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getElementType());
7122     // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
7123     // in AArch64 it comes last. We may want to stick to one or another.
7124     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
7125         Arch == llvm::Triple::aarch64_32) {
7126       llvm::Type *Tys[2] = { VTy, PTy };
7127       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7128       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7129     }
7130     llvm::Type *Tys[2] = { PTy, VTy };
7131     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7132   }
7133   case NEON::BI__builtin_neon_vsubhn_v: {
7134     llvm::FixedVectorType *SrcTy =
7135         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7136 
7137     // %sum = add <4 x i32> %lhs, %rhs
7138     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7139     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7140     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
7141 
7142     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7143     Constant *ShiftAmt =
7144         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7145     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
7146 
7147     // %res = trunc <4 x i32> %high to <4 x i16>
7148     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
7149   }
7150   case NEON::BI__builtin_neon_vtrn_v:
7151   case NEON::BI__builtin_neon_vtrnq_v: {
7152     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7153     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7154     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7155     Value *SV = nullptr;
7156 
7157     for (unsigned vi = 0; vi != 2; ++vi) {
7158       SmallVector<int, 16> Indices;
7159       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7160         Indices.push_back(i+vi);
7161         Indices.push_back(i+e+vi);
7162       }
7163       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7164       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
7165       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7166     }
7167     return SV;
7168   }
7169   case NEON::BI__builtin_neon_vtst_v:
7170   case NEON::BI__builtin_neon_vtstq_v: {
7171     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7172     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7173     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
7174     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
7175                                 ConstantAggregateZero::get(Ty));
7176     return Builder.CreateSExt(Ops[0], Ty, "vtst");
7177   }
7178   case NEON::BI__builtin_neon_vuzp_v:
7179   case NEON::BI__builtin_neon_vuzpq_v: {
7180     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7181     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7182     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7183     Value *SV = nullptr;
7184 
7185     for (unsigned vi = 0; vi != 2; ++vi) {
7186       SmallVector<int, 16> Indices;
7187       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7188         Indices.push_back(2*i+vi);
7189 
7190       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7191       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
7192       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7193     }
7194     return SV;
7195   }
7196   case NEON::BI__builtin_neon_vxarq_v: {
7197     Function *F = CGM.getIntrinsic(Int);
7198     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
7199     return EmitNeonCall(F, Ops, "");
7200   }
7201   case NEON::BI__builtin_neon_vzip_v:
7202   case NEON::BI__builtin_neon_vzipq_v: {
7203     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7204     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7205     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7206     Value *SV = nullptr;
7207 
7208     for (unsigned vi = 0; vi != 2; ++vi) {
7209       SmallVector<int, 16> Indices;
7210       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7211         Indices.push_back((i + vi*e) >> 1);
7212         Indices.push_back(((i + vi*e) >> 1)+e);
7213       }
7214       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7215       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
7216       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7217     }
7218     return SV;
7219   }
7220   case NEON::BI__builtin_neon_vdot_v:
7221   case NEON::BI__builtin_neon_vdotq_v: {
7222     auto *InputTy =
7223         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7224     llvm::Type *Tys[2] = { Ty, InputTy };
7225     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
7226     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
7227   }
7228   case NEON::BI__builtin_neon_vfmlal_low_v:
7229   case NEON::BI__builtin_neon_vfmlalq_low_v: {
7230     auto *InputTy =
7231         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7232     llvm::Type *Tys[2] = { Ty, InputTy };
7233     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
7234   }
7235   case NEON::BI__builtin_neon_vfmlsl_low_v:
7236   case NEON::BI__builtin_neon_vfmlslq_low_v: {
7237     auto *InputTy =
7238         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7239     llvm::Type *Tys[2] = { Ty, InputTy };
7240     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
7241   }
7242   case NEON::BI__builtin_neon_vfmlal_high_v:
7243   case NEON::BI__builtin_neon_vfmlalq_high_v: {
7244     auto *InputTy =
7245         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7246     llvm::Type *Tys[2] = { Ty, InputTy };
7247     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
7248   }
7249   case NEON::BI__builtin_neon_vfmlsl_high_v:
7250   case NEON::BI__builtin_neon_vfmlslq_high_v: {
7251     auto *InputTy =
7252         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7253     llvm::Type *Tys[2] = { Ty, InputTy };
7254     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
7255   }
7256   case NEON::BI__builtin_neon_vmmlaq_v: {
7257     auto *InputTy =
7258         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7259     llvm::Type *Tys[2] = { Ty, InputTy };
7260     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
7261     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla");
7262   }
7263   case NEON::BI__builtin_neon_vusmmlaq_v: {
7264     auto *InputTy =
7265         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7266     llvm::Type *Tys[2] = { Ty, InputTy };
7267     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
7268   }
7269   case NEON::BI__builtin_neon_vusdot_v:
7270   case NEON::BI__builtin_neon_vusdotq_v: {
7271     auto *InputTy =
7272         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7273     llvm::Type *Tys[2] = { Ty, InputTy };
7274     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
7275   }
7276   case NEON::BI__builtin_neon_vbfdot_v:
7277   case NEON::BI__builtin_neon_vbfdotq_v: {
7278     llvm::Type *InputTy =
7279         llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
7280     llvm::Type *Tys[2] = { Ty, InputTy };
7281     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
7282   }
7283   case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
7284     llvm::Type *Tys[1] = { Ty };
7285     Function *F = CGM.getIntrinsic(Int, Tys);
7286     return EmitNeonCall(F, Ops, "vcvtfp2bf");
7287   }
7288 
7289   }
7290 
7291   assert(Int && "Expected valid intrinsic number");
7292 
7293   // Determine the type(s) of this overloaded AArch64 intrinsic.
7294   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
7295 
7296   Value *Result = EmitNeonCall(F, Ops, NameHint);
7297   llvm::Type *ResultType = ConvertType(E->getType());
7298   // AArch64 intrinsic one-element vector type cast to
7299   // scalar type expected by the builtin
7300   return Builder.CreateBitCast(Result, ResultType, NameHint);
7301 }
7302 
7303 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
7304     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
7305     const CmpInst::Predicate Ip, const Twine &Name) {
7306   llvm::Type *OTy = Op->getType();
7307 
7308   // FIXME: this is utterly horrific. We should not be looking at previous
7309   // codegen context to find out what needs doing. Unfortunately TableGen
7310   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
7311   // (etc).
7312   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
7313     OTy = BI->getOperand(0)->getType();
7314 
7315   Op = Builder.CreateBitCast(Op, OTy);
7316   if (OTy->getScalarType()->isFloatingPointTy()) {
7317     if (Fp == CmpInst::FCMP_OEQ)
7318       Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
7319     else
7320       Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
7321   } else {
7322     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
7323   }
7324   return Builder.CreateSExt(Op, Ty, Name);
7325 }
7326 
7327 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
7328                                  Value *ExtOp, Value *IndexOp,
7329                                  llvm::Type *ResTy, unsigned IntID,
7330                                  const char *Name) {
7331   SmallVector<Value *, 2> TblOps;
7332   if (ExtOp)
7333     TblOps.push_back(ExtOp);
7334 
7335   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
7336   SmallVector<int, 16> Indices;
7337   auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
7338   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
7339     Indices.push_back(2*i);
7340     Indices.push_back(2*i+1);
7341   }
7342 
7343   int PairPos = 0, End = Ops.size() - 1;
7344   while (PairPos < End) {
7345     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
7346                                                      Ops[PairPos+1], Indices,
7347                                                      Name));
7348     PairPos += 2;
7349   }
7350 
7351   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
7352   // of the 128-bit lookup table with zero.
7353   if (PairPos == End) {
7354     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
7355     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
7356                                                      ZeroTbl, Indices, Name));
7357   }
7358 
7359   Function *TblF;
7360   TblOps.push_back(IndexOp);
7361   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
7362 
7363   return CGF.EmitNeonCall(TblF, TblOps, Name);
7364 }
7365 
7366 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
7367   unsigned Value;
7368   switch (BuiltinID) {
7369   default:
7370     return nullptr;
7371   case ARM::BI__builtin_arm_nop:
7372     Value = 0;
7373     break;
7374   case ARM::BI__builtin_arm_yield:
7375   case ARM::BI__yield:
7376     Value = 1;
7377     break;
7378   case ARM::BI__builtin_arm_wfe:
7379   case ARM::BI__wfe:
7380     Value = 2;
7381     break;
7382   case ARM::BI__builtin_arm_wfi:
7383   case ARM::BI__wfi:
7384     Value = 3;
7385     break;
7386   case ARM::BI__builtin_arm_sev:
7387   case ARM::BI__sev:
7388     Value = 4;
7389     break;
7390   case ARM::BI__builtin_arm_sevl:
7391   case ARM::BI__sevl:
7392     Value = 5;
7393     break;
7394   }
7395 
7396   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
7397                             llvm::ConstantInt::get(Int32Ty, Value));
7398 }
7399 
7400 enum SpecialRegisterAccessKind {
7401   NormalRead,
7402   VolatileRead,
7403   Write,
7404 };
7405 
7406 // Generates the IR for the read/write special register builtin,
7407 // ValueType is the type of the value that is to be written or read,
7408 // RegisterType is the type of the register being written to or read from.
7409 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
7410                                          const CallExpr *E,
7411                                          llvm::Type *RegisterType,
7412                                          llvm::Type *ValueType,
7413                                          SpecialRegisterAccessKind AccessKind,
7414                                          StringRef SysReg = "") {
7415   // write and register intrinsics only support 32 and 64 bit operations.
7416   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64))
7417           && "Unsupported size for register.");
7418 
7419   CodeGen::CGBuilderTy &Builder = CGF.Builder;
7420   CodeGen::CodeGenModule &CGM = CGF.CGM;
7421   LLVMContext &Context = CGM.getLLVMContext();
7422 
7423   if (SysReg.empty()) {
7424     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
7425     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
7426   }
7427 
7428   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
7429   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
7430   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
7431 
7432   llvm::Type *Types[] = { RegisterType };
7433 
7434   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
7435   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
7436             && "Can't fit 64-bit value in 32-bit register");
7437 
7438   if (AccessKind != Write) {
7439     assert(AccessKind == NormalRead || AccessKind == VolatileRead);
7440     llvm::Function *F = CGM.getIntrinsic(
7441         AccessKind == VolatileRead ? llvm::Intrinsic::read_volatile_register
7442                                    : llvm::Intrinsic::read_register,
7443         Types);
7444     llvm::Value *Call = Builder.CreateCall(F, Metadata);
7445 
7446     if (MixedTypes)
7447       // Read into 64 bit register and then truncate result to 32 bit.
7448       return Builder.CreateTrunc(Call, ValueType);
7449 
7450     if (ValueType->isPointerTy())
7451       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
7452       return Builder.CreateIntToPtr(Call, ValueType);
7453 
7454     return Call;
7455   }
7456 
7457   llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
7458   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
7459   if (MixedTypes) {
7460     // Extend 32 bit write value to 64 bit to pass to write.
7461     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
7462     return Builder.CreateCall(F, { Metadata, ArgValue });
7463   }
7464 
7465   if (ValueType->isPointerTy()) {
7466     // Have VoidPtrTy ArgValue but want to return an i32/i64.
7467     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
7468     return Builder.CreateCall(F, { Metadata, ArgValue });
7469   }
7470 
7471   return Builder.CreateCall(F, { Metadata, ArgValue });
7472 }
7473 
7474 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
7475 /// argument that specifies the vector type.
7476 static bool HasExtraNeonArgument(unsigned BuiltinID) {
7477   switch (BuiltinID) {
7478   default: break;
7479   case NEON::BI__builtin_neon_vget_lane_i8:
7480   case NEON::BI__builtin_neon_vget_lane_i16:
7481   case NEON::BI__builtin_neon_vget_lane_bf16:
7482   case NEON::BI__builtin_neon_vget_lane_i32:
7483   case NEON::BI__builtin_neon_vget_lane_i64:
7484   case NEON::BI__builtin_neon_vget_lane_f32:
7485   case NEON::BI__builtin_neon_vgetq_lane_i8:
7486   case NEON::BI__builtin_neon_vgetq_lane_i16:
7487   case NEON::BI__builtin_neon_vgetq_lane_bf16:
7488   case NEON::BI__builtin_neon_vgetq_lane_i32:
7489   case NEON::BI__builtin_neon_vgetq_lane_i64:
7490   case NEON::BI__builtin_neon_vgetq_lane_f32:
7491   case NEON::BI__builtin_neon_vduph_lane_bf16:
7492   case NEON::BI__builtin_neon_vduph_laneq_bf16:
7493   case NEON::BI__builtin_neon_vset_lane_i8:
7494   case NEON::BI__builtin_neon_vset_lane_i16:
7495   case NEON::BI__builtin_neon_vset_lane_bf16:
7496   case NEON::BI__builtin_neon_vset_lane_i32:
7497   case NEON::BI__builtin_neon_vset_lane_i64:
7498   case NEON::BI__builtin_neon_vset_lane_f32:
7499   case NEON::BI__builtin_neon_vsetq_lane_i8:
7500   case NEON::BI__builtin_neon_vsetq_lane_i16:
7501   case NEON::BI__builtin_neon_vsetq_lane_bf16:
7502   case NEON::BI__builtin_neon_vsetq_lane_i32:
7503   case NEON::BI__builtin_neon_vsetq_lane_i64:
7504   case NEON::BI__builtin_neon_vsetq_lane_f32:
7505   case NEON::BI__builtin_neon_vsha1h_u32:
7506   case NEON::BI__builtin_neon_vsha1cq_u32:
7507   case NEON::BI__builtin_neon_vsha1pq_u32:
7508   case NEON::BI__builtin_neon_vsha1mq_u32:
7509   case NEON::BI__builtin_neon_vcvth_bf16_f32:
7510   case clang::ARM::BI_MoveToCoprocessor:
7511   case clang::ARM::BI_MoveToCoprocessor2:
7512     return false;
7513   }
7514   return true;
7515 }
7516 
7517 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
7518                                            const CallExpr *E,
7519                                            ReturnValueSlot ReturnValue,
7520                                            llvm::Triple::ArchType Arch) {
7521   if (auto Hint = GetValueForARMHint(BuiltinID))
7522     return Hint;
7523 
7524   if (BuiltinID == ARM::BI__emit) {
7525     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
7526     llvm::FunctionType *FTy =
7527         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
7528 
7529     Expr::EvalResult Result;
7530     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
7531       llvm_unreachable("Sema will ensure that the parameter is constant");
7532 
7533     llvm::APSInt Value = Result.Val.getInt();
7534     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
7535 
7536     llvm::InlineAsm *Emit =
7537         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
7538                                  /*hasSideEffects=*/true)
7539                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
7540                                  /*hasSideEffects=*/true);
7541 
7542     return Builder.CreateCall(Emit);
7543   }
7544 
7545   if (BuiltinID == ARM::BI__builtin_arm_dbg) {
7546     Value *Option = EmitScalarExpr(E->getArg(0));
7547     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
7548   }
7549 
7550   if (BuiltinID == ARM::BI__builtin_arm_prefetch) {
7551     Value *Address = EmitScalarExpr(E->getArg(0));
7552     Value *RW      = EmitScalarExpr(E->getArg(1));
7553     Value *IsData  = EmitScalarExpr(E->getArg(2));
7554 
7555     // Locality is not supported on ARM target
7556     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
7557 
7558     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
7559     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
7560   }
7561 
7562   if (BuiltinID == ARM::BI__builtin_arm_rbit) {
7563     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7564     return Builder.CreateCall(
7565         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
7566   }
7567 
7568   if (BuiltinID == ARM::BI__builtin_arm_cls) {
7569     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7570     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
7571   }
7572   if (BuiltinID == ARM::BI__builtin_arm_cls64) {
7573     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7574     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
7575                               "cls");
7576   }
7577 
7578   if (BuiltinID == ARM::BI__clear_cache) {
7579     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
7580     const FunctionDecl *FD = E->getDirectCallee();
7581     Value *Ops[2];
7582     for (unsigned i = 0; i < 2; i++)
7583       Ops[i] = EmitScalarExpr(E->getArg(i));
7584     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
7585     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
7586     StringRef Name = FD->getName();
7587     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
7588   }
7589 
7590   if (BuiltinID == ARM::BI__builtin_arm_mcrr ||
7591       BuiltinID == ARM::BI__builtin_arm_mcrr2) {
7592     Function *F;
7593 
7594     switch (BuiltinID) {
7595     default: llvm_unreachable("unexpected builtin");
7596     case ARM::BI__builtin_arm_mcrr:
7597       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
7598       break;
7599     case ARM::BI__builtin_arm_mcrr2:
7600       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
7601       break;
7602     }
7603 
7604     // MCRR{2} instruction has 5 operands but
7605     // the intrinsic has 4 because Rt and Rt2
7606     // are represented as a single unsigned 64
7607     // bit integer in the intrinsic definition
7608     // but internally it's represented as 2 32
7609     // bit integers.
7610 
7611     Value *Coproc = EmitScalarExpr(E->getArg(0));
7612     Value *Opc1 = EmitScalarExpr(E->getArg(1));
7613     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
7614     Value *CRm = EmitScalarExpr(E->getArg(3));
7615 
7616     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
7617     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
7618     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
7619     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
7620 
7621     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
7622   }
7623 
7624   if (BuiltinID == ARM::BI__builtin_arm_mrrc ||
7625       BuiltinID == ARM::BI__builtin_arm_mrrc2) {
7626     Function *F;
7627 
7628     switch (BuiltinID) {
7629     default: llvm_unreachable("unexpected builtin");
7630     case ARM::BI__builtin_arm_mrrc:
7631       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
7632       break;
7633     case ARM::BI__builtin_arm_mrrc2:
7634       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
7635       break;
7636     }
7637 
7638     Value *Coproc = EmitScalarExpr(E->getArg(0));
7639     Value *Opc1 = EmitScalarExpr(E->getArg(1));
7640     Value *CRm  = EmitScalarExpr(E->getArg(2));
7641     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
7642 
7643     // Returns an unsigned 64 bit integer, represented
7644     // as two 32 bit integers.
7645 
7646     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
7647     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
7648     Rt = Builder.CreateZExt(Rt, Int64Ty);
7649     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
7650 
7651     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
7652     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
7653     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
7654 
7655     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
7656   }
7657 
7658   if (BuiltinID == ARM::BI__builtin_arm_ldrexd ||
7659       ((BuiltinID == ARM::BI__builtin_arm_ldrex ||
7660         BuiltinID == ARM::BI__builtin_arm_ldaex) &&
7661        getContext().getTypeSize(E->getType()) == 64) ||
7662       BuiltinID == ARM::BI__ldrexd) {
7663     Function *F;
7664 
7665     switch (BuiltinID) {
7666     default: llvm_unreachable("unexpected builtin");
7667     case ARM::BI__builtin_arm_ldaex:
7668       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
7669       break;
7670     case ARM::BI__builtin_arm_ldrexd:
7671     case ARM::BI__builtin_arm_ldrex:
7672     case ARM::BI__ldrexd:
7673       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
7674       break;
7675     }
7676 
7677     Value *LdPtr = EmitScalarExpr(E->getArg(0));
7678     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
7679                                     "ldrexd");
7680 
7681     Value *Val0 = Builder.CreateExtractValue(Val, 1);
7682     Value *Val1 = Builder.CreateExtractValue(Val, 0);
7683     Val0 = Builder.CreateZExt(Val0, Int64Ty);
7684     Val1 = Builder.CreateZExt(Val1, Int64Ty);
7685 
7686     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
7687     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
7688     Val = Builder.CreateOr(Val, Val1);
7689     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
7690   }
7691 
7692   if (BuiltinID == ARM::BI__builtin_arm_ldrex ||
7693       BuiltinID == ARM::BI__builtin_arm_ldaex) {
7694     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
7695 
7696     QualType Ty = E->getType();
7697     llvm::Type *RealResTy = ConvertType(Ty);
7698     llvm::Type *PtrTy = llvm::IntegerType::get(
7699         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
7700     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
7701 
7702     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_ldaex
7703                                        ? Intrinsic::arm_ldaex
7704                                        : Intrinsic::arm_ldrex,
7705                                    PtrTy);
7706     Value *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
7707 
7708     if (RealResTy->isPointerTy())
7709       return Builder.CreateIntToPtr(Val, RealResTy);
7710     else {
7711       llvm::Type *IntResTy = llvm::IntegerType::get(
7712           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
7713       Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
7714       return Builder.CreateBitCast(Val, RealResTy);
7715     }
7716   }
7717 
7718   if (BuiltinID == ARM::BI__builtin_arm_strexd ||
7719       ((BuiltinID == ARM::BI__builtin_arm_stlex ||
7720         BuiltinID == ARM::BI__builtin_arm_strex) &&
7721        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
7722     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
7723                                        ? Intrinsic::arm_stlexd
7724                                        : Intrinsic::arm_strexd);
7725     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
7726 
7727     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
7728     Value *Val = EmitScalarExpr(E->getArg(0));
7729     Builder.CreateStore(Val, Tmp);
7730 
7731     Address LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
7732     Val = Builder.CreateLoad(LdPtr);
7733 
7734     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
7735     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
7736     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), Int8PtrTy);
7737     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
7738   }
7739 
7740   if (BuiltinID == ARM::BI__builtin_arm_strex ||
7741       BuiltinID == ARM::BI__builtin_arm_stlex) {
7742     Value *StoreVal = EmitScalarExpr(E->getArg(0));
7743     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
7744 
7745     QualType Ty = E->getArg(0)->getType();
7746     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
7747                                                  getContext().getTypeSize(Ty));
7748     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
7749 
7750     if (StoreVal->getType()->isPointerTy())
7751       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
7752     else {
7753       llvm::Type *IntTy = llvm::IntegerType::get(
7754           getLLVMContext(),
7755           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
7756       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
7757       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
7758     }
7759 
7760     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex
7761                                        ? Intrinsic::arm_stlex
7762                                        : Intrinsic::arm_strex,
7763                                    StoreAddr->getType());
7764     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
7765   }
7766 
7767   if (BuiltinID == ARM::BI__builtin_arm_clrex) {
7768     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
7769     return Builder.CreateCall(F);
7770   }
7771 
7772   // CRC32
7773   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
7774   switch (BuiltinID) {
7775   case ARM::BI__builtin_arm_crc32b:
7776     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
7777   case ARM::BI__builtin_arm_crc32cb:
7778     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
7779   case ARM::BI__builtin_arm_crc32h:
7780     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
7781   case ARM::BI__builtin_arm_crc32ch:
7782     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
7783   case ARM::BI__builtin_arm_crc32w:
7784   case ARM::BI__builtin_arm_crc32d:
7785     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
7786   case ARM::BI__builtin_arm_crc32cw:
7787   case ARM::BI__builtin_arm_crc32cd:
7788     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
7789   }
7790 
7791   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
7792     Value *Arg0 = EmitScalarExpr(E->getArg(0));
7793     Value *Arg1 = EmitScalarExpr(E->getArg(1));
7794 
7795     // crc32{c,}d intrinsics are implemnted as two calls to crc32{c,}w
7796     // intrinsics, hence we need different codegen for these cases.
7797     if (BuiltinID == ARM::BI__builtin_arm_crc32d ||
7798         BuiltinID == ARM::BI__builtin_arm_crc32cd) {
7799       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
7800       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
7801       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
7802       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
7803 
7804       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
7805       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
7806       return Builder.CreateCall(F, {Res, Arg1b});
7807     } else {
7808       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
7809 
7810       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
7811       return Builder.CreateCall(F, {Arg0, Arg1});
7812     }
7813   }
7814 
7815   if (BuiltinID == ARM::BI__builtin_arm_rsr ||
7816       BuiltinID == ARM::BI__builtin_arm_rsr64 ||
7817       BuiltinID == ARM::BI__builtin_arm_rsrp ||
7818       BuiltinID == ARM::BI__builtin_arm_wsr ||
7819       BuiltinID == ARM::BI__builtin_arm_wsr64 ||
7820       BuiltinID == ARM::BI__builtin_arm_wsrp) {
7821 
7822     SpecialRegisterAccessKind AccessKind = Write;
7823     if (BuiltinID == ARM::BI__builtin_arm_rsr ||
7824         BuiltinID == ARM::BI__builtin_arm_rsr64 ||
7825         BuiltinID == ARM::BI__builtin_arm_rsrp)
7826       AccessKind = VolatileRead;
7827 
7828     bool IsPointerBuiltin = BuiltinID == ARM::BI__builtin_arm_rsrp ||
7829                             BuiltinID == ARM::BI__builtin_arm_wsrp;
7830 
7831     bool Is64Bit = BuiltinID == ARM::BI__builtin_arm_rsr64 ||
7832                    BuiltinID == ARM::BI__builtin_arm_wsr64;
7833 
7834     llvm::Type *ValueType;
7835     llvm::Type *RegisterType;
7836     if (IsPointerBuiltin) {
7837       ValueType = VoidPtrTy;
7838       RegisterType = Int32Ty;
7839     } else if (Is64Bit) {
7840       ValueType = RegisterType = Int64Ty;
7841     } else {
7842       ValueType = RegisterType = Int32Ty;
7843     }
7844 
7845     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
7846                                       AccessKind);
7847   }
7848 
7849   // Handle MSVC intrinsics before argument evaluation to prevent double
7850   // evaluation.
7851   if (Optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
7852     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
7853 
7854   // Deal with MVE builtins
7855   if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
7856     return Result;
7857   // Handle CDE builtins
7858   if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
7859     return Result;
7860 
7861   // Find out if any arguments are required to be integer constant
7862   // expressions.
7863   unsigned ICEArguments = 0;
7864   ASTContext::GetBuiltinTypeError Error;
7865   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
7866   assert(Error == ASTContext::GE_None && "Should not codegen an error");
7867 
7868   auto getAlignmentValue32 = [&](Address addr) -> Value* {
7869     return Builder.getInt32(addr.getAlignment().getQuantity());
7870   };
7871 
7872   Address PtrOp0 = Address::invalid();
7873   Address PtrOp1 = Address::invalid();
7874   SmallVector<Value*, 4> Ops;
7875   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
7876   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
7877   for (unsigned i = 0, e = NumArgs; i != e; i++) {
7878     if (i == 0) {
7879       switch (BuiltinID) {
7880       case NEON::BI__builtin_neon_vld1_v:
7881       case NEON::BI__builtin_neon_vld1q_v:
7882       case NEON::BI__builtin_neon_vld1q_lane_v:
7883       case NEON::BI__builtin_neon_vld1_lane_v:
7884       case NEON::BI__builtin_neon_vld1_dup_v:
7885       case NEON::BI__builtin_neon_vld1q_dup_v:
7886       case NEON::BI__builtin_neon_vst1_v:
7887       case NEON::BI__builtin_neon_vst1q_v:
7888       case NEON::BI__builtin_neon_vst1q_lane_v:
7889       case NEON::BI__builtin_neon_vst1_lane_v:
7890       case NEON::BI__builtin_neon_vst2_v:
7891       case NEON::BI__builtin_neon_vst2q_v:
7892       case NEON::BI__builtin_neon_vst2_lane_v:
7893       case NEON::BI__builtin_neon_vst2q_lane_v:
7894       case NEON::BI__builtin_neon_vst3_v:
7895       case NEON::BI__builtin_neon_vst3q_v:
7896       case NEON::BI__builtin_neon_vst3_lane_v:
7897       case NEON::BI__builtin_neon_vst3q_lane_v:
7898       case NEON::BI__builtin_neon_vst4_v:
7899       case NEON::BI__builtin_neon_vst4q_v:
7900       case NEON::BI__builtin_neon_vst4_lane_v:
7901       case NEON::BI__builtin_neon_vst4q_lane_v:
7902         // Get the alignment for the argument in addition to the value;
7903         // we'll use it later.
7904         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
7905         Ops.push_back(PtrOp0.getPointer());
7906         continue;
7907       }
7908     }
7909     if (i == 1) {
7910       switch (BuiltinID) {
7911       case NEON::BI__builtin_neon_vld2_v:
7912       case NEON::BI__builtin_neon_vld2q_v:
7913       case NEON::BI__builtin_neon_vld3_v:
7914       case NEON::BI__builtin_neon_vld3q_v:
7915       case NEON::BI__builtin_neon_vld4_v:
7916       case NEON::BI__builtin_neon_vld4q_v:
7917       case NEON::BI__builtin_neon_vld2_lane_v:
7918       case NEON::BI__builtin_neon_vld2q_lane_v:
7919       case NEON::BI__builtin_neon_vld3_lane_v:
7920       case NEON::BI__builtin_neon_vld3q_lane_v:
7921       case NEON::BI__builtin_neon_vld4_lane_v:
7922       case NEON::BI__builtin_neon_vld4q_lane_v:
7923       case NEON::BI__builtin_neon_vld2_dup_v:
7924       case NEON::BI__builtin_neon_vld2q_dup_v:
7925       case NEON::BI__builtin_neon_vld3_dup_v:
7926       case NEON::BI__builtin_neon_vld3q_dup_v:
7927       case NEON::BI__builtin_neon_vld4_dup_v:
7928       case NEON::BI__builtin_neon_vld4q_dup_v:
7929         // Get the alignment for the argument in addition to the value;
7930         // we'll use it later.
7931         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
7932         Ops.push_back(PtrOp1.getPointer());
7933         continue;
7934       }
7935     }
7936 
7937     if ((ICEArguments & (1 << i)) == 0) {
7938       Ops.push_back(EmitScalarExpr(E->getArg(i)));
7939     } else {
7940       // If this is required to be a constant, constant fold it so that we know
7941       // that the generated intrinsic gets a ConstantInt.
7942       Ops.push_back(llvm::ConstantInt::get(
7943           getLLVMContext(),
7944           *E->getArg(i)->getIntegerConstantExpr(getContext())));
7945     }
7946   }
7947 
7948   switch (BuiltinID) {
7949   default: break;
7950 
7951   case NEON::BI__builtin_neon_vget_lane_i8:
7952   case NEON::BI__builtin_neon_vget_lane_i16:
7953   case NEON::BI__builtin_neon_vget_lane_i32:
7954   case NEON::BI__builtin_neon_vget_lane_i64:
7955   case NEON::BI__builtin_neon_vget_lane_bf16:
7956   case NEON::BI__builtin_neon_vget_lane_f32:
7957   case NEON::BI__builtin_neon_vgetq_lane_i8:
7958   case NEON::BI__builtin_neon_vgetq_lane_i16:
7959   case NEON::BI__builtin_neon_vgetq_lane_i32:
7960   case NEON::BI__builtin_neon_vgetq_lane_i64:
7961   case NEON::BI__builtin_neon_vgetq_lane_bf16:
7962   case NEON::BI__builtin_neon_vgetq_lane_f32:
7963   case NEON::BI__builtin_neon_vduph_lane_bf16:
7964   case NEON::BI__builtin_neon_vduph_laneq_bf16:
7965     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
7966 
7967   case NEON::BI__builtin_neon_vrndns_f32: {
7968     Value *Arg = EmitScalarExpr(E->getArg(0));
7969     llvm::Type *Tys[] = {Arg->getType()};
7970     Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
7971     return Builder.CreateCall(F, {Arg}, "vrndn"); }
7972 
7973   case NEON::BI__builtin_neon_vset_lane_i8:
7974   case NEON::BI__builtin_neon_vset_lane_i16:
7975   case NEON::BI__builtin_neon_vset_lane_i32:
7976   case NEON::BI__builtin_neon_vset_lane_i64:
7977   case NEON::BI__builtin_neon_vset_lane_bf16:
7978   case NEON::BI__builtin_neon_vset_lane_f32:
7979   case NEON::BI__builtin_neon_vsetq_lane_i8:
7980   case NEON::BI__builtin_neon_vsetq_lane_i16:
7981   case NEON::BI__builtin_neon_vsetq_lane_i32:
7982   case NEON::BI__builtin_neon_vsetq_lane_i64:
7983   case NEON::BI__builtin_neon_vsetq_lane_bf16:
7984   case NEON::BI__builtin_neon_vsetq_lane_f32:
7985     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
7986 
7987   case NEON::BI__builtin_neon_vsha1h_u32:
7988     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
7989                         "vsha1h");
7990   case NEON::BI__builtin_neon_vsha1cq_u32:
7991     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
7992                         "vsha1h");
7993   case NEON::BI__builtin_neon_vsha1pq_u32:
7994     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
7995                         "vsha1h");
7996   case NEON::BI__builtin_neon_vsha1mq_u32:
7997     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
7998                         "vsha1h");
7999 
8000   case NEON::BI__builtin_neon_vcvth_bf16_f32: {
8001     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops,
8002                         "vcvtbfp2bf");
8003   }
8004 
8005   // The ARM _MoveToCoprocessor builtins put the input register value as
8006   // the first argument, but the LLVM intrinsic expects it as the third one.
8007   case ARM::BI_MoveToCoprocessor:
8008   case ARM::BI_MoveToCoprocessor2: {
8009     Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
8010                                    Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
8011     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
8012                                   Ops[3], Ops[4], Ops[5]});
8013   }
8014   }
8015 
8016   // Get the last argument, which specifies the vector type.
8017   assert(HasExtraArg);
8018   const Expr *Arg = E->getArg(E->getNumArgs()-1);
8019   Optional<llvm::APSInt> Result = Arg->getIntegerConstantExpr(getContext());
8020   if (!Result)
8021     return nullptr;
8022 
8023   if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f ||
8024       BuiltinID == ARM::BI__builtin_arm_vcvtr_d) {
8025     // Determine the overloaded type of this builtin.
8026     llvm::Type *Ty;
8027     if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f)
8028       Ty = FloatTy;
8029     else
8030       Ty = DoubleTy;
8031 
8032     // Determine whether this is an unsigned conversion or not.
8033     bool usgn = Result->getZExtValue() == 1;
8034     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
8035 
8036     // Call the appropriate intrinsic.
8037     Function *F = CGM.getIntrinsic(Int, Ty);
8038     return Builder.CreateCall(F, Ops, "vcvtr");
8039   }
8040 
8041   // Determine the type of this overloaded NEON intrinsic.
8042   NeonTypeFlags Type = Result->getZExtValue();
8043   bool usgn = Type.isUnsigned();
8044   bool rightShift = false;
8045 
8046   llvm::FixedVectorType *VTy =
8047       GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
8048                   getTarget().hasBFloat16Type());
8049   llvm::Type *Ty = VTy;
8050   if (!Ty)
8051     return nullptr;
8052 
8053   // Many NEON builtins have identical semantics and uses in ARM and
8054   // AArch64. Emit these in a single function.
8055   auto IntrinsicMap = makeArrayRef(ARMSIMDIntrinsicMap);
8056   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
8057       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
8058   if (Builtin)
8059     return EmitCommonNeonBuiltinExpr(
8060         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
8061         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
8062 
8063   unsigned Int;
8064   switch (BuiltinID) {
8065   default: return nullptr;
8066   case NEON::BI__builtin_neon_vld1q_lane_v:
8067     // Handle 64-bit integer elements as a special case.  Use shuffles of
8068     // one-element vectors to avoid poor code for i64 in the backend.
8069     if (VTy->getElementType()->isIntegerTy(64)) {
8070       // Extract the other lane.
8071       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8072       int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
8073       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
8074       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8075       // Load the value as a one-element vector.
8076       Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
8077       llvm::Type *Tys[] = {Ty, Int8PtrTy};
8078       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
8079       Value *Align = getAlignmentValue32(PtrOp0);
8080       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
8081       // Combine them.
8082       int Indices[] = {1 - Lane, Lane};
8083       return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane");
8084     }
8085     LLVM_FALLTHROUGH;
8086   case NEON::BI__builtin_neon_vld1_lane_v: {
8087     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8088     PtrOp0 = Builder.CreateElementBitCast(PtrOp0, VTy->getElementType());
8089     Value *Ld = Builder.CreateLoad(PtrOp0);
8090     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
8091   }
8092   case NEON::BI__builtin_neon_vqrshrn_n_v:
8093     Int =
8094       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
8095     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
8096                         1, true);
8097   case NEON::BI__builtin_neon_vqrshrun_n_v:
8098     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
8099                         Ops, "vqrshrun_n", 1, true);
8100   case NEON::BI__builtin_neon_vqshrn_n_v:
8101     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
8102     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
8103                         1, true);
8104   case NEON::BI__builtin_neon_vqshrun_n_v:
8105     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
8106                         Ops, "vqshrun_n", 1, true);
8107   case NEON::BI__builtin_neon_vrecpe_v:
8108   case NEON::BI__builtin_neon_vrecpeq_v:
8109     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
8110                         Ops, "vrecpe");
8111   case NEON::BI__builtin_neon_vrshrn_n_v:
8112     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
8113                         Ops, "vrshrn_n", 1, true);
8114   case NEON::BI__builtin_neon_vrsra_n_v:
8115   case NEON::BI__builtin_neon_vrsraq_n_v:
8116     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8117     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8118     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
8119     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
8120     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
8121     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
8122   case NEON::BI__builtin_neon_vsri_n_v:
8123   case NEON::BI__builtin_neon_vsriq_n_v:
8124     rightShift = true;
8125     LLVM_FALLTHROUGH;
8126   case NEON::BI__builtin_neon_vsli_n_v:
8127   case NEON::BI__builtin_neon_vsliq_n_v:
8128     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
8129     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
8130                         Ops, "vsli_n");
8131   case NEON::BI__builtin_neon_vsra_n_v:
8132   case NEON::BI__builtin_neon_vsraq_n_v:
8133     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8134     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
8135     return Builder.CreateAdd(Ops[0], Ops[1]);
8136   case NEON::BI__builtin_neon_vst1q_lane_v:
8137     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
8138     // a one-element vector and avoid poor code for i64 in the backend.
8139     if (VTy->getElementType()->isIntegerTy(64)) {
8140       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8141       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
8142       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8143       Ops[2] = getAlignmentValue32(PtrOp0);
8144       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
8145       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
8146                                                  Tys), Ops);
8147     }
8148     LLVM_FALLTHROUGH;
8149   case NEON::BI__builtin_neon_vst1_lane_v: {
8150     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8151     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
8152     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
8153     auto St = Builder.CreateStore(Ops[1], Builder.CreateBitCast(PtrOp0, Ty));
8154     return St;
8155   }
8156   case NEON::BI__builtin_neon_vtbl1_v:
8157     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
8158                         Ops, "vtbl1");
8159   case NEON::BI__builtin_neon_vtbl2_v:
8160     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
8161                         Ops, "vtbl2");
8162   case NEON::BI__builtin_neon_vtbl3_v:
8163     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
8164                         Ops, "vtbl3");
8165   case NEON::BI__builtin_neon_vtbl4_v:
8166     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
8167                         Ops, "vtbl4");
8168   case NEON::BI__builtin_neon_vtbx1_v:
8169     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
8170                         Ops, "vtbx1");
8171   case NEON::BI__builtin_neon_vtbx2_v:
8172     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
8173                         Ops, "vtbx2");
8174   case NEON::BI__builtin_neon_vtbx3_v:
8175     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
8176                         Ops, "vtbx3");
8177   case NEON::BI__builtin_neon_vtbx4_v:
8178     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
8179                         Ops, "vtbx4");
8180   }
8181 }
8182 
8183 template<typename Integer>
8184 static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
8185   return E->getIntegerConstantExpr(Context)->getExtValue();
8186 }
8187 
8188 static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
8189                                      llvm::Type *T, bool Unsigned) {
8190   // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
8191   // which finds it convenient to specify signed/unsigned as a boolean flag.
8192   return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
8193 }
8194 
8195 static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
8196                                     uint32_t Shift, bool Unsigned) {
8197   // MVE helper function for integer shift right. This must handle signed vs
8198   // unsigned, and also deal specially with the case where the shift count is
8199   // equal to the lane size. In LLVM IR, an LShr with that parameter would be
8200   // undefined behavior, but in MVE it's legal, so we must convert it to code
8201   // that is not undefined in IR.
8202   unsigned LaneBits = cast<llvm::VectorType>(V->getType())
8203                           ->getElementType()
8204                           ->getPrimitiveSizeInBits();
8205   if (Shift == LaneBits) {
8206     // An unsigned shift of the full lane size always generates zero, so we can
8207     // simply emit a zero vector. A signed shift of the full lane size does the
8208     // same thing as shifting by one bit fewer.
8209     if (Unsigned)
8210       return llvm::Constant::getNullValue(V->getType());
8211     else
8212       --Shift;
8213   }
8214   return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift);
8215 }
8216 
8217 static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
8218   // MVE-specific helper function for a vector splat, which infers the element
8219   // count of the output vector by knowing that MVE vectors are all 128 bits
8220   // wide.
8221   unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
8222   return Builder.CreateVectorSplat(Elements, V);
8223 }
8224 
8225 static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
8226                                             CodeGenFunction *CGF,
8227                                             llvm::Value *V,
8228                                             llvm::Type *DestType) {
8229   // Convert one MVE vector type into another by reinterpreting its in-register
8230   // format.
8231   //
8232   // Little-endian, this is identical to a bitcast (which reinterprets the
8233   // memory format). But big-endian, they're not necessarily the same, because
8234   // the register and memory formats map to each other differently depending on
8235   // the lane size.
8236   //
8237   // We generate a bitcast whenever we can (if we're little-endian, or if the
8238   // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
8239   // that performs the different kind of reinterpretation.
8240   if (CGF->getTarget().isBigEndian() &&
8241       V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
8242     return Builder.CreateCall(
8243         CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq,
8244                               {DestType, V->getType()}),
8245         V);
8246   } else {
8247     return Builder.CreateBitCast(V, DestType);
8248   }
8249 }
8250 
8251 static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
8252   // Make a shufflevector that extracts every other element of a vector (evens
8253   // or odds, as desired).
8254   SmallVector<int, 16> Indices;
8255   unsigned InputElements =
8256       cast<llvm::FixedVectorType>(V->getType())->getNumElements();
8257   for (unsigned i = 0; i < InputElements; i += 2)
8258     Indices.push_back(i + Odd);
8259   return Builder.CreateShuffleVector(V, Indices);
8260 }
8261 
8262 static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
8263                               llvm::Value *V1) {
8264   // Make a shufflevector that interleaves two vectors element by element.
8265   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
8266   SmallVector<int, 16> Indices;
8267   unsigned InputElements =
8268       cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
8269   for (unsigned i = 0; i < InputElements; i++) {
8270     Indices.push_back(i);
8271     Indices.push_back(i + InputElements);
8272   }
8273   return Builder.CreateShuffleVector(V0, V1, Indices);
8274 }
8275 
8276 template<unsigned HighBit, unsigned OtherBits>
8277 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
8278   // MVE-specific helper function to make a vector splat of a constant such as
8279   // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
8280   llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType();
8281   unsigned LaneBits = T->getPrimitiveSizeInBits();
8282   uint32_t Value = HighBit << (LaneBits - 1);
8283   if (OtherBits)
8284     Value |= (1UL << (LaneBits - 1)) - 1;
8285   llvm::Value *Lane = llvm::ConstantInt::get(T, Value);
8286   return ARMMVEVectorSplat(Builder, Lane);
8287 }
8288 
8289 static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
8290                                                llvm::Value *V,
8291                                                unsigned ReverseWidth) {
8292   // MVE-specific helper function which reverses the elements of a
8293   // vector within every (ReverseWidth)-bit collection of lanes.
8294   SmallVector<int, 16> Indices;
8295   unsigned LaneSize = V->getType()->getScalarSizeInBits();
8296   unsigned Elements = 128 / LaneSize;
8297   unsigned Mask = ReverseWidth / LaneSize - 1;
8298   for (unsigned i = 0; i < Elements; i++)
8299     Indices.push_back(i ^ Mask);
8300   return Builder.CreateShuffleVector(V, Indices);
8301 }
8302 
8303 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
8304                                               const CallExpr *E,
8305                                               ReturnValueSlot ReturnValue,
8306                                               llvm::Triple::ArchType Arch) {
8307   enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
8308   Intrinsic::ID IRIntr;
8309   unsigned NumVectors;
8310 
8311   // Code autogenerated by Tablegen will handle all the simple builtins.
8312   switch (BuiltinID) {
8313     #include "clang/Basic/arm_mve_builtin_cg.inc"
8314 
8315     // If we didn't match an MVE builtin id at all, go back to the
8316     // main EmitARMBuiltinExpr.
8317   default:
8318     return nullptr;
8319   }
8320 
8321   // Anything that breaks from that switch is an MVE builtin that
8322   // needs handwritten code to generate.
8323 
8324   switch (CustomCodeGenType) {
8325 
8326   case CustomCodeGen::VLD24: {
8327     llvm::SmallVector<Value *, 4> Ops;
8328     llvm::SmallVector<llvm::Type *, 4> Tys;
8329 
8330     auto MvecCType = E->getType();
8331     auto MvecLType = ConvertType(MvecCType);
8332     assert(MvecLType->isStructTy() &&
8333            "Return type for vld[24]q should be a struct");
8334     assert(MvecLType->getStructNumElements() == 1 &&
8335            "Return-type struct for vld[24]q should have one element");
8336     auto MvecLTypeInner = MvecLType->getStructElementType(0);
8337     assert(MvecLTypeInner->isArrayTy() &&
8338            "Return-type struct for vld[24]q should contain an array");
8339     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
8340            "Array member of return-type struct vld[24]q has wrong length");
8341     auto VecLType = MvecLTypeInner->getArrayElementType();
8342 
8343     Tys.push_back(VecLType);
8344 
8345     auto Addr = E->getArg(0);
8346     Ops.push_back(EmitScalarExpr(Addr));
8347     Tys.push_back(ConvertType(Addr->getType()));
8348 
8349     Function *F = CGM.getIntrinsic(IRIntr, makeArrayRef(Tys));
8350     Value *LoadResult = Builder.CreateCall(F, Ops);
8351     Value *MvecOut = UndefValue::get(MvecLType);
8352     for (unsigned i = 0; i < NumVectors; ++i) {
8353       Value *Vec = Builder.CreateExtractValue(LoadResult, i);
8354       MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i});
8355     }
8356 
8357     if (ReturnValue.isNull())
8358       return MvecOut;
8359     else
8360       return Builder.CreateStore(MvecOut, ReturnValue.getValue());
8361   }
8362 
8363   case CustomCodeGen::VST24: {
8364     llvm::SmallVector<Value *, 4> Ops;
8365     llvm::SmallVector<llvm::Type *, 4> Tys;
8366 
8367     auto Addr = E->getArg(0);
8368     Ops.push_back(EmitScalarExpr(Addr));
8369     Tys.push_back(ConvertType(Addr->getType()));
8370 
8371     auto MvecCType = E->getArg(1)->getType();
8372     auto MvecLType = ConvertType(MvecCType);
8373     assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
8374     assert(MvecLType->getStructNumElements() == 1 &&
8375            "Data-type struct for vst2q should have one element");
8376     auto MvecLTypeInner = MvecLType->getStructElementType(0);
8377     assert(MvecLTypeInner->isArrayTy() &&
8378            "Data-type struct for vst2q should contain an array");
8379     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
8380            "Array member of return-type struct vld[24]q has wrong length");
8381     auto VecLType = MvecLTypeInner->getArrayElementType();
8382 
8383     Tys.push_back(VecLType);
8384 
8385     AggValueSlot MvecSlot = CreateAggTemp(MvecCType);
8386     EmitAggExpr(E->getArg(1), MvecSlot);
8387     auto Mvec = Builder.CreateLoad(MvecSlot.getAddress());
8388     for (unsigned i = 0; i < NumVectors; i++)
8389       Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i}));
8390 
8391     Function *F = CGM.getIntrinsic(IRIntr, makeArrayRef(Tys));
8392     Value *ToReturn = nullptr;
8393     for (unsigned i = 0; i < NumVectors; i++) {
8394       Ops.push_back(llvm::ConstantInt::get(Int32Ty, i));
8395       ToReturn = Builder.CreateCall(F, Ops);
8396       Ops.pop_back();
8397     }
8398     return ToReturn;
8399   }
8400   }
8401   llvm_unreachable("unknown custom codegen type.");
8402 }
8403 
8404 Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
8405                                               const CallExpr *E,
8406                                               ReturnValueSlot ReturnValue,
8407                                               llvm::Triple::ArchType Arch) {
8408   switch (BuiltinID) {
8409   default:
8410     return nullptr;
8411 #include "clang/Basic/arm_cde_builtin_cg.inc"
8412   }
8413 }
8414 
8415 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
8416                                       const CallExpr *E,
8417                                       SmallVectorImpl<Value *> &Ops,
8418                                       llvm::Triple::ArchType Arch) {
8419   unsigned int Int = 0;
8420   const char *s = nullptr;
8421 
8422   switch (BuiltinID) {
8423   default:
8424     return nullptr;
8425   case NEON::BI__builtin_neon_vtbl1_v:
8426   case NEON::BI__builtin_neon_vqtbl1_v:
8427   case NEON::BI__builtin_neon_vqtbl1q_v:
8428   case NEON::BI__builtin_neon_vtbl2_v:
8429   case NEON::BI__builtin_neon_vqtbl2_v:
8430   case NEON::BI__builtin_neon_vqtbl2q_v:
8431   case NEON::BI__builtin_neon_vtbl3_v:
8432   case NEON::BI__builtin_neon_vqtbl3_v:
8433   case NEON::BI__builtin_neon_vqtbl3q_v:
8434   case NEON::BI__builtin_neon_vtbl4_v:
8435   case NEON::BI__builtin_neon_vqtbl4_v:
8436   case NEON::BI__builtin_neon_vqtbl4q_v:
8437     break;
8438   case NEON::BI__builtin_neon_vtbx1_v:
8439   case NEON::BI__builtin_neon_vqtbx1_v:
8440   case NEON::BI__builtin_neon_vqtbx1q_v:
8441   case NEON::BI__builtin_neon_vtbx2_v:
8442   case NEON::BI__builtin_neon_vqtbx2_v:
8443   case NEON::BI__builtin_neon_vqtbx2q_v:
8444   case NEON::BI__builtin_neon_vtbx3_v:
8445   case NEON::BI__builtin_neon_vqtbx3_v:
8446   case NEON::BI__builtin_neon_vqtbx3q_v:
8447   case NEON::BI__builtin_neon_vtbx4_v:
8448   case NEON::BI__builtin_neon_vqtbx4_v:
8449   case NEON::BI__builtin_neon_vqtbx4q_v:
8450     break;
8451   }
8452 
8453   assert(E->getNumArgs() >= 3);
8454 
8455   // Get the last argument, which specifies the vector type.
8456   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
8457   Optional<llvm::APSInt> Result = Arg->getIntegerConstantExpr(CGF.getContext());
8458   if (!Result)
8459     return nullptr;
8460 
8461   // Determine the type of this overloaded NEON intrinsic.
8462   NeonTypeFlags Type = Result->getZExtValue();
8463   llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
8464   if (!Ty)
8465     return nullptr;
8466 
8467   CodeGen::CGBuilderTy &Builder = CGF.Builder;
8468 
8469   // AArch64 scalar builtins are not overloaded, they do not have an extra
8470   // argument that specifies the vector type, need to handle each case.
8471   switch (BuiltinID) {
8472   case NEON::BI__builtin_neon_vtbl1_v: {
8473     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 1), nullptr,
8474                               Ops[1], Ty, Intrinsic::aarch64_neon_tbl1,
8475                               "vtbl1");
8476   }
8477   case NEON::BI__builtin_neon_vtbl2_v: {
8478     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 2), nullptr,
8479                               Ops[2], Ty, Intrinsic::aarch64_neon_tbl1,
8480                               "vtbl1");
8481   }
8482   case NEON::BI__builtin_neon_vtbl3_v: {
8483     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 3), nullptr,
8484                               Ops[3], Ty, Intrinsic::aarch64_neon_tbl2,
8485                               "vtbl2");
8486   }
8487   case NEON::BI__builtin_neon_vtbl4_v: {
8488     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(0, 4), nullptr,
8489                               Ops[4], Ty, Intrinsic::aarch64_neon_tbl2,
8490                               "vtbl2");
8491   }
8492   case NEON::BI__builtin_neon_vtbx1_v: {
8493     Value *TblRes =
8494         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 1), nullptr, Ops[2],
8495                            Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
8496 
8497     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
8498     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
8499     CmpRes = Builder.CreateSExt(CmpRes, Ty);
8500 
8501     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
8502     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
8503     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
8504   }
8505   case NEON::BI__builtin_neon_vtbx2_v: {
8506     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 2), Ops[0],
8507                               Ops[3], Ty, Intrinsic::aarch64_neon_tbx1,
8508                               "vtbx1");
8509   }
8510   case NEON::BI__builtin_neon_vtbx3_v: {
8511     Value *TblRes =
8512         packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 3), nullptr, Ops[4],
8513                            Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
8514 
8515     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
8516     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
8517                                            TwentyFourV);
8518     CmpRes = Builder.CreateSExt(CmpRes, Ty);
8519 
8520     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
8521     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
8522     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
8523   }
8524   case NEON::BI__builtin_neon_vtbx4_v: {
8525     return packTBLDVectorList(CGF, makeArrayRef(Ops).slice(1, 4), Ops[0],
8526                               Ops[5], Ty, Intrinsic::aarch64_neon_tbx2,
8527                               "vtbx2");
8528   }
8529   case NEON::BI__builtin_neon_vqtbl1_v:
8530   case NEON::BI__builtin_neon_vqtbl1q_v:
8531     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
8532   case NEON::BI__builtin_neon_vqtbl2_v:
8533   case NEON::BI__builtin_neon_vqtbl2q_v: {
8534     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
8535   case NEON::BI__builtin_neon_vqtbl3_v:
8536   case NEON::BI__builtin_neon_vqtbl3q_v:
8537     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
8538   case NEON::BI__builtin_neon_vqtbl4_v:
8539   case NEON::BI__builtin_neon_vqtbl4q_v:
8540     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
8541   case NEON::BI__builtin_neon_vqtbx1_v:
8542   case NEON::BI__builtin_neon_vqtbx1q_v:
8543     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
8544   case NEON::BI__builtin_neon_vqtbx2_v:
8545   case NEON::BI__builtin_neon_vqtbx2q_v:
8546     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
8547   case NEON::BI__builtin_neon_vqtbx3_v:
8548   case NEON::BI__builtin_neon_vqtbx3q_v:
8549     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
8550   case NEON::BI__builtin_neon_vqtbx4_v:
8551   case NEON::BI__builtin_neon_vqtbx4q_v:
8552     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
8553   }
8554   }
8555 
8556   if (!Int)
8557     return nullptr;
8558 
8559   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
8560   return CGF.EmitNeonCall(F, Ops, s);
8561 }
8562 
8563 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
8564   auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
8565   Op = Builder.CreateBitCast(Op, Int16Ty);
8566   Value *V = UndefValue::get(VTy);
8567   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
8568   Op = Builder.CreateInsertElement(V, Op, CI);
8569   return Op;
8570 }
8571 
8572 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
8573 /// access builtin.  Only required if it can't be inferred from the base pointer
8574 /// operand.
8575 llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
8576   switch (TypeFlags.getMemEltType()) {
8577   case SVETypeFlags::MemEltTyDefault:
8578     return getEltType(TypeFlags);
8579   case SVETypeFlags::MemEltTyInt8:
8580     return Builder.getInt8Ty();
8581   case SVETypeFlags::MemEltTyInt16:
8582     return Builder.getInt16Ty();
8583   case SVETypeFlags::MemEltTyInt32:
8584     return Builder.getInt32Ty();
8585   case SVETypeFlags::MemEltTyInt64:
8586     return Builder.getInt64Ty();
8587   }
8588   llvm_unreachable("Unknown MemEltType");
8589 }
8590 
8591 llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
8592   switch (TypeFlags.getEltType()) {
8593   default:
8594     llvm_unreachable("Invalid SVETypeFlag!");
8595 
8596   case SVETypeFlags::EltTyInt8:
8597     return Builder.getInt8Ty();
8598   case SVETypeFlags::EltTyInt16:
8599     return Builder.getInt16Ty();
8600   case SVETypeFlags::EltTyInt32:
8601     return Builder.getInt32Ty();
8602   case SVETypeFlags::EltTyInt64:
8603     return Builder.getInt64Ty();
8604 
8605   case SVETypeFlags::EltTyFloat16:
8606     return Builder.getHalfTy();
8607   case SVETypeFlags::EltTyFloat32:
8608     return Builder.getFloatTy();
8609   case SVETypeFlags::EltTyFloat64:
8610     return Builder.getDoubleTy();
8611 
8612   case SVETypeFlags::EltTyBFloat16:
8613     return Builder.getBFloatTy();
8614 
8615   case SVETypeFlags::EltTyBool8:
8616   case SVETypeFlags::EltTyBool16:
8617   case SVETypeFlags::EltTyBool32:
8618   case SVETypeFlags::EltTyBool64:
8619     return Builder.getInt1Ty();
8620   }
8621 }
8622 
8623 // Return the llvm predicate vector type corresponding to the specified element
8624 // TypeFlags.
8625 llvm::ScalableVectorType *
8626 CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
8627   switch (TypeFlags.getEltType()) {
8628   default: llvm_unreachable("Unhandled SVETypeFlag!");
8629 
8630   case SVETypeFlags::EltTyInt8:
8631     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
8632   case SVETypeFlags::EltTyInt16:
8633     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
8634   case SVETypeFlags::EltTyInt32:
8635     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
8636   case SVETypeFlags::EltTyInt64:
8637     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
8638 
8639   case SVETypeFlags::EltTyBFloat16:
8640     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
8641   case SVETypeFlags::EltTyFloat16:
8642     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
8643   case SVETypeFlags::EltTyFloat32:
8644     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
8645   case SVETypeFlags::EltTyFloat64:
8646     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
8647 
8648   case SVETypeFlags::EltTyBool8:
8649     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
8650   case SVETypeFlags::EltTyBool16:
8651     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
8652   case SVETypeFlags::EltTyBool32:
8653     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
8654   case SVETypeFlags::EltTyBool64:
8655     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
8656   }
8657 }
8658 
8659 // Return the llvm vector type corresponding to the specified element TypeFlags.
8660 llvm::ScalableVectorType *
8661 CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
8662   switch (TypeFlags.getEltType()) {
8663   default:
8664     llvm_unreachable("Invalid SVETypeFlag!");
8665 
8666   case SVETypeFlags::EltTyInt8:
8667     return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
8668   case SVETypeFlags::EltTyInt16:
8669     return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8);
8670   case SVETypeFlags::EltTyInt32:
8671     return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4);
8672   case SVETypeFlags::EltTyInt64:
8673     return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
8674 
8675   case SVETypeFlags::EltTyFloat16:
8676     return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
8677   case SVETypeFlags::EltTyBFloat16:
8678     return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8);
8679   case SVETypeFlags::EltTyFloat32:
8680     return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4);
8681   case SVETypeFlags::EltTyFloat64:
8682     return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2);
8683 
8684   case SVETypeFlags::EltTyBool8:
8685     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
8686   case SVETypeFlags::EltTyBool16:
8687     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
8688   case SVETypeFlags::EltTyBool32:
8689     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
8690   case SVETypeFlags::EltTyBool64:
8691     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
8692   }
8693 }
8694 
8695 llvm::Value *
8696 CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
8697   Function *Ptrue =
8698       CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
8699   return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
8700 }
8701 
8702 constexpr unsigned SVEBitsPerBlock = 128;
8703 
8704 static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
8705   unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
8706   return llvm::ScalableVectorType::get(EltTy, NumElts);
8707 }
8708 
8709 // Reinterpret the input predicate so that it can be used to correctly isolate
8710 // the elements of the specified datatype.
8711 Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
8712                                              llvm::ScalableVectorType *VTy) {
8713   auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy);
8714   if (Pred->getType() == RTy)
8715     return Pred;
8716 
8717   unsigned IntID;
8718   llvm::Type *IntrinsicTy;
8719   switch (VTy->getMinNumElements()) {
8720   default:
8721     llvm_unreachable("unsupported element count!");
8722   case 2:
8723   case 4:
8724   case 8:
8725     IntID = Intrinsic::aarch64_sve_convert_from_svbool;
8726     IntrinsicTy = RTy;
8727     break;
8728   case 16:
8729     IntID = Intrinsic::aarch64_sve_convert_to_svbool;
8730     IntrinsicTy = Pred->getType();
8731     break;
8732   }
8733 
8734   Function *F = CGM.getIntrinsic(IntID, IntrinsicTy);
8735   Value *C = Builder.CreateCall(F, Pred);
8736   assert(C->getType() == RTy && "Unexpected return type!");
8737   return C;
8738 }
8739 
8740 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
8741                                           SmallVectorImpl<Value *> &Ops,
8742                                           unsigned IntID) {
8743   auto *ResultTy = getSVEType(TypeFlags);
8744   auto *OverloadedTy =
8745       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
8746 
8747   // At the ACLE level there's only one predicate type, svbool_t, which is
8748   // mapped to <n x 16 x i1>. However, this might be incompatible with the
8749   // actual type being loaded. For example, when loading doubles (i64) the
8750   // predicated should be <n x 2 x i1> instead. At the IR level the type of
8751   // the predicate and the data being loaded must match. Cast accordingly.
8752   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
8753 
8754   Function *F = nullptr;
8755   if (Ops[1]->getType()->isVectorTy())
8756     // This is the "vector base, scalar offset" case. In order to uniquely
8757     // map this built-in to an LLVM IR intrinsic, we need both the return type
8758     // and the type of the vector base.
8759     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()});
8760   else
8761     // This is the "scalar base, vector offset case". The type of the offset
8762     // is encoded in the name of the intrinsic. We only need to specify the
8763     // return type in order to uniquely map this built-in to an LLVM IR
8764     // intrinsic.
8765     F = CGM.getIntrinsic(IntID, OverloadedTy);
8766 
8767   // Pass 0 when the offset is missing. This can only be applied when using
8768   // the "vector base" addressing mode for which ACLE allows no offset. The
8769   // corresponding LLVM IR always requires an offset.
8770   if (Ops.size() == 2) {
8771     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
8772     Ops.push_back(ConstantInt::get(Int64Ty, 0));
8773   }
8774 
8775   // For "vector base, scalar index" scale the index so that it becomes a
8776   // scalar offset.
8777   if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
8778     unsigned BytesPerElt =
8779         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
8780     Value *Scale = ConstantInt::get(Int64Ty, BytesPerElt);
8781     Ops[2] = Builder.CreateMul(Ops[2], Scale);
8782   }
8783 
8784   Value *Call = Builder.CreateCall(F, Ops);
8785 
8786   // The following sext/zext is only needed when ResultTy != OverloadedTy. In
8787   // other cases it's folded into a nop.
8788   return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy)
8789                                   : Builder.CreateSExt(Call, ResultTy);
8790 }
8791 
8792 Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
8793                                             SmallVectorImpl<Value *> &Ops,
8794                                             unsigned IntID) {
8795   auto *SrcDataTy = getSVEType(TypeFlags);
8796   auto *OverloadedTy =
8797       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy);
8798 
8799   // In ACLE the source data is passed in the last argument, whereas in LLVM IR
8800   // it's the first argument. Move it accordingly.
8801   Ops.insert(Ops.begin(), Ops.pop_back_val());
8802 
8803   Function *F = nullptr;
8804   if (Ops[2]->getType()->isVectorTy())
8805     // This is the "vector base, scalar offset" case. In order to uniquely
8806     // map this built-in to an LLVM IR intrinsic, we need both the return type
8807     // and the type of the vector base.
8808     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()});
8809   else
8810     // This is the "scalar base, vector offset case". The type of the offset
8811     // is encoded in the name of the intrinsic. We only need to specify the
8812     // return type in order to uniquely map this built-in to an LLVM IR
8813     // intrinsic.
8814     F = CGM.getIntrinsic(IntID, OverloadedTy);
8815 
8816   // Pass 0 when the offset is missing. This can only be applied when using
8817   // the "vector base" addressing mode for which ACLE allows no offset. The
8818   // corresponding LLVM IR always requires an offset.
8819   if (Ops.size() == 3) {
8820     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
8821     Ops.push_back(ConstantInt::get(Int64Ty, 0));
8822   }
8823 
8824   // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
8825   // folded into a nop.
8826   Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy);
8827 
8828   // At the ACLE level there's only one predicate type, svbool_t, which is
8829   // mapped to <n x 16 x i1>. However, this might be incompatible with the
8830   // actual type being stored. For example, when storing doubles (i64) the
8831   // predicated should be <n x 2 x i1> instead. At the IR level the type of
8832   // the predicate and the data being stored must match. Cast accordingly.
8833   Ops[1] = EmitSVEPredicateCast(Ops[1], OverloadedTy);
8834 
8835   // For "vector base, scalar index" scale the index so that it becomes a
8836   // scalar offset.
8837   if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
8838     unsigned BytesPerElt =
8839         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
8840     Value *Scale = ConstantInt::get(Int64Ty, BytesPerElt);
8841     Ops[3] = Builder.CreateMul(Ops[3], Scale);
8842   }
8843 
8844   return Builder.CreateCall(F, Ops);
8845 }
8846 
8847 Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
8848                                               SmallVectorImpl<Value *> &Ops,
8849                                               unsigned IntID) {
8850   // The gather prefetches are overloaded on the vector input - this can either
8851   // be the vector of base addresses or vector of offsets.
8852   auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType());
8853   if (!OverloadedTy)
8854     OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType());
8855 
8856   // Cast the predicate from svbool_t to the right number of elements.
8857   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
8858 
8859   // vector + imm addressing modes
8860   if (Ops[1]->getType()->isVectorTy()) {
8861     if (Ops.size() == 3) {
8862       // Pass 0 for 'vector+imm' when the index is omitted.
8863       Ops.push_back(ConstantInt::get(Int64Ty, 0));
8864 
8865       // The sv_prfop is the last operand in the builtin and IR intrinsic.
8866       std::swap(Ops[2], Ops[3]);
8867     } else {
8868       // Index needs to be passed as scaled offset.
8869       llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
8870       unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
8871       Value *Scale = ConstantInt::get(Int64Ty, BytesPerElt);
8872       Ops[2] = Builder.CreateMul(Ops[2], Scale);
8873     }
8874   }
8875 
8876   Function *F = CGM.getIntrinsic(IntID, OverloadedTy);
8877   return Builder.CreateCall(F, Ops);
8878 }
8879 
8880 Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
8881                                           SmallVectorImpl<Value*> &Ops,
8882                                           unsigned IntID) {
8883   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
8884   auto VecPtrTy = llvm::PointerType::getUnqual(VTy);
8885   auto EltPtrTy = llvm::PointerType::getUnqual(VTy->getElementType());
8886 
8887   unsigned N;
8888   switch (IntID) {
8889   case Intrinsic::aarch64_sve_ld2:
8890     N = 2;
8891     break;
8892   case Intrinsic::aarch64_sve_ld3:
8893     N = 3;
8894     break;
8895   case Intrinsic::aarch64_sve_ld4:
8896     N = 4;
8897     break;
8898   default:
8899     llvm_unreachable("unknown intrinsic!");
8900   }
8901   auto RetTy = llvm::VectorType::get(VTy->getElementType(),
8902                                      VTy->getElementCount() * N);
8903 
8904 	Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
8905   Value *BasePtr= Builder.CreateBitCast(Ops[1], VecPtrTy);
8906   Value *Offset = Ops.size() > 2 ? Ops[2] : Builder.getInt32(0);
8907   BasePtr = Builder.CreateGEP(VTy, BasePtr, Offset);
8908   BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy);
8909 
8910   Function *F = CGM.getIntrinsic(IntID, {RetTy, Predicate->getType()});
8911   return Builder.CreateCall(F, { Predicate, BasePtr });
8912 }
8913 
8914 Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
8915                                            SmallVectorImpl<Value*> &Ops,
8916                                            unsigned IntID) {
8917   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
8918   auto VecPtrTy = llvm::PointerType::getUnqual(VTy);
8919   auto EltPtrTy = llvm::PointerType::getUnqual(VTy->getElementType());
8920 
8921   unsigned N;
8922   switch (IntID) {
8923   case Intrinsic::aarch64_sve_st2:
8924     N = 2;
8925     break;
8926   case Intrinsic::aarch64_sve_st3:
8927     N = 3;
8928     break;
8929   case Intrinsic::aarch64_sve_st4:
8930     N = 4;
8931     break;
8932   default:
8933     llvm_unreachable("unknown intrinsic!");
8934   }
8935   auto TupleTy =
8936       llvm::VectorType::get(VTy->getElementType(), VTy->getElementCount() * N);
8937 
8938   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
8939   Value *BasePtr = Builder.CreateBitCast(Ops[1], VecPtrTy);
8940   Value *Offset = Ops.size() > 3 ? Ops[2] : Builder.getInt32(0);
8941   Value *Val = Ops.back();
8942   BasePtr = Builder.CreateGEP(VTy, BasePtr, Offset);
8943   BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy);
8944 
8945   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
8946   // need to break up the tuple vector.
8947   SmallVector<llvm::Value*, 5> Operands;
8948   Function *FExtr =
8949       CGM.getIntrinsic(Intrinsic::aarch64_sve_tuple_get, {VTy, TupleTy});
8950   for (unsigned I = 0; I < N; ++I)
8951     Operands.push_back(Builder.CreateCall(FExtr, {Val, Builder.getInt32(I)}));
8952   Operands.append({Predicate, BasePtr});
8953 
8954   Function *F = CGM.getIntrinsic(IntID, { VTy });
8955   return Builder.CreateCall(F, Operands);
8956 }
8957 
8958 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
8959 // svpmullt_pair intrinsics, with the exception that their results are bitcast
8960 // to a wider type.
8961 Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
8962                                      SmallVectorImpl<Value *> &Ops,
8963                                      unsigned BuiltinID) {
8964   // Splat scalar operand to vector (intrinsics with _n infix)
8965   if (TypeFlags.hasSplatOperand()) {
8966     unsigned OpNo = TypeFlags.getSplatOperand();
8967     Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
8968   }
8969 
8970   // The pair-wise function has a narrower overloaded type.
8971   Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType());
8972   Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]});
8973 
8974   // Now bitcast to the wider result type.
8975   llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
8976   return EmitSVEReinterpret(Call, Ty);
8977 }
8978 
8979 Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
8980                                     ArrayRef<Value *> Ops, unsigned BuiltinID) {
8981   llvm::Type *OverloadedTy = getSVEType(TypeFlags);
8982   Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
8983   return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
8984 }
8985 
8986 Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
8987                                             SmallVectorImpl<Value *> &Ops,
8988                                             unsigned BuiltinID) {
8989   auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
8990   auto *VectorTy = getSVEVectorForElementType(MemEltTy);
8991   auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
8992 
8993   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
8994   Value *BasePtr = Ops[1];
8995 
8996   // Implement the index operand if not omitted.
8997   if (Ops.size() > 3) {
8998     BasePtr = Builder.CreateBitCast(BasePtr, MemoryTy->getPointerTo());
8999     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9000   }
9001 
9002   // Prefetch intriniscs always expect an i8*
9003   BasePtr = Builder.CreateBitCast(BasePtr, llvm::PointerType::getUnqual(Int8Ty));
9004   Value *PrfOp = Ops.back();
9005 
9006   Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType());
9007   return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp});
9008 }
9009 
9010 Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
9011                                           llvm::Type *ReturnTy,
9012                                           SmallVectorImpl<Value *> &Ops,
9013                                           unsigned BuiltinID,
9014                                           bool IsZExtReturn) {
9015   QualType LangPTy = E->getArg(1)->getType();
9016   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9017       LangPTy->castAs<PointerType>()->getPointeeType());
9018 
9019   // The vector type that is returned may be different from the
9020   // eventual type loaded from memory.
9021   auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
9022   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9023 
9024   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9025   Value *BasePtr = Builder.CreateBitCast(Ops[1], MemoryTy->getPointerTo());
9026   Value *Offset = Ops.size() > 2 ? Ops[2] : Builder.getInt32(0);
9027   BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Offset);
9028 
9029   BasePtr = Builder.CreateBitCast(BasePtr, MemEltTy->getPointerTo());
9030   Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9031   Value *Load = Builder.CreateCall(F, {Predicate, BasePtr});
9032 
9033   return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
9034                      : Builder.CreateSExt(Load, VectorTy);
9035 }
9036 
9037 Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
9038                                            SmallVectorImpl<Value *> &Ops,
9039                                            unsigned BuiltinID) {
9040   QualType LangPTy = E->getArg(1)->getType();
9041   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9042       LangPTy->castAs<PointerType>()->getPointeeType());
9043 
9044   // The vector type that is stored may be different from the
9045   // eventual type stored to memory.
9046   auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
9047   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9048 
9049   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9050   Value *BasePtr = Builder.CreateBitCast(Ops[1], MemoryTy->getPointerTo());
9051   Value *Offset = Ops.size() == 4 ? Ops[2] : Builder.getInt32(0);
9052   BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Offset);
9053 
9054   // Last value is always the data
9055   llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
9056 
9057   BasePtr = Builder.CreateBitCast(BasePtr, MemEltTy->getPointerTo());
9058   Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9059   return Builder.CreateCall(F, {Val, Predicate, BasePtr});
9060 }
9061 
9062 // Limit the usage of scalable llvm IR generated by the ACLE by using the
9063 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
9064 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
9065   auto F = CGM.getIntrinsic(Intrinsic::aarch64_sve_dup_x, Ty);
9066   return Builder.CreateCall(F, Scalar);
9067 }
9068 
9069 Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) {
9070   return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
9071 }
9072 
9073 Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
9074   // FIXME: For big endian this needs an additional REV, or needs a separate
9075   // intrinsic that is code-generated as a no-op, because the LLVM bitcast
9076   // instruction is defined as 'bitwise' equivalent from memory point of
9077   // view (when storing/reloading), whereas the svreinterpret builtin
9078   // implements bitwise equivalent cast from register point of view.
9079   // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
9080   return Builder.CreateBitCast(Val, Ty);
9081 }
9082 
9083 static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
9084                                       SmallVectorImpl<Value *> &Ops) {
9085   auto *SplatZero = Constant::getNullValue(Ty);
9086   Ops.insert(Ops.begin(), SplatZero);
9087 }
9088 
9089 static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
9090                                        SmallVectorImpl<Value *> &Ops) {
9091   auto *SplatUndef = UndefValue::get(Ty);
9092   Ops.insert(Ops.begin(), SplatUndef);
9093 }
9094 
9095 SmallVector<llvm::Type *, 2>
9096 CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
9097                                      llvm::Type *ResultType,
9098                                      ArrayRef<Value *> Ops) {
9099   if (TypeFlags.isOverloadNone())
9100     return {};
9101 
9102   llvm::Type *DefaultType = getSVEType(TypeFlags);
9103 
9104   if (TypeFlags.isOverloadWhile())
9105     return {DefaultType, Ops[1]->getType()};
9106 
9107   if (TypeFlags.isOverloadWhileRW())
9108     return {getSVEPredType(TypeFlags), Ops[0]->getType()};
9109 
9110   if (TypeFlags.isOverloadCvt() || TypeFlags.isTupleSet())
9111     return {Ops[0]->getType(), Ops.back()->getType()};
9112 
9113   if (TypeFlags.isTupleCreate() || TypeFlags.isTupleGet())
9114     return {ResultType, Ops[0]->getType()};
9115 
9116   assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
9117   return {DefaultType};
9118 }
9119 
9120 Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
9121                                                   const CallExpr *E) {
9122   // Find out if any arguments are required to be integer constant expressions.
9123   unsigned ICEArguments = 0;
9124   ASTContext::GetBuiltinTypeError Error;
9125   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
9126   assert(Error == ASTContext::GE_None && "Should not codegen an error");
9127 
9128   llvm::Type *Ty = ConvertType(E->getType());
9129   if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
9130       BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
9131     Value *Val = EmitScalarExpr(E->getArg(0));
9132     return EmitSVEReinterpret(Val, Ty);
9133   }
9134 
9135   llvm::SmallVector<Value *, 4> Ops;
9136   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
9137     if ((ICEArguments & (1 << i)) == 0)
9138       Ops.push_back(EmitScalarExpr(E->getArg(i)));
9139     else {
9140       // If this is required to be a constant, constant fold it so that we know
9141       // that the generated intrinsic gets a ConstantInt.
9142       Optional<llvm::APSInt> Result =
9143           E->getArg(i)->getIntegerConstantExpr(getContext());
9144       assert(Result && "Expected argument to be a constant");
9145 
9146       // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
9147       // truncate because the immediate has been range checked and no valid
9148       // immediate requires more than a handful of bits.
9149       *Result = Result->extOrTrunc(32);
9150       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
9151     }
9152   }
9153 
9154   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
9155                                               AArch64SVEIntrinsicsProvenSorted);
9156   SVETypeFlags TypeFlags(Builtin->TypeModifier);
9157   if (TypeFlags.isLoad())
9158     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
9159                              TypeFlags.isZExtReturn());
9160   else if (TypeFlags.isStore())
9161     return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
9162   else if (TypeFlags.isGatherLoad())
9163     return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9164   else if (TypeFlags.isScatterStore())
9165     return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9166   else if (TypeFlags.isPrefetch())
9167     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9168   else if (TypeFlags.isGatherPrefetch())
9169     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9170 	else if (TypeFlags.isStructLoad())
9171 		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9172 	else if (TypeFlags.isStructStore())
9173 		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9174   else if (TypeFlags.isUndef())
9175     return UndefValue::get(Ty);
9176   else if (Builtin->LLVMIntrinsic != 0) {
9177     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
9178       InsertExplicitZeroOperand(Builder, Ty, Ops);
9179 
9180     if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
9181       InsertExplicitUndefOperand(Builder, Ty, Ops);
9182 
9183     // Some ACLE builtins leave out the argument to specify the predicate
9184     // pattern, which is expected to be expanded to an SV_ALL pattern.
9185     if (TypeFlags.isAppendSVALL())
9186       Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31));
9187     if (TypeFlags.isInsertOp1SVALL())
9188       Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
9189 
9190     // Predicates must match the main datatype.
9191     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
9192       if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
9193         if (PredTy->getElementType()->isIntegerTy(1))
9194           Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
9195 
9196     // Splat scalar operand to vector (intrinsics with _n infix)
9197     if (TypeFlags.hasSplatOperand()) {
9198       unsigned OpNo = TypeFlags.getSplatOperand();
9199       Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
9200     }
9201 
9202     if (TypeFlags.isReverseCompare())
9203       std::swap(Ops[1], Ops[2]);
9204 
9205     if (TypeFlags.isReverseUSDOT())
9206       std::swap(Ops[1], Ops[2]);
9207 
9208     // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
9209     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
9210       llvm::Type *OpndTy = Ops[1]->getType();
9211       auto *SplatZero = Constant::getNullValue(OpndTy);
9212       Function *Sel = CGM.getIntrinsic(Intrinsic::aarch64_sve_sel, OpndTy);
9213       Ops[1] = Builder.CreateCall(Sel, {Ops[0], Ops[1], SplatZero});
9214     }
9215 
9216     Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
9217                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
9218     Value *Call = Builder.CreateCall(F, Ops);
9219 
9220     // Predicate results must be converted to svbool_t.
9221     if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
9222       if (PredTy->getScalarType()->isIntegerTy(1))
9223         Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
9224 
9225     return Call;
9226   }
9227 
9228   switch (BuiltinID) {
9229   default:
9230     return nullptr;
9231 
9232   case SVE::BI__builtin_sve_svmov_b_z: {
9233     // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
9234     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9235     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
9236     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy);
9237     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]});
9238   }
9239 
9240   case SVE::BI__builtin_sve_svnot_b_z: {
9241     // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
9242     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9243     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
9244     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy);
9245     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]});
9246   }
9247 
9248   case SVE::BI__builtin_sve_svmovlb_u16:
9249   case SVE::BI__builtin_sve_svmovlb_u32:
9250   case SVE::BI__builtin_sve_svmovlb_u64:
9251     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb);
9252 
9253   case SVE::BI__builtin_sve_svmovlb_s16:
9254   case SVE::BI__builtin_sve_svmovlb_s32:
9255   case SVE::BI__builtin_sve_svmovlb_s64:
9256     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb);
9257 
9258   case SVE::BI__builtin_sve_svmovlt_u16:
9259   case SVE::BI__builtin_sve_svmovlt_u32:
9260   case SVE::BI__builtin_sve_svmovlt_u64:
9261     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt);
9262 
9263   case SVE::BI__builtin_sve_svmovlt_s16:
9264   case SVE::BI__builtin_sve_svmovlt_s32:
9265   case SVE::BI__builtin_sve_svmovlt_s64:
9266     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt);
9267 
9268   case SVE::BI__builtin_sve_svpmullt_u16:
9269   case SVE::BI__builtin_sve_svpmullt_u64:
9270   case SVE::BI__builtin_sve_svpmullt_n_u16:
9271   case SVE::BI__builtin_sve_svpmullt_n_u64:
9272     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair);
9273 
9274   case SVE::BI__builtin_sve_svpmullb_u16:
9275   case SVE::BI__builtin_sve_svpmullb_u64:
9276   case SVE::BI__builtin_sve_svpmullb_n_u16:
9277   case SVE::BI__builtin_sve_svpmullb_n_u64:
9278     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair);
9279 
9280   case SVE::BI__builtin_sve_svdup_n_b8:
9281   case SVE::BI__builtin_sve_svdup_n_b16:
9282   case SVE::BI__builtin_sve_svdup_n_b32:
9283   case SVE::BI__builtin_sve_svdup_n_b64: {
9284     Value *CmpNE =
9285         Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType()));
9286     llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
9287     Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy);
9288     return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty));
9289   }
9290 
9291   case SVE::BI__builtin_sve_svdupq_n_b8:
9292   case SVE::BI__builtin_sve_svdupq_n_b16:
9293   case SVE::BI__builtin_sve_svdupq_n_b32:
9294   case SVE::BI__builtin_sve_svdupq_n_b64:
9295   case SVE::BI__builtin_sve_svdupq_n_u8:
9296   case SVE::BI__builtin_sve_svdupq_n_s8:
9297   case SVE::BI__builtin_sve_svdupq_n_u64:
9298   case SVE::BI__builtin_sve_svdupq_n_f64:
9299   case SVE::BI__builtin_sve_svdupq_n_s64:
9300   case SVE::BI__builtin_sve_svdupq_n_u16:
9301   case SVE::BI__builtin_sve_svdupq_n_f16:
9302   case SVE::BI__builtin_sve_svdupq_n_bf16:
9303   case SVE::BI__builtin_sve_svdupq_n_s16:
9304   case SVE::BI__builtin_sve_svdupq_n_u32:
9305   case SVE::BI__builtin_sve_svdupq_n_f32:
9306   case SVE::BI__builtin_sve_svdupq_n_s32: {
9307     // These builtins are implemented by storing each element to an array and using
9308     // ld1rq to materialize a vector.
9309     unsigned NumOpnds = Ops.size();
9310 
9311     bool IsBoolTy =
9312         cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1);
9313 
9314     // For svdupq_n_b* the element type of is an integer of type 128/numelts,
9315     // so that the compare can use the width that is natural for the expected
9316     // number of predicate lanes.
9317     llvm::Type *EltTy = Ops[0]->getType();
9318     if (IsBoolTy)
9319       EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
9320 
9321     SmallVector<llvm::Value *, 16> VecOps;
9322     for (unsigned I = 0; I < NumOpnds; ++I)
9323         VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
9324     Value *Vec = BuildVector(VecOps);
9325 
9326     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9327     Value *Pred = EmitSVEAllTruePred(TypeFlags);
9328 
9329     llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
9330     Value *InsertSubVec = Builder.CreateInsertVector(
9331         OverloadedTy, UndefValue::get(OverloadedTy), Vec, Builder.getInt64(0));
9332 
9333     Function *F =
9334         CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
9335     Value *DupQLane =
9336         Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
9337 
9338     if (!IsBoolTy)
9339       return DupQLane;
9340 
9341     // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
9342     F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
9343                                        : Intrinsic::aarch64_sve_cmpne_wide,
9344                          OverloadedTy);
9345     Value *Call = Builder.CreateCall(
9346         F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
9347     return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
9348   }
9349 
9350   case SVE::BI__builtin_sve_svpfalse_b:
9351     return ConstantInt::getFalse(Ty);
9352 
9353   case SVE::BI__builtin_sve_svlen_bf16:
9354   case SVE::BI__builtin_sve_svlen_f16:
9355   case SVE::BI__builtin_sve_svlen_f32:
9356   case SVE::BI__builtin_sve_svlen_f64:
9357   case SVE::BI__builtin_sve_svlen_s8:
9358   case SVE::BI__builtin_sve_svlen_s16:
9359   case SVE::BI__builtin_sve_svlen_s32:
9360   case SVE::BI__builtin_sve_svlen_s64:
9361   case SVE::BI__builtin_sve_svlen_u8:
9362   case SVE::BI__builtin_sve_svlen_u16:
9363   case SVE::BI__builtin_sve_svlen_u32:
9364   case SVE::BI__builtin_sve_svlen_u64: {
9365     SVETypeFlags TF(Builtin->TypeModifier);
9366     auto VTy = cast<llvm::VectorType>(getSVEType(TF));
9367     auto *NumEls =
9368         llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue());
9369 
9370     Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty);
9371     return Builder.CreateMul(NumEls, Builder.CreateCall(F));
9372   }
9373 
9374   case SVE::BI__builtin_sve_svtbl2_u8:
9375   case SVE::BI__builtin_sve_svtbl2_s8:
9376   case SVE::BI__builtin_sve_svtbl2_u16:
9377   case SVE::BI__builtin_sve_svtbl2_s16:
9378   case SVE::BI__builtin_sve_svtbl2_u32:
9379   case SVE::BI__builtin_sve_svtbl2_s32:
9380   case SVE::BI__builtin_sve_svtbl2_u64:
9381   case SVE::BI__builtin_sve_svtbl2_s64:
9382   case SVE::BI__builtin_sve_svtbl2_f16:
9383   case SVE::BI__builtin_sve_svtbl2_bf16:
9384   case SVE::BI__builtin_sve_svtbl2_f32:
9385   case SVE::BI__builtin_sve_svtbl2_f64: {
9386     SVETypeFlags TF(Builtin->TypeModifier);
9387     auto VTy = cast<llvm::VectorType>(getSVEType(TF));
9388     auto TupleTy = llvm::VectorType::getDoubleElementsVectorType(VTy);
9389     Function *FExtr =
9390         CGM.getIntrinsic(Intrinsic::aarch64_sve_tuple_get, {VTy, TupleTy});
9391     Value *V0 = Builder.CreateCall(FExtr, {Ops[0], Builder.getInt32(0)});
9392     Value *V1 = Builder.CreateCall(FExtr, {Ops[0], Builder.getInt32(1)});
9393     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
9394     return Builder.CreateCall(F, {V0, V1, Ops[1]});
9395   }
9396 
9397   case SVE::BI__builtin_sve_svset_neonq_s8:
9398   case SVE::BI__builtin_sve_svset_neonq_s16:
9399   case SVE::BI__builtin_sve_svset_neonq_s32:
9400   case SVE::BI__builtin_sve_svset_neonq_s64:
9401   case SVE::BI__builtin_sve_svset_neonq_u8:
9402   case SVE::BI__builtin_sve_svset_neonq_u16:
9403   case SVE::BI__builtin_sve_svset_neonq_u32:
9404   case SVE::BI__builtin_sve_svset_neonq_u64:
9405   case SVE::BI__builtin_sve_svset_neonq_f16:
9406   case SVE::BI__builtin_sve_svset_neonq_f32:
9407   case SVE::BI__builtin_sve_svset_neonq_f64:
9408   case SVE::BI__builtin_sve_svset_neonq_bf16: {
9409     return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], Builder.getInt64(0));
9410   }
9411 
9412   case SVE::BI__builtin_sve_svget_neonq_s8:
9413   case SVE::BI__builtin_sve_svget_neonq_s16:
9414   case SVE::BI__builtin_sve_svget_neonq_s32:
9415   case SVE::BI__builtin_sve_svget_neonq_s64:
9416   case SVE::BI__builtin_sve_svget_neonq_u8:
9417   case SVE::BI__builtin_sve_svget_neonq_u16:
9418   case SVE::BI__builtin_sve_svget_neonq_u32:
9419   case SVE::BI__builtin_sve_svget_neonq_u64:
9420   case SVE::BI__builtin_sve_svget_neonq_f16:
9421   case SVE::BI__builtin_sve_svget_neonq_f32:
9422   case SVE::BI__builtin_sve_svget_neonq_f64:
9423   case SVE::BI__builtin_sve_svget_neonq_bf16: {
9424     return Builder.CreateExtractVector(Ty, Ops[0], Builder.getInt64(0));
9425   }
9426 
9427   case SVE::BI__builtin_sve_svdup_neonq_s8:
9428   case SVE::BI__builtin_sve_svdup_neonq_s16:
9429   case SVE::BI__builtin_sve_svdup_neonq_s32:
9430   case SVE::BI__builtin_sve_svdup_neonq_s64:
9431   case SVE::BI__builtin_sve_svdup_neonq_u8:
9432   case SVE::BI__builtin_sve_svdup_neonq_u16:
9433   case SVE::BI__builtin_sve_svdup_neonq_u32:
9434   case SVE::BI__builtin_sve_svdup_neonq_u64:
9435   case SVE::BI__builtin_sve_svdup_neonq_f16:
9436   case SVE::BI__builtin_sve_svdup_neonq_f32:
9437   case SVE::BI__builtin_sve_svdup_neonq_f64:
9438   case SVE::BI__builtin_sve_svdup_neonq_bf16: {
9439     Value *Insert = Builder.CreateInsertVector(Ty, UndefValue::get(Ty), Ops[0],
9440                                                Builder.getInt64(0));
9441     return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty},
9442                                    {Insert, Builder.getInt64(0)});
9443   }
9444   }
9445 
9446   /// Should not happen
9447   return nullptr;
9448 }
9449 
9450 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
9451                                                const CallExpr *E,
9452                                                llvm::Triple::ArchType Arch) {
9453   if (BuiltinID >= AArch64::FirstSVEBuiltin &&
9454       BuiltinID <= AArch64::LastSVEBuiltin)
9455     return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
9456 
9457   unsigned HintID = static_cast<unsigned>(-1);
9458   switch (BuiltinID) {
9459   default: break;
9460   case AArch64::BI__builtin_arm_nop:
9461     HintID = 0;
9462     break;
9463   case AArch64::BI__builtin_arm_yield:
9464   case AArch64::BI__yield:
9465     HintID = 1;
9466     break;
9467   case AArch64::BI__builtin_arm_wfe:
9468   case AArch64::BI__wfe:
9469     HintID = 2;
9470     break;
9471   case AArch64::BI__builtin_arm_wfi:
9472   case AArch64::BI__wfi:
9473     HintID = 3;
9474     break;
9475   case AArch64::BI__builtin_arm_sev:
9476   case AArch64::BI__sev:
9477     HintID = 4;
9478     break;
9479   case AArch64::BI__builtin_arm_sevl:
9480   case AArch64::BI__sevl:
9481     HintID = 5;
9482     break;
9483   }
9484 
9485   if (HintID != static_cast<unsigned>(-1)) {
9486     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
9487     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
9488   }
9489 
9490   if (BuiltinID == AArch64::BI__builtin_arm_prefetch) {
9491     Value *Address         = EmitScalarExpr(E->getArg(0));
9492     Value *RW              = EmitScalarExpr(E->getArg(1));
9493     Value *CacheLevel      = EmitScalarExpr(E->getArg(2));
9494     Value *RetentionPolicy = EmitScalarExpr(E->getArg(3));
9495     Value *IsData          = EmitScalarExpr(E->getArg(4));
9496 
9497     Value *Locality = nullptr;
9498     if (cast<llvm::ConstantInt>(RetentionPolicy)->isZero()) {
9499       // Temporal fetch, needs to convert cache level to locality.
9500       Locality = llvm::ConstantInt::get(Int32Ty,
9501         -cast<llvm::ConstantInt>(CacheLevel)->getValue() + 3);
9502     } else {
9503       // Streaming fetch.
9504       Locality = llvm::ConstantInt::get(Int32Ty, 0);
9505     }
9506 
9507     // FIXME: We need AArch64 specific LLVM intrinsic if we want to specify
9508     // PLDL3STRM or PLDL2STRM.
9509     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
9510     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
9511   }
9512 
9513   if (BuiltinID == AArch64::BI__builtin_arm_rbit) {
9514     assert((getContext().getTypeSize(E->getType()) == 32) &&
9515            "rbit of unusual size!");
9516     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9517     return Builder.CreateCall(
9518         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
9519   }
9520   if (BuiltinID == AArch64::BI__builtin_arm_rbit64) {
9521     assert((getContext().getTypeSize(E->getType()) == 64) &&
9522            "rbit of unusual size!");
9523     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9524     return Builder.CreateCall(
9525         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
9526   }
9527 
9528   if (BuiltinID == AArch64::BI__builtin_arm_cls) {
9529     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9530     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
9531                               "cls");
9532   }
9533   if (BuiltinID == AArch64::BI__builtin_arm_cls64) {
9534     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9535     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
9536                               "cls");
9537   }
9538 
9539   if (BuiltinID == AArch64::BI__builtin_arm_frint32zf ||
9540       BuiltinID == AArch64::BI__builtin_arm_frint32z) {
9541     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9542     llvm::Type *Ty = Arg->getType();
9543     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty),
9544                               Arg, "frint32z");
9545   }
9546 
9547   if (BuiltinID == AArch64::BI__builtin_arm_frint64zf ||
9548       BuiltinID == AArch64::BI__builtin_arm_frint64z) {
9549     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9550     llvm::Type *Ty = Arg->getType();
9551     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty),
9552                               Arg, "frint64z");
9553   }
9554 
9555   if (BuiltinID == AArch64::BI__builtin_arm_frint32xf ||
9556       BuiltinID == AArch64::BI__builtin_arm_frint32x) {
9557     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9558     llvm::Type *Ty = Arg->getType();
9559     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty),
9560                               Arg, "frint32x");
9561   }
9562 
9563   if (BuiltinID == AArch64::BI__builtin_arm_frint64xf ||
9564       BuiltinID == AArch64::BI__builtin_arm_frint64x) {
9565     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9566     llvm::Type *Ty = Arg->getType();
9567     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty),
9568                               Arg, "frint64x");
9569   }
9570 
9571   if (BuiltinID == AArch64::BI__builtin_arm_jcvt) {
9572     assert((getContext().getTypeSize(E->getType()) == 32) &&
9573            "__jcvt of unusual size!");
9574     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
9575     return Builder.CreateCall(
9576         CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg);
9577   }
9578 
9579   if (BuiltinID == AArch64::BI__builtin_arm_ld64b ||
9580       BuiltinID == AArch64::BI__builtin_arm_st64b ||
9581       BuiltinID == AArch64::BI__builtin_arm_st64bv ||
9582       BuiltinID == AArch64::BI__builtin_arm_st64bv0) {
9583     llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0));
9584     llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1));
9585 
9586     if (BuiltinID == AArch64::BI__builtin_arm_ld64b) {
9587       // Load from the address via an LLVM intrinsic, receiving a
9588       // tuple of 8 i64 words, and store each one to ValPtr.
9589       Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b);
9590       llvm::Value *Val = Builder.CreateCall(F, MemAddr);
9591       llvm::Value *ToRet;
9592       for (size_t i = 0; i < 8; i++) {
9593         llvm::Value *ValOffsetPtr =
9594             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
9595         Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8));
9596         ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
9597       }
9598       return ToRet;
9599     } else {
9600       // Load 8 i64 words from ValPtr, and store them to the address
9601       // via an LLVM intrinsic.
9602       SmallVector<llvm::Value *, 9> Args;
9603       Args.push_back(MemAddr);
9604       for (size_t i = 0; i < 8; i++) {
9605         llvm::Value *ValOffsetPtr =
9606             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
9607         Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8));
9608         Args.push_back(Builder.CreateLoad(Addr));
9609       }
9610 
9611       auto Intr = (BuiltinID == AArch64::BI__builtin_arm_st64b
9612                        ? Intrinsic::aarch64_st64b
9613                        : BuiltinID == AArch64::BI__builtin_arm_st64bv
9614                              ? Intrinsic::aarch64_st64bv
9615                              : Intrinsic::aarch64_st64bv0);
9616       Function *F = CGM.getIntrinsic(Intr);
9617       return Builder.CreateCall(F, Args);
9618     }
9619   }
9620 
9621   if (BuiltinID == AArch64::BI__builtin_arm_rndr ||
9622       BuiltinID == AArch64::BI__builtin_arm_rndrrs) {
9623 
9624     auto Intr = (BuiltinID == AArch64::BI__builtin_arm_rndr
9625                      ? Intrinsic::aarch64_rndr
9626                      : Intrinsic::aarch64_rndrrs);
9627     Function *F = CGM.getIntrinsic(Intr);
9628     llvm::Value *Val = Builder.CreateCall(F);
9629     Value *RandomValue = Builder.CreateExtractValue(Val, 0);
9630     Value *Status = Builder.CreateExtractValue(Val, 1);
9631 
9632     Address MemAddress = EmitPointerWithAlignment(E->getArg(0));
9633     Builder.CreateStore(RandomValue, MemAddress);
9634     Status = Builder.CreateZExt(Status, Int32Ty);
9635     return Status;
9636   }
9637 
9638   if (BuiltinID == AArch64::BI__clear_cache) {
9639     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
9640     const FunctionDecl *FD = E->getDirectCallee();
9641     Value *Ops[2];
9642     for (unsigned i = 0; i < 2; i++)
9643       Ops[i] = EmitScalarExpr(E->getArg(i));
9644     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
9645     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
9646     StringRef Name = FD->getName();
9647     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
9648   }
9649 
9650   if ((BuiltinID == AArch64::BI__builtin_arm_ldrex ||
9651       BuiltinID == AArch64::BI__builtin_arm_ldaex) &&
9652       getContext().getTypeSize(E->getType()) == 128) {
9653     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
9654                                        ? Intrinsic::aarch64_ldaxp
9655                                        : Intrinsic::aarch64_ldxp);
9656 
9657     Value *LdPtr = EmitScalarExpr(E->getArg(0));
9658     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
9659                                     "ldxp");
9660 
9661     Value *Val0 = Builder.CreateExtractValue(Val, 1);
9662     Value *Val1 = Builder.CreateExtractValue(Val, 0);
9663     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
9664     Val0 = Builder.CreateZExt(Val0, Int128Ty);
9665     Val1 = Builder.CreateZExt(Val1, Int128Ty);
9666 
9667     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
9668     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
9669     Val = Builder.CreateOr(Val, Val1);
9670     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
9671   } else if (BuiltinID == AArch64::BI__builtin_arm_ldrex ||
9672              BuiltinID == AArch64::BI__builtin_arm_ldaex) {
9673     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
9674 
9675     QualType Ty = E->getType();
9676     llvm::Type *RealResTy = ConvertType(Ty);
9677     llvm::Type *PtrTy = llvm::IntegerType::get(
9678         getLLVMContext(), getContext().getTypeSize(Ty))->getPointerTo();
9679     LoadAddr = Builder.CreateBitCast(LoadAddr, PtrTy);
9680 
9681     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex
9682                                        ? Intrinsic::aarch64_ldaxr
9683                                        : Intrinsic::aarch64_ldxr,
9684                                    PtrTy);
9685     Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
9686 
9687     if (RealResTy->isPointerTy())
9688       return Builder.CreateIntToPtr(Val, RealResTy);
9689 
9690     llvm::Type *IntResTy = llvm::IntegerType::get(
9691         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
9692     Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
9693     return Builder.CreateBitCast(Val, RealResTy);
9694   }
9695 
9696   if ((BuiltinID == AArch64::BI__builtin_arm_strex ||
9697        BuiltinID == AArch64::BI__builtin_arm_stlex) &&
9698       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
9699     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
9700                                        ? Intrinsic::aarch64_stlxp
9701                                        : Intrinsic::aarch64_stxp);
9702     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
9703 
9704     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
9705     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
9706 
9707     Tmp = Builder.CreateBitCast(Tmp, llvm::PointerType::getUnqual(STy));
9708     llvm::Value *Val = Builder.CreateLoad(Tmp);
9709 
9710     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
9711     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
9712     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
9713                                          Int8PtrTy);
9714     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
9715   }
9716 
9717   if (BuiltinID == AArch64::BI__builtin_arm_strex ||
9718       BuiltinID == AArch64::BI__builtin_arm_stlex) {
9719     Value *StoreVal = EmitScalarExpr(E->getArg(0));
9720     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
9721 
9722     QualType Ty = E->getArg(0)->getType();
9723     llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
9724                                                  getContext().getTypeSize(Ty));
9725     StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
9726 
9727     if (StoreVal->getType()->isPointerTy())
9728       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
9729     else {
9730       llvm::Type *IntTy = llvm::IntegerType::get(
9731           getLLVMContext(),
9732           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
9733       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
9734       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
9735     }
9736 
9737     Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex
9738                                        ? Intrinsic::aarch64_stlxr
9739                                        : Intrinsic::aarch64_stxr,
9740                                    StoreAddr->getType());
9741     return Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
9742   }
9743 
9744   if (BuiltinID == AArch64::BI__getReg) {
9745     Expr::EvalResult Result;
9746     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
9747       llvm_unreachable("Sema will ensure that the parameter is constant");
9748 
9749     llvm::APSInt Value = Result.Val.getInt();
9750     LLVMContext &Context = CGM.getLLVMContext();
9751     std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10);
9752 
9753     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
9754     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
9755     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
9756 
9757     llvm::Function *F =
9758         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
9759     return Builder.CreateCall(F, Metadata);
9760   }
9761 
9762   if (BuiltinID == AArch64::BI__builtin_arm_clrex) {
9763     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
9764     return Builder.CreateCall(F);
9765   }
9766 
9767   if (BuiltinID == AArch64::BI_ReadWriteBarrier)
9768     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
9769                                llvm::SyncScope::SingleThread);
9770 
9771   // CRC32
9772   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
9773   switch (BuiltinID) {
9774   case AArch64::BI__builtin_arm_crc32b:
9775     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
9776   case AArch64::BI__builtin_arm_crc32cb:
9777     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
9778   case AArch64::BI__builtin_arm_crc32h:
9779     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
9780   case AArch64::BI__builtin_arm_crc32ch:
9781     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
9782   case AArch64::BI__builtin_arm_crc32w:
9783     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
9784   case AArch64::BI__builtin_arm_crc32cw:
9785     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
9786   case AArch64::BI__builtin_arm_crc32d:
9787     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
9788   case AArch64::BI__builtin_arm_crc32cd:
9789     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
9790   }
9791 
9792   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
9793     Value *Arg0 = EmitScalarExpr(E->getArg(0));
9794     Value *Arg1 = EmitScalarExpr(E->getArg(1));
9795     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
9796 
9797     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
9798     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
9799 
9800     return Builder.CreateCall(F, {Arg0, Arg1});
9801   }
9802 
9803   // Memory Operations (MOPS)
9804   if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
9805     Value *Dst = EmitScalarExpr(E->getArg(0));
9806     Value *Val = EmitScalarExpr(E->getArg(1));
9807     Value *Size = EmitScalarExpr(E->getArg(2));
9808     Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
9809     Val = Builder.CreateTrunc(Val, Int8Ty);
9810     Size = Builder.CreateIntCast(Size, Int64Ty, false);
9811     return Builder.CreateCall(
9812         CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
9813   }
9814 
9815   // Memory Tagging Extensions (MTE) Intrinsics
9816   Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
9817   switch (BuiltinID) {
9818   case AArch64::BI__builtin_arm_irg:
9819     MTEIntrinsicID = Intrinsic::aarch64_irg; break;
9820   case  AArch64::BI__builtin_arm_addg:
9821     MTEIntrinsicID = Intrinsic::aarch64_addg; break;
9822   case  AArch64::BI__builtin_arm_gmi:
9823     MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
9824   case  AArch64::BI__builtin_arm_ldg:
9825     MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
9826   case AArch64::BI__builtin_arm_stg:
9827     MTEIntrinsicID = Intrinsic::aarch64_stg; break;
9828   case AArch64::BI__builtin_arm_subp:
9829     MTEIntrinsicID = Intrinsic::aarch64_subp; break;
9830   }
9831 
9832   if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
9833     llvm::Type *T = ConvertType(E->getType());
9834 
9835     if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
9836       Value *Pointer = EmitScalarExpr(E->getArg(0));
9837       Value *Mask = EmitScalarExpr(E->getArg(1));
9838 
9839       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
9840       Mask = Builder.CreateZExt(Mask, Int64Ty);
9841       Value *RV = Builder.CreateCall(
9842                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask});
9843        return Builder.CreatePointerCast(RV, T);
9844     }
9845     if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
9846       Value *Pointer = EmitScalarExpr(E->getArg(0));
9847       Value *TagOffset = EmitScalarExpr(E->getArg(1));
9848 
9849       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
9850       TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
9851       Value *RV = Builder.CreateCall(
9852                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset});
9853       return Builder.CreatePointerCast(RV, T);
9854     }
9855     if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
9856       Value *Pointer = EmitScalarExpr(E->getArg(0));
9857       Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
9858 
9859       ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
9860       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
9861       return Builder.CreateCall(
9862                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
9863     }
9864     // Although it is possible to supply a different return
9865     // address (first arg) to this intrinsic, for now we set
9866     // return address same as input address.
9867     if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
9868       Value *TagAddress = EmitScalarExpr(E->getArg(0));
9869       TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
9870       Value *RV = Builder.CreateCall(
9871                     CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
9872       return Builder.CreatePointerCast(RV, T);
9873     }
9874     // Although it is possible to supply a different tag (to set)
9875     // to this intrinsic (as first arg), for now we supply
9876     // the tag that is in input address arg (common use case).
9877     if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
9878         Value *TagAddress = EmitScalarExpr(E->getArg(0));
9879         TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
9880         return Builder.CreateCall(
9881                  CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
9882     }
9883     if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
9884       Value *PointerA = EmitScalarExpr(E->getArg(0));
9885       Value *PointerB = EmitScalarExpr(E->getArg(1));
9886       PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy);
9887       PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy);
9888       return Builder.CreateCall(
9889                        CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
9890     }
9891   }
9892 
9893   if (BuiltinID == AArch64::BI__builtin_arm_rsr ||
9894       BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
9895       BuiltinID == AArch64::BI__builtin_arm_rsrp ||
9896       BuiltinID == AArch64::BI__builtin_arm_wsr ||
9897       BuiltinID == AArch64::BI__builtin_arm_wsr64 ||
9898       BuiltinID == AArch64::BI__builtin_arm_wsrp) {
9899 
9900     SpecialRegisterAccessKind AccessKind = Write;
9901     if (BuiltinID == AArch64::BI__builtin_arm_rsr ||
9902         BuiltinID == AArch64::BI__builtin_arm_rsr64 ||
9903         BuiltinID == AArch64::BI__builtin_arm_rsrp)
9904       AccessKind = VolatileRead;
9905 
9906     bool IsPointerBuiltin = BuiltinID == AArch64::BI__builtin_arm_rsrp ||
9907                             BuiltinID == AArch64::BI__builtin_arm_wsrp;
9908 
9909     bool Is64Bit = BuiltinID != AArch64::BI__builtin_arm_rsr &&
9910                    BuiltinID != AArch64::BI__builtin_arm_wsr;
9911 
9912     llvm::Type *ValueType;
9913     llvm::Type *RegisterType = Int64Ty;
9914     if (IsPointerBuiltin) {
9915       ValueType = VoidPtrTy;
9916     } else if (Is64Bit) {
9917       ValueType = Int64Ty;
9918     } else {
9919       ValueType = Int32Ty;
9920     }
9921 
9922     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
9923                                       AccessKind);
9924   }
9925 
9926   if (BuiltinID == AArch64::BI_ReadStatusReg ||
9927       BuiltinID == AArch64::BI_WriteStatusReg) {
9928     LLVMContext &Context = CGM.getLLVMContext();
9929 
9930     unsigned SysReg =
9931       E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
9932 
9933     std::string SysRegStr;
9934     llvm::raw_string_ostream(SysRegStr) <<
9935                        ((1 << 1) | ((SysReg >> 14) & 1))  << ":" <<
9936                        ((SysReg >> 11) & 7)               << ":" <<
9937                        ((SysReg >> 7)  & 15)              << ":" <<
9938                        ((SysReg >> 3)  & 15)              << ":" <<
9939                        ( SysReg        & 7);
9940 
9941     llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
9942     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
9943     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
9944 
9945     llvm::Type *RegisterType = Int64Ty;
9946     llvm::Type *Types[] = { RegisterType };
9947 
9948     if (BuiltinID == AArch64::BI_ReadStatusReg) {
9949       llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
9950 
9951       return Builder.CreateCall(F, Metadata);
9952     }
9953 
9954     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
9955     llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
9956 
9957     return Builder.CreateCall(F, { Metadata, ArgValue });
9958   }
9959 
9960   if (BuiltinID == AArch64::BI_AddressOfReturnAddress) {
9961     llvm::Function *F =
9962         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
9963     return Builder.CreateCall(F);
9964   }
9965 
9966   if (BuiltinID == AArch64::BI__builtin_sponentry) {
9967     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
9968     return Builder.CreateCall(F);
9969   }
9970 
9971   if (BuiltinID == AArch64::BI__mulh || BuiltinID == AArch64::BI__umulh) {
9972     llvm::Type *ResType = ConvertType(E->getType());
9973     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
9974 
9975     bool IsSigned = BuiltinID == AArch64::BI__mulh;
9976     Value *LHS =
9977         Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
9978     Value *RHS =
9979         Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
9980 
9981     Value *MulResult, *HigherBits;
9982     if (IsSigned) {
9983       MulResult = Builder.CreateNSWMul(LHS, RHS);
9984       HigherBits = Builder.CreateAShr(MulResult, 64);
9985     } else {
9986       MulResult = Builder.CreateNUWMul(LHS, RHS);
9987       HigherBits = Builder.CreateLShr(MulResult, 64);
9988     }
9989     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
9990 
9991     return HigherBits;
9992   }
9993 
9994   // Handle MSVC intrinsics before argument evaluation to prevent double
9995   // evaluation.
9996   if (Optional<MSVCIntrin> MsvcIntId = translateAarch64ToMsvcIntrin(BuiltinID))
9997     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
9998 
9999   // Find out if any arguments are required to be integer constant
10000   // expressions.
10001   unsigned ICEArguments = 0;
10002   ASTContext::GetBuiltinTypeError Error;
10003   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
10004   assert(Error == ASTContext::GE_None && "Should not codegen an error");
10005 
10006   llvm::SmallVector<Value*, 4> Ops;
10007   Address PtrOp0 = Address::invalid();
10008   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
10009     if (i == 0) {
10010       switch (BuiltinID) {
10011       case NEON::BI__builtin_neon_vld1_v:
10012       case NEON::BI__builtin_neon_vld1q_v:
10013       case NEON::BI__builtin_neon_vld1_dup_v:
10014       case NEON::BI__builtin_neon_vld1q_dup_v:
10015       case NEON::BI__builtin_neon_vld1_lane_v:
10016       case NEON::BI__builtin_neon_vld1q_lane_v:
10017       case NEON::BI__builtin_neon_vst1_v:
10018       case NEON::BI__builtin_neon_vst1q_v:
10019       case NEON::BI__builtin_neon_vst1_lane_v:
10020       case NEON::BI__builtin_neon_vst1q_lane_v:
10021         // Get the alignment for the argument in addition to the value;
10022         // we'll use it later.
10023         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
10024         Ops.push_back(PtrOp0.getPointer());
10025         continue;
10026       }
10027     }
10028     if ((ICEArguments & (1 << i)) == 0) {
10029       Ops.push_back(EmitScalarExpr(E->getArg(i)));
10030     } else {
10031       // If this is required to be a constant, constant fold it so that we know
10032       // that the generated intrinsic gets a ConstantInt.
10033       Ops.push_back(llvm::ConstantInt::get(
10034           getLLVMContext(),
10035           *E->getArg(i)->getIntegerConstantExpr(getContext())));
10036     }
10037   }
10038 
10039   auto SISDMap = makeArrayRef(AArch64SISDIntrinsicMap);
10040   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
10041       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
10042 
10043   if (Builtin) {
10044     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
10045     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
10046     assert(Result && "SISD intrinsic should have been handled");
10047     return Result;
10048   }
10049 
10050   const Expr *Arg = E->getArg(E->getNumArgs()-1);
10051   NeonTypeFlags Type(0);
10052   if (Optional<llvm::APSInt> Result = Arg->getIntegerConstantExpr(getContext()))
10053     // Determine the type of this overloaded NEON intrinsic.
10054     Type = NeonTypeFlags(Result->getZExtValue());
10055 
10056   bool usgn = Type.isUnsigned();
10057   bool quad = Type.isQuad();
10058 
10059   // Handle non-overloaded intrinsics first.
10060   switch (BuiltinID) {
10061   default: break;
10062   case NEON::BI__builtin_neon_vabsh_f16:
10063     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10064     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
10065   case NEON::BI__builtin_neon_vaddq_p128: {
10066     llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
10067     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10068     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
10069     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
10070     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
10071     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
10072     return Builder.CreateBitCast(Ops[0], Int128Ty);
10073   }
10074   case NEON::BI__builtin_neon_vldrq_p128: {
10075     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
10076     llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
10077     Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
10078     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
10079                                      CharUnits::fromQuantity(16));
10080   }
10081   case NEON::BI__builtin_neon_vstrq_p128: {
10082     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
10083     Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
10084     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
10085   }
10086   case NEON::BI__builtin_neon_vcvts_f32_u32:
10087   case NEON::BI__builtin_neon_vcvtd_f64_u64:
10088     usgn = true;
10089     LLVM_FALLTHROUGH;
10090   case NEON::BI__builtin_neon_vcvts_f32_s32:
10091   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
10092     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10093     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
10094     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
10095     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
10096     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
10097     if (usgn)
10098       return Builder.CreateUIToFP(Ops[0], FTy);
10099     return Builder.CreateSIToFP(Ops[0], FTy);
10100   }
10101   case NEON::BI__builtin_neon_vcvth_f16_u16:
10102   case NEON::BI__builtin_neon_vcvth_f16_u32:
10103   case NEON::BI__builtin_neon_vcvth_f16_u64:
10104     usgn = true;
10105     LLVM_FALLTHROUGH;
10106   case NEON::BI__builtin_neon_vcvth_f16_s16:
10107   case NEON::BI__builtin_neon_vcvth_f16_s32:
10108   case NEON::BI__builtin_neon_vcvth_f16_s64: {
10109     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10110     llvm::Type *FTy = HalfTy;
10111     llvm::Type *InTy;
10112     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
10113       InTy = Int64Ty;
10114     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
10115       InTy = Int32Ty;
10116     else
10117       InTy = Int16Ty;
10118     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
10119     if (usgn)
10120       return Builder.CreateUIToFP(Ops[0], FTy);
10121     return Builder.CreateSIToFP(Ops[0], FTy);
10122   }
10123   case NEON::BI__builtin_neon_vcvtah_u16_f16:
10124   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
10125   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
10126   case NEON::BI__builtin_neon_vcvtph_u16_f16:
10127   case NEON::BI__builtin_neon_vcvth_u16_f16:
10128   case NEON::BI__builtin_neon_vcvtah_s16_f16:
10129   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
10130   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
10131   case NEON::BI__builtin_neon_vcvtph_s16_f16:
10132   case NEON::BI__builtin_neon_vcvth_s16_f16: {
10133     unsigned Int;
10134     llvm::Type* InTy = Int32Ty;
10135     llvm::Type* FTy  = HalfTy;
10136     llvm::Type *Tys[2] = {InTy, FTy};
10137     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10138     switch (BuiltinID) {
10139     default: llvm_unreachable("missing builtin ID in switch!");
10140     case NEON::BI__builtin_neon_vcvtah_u16_f16:
10141       Int = Intrinsic::aarch64_neon_fcvtau; break;
10142     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
10143       Int = Intrinsic::aarch64_neon_fcvtmu; break;
10144     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
10145       Int = Intrinsic::aarch64_neon_fcvtnu; break;
10146     case NEON::BI__builtin_neon_vcvtph_u16_f16:
10147       Int = Intrinsic::aarch64_neon_fcvtpu; break;
10148     case NEON::BI__builtin_neon_vcvth_u16_f16:
10149       Int = Intrinsic::aarch64_neon_fcvtzu; break;
10150     case NEON::BI__builtin_neon_vcvtah_s16_f16:
10151       Int = Intrinsic::aarch64_neon_fcvtas; break;
10152     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
10153       Int = Intrinsic::aarch64_neon_fcvtms; break;
10154     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
10155       Int = Intrinsic::aarch64_neon_fcvtns; break;
10156     case NEON::BI__builtin_neon_vcvtph_s16_f16:
10157       Int = Intrinsic::aarch64_neon_fcvtps; break;
10158     case NEON::BI__builtin_neon_vcvth_s16_f16:
10159       Int = Intrinsic::aarch64_neon_fcvtzs; break;
10160     }
10161     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
10162     return Builder.CreateTrunc(Ops[0], Int16Ty);
10163   }
10164   case NEON::BI__builtin_neon_vcaleh_f16:
10165   case NEON::BI__builtin_neon_vcalth_f16:
10166   case NEON::BI__builtin_neon_vcageh_f16:
10167   case NEON::BI__builtin_neon_vcagth_f16: {
10168     unsigned Int;
10169     llvm::Type* InTy = Int32Ty;
10170     llvm::Type* FTy  = HalfTy;
10171     llvm::Type *Tys[2] = {InTy, FTy};
10172     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10173     switch (BuiltinID) {
10174     default: llvm_unreachable("missing builtin ID in switch!");
10175     case NEON::BI__builtin_neon_vcageh_f16:
10176       Int = Intrinsic::aarch64_neon_facge; break;
10177     case NEON::BI__builtin_neon_vcagth_f16:
10178       Int = Intrinsic::aarch64_neon_facgt; break;
10179     case NEON::BI__builtin_neon_vcaleh_f16:
10180       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
10181     case NEON::BI__builtin_neon_vcalth_f16:
10182       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
10183     }
10184     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
10185     return Builder.CreateTrunc(Ops[0], Int16Ty);
10186   }
10187   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
10188   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
10189     unsigned Int;
10190     llvm::Type* InTy = Int32Ty;
10191     llvm::Type* FTy  = HalfTy;
10192     llvm::Type *Tys[2] = {InTy, FTy};
10193     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10194     switch (BuiltinID) {
10195     default: llvm_unreachable("missing builtin ID in switch!");
10196     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
10197       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
10198     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
10199       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
10200     }
10201     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
10202     return Builder.CreateTrunc(Ops[0], Int16Ty);
10203   }
10204   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
10205   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
10206     unsigned Int;
10207     llvm::Type* FTy  = HalfTy;
10208     llvm::Type* InTy = Int32Ty;
10209     llvm::Type *Tys[2] = {FTy, InTy};
10210     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10211     switch (BuiltinID) {
10212     default: llvm_unreachable("missing builtin ID in switch!");
10213     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
10214       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
10215       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
10216       break;
10217     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
10218       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
10219       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
10220       break;
10221     }
10222     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
10223   }
10224   case NEON::BI__builtin_neon_vpaddd_s64: {
10225     auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
10226     Value *Vec = EmitScalarExpr(E->getArg(0));
10227     // The vector is v2f64, so make sure it's bitcast to that.
10228     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
10229     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10230     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10231     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10232     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10233     // Pairwise addition of a v2f64 into a scalar f64.
10234     return Builder.CreateAdd(Op0, Op1, "vpaddd");
10235   }
10236   case NEON::BI__builtin_neon_vpaddd_f64: {
10237     auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
10238     Value *Vec = EmitScalarExpr(E->getArg(0));
10239     // The vector is v2f64, so make sure it's bitcast to that.
10240     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
10241     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10242     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10243     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10244     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10245     // Pairwise addition of a v2f64 into a scalar f64.
10246     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
10247   }
10248   case NEON::BI__builtin_neon_vpadds_f32: {
10249     auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
10250     Value *Vec = EmitScalarExpr(E->getArg(0));
10251     // The vector is v2f32, so make sure it's bitcast to that.
10252     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
10253     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10254     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10255     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10256     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10257     // Pairwise addition of a v2f32 into a scalar f32.
10258     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
10259   }
10260   case NEON::BI__builtin_neon_vceqzd_s64:
10261   case NEON::BI__builtin_neon_vceqzd_f64:
10262   case NEON::BI__builtin_neon_vceqzs_f32:
10263   case NEON::BI__builtin_neon_vceqzh_f16:
10264     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10265     return EmitAArch64CompareBuiltinExpr(
10266         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10267         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
10268   case NEON::BI__builtin_neon_vcgezd_s64:
10269   case NEON::BI__builtin_neon_vcgezd_f64:
10270   case NEON::BI__builtin_neon_vcgezs_f32:
10271   case NEON::BI__builtin_neon_vcgezh_f16:
10272     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10273     return EmitAArch64CompareBuiltinExpr(
10274         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10275         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
10276   case NEON::BI__builtin_neon_vclezd_s64:
10277   case NEON::BI__builtin_neon_vclezd_f64:
10278   case NEON::BI__builtin_neon_vclezs_f32:
10279   case NEON::BI__builtin_neon_vclezh_f16:
10280     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10281     return EmitAArch64CompareBuiltinExpr(
10282         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10283         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
10284   case NEON::BI__builtin_neon_vcgtzd_s64:
10285   case NEON::BI__builtin_neon_vcgtzd_f64:
10286   case NEON::BI__builtin_neon_vcgtzs_f32:
10287   case NEON::BI__builtin_neon_vcgtzh_f16:
10288     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10289     return EmitAArch64CompareBuiltinExpr(
10290         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10291         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
10292   case NEON::BI__builtin_neon_vcltzd_s64:
10293   case NEON::BI__builtin_neon_vcltzd_f64:
10294   case NEON::BI__builtin_neon_vcltzs_f32:
10295   case NEON::BI__builtin_neon_vcltzh_f16:
10296     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10297     return EmitAArch64CompareBuiltinExpr(
10298         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10299         ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
10300 
10301   case NEON::BI__builtin_neon_vceqzd_u64: {
10302     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10303     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
10304     Ops[0] =
10305         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
10306     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
10307   }
10308   case NEON::BI__builtin_neon_vceqd_f64:
10309   case NEON::BI__builtin_neon_vcled_f64:
10310   case NEON::BI__builtin_neon_vcltd_f64:
10311   case NEON::BI__builtin_neon_vcged_f64:
10312   case NEON::BI__builtin_neon_vcgtd_f64: {
10313     llvm::CmpInst::Predicate P;
10314     switch (BuiltinID) {
10315     default: llvm_unreachable("missing builtin ID in switch!");
10316     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
10317     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
10318     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
10319     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
10320     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
10321     }
10322     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10323     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
10324     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
10325     if (P == llvm::FCmpInst::FCMP_OEQ)
10326       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
10327     else
10328       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
10329     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
10330   }
10331   case NEON::BI__builtin_neon_vceqs_f32:
10332   case NEON::BI__builtin_neon_vcles_f32:
10333   case NEON::BI__builtin_neon_vclts_f32:
10334   case NEON::BI__builtin_neon_vcges_f32:
10335   case NEON::BI__builtin_neon_vcgts_f32: {
10336     llvm::CmpInst::Predicate P;
10337     switch (BuiltinID) {
10338     default: llvm_unreachable("missing builtin ID in switch!");
10339     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
10340     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
10341     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
10342     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
10343     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
10344     }
10345     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10346     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
10347     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
10348     if (P == llvm::FCmpInst::FCMP_OEQ)
10349       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
10350     else
10351       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
10352     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
10353   }
10354   case NEON::BI__builtin_neon_vceqh_f16:
10355   case NEON::BI__builtin_neon_vcleh_f16:
10356   case NEON::BI__builtin_neon_vclth_f16:
10357   case NEON::BI__builtin_neon_vcgeh_f16:
10358   case NEON::BI__builtin_neon_vcgth_f16: {
10359     llvm::CmpInst::Predicate P;
10360     switch (BuiltinID) {
10361     default: llvm_unreachable("missing builtin ID in switch!");
10362     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
10363     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
10364     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
10365     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
10366     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
10367     }
10368     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10369     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
10370     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
10371     if (P == llvm::FCmpInst::FCMP_OEQ)
10372       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
10373     else
10374       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
10375     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
10376   }
10377   case NEON::BI__builtin_neon_vceqd_s64:
10378   case NEON::BI__builtin_neon_vceqd_u64:
10379   case NEON::BI__builtin_neon_vcgtd_s64:
10380   case NEON::BI__builtin_neon_vcgtd_u64:
10381   case NEON::BI__builtin_neon_vcltd_s64:
10382   case NEON::BI__builtin_neon_vcltd_u64:
10383   case NEON::BI__builtin_neon_vcged_u64:
10384   case NEON::BI__builtin_neon_vcged_s64:
10385   case NEON::BI__builtin_neon_vcled_u64:
10386   case NEON::BI__builtin_neon_vcled_s64: {
10387     llvm::CmpInst::Predicate P;
10388     switch (BuiltinID) {
10389     default: llvm_unreachable("missing builtin ID in switch!");
10390     case NEON::BI__builtin_neon_vceqd_s64:
10391     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
10392     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
10393     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
10394     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
10395     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
10396     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
10397     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
10398     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
10399     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
10400     }
10401     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10402     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
10403     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
10404     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
10405     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
10406   }
10407   case NEON::BI__builtin_neon_vtstd_s64:
10408   case NEON::BI__builtin_neon_vtstd_u64: {
10409     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10410     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
10411     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
10412     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
10413     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
10414                                 llvm::Constant::getNullValue(Int64Ty));
10415     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
10416   }
10417   case NEON::BI__builtin_neon_vset_lane_i8:
10418   case NEON::BI__builtin_neon_vset_lane_i16:
10419   case NEON::BI__builtin_neon_vset_lane_i32:
10420   case NEON::BI__builtin_neon_vset_lane_i64:
10421   case NEON::BI__builtin_neon_vset_lane_bf16:
10422   case NEON::BI__builtin_neon_vset_lane_f32:
10423   case NEON::BI__builtin_neon_vsetq_lane_i8:
10424   case NEON::BI__builtin_neon_vsetq_lane_i16:
10425   case NEON::BI__builtin_neon_vsetq_lane_i32:
10426   case NEON::BI__builtin_neon_vsetq_lane_i64:
10427   case NEON::BI__builtin_neon_vsetq_lane_bf16:
10428   case NEON::BI__builtin_neon_vsetq_lane_f32:
10429     Ops.push_back(EmitScalarExpr(E->getArg(2)));
10430     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
10431   case NEON::BI__builtin_neon_vset_lane_f64:
10432     // The vector type needs a cast for the v1f64 variant.
10433     Ops[1] =
10434         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
10435     Ops.push_back(EmitScalarExpr(E->getArg(2)));
10436     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
10437   case NEON::BI__builtin_neon_vsetq_lane_f64:
10438     // The vector type needs a cast for the v2f64 variant.
10439     Ops[1] =
10440         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
10441     Ops.push_back(EmitScalarExpr(E->getArg(2)));
10442     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
10443 
10444   case NEON::BI__builtin_neon_vget_lane_i8:
10445   case NEON::BI__builtin_neon_vdupb_lane_i8:
10446     Ops[0] =
10447         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
10448     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10449                                         "vget_lane");
10450   case NEON::BI__builtin_neon_vgetq_lane_i8:
10451   case NEON::BI__builtin_neon_vdupb_laneq_i8:
10452     Ops[0] =
10453         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
10454     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10455                                         "vgetq_lane");
10456   case NEON::BI__builtin_neon_vget_lane_i16:
10457   case NEON::BI__builtin_neon_vduph_lane_i16:
10458     Ops[0] =
10459         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
10460     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10461                                         "vget_lane");
10462   case NEON::BI__builtin_neon_vgetq_lane_i16:
10463   case NEON::BI__builtin_neon_vduph_laneq_i16:
10464     Ops[0] =
10465         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
10466     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10467                                         "vgetq_lane");
10468   case NEON::BI__builtin_neon_vget_lane_i32:
10469   case NEON::BI__builtin_neon_vdups_lane_i32:
10470     Ops[0] =
10471         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
10472     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10473                                         "vget_lane");
10474   case NEON::BI__builtin_neon_vdups_lane_f32:
10475     Ops[0] =
10476         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
10477     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10478                                         "vdups_lane");
10479   case NEON::BI__builtin_neon_vgetq_lane_i32:
10480   case NEON::BI__builtin_neon_vdups_laneq_i32:
10481     Ops[0] =
10482         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
10483     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10484                                         "vgetq_lane");
10485   case NEON::BI__builtin_neon_vget_lane_i64:
10486   case NEON::BI__builtin_neon_vdupd_lane_i64:
10487     Ops[0] =
10488         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
10489     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10490                                         "vget_lane");
10491   case NEON::BI__builtin_neon_vdupd_lane_f64:
10492     Ops[0] =
10493         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
10494     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10495                                         "vdupd_lane");
10496   case NEON::BI__builtin_neon_vgetq_lane_i64:
10497   case NEON::BI__builtin_neon_vdupd_laneq_i64:
10498     Ops[0] =
10499         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
10500     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10501                                         "vgetq_lane");
10502   case NEON::BI__builtin_neon_vget_lane_f32:
10503     Ops[0] =
10504         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
10505     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10506                                         "vget_lane");
10507   case NEON::BI__builtin_neon_vget_lane_f64:
10508     Ops[0] =
10509         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
10510     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10511                                         "vget_lane");
10512   case NEON::BI__builtin_neon_vgetq_lane_f32:
10513   case NEON::BI__builtin_neon_vdups_laneq_f32:
10514     Ops[0] =
10515         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
10516     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10517                                         "vgetq_lane");
10518   case NEON::BI__builtin_neon_vgetq_lane_f64:
10519   case NEON::BI__builtin_neon_vdupd_laneq_f64:
10520     Ops[0] =
10521         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
10522     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10523                                         "vgetq_lane");
10524   case NEON::BI__builtin_neon_vaddh_f16:
10525     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10526     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
10527   case NEON::BI__builtin_neon_vsubh_f16:
10528     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10529     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
10530   case NEON::BI__builtin_neon_vmulh_f16:
10531     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10532     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
10533   case NEON::BI__builtin_neon_vdivh_f16:
10534     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10535     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
10536   case NEON::BI__builtin_neon_vfmah_f16:
10537     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
10538     return emitCallMaybeConstrainedFPBuiltin(
10539         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
10540         {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
10541   case NEON::BI__builtin_neon_vfmsh_f16: {
10542     // FIXME: This should be an fneg instruction:
10543     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
10544     Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
10545 
10546     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
10547     return emitCallMaybeConstrainedFPBuiltin(
10548         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
10549         {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
10550   }
10551   case NEON::BI__builtin_neon_vaddd_s64:
10552   case NEON::BI__builtin_neon_vaddd_u64:
10553     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
10554   case NEON::BI__builtin_neon_vsubd_s64:
10555   case NEON::BI__builtin_neon_vsubd_u64:
10556     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
10557   case NEON::BI__builtin_neon_vqdmlalh_s16:
10558   case NEON::BI__builtin_neon_vqdmlslh_s16: {
10559     SmallVector<Value *, 2> ProductOps;
10560     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
10561     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
10562     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
10563     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
10564                           ProductOps, "vqdmlXl");
10565     Constant *CI = ConstantInt::get(SizeTy, 0);
10566     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
10567 
10568     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
10569                                         ? Intrinsic::aarch64_neon_sqadd
10570                                         : Intrinsic::aarch64_neon_sqsub;
10571     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
10572   }
10573   case NEON::BI__builtin_neon_vqshlud_n_s64: {
10574     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10575     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
10576     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
10577                         Ops, "vqshlu_n");
10578   }
10579   case NEON::BI__builtin_neon_vqshld_n_u64:
10580   case NEON::BI__builtin_neon_vqshld_n_s64: {
10581     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
10582                                    ? Intrinsic::aarch64_neon_uqshl
10583                                    : Intrinsic::aarch64_neon_sqshl;
10584     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10585     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
10586     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
10587   }
10588   case NEON::BI__builtin_neon_vrshrd_n_u64:
10589   case NEON::BI__builtin_neon_vrshrd_n_s64: {
10590     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
10591                                    ? Intrinsic::aarch64_neon_urshl
10592                                    : Intrinsic::aarch64_neon_srshl;
10593     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10594     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
10595     Ops[1] = ConstantInt::get(Int64Ty, -SV);
10596     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
10597   }
10598   case NEON::BI__builtin_neon_vrsrad_n_u64:
10599   case NEON::BI__builtin_neon_vrsrad_n_s64: {
10600     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
10601                                    ? Intrinsic::aarch64_neon_urshl
10602                                    : Intrinsic::aarch64_neon_srshl;
10603     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
10604     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
10605     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
10606                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
10607     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
10608   }
10609   case NEON::BI__builtin_neon_vshld_n_s64:
10610   case NEON::BI__builtin_neon_vshld_n_u64: {
10611     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
10612     return Builder.CreateShl(
10613         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
10614   }
10615   case NEON::BI__builtin_neon_vshrd_n_s64: {
10616     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
10617     return Builder.CreateAShr(
10618         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
10619                                                    Amt->getZExtValue())),
10620         "shrd_n");
10621   }
10622   case NEON::BI__builtin_neon_vshrd_n_u64: {
10623     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
10624     uint64_t ShiftAmt = Amt->getZExtValue();
10625     // Right-shifting an unsigned value by its size yields 0.
10626     if (ShiftAmt == 64)
10627       return ConstantInt::get(Int64Ty, 0);
10628     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
10629                               "shrd_n");
10630   }
10631   case NEON::BI__builtin_neon_vsrad_n_s64: {
10632     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
10633     Ops[1] = Builder.CreateAShr(
10634         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
10635                                                    Amt->getZExtValue())),
10636         "shrd_n");
10637     return Builder.CreateAdd(Ops[0], Ops[1]);
10638   }
10639   case NEON::BI__builtin_neon_vsrad_n_u64: {
10640     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
10641     uint64_t ShiftAmt = Amt->getZExtValue();
10642     // Right-shifting an unsigned value by its size yields 0.
10643     // As Op + 0 = Op, return Ops[0] directly.
10644     if (ShiftAmt == 64)
10645       return Ops[0];
10646     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
10647                                 "shrd_n");
10648     return Builder.CreateAdd(Ops[0], Ops[1]);
10649   }
10650   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
10651   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
10652   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
10653   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
10654     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
10655                                           "lane");
10656     SmallVector<Value *, 2> ProductOps;
10657     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
10658     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
10659     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
10660     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
10661                           ProductOps, "vqdmlXl");
10662     Constant *CI = ConstantInt::get(SizeTy, 0);
10663     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
10664     Ops.pop_back();
10665 
10666     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
10667                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
10668                           ? Intrinsic::aarch64_neon_sqadd
10669                           : Intrinsic::aarch64_neon_sqsub;
10670     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
10671   }
10672   case NEON::BI__builtin_neon_vqdmlals_s32:
10673   case NEON::BI__builtin_neon_vqdmlsls_s32: {
10674     SmallVector<Value *, 2> ProductOps;
10675     ProductOps.push_back(Ops[1]);
10676     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
10677     Ops[1] =
10678         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
10679                      ProductOps, "vqdmlXl");
10680 
10681     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
10682                                         ? Intrinsic::aarch64_neon_sqadd
10683                                         : Intrinsic::aarch64_neon_sqsub;
10684     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
10685   }
10686   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
10687   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
10688   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
10689   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
10690     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
10691                                           "lane");
10692     SmallVector<Value *, 2> ProductOps;
10693     ProductOps.push_back(Ops[1]);
10694     ProductOps.push_back(Ops[2]);
10695     Ops[1] =
10696         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
10697                      ProductOps, "vqdmlXl");
10698     Ops.pop_back();
10699 
10700     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
10701                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
10702                           ? Intrinsic::aarch64_neon_sqadd
10703                           : Intrinsic::aarch64_neon_sqsub;
10704     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
10705   }
10706   case NEON::BI__builtin_neon_vget_lane_bf16:
10707   case NEON::BI__builtin_neon_vduph_lane_bf16:
10708   case NEON::BI__builtin_neon_vduph_lane_f16: {
10709     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10710                                         "vget_lane");
10711   }
10712   case NEON::BI__builtin_neon_vgetq_lane_bf16:
10713   case NEON::BI__builtin_neon_vduph_laneq_bf16:
10714   case NEON::BI__builtin_neon_vduph_laneq_f16: {
10715     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
10716                                         "vgetq_lane");
10717   }
10718 
10719   case AArch64::BI_InterlockedAdd: {
10720     Value *Arg0 = EmitScalarExpr(E->getArg(0));
10721     Value *Arg1 = EmitScalarExpr(E->getArg(1));
10722     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
10723       AtomicRMWInst::Add, Arg0, Arg1,
10724       llvm::AtomicOrdering::SequentiallyConsistent);
10725     return Builder.CreateAdd(RMWI, Arg1);
10726   }
10727   }
10728 
10729   llvm::FixedVectorType *VTy = GetNeonType(this, Type);
10730   llvm::Type *Ty = VTy;
10731   if (!Ty)
10732     return nullptr;
10733 
10734   // Not all intrinsics handled by the common case work for AArch64 yet, so only
10735   // defer to common code if it's been added to our special map.
10736   Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
10737                                         AArch64SIMDIntrinsicsProvenSorted);
10738 
10739   if (Builtin)
10740     return EmitCommonNeonBuiltinExpr(
10741         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
10742         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
10743         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
10744 
10745   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
10746     return V;
10747 
10748   unsigned Int;
10749   switch (BuiltinID) {
10750   default: return nullptr;
10751   case NEON::BI__builtin_neon_vbsl_v:
10752   case NEON::BI__builtin_neon_vbslq_v: {
10753     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
10754     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
10755     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
10756     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
10757 
10758     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
10759     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
10760     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
10761     return Builder.CreateBitCast(Ops[0], Ty);
10762   }
10763   case NEON::BI__builtin_neon_vfma_lane_v:
10764   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
10765     // The ARM builtins (and instructions) have the addend as the first
10766     // operand, but the 'fma' intrinsics have it last. Swap it around here.
10767     Value *Addend = Ops[0];
10768     Value *Multiplicand = Ops[1];
10769     Value *LaneSource = Ops[2];
10770     Ops[0] = Multiplicand;
10771     Ops[1] = LaneSource;
10772     Ops[2] = Addend;
10773 
10774     // Now adjust things to handle the lane access.
10775     auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
10776                          ? llvm::FixedVectorType::get(VTy->getElementType(),
10777                                                       VTy->getNumElements() / 2)
10778                          : VTy;
10779     llvm::Constant *cst = cast<Constant>(Ops[3]);
10780     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
10781     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
10782     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
10783 
10784     Ops.pop_back();
10785     Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
10786                                        : Intrinsic::fma;
10787     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
10788   }
10789   case NEON::BI__builtin_neon_vfma_laneq_v: {
10790     auto *VTy = cast<llvm::FixedVectorType>(Ty);
10791     // v1f64 fma should be mapped to Neon scalar f64 fma
10792     if (VTy && VTy->getElementType() == DoubleTy) {
10793       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
10794       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
10795       llvm::FixedVectorType *VTy =
10796           GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
10797       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
10798       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
10799       Value *Result;
10800       Result = emitCallMaybeConstrainedFPBuiltin(
10801           *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
10802           DoubleTy, {Ops[1], Ops[2], Ops[0]});
10803       return Builder.CreateBitCast(Result, Ty);
10804     }
10805     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
10806     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
10807 
10808     auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
10809                                            VTy->getNumElements() * 2);
10810     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
10811     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
10812                                                cast<ConstantInt>(Ops[3]));
10813     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
10814 
10815     return emitCallMaybeConstrainedFPBuiltin(
10816         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
10817         {Ops[2], Ops[1], Ops[0]});
10818   }
10819   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
10820     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
10821     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
10822 
10823     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
10824     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
10825     return emitCallMaybeConstrainedFPBuiltin(
10826         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
10827         {Ops[2], Ops[1], Ops[0]});
10828   }
10829   case NEON::BI__builtin_neon_vfmah_lane_f16:
10830   case NEON::BI__builtin_neon_vfmas_lane_f32:
10831   case NEON::BI__builtin_neon_vfmah_laneq_f16:
10832   case NEON::BI__builtin_neon_vfmas_laneq_f32:
10833   case NEON::BI__builtin_neon_vfmad_lane_f64:
10834   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
10835     Ops.push_back(EmitScalarExpr(E->getArg(3)));
10836     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
10837     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
10838     return emitCallMaybeConstrainedFPBuiltin(
10839         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
10840         {Ops[1], Ops[2], Ops[0]});
10841   }
10842   case NEON::BI__builtin_neon_vmull_v:
10843     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10844     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
10845     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
10846     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
10847   case NEON::BI__builtin_neon_vmax_v:
10848   case NEON::BI__builtin_neon_vmaxq_v:
10849     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10850     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
10851     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
10852     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
10853   case NEON::BI__builtin_neon_vmaxh_f16: {
10854     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10855     Int = Intrinsic::aarch64_neon_fmax;
10856     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
10857   }
10858   case NEON::BI__builtin_neon_vmin_v:
10859   case NEON::BI__builtin_neon_vminq_v:
10860     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10861     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
10862     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
10863     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
10864   case NEON::BI__builtin_neon_vminh_f16: {
10865     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10866     Int = Intrinsic::aarch64_neon_fmin;
10867     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
10868   }
10869   case NEON::BI__builtin_neon_vabd_v:
10870   case NEON::BI__builtin_neon_vabdq_v:
10871     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10872     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
10873     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
10874     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
10875   case NEON::BI__builtin_neon_vpadal_v:
10876   case NEON::BI__builtin_neon_vpadalq_v: {
10877     unsigned ArgElts = VTy->getNumElements();
10878     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
10879     unsigned BitWidth = EltTy->getBitWidth();
10880     auto *ArgTy = llvm::FixedVectorType::get(
10881         llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
10882     llvm::Type* Tys[2] = { VTy, ArgTy };
10883     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
10884     SmallVector<llvm::Value*, 1> TmpOps;
10885     TmpOps.push_back(Ops[1]);
10886     Function *F = CGM.getIntrinsic(Int, Tys);
10887     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
10888     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
10889     return Builder.CreateAdd(tmp, addend);
10890   }
10891   case NEON::BI__builtin_neon_vpmin_v:
10892   case NEON::BI__builtin_neon_vpminq_v:
10893     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10894     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
10895     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
10896     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
10897   case NEON::BI__builtin_neon_vpmax_v:
10898   case NEON::BI__builtin_neon_vpmaxq_v:
10899     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
10900     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
10901     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
10902     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
10903   case NEON::BI__builtin_neon_vminnm_v:
10904   case NEON::BI__builtin_neon_vminnmq_v:
10905     Int = Intrinsic::aarch64_neon_fminnm;
10906     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
10907   case NEON::BI__builtin_neon_vminnmh_f16:
10908     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10909     Int = Intrinsic::aarch64_neon_fminnm;
10910     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
10911   case NEON::BI__builtin_neon_vmaxnm_v:
10912   case NEON::BI__builtin_neon_vmaxnmq_v:
10913     Int = Intrinsic::aarch64_neon_fmaxnm;
10914     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
10915   case NEON::BI__builtin_neon_vmaxnmh_f16:
10916     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10917     Int = Intrinsic::aarch64_neon_fmaxnm;
10918     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
10919   case NEON::BI__builtin_neon_vrecpss_f32: {
10920     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10921     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
10922                         Ops, "vrecps");
10923   }
10924   case NEON::BI__builtin_neon_vrecpsd_f64:
10925     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10926     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
10927                         Ops, "vrecps");
10928   case NEON::BI__builtin_neon_vrecpsh_f16:
10929     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10930     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
10931                         Ops, "vrecps");
10932   case NEON::BI__builtin_neon_vqshrun_n_v:
10933     Int = Intrinsic::aarch64_neon_sqshrun;
10934     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
10935   case NEON::BI__builtin_neon_vqrshrun_n_v:
10936     Int = Intrinsic::aarch64_neon_sqrshrun;
10937     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
10938   case NEON::BI__builtin_neon_vqshrn_n_v:
10939     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
10940     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
10941   case NEON::BI__builtin_neon_vrshrn_n_v:
10942     Int = Intrinsic::aarch64_neon_rshrn;
10943     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
10944   case NEON::BI__builtin_neon_vqrshrn_n_v:
10945     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
10946     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
10947   case NEON::BI__builtin_neon_vrndah_f16: {
10948     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10949     Int = Builder.getIsFPConstrained()
10950               ? Intrinsic::experimental_constrained_round
10951               : Intrinsic::round;
10952     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
10953   }
10954   case NEON::BI__builtin_neon_vrnda_v:
10955   case NEON::BI__builtin_neon_vrndaq_v: {
10956     Int = Builder.getIsFPConstrained()
10957               ? Intrinsic::experimental_constrained_round
10958               : Intrinsic::round;
10959     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
10960   }
10961   case NEON::BI__builtin_neon_vrndih_f16: {
10962     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10963     Int = Builder.getIsFPConstrained()
10964               ? Intrinsic::experimental_constrained_nearbyint
10965               : Intrinsic::nearbyint;
10966     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
10967   }
10968   case NEON::BI__builtin_neon_vrndmh_f16: {
10969     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10970     Int = Builder.getIsFPConstrained()
10971               ? Intrinsic::experimental_constrained_floor
10972               : Intrinsic::floor;
10973     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
10974   }
10975   case NEON::BI__builtin_neon_vrndm_v:
10976   case NEON::BI__builtin_neon_vrndmq_v: {
10977     Int = Builder.getIsFPConstrained()
10978               ? Intrinsic::experimental_constrained_floor
10979               : Intrinsic::floor;
10980     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
10981   }
10982   case NEON::BI__builtin_neon_vrndnh_f16: {
10983     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10984     Int = Builder.getIsFPConstrained()
10985               ? Intrinsic::experimental_constrained_roundeven
10986               : Intrinsic::roundeven;
10987     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
10988   }
10989   case NEON::BI__builtin_neon_vrndn_v:
10990   case NEON::BI__builtin_neon_vrndnq_v: {
10991     Int = Builder.getIsFPConstrained()
10992               ? Intrinsic::experimental_constrained_roundeven
10993               : Intrinsic::roundeven;
10994     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
10995   }
10996   case NEON::BI__builtin_neon_vrndns_f32: {
10997     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10998     Int = Builder.getIsFPConstrained()
10999               ? Intrinsic::experimental_constrained_roundeven
11000               : Intrinsic::roundeven;
11001     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
11002   }
11003   case NEON::BI__builtin_neon_vrndph_f16: {
11004     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11005     Int = Builder.getIsFPConstrained()
11006               ? Intrinsic::experimental_constrained_ceil
11007               : Intrinsic::ceil;
11008     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
11009   }
11010   case NEON::BI__builtin_neon_vrndp_v:
11011   case NEON::BI__builtin_neon_vrndpq_v: {
11012     Int = Builder.getIsFPConstrained()
11013               ? Intrinsic::experimental_constrained_ceil
11014               : Intrinsic::ceil;
11015     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
11016   }
11017   case NEON::BI__builtin_neon_vrndxh_f16: {
11018     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11019     Int = Builder.getIsFPConstrained()
11020               ? Intrinsic::experimental_constrained_rint
11021               : Intrinsic::rint;
11022     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
11023   }
11024   case NEON::BI__builtin_neon_vrndx_v:
11025   case NEON::BI__builtin_neon_vrndxq_v: {
11026     Int = Builder.getIsFPConstrained()
11027               ? Intrinsic::experimental_constrained_rint
11028               : Intrinsic::rint;
11029     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
11030   }
11031   case NEON::BI__builtin_neon_vrndh_f16: {
11032     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11033     Int = Builder.getIsFPConstrained()
11034               ? Intrinsic::experimental_constrained_trunc
11035               : Intrinsic::trunc;
11036     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
11037   }
11038   case NEON::BI__builtin_neon_vrnd32x_v:
11039   case NEON::BI__builtin_neon_vrnd32xq_v: {
11040     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11041     Int = Intrinsic::aarch64_neon_frint32x;
11042     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
11043   }
11044   case NEON::BI__builtin_neon_vrnd32z_v:
11045   case NEON::BI__builtin_neon_vrnd32zq_v: {
11046     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11047     Int = Intrinsic::aarch64_neon_frint32z;
11048     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
11049   }
11050   case NEON::BI__builtin_neon_vrnd64x_v:
11051   case NEON::BI__builtin_neon_vrnd64xq_v: {
11052     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11053     Int = Intrinsic::aarch64_neon_frint64x;
11054     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
11055   }
11056   case NEON::BI__builtin_neon_vrnd64z_v:
11057   case NEON::BI__builtin_neon_vrnd64zq_v: {
11058     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11059     Int = Intrinsic::aarch64_neon_frint64z;
11060     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
11061   }
11062   case NEON::BI__builtin_neon_vrnd_v:
11063   case NEON::BI__builtin_neon_vrndq_v: {
11064     Int = Builder.getIsFPConstrained()
11065               ? Intrinsic::experimental_constrained_trunc
11066               : Intrinsic::trunc;
11067     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
11068   }
11069   case NEON::BI__builtin_neon_vcvt_f64_v:
11070   case NEON::BI__builtin_neon_vcvtq_f64_v:
11071     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11072     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
11073     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
11074                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
11075   case NEON::BI__builtin_neon_vcvt_f64_f32: {
11076     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
11077            "unexpected vcvt_f64_f32 builtin");
11078     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
11079     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
11080 
11081     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
11082   }
11083   case NEON::BI__builtin_neon_vcvt_f32_f64: {
11084     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
11085            "unexpected vcvt_f32_f64 builtin");
11086     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
11087     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
11088 
11089     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
11090   }
11091   case NEON::BI__builtin_neon_vcvt_s32_v:
11092   case NEON::BI__builtin_neon_vcvt_u32_v:
11093   case NEON::BI__builtin_neon_vcvt_s64_v:
11094   case NEON::BI__builtin_neon_vcvt_u64_v:
11095   case NEON::BI__builtin_neon_vcvt_s16_v:
11096   case NEON::BI__builtin_neon_vcvt_u16_v:
11097   case NEON::BI__builtin_neon_vcvtq_s32_v:
11098   case NEON::BI__builtin_neon_vcvtq_u32_v:
11099   case NEON::BI__builtin_neon_vcvtq_s64_v:
11100   case NEON::BI__builtin_neon_vcvtq_u64_v:
11101   case NEON::BI__builtin_neon_vcvtq_s16_v:
11102   case NEON::BI__builtin_neon_vcvtq_u16_v: {
11103     Int =
11104         usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
11105     llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
11106     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
11107   }
11108   case NEON::BI__builtin_neon_vcvta_s16_v:
11109   case NEON::BI__builtin_neon_vcvta_u16_v:
11110   case NEON::BI__builtin_neon_vcvta_s32_v:
11111   case NEON::BI__builtin_neon_vcvtaq_s16_v:
11112   case NEON::BI__builtin_neon_vcvtaq_s32_v:
11113   case NEON::BI__builtin_neon_vcvta_u32_v:
11114   case NEON::BI__builtin_neon_vcvtaq_u16_v:
11115   case NEON::BI__builtin_neon_vcvtaq_u32_v:
11116   case NEON::BI__builtin_neon_vcvta_s64_v:
11117   case NEON::BI__builtin_neon_vcvtaq_s64_v:
11118   case NEON::BI__builtin_neon_vcvta_u64_v:
11119   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
11120     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
11121     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11122     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
11123   }
11124   case NEON::BI__builtin_neon_vcvtm_s16_v:
11125   case NEON::BI__builtin_neon_vcvtm_s32_v:
11126   case NEON::BI__builtin_neon_vcvtmq_s16_v:
11127   case NEON::BI__builtin_neon_vcvtmq_s32_v:
11128   case NEON::BI__builtin_neon_vcvtm_u16_v:
11129   case NEON::BI__builtin_neon_vcvtm_u32_v:
11130   case NEON::BI__builtin_neon_vcvtmq_u16_v:
11131   case NEON::BI__builtin_neon_vcvtmq_u32_v:
11132   case NEON::BI__builtin_neon_vcvtm_s64_v:
11133   case NEON::BI__builtin_neon_vcvtmq_s64_v:
11134   case NEON::BI__builtin_neon_vcvtm_u64_v:
11135   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
11136     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
11137     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11138     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
11139   }
11140   case NEON::BI__builtin_neon_vcvtn_s16_v:
11141   case NEON::BI__builtin_neon_vcvtn_s32_v:
11142   case NEON::BI__builtin_neon_vcvtnq_s16_v:
11143   case NEON::BI__builtin_neon_vcvtnq_s32_v:
11144   case NEON::BI__builtin_neon_vcvtn_u16_v:
11145   case NEON::BI__builtin_neon_vcvtn_u32_v:
11146   case NEON::BI__builtin_neon_vcvtnq_u16_v:
11147   case NEON::BI__builtin_neon_vcvtnq_u32_v:
11148   case NEON::BI__builtin_neon_vcvtn_s64_v:
11149   case NEON::BI__builtin_neon_vcvtnq_s64_v:
11150   case NEON::BI__builtin_neon_vcvtn_u64_v:
11151   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
11152     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
11153     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11154     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
11155   }
11156   case NEON::BI__builtin_neon_vcvtp_s16_v:
11157   case NEON::BI__builtin_neon_vcvtp_s32_v:
11158   case NEON::BI__builtin_neon_vcvtpq_s16_v:
11159   case NEON::BI__builtin_neon_vcvtpq_s32_v:
11160   case NEON::BI__builtin_neon_vcvtp_u16_v:
11161   case NEON::BI__builtin_neon_vcvtp_u32_v:
11162   case NEON::BI__builtin_neon_vcvtpq_u16_v:
11163   case NEON::BI__builtin_neon_vcvtpq_u32_v:
11164   case NEON::BI__builtin_neon_vcvtp_s64_v:
11165   case NEON::BI__builtin_neon_vcvtpq_s64_v:
11166   case NEON::BI__builtin_neon_vcvtp_u64_v:
11167   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
11168     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
11169     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11170     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
11171   }
11172   case NEON::BI__builtin_neon_vmulx_v:
11173   case NEON::BI__builtin_neon_vmulxq_v: {
11174     Int = Intrinsic::aarch64_neon_fmulx;
11175     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
11176   }
11177   case NEON::BI__builtin_neon_vmulxh_lane_f16:
11178   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
11179     // vmulx_lane should be mapped to Neon scalar mulx after
11180     // extracting the scalar element
11181     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11182     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
11183     Ops.pop_back();
11184     Int = Intrinsic::aarch64_neon_fmulx;
11185     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
11186   }
11187   case NEON::BI__builtin_neon_vmul_lane_v:
11188   case NEON::BI__builtin_neon_vmul_laneq_v: {
11189     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
11190     bool Quad = false;
11191     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
11192       Quad = true;
11193     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11194     llvm::FixedVectorType *VTy =
11195         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
11196     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
11197     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
11198     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
11199     return Builder.CreateBitCast(Result, Ty);
11200   }
11201   case NEON::BI__builtin_neon_vnegd_s64:
11202     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
11203   case NEON::BI__builtin_neon_vnegh_f16:
11204     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
11205   case NEON::BI__builtin_neon_vpmaxnm_v:
11206   case NEON::BI__builtin_neon_vpmaxnmq_v: {
11207     Int = Intrinsic::aarch64_neon_fmaxnmp;
11208     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
11209   }
11210   case NEON::BI__builtin_neon_vpminnm_v:
11211   case NEON::BI__builtin_neon_vpminnmq_v: {
11212     Int = Intrinsic::aarch64_neon_fminnmp;
11213     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
11214   }
11215   case NEON::BI__builtin_neon_vsqrth_f16: {
11216     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11217     Int = Builder.getIsFPConstrained()
11218               ? Intrinsic::experimental_constrained_sqrt
11219               : Intrinsic::sqrt;
11220     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
11221   }
11222   case NEON::BI__builtin_neon_vsqrt_v:
11223   case NEON::BI__builtin_neon_vsqrtq_v: {
11224     Int = Builder.getIsFPConstrained()
11225               ? Intrinsic::experimental_constrained_sqrt
11226               : Intrinsic::sqrt;
11227     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11228     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
11229   }
11230   case NEON::BI__builtin_neon_vrbit_v:
11231   case NEON::BI__builtin_neon_vrbitq_v: {
11232     Int = Intrinsic::bitreverse;
11233     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
11234   }
11235   case NEON::BI__builtin_neon_vaddv_u8:
11236     // FIXME: These are handled by the AArch64 scalar code.
11237     usgn = true;
11238     LLVM_FALLTHROUGH;
11239   case NEON::BI__builtin_neon_vaddv_s8: {
11240     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11241     Ty = Int32Ty;
11242     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11243     llvm::Type *Tys[2] = { Ty, VTy };
11244     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11245     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11246     return Builder.CreateTrunc(Ops[0], Int8Ty);
11247   }
11248   case NEON::BI__builtin_neon_vaddv_u16:
11249     usgn = true;
11250     LLVM_FALLTHROUGH;
11251   case NEON::BI__builtin_neon_vaddv_s16: {
11252     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11253     Ty = Int32Ty;
11254     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11255     llvm::Type *Tys[2] = { Ty, VTy };
11256     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11257     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11258     return Builder.CreateTrunc(Ops[0], Int16Ty);
11259   }
11260   case NEON::BI__builtin_neon_vaddvq_u8:
11261     usgn = true;
11262     LLVM_FALLTHROUGH;
11263   case NEON::BI__builtin_neon_vaddvq_s8: {
11264     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11265     Ty = Int32Ty;
11266     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11267     llvm::Type *Tys[2] = { Ty, VTy };
11268     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11269     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11270     return Builder.CreateTrunc(Ops[0], Int8Ty);
11271   }
11272   case NEON::BI__builtin_neon_vaddvq_u16:
11273     usgn = true;
11274     LLVM_FALLTHROUGH;
11275   case NEON::BI__builtin_neon_vaddvq_s16: {
11276     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11277     Ty = Int32Ty;
11278     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11279     llvm::Type *Tys[2] = { Ty, VTy };
11280     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11281     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11282     return Builder.CreateTrunc(Ops[0], Int16Ty);
11283   }
11284   case NEON::BI__builtin_neon_vmaxv_u8: {
11285     Int = Intrinsic::aarch64_neon_umaxv;
11286     Ty = Int32Ty;
11287     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11288     llvm::Type *Tys[2] = { Ty, VTy };
11289     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11290     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11291     return Builder.CreateTrunc(Ops[0], Int8Ty);
11292   }
11293   case NEON::BI__builtin_neon_vmaxv_u16: {
11294     Int = Intrinsic::aarch64_neon_umaxv;
11295     Ty = Int32Ty;
11296     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11297     llvm::Type *Tys[2] = { Ty, VTy };
11298     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11299     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11300     return Builder.CreateTrunc(Ops[0], Int16Ty);
11301   }
11302   case NEON::BI__builtin_neon_vmaxvq_u8: {
11303     Int = Intrinsic::aarch64_neon_umaxv;
11304     Ty = Int32Ty;
11305     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11306     llvm::Type *Tys[2] = { Ty, VTy };
11307     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11308     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11309     return Builder.CreateTrunc(Ops[0], Int8Ty);
11310   }
11311   case NEON::BI__builtin_neon_vmaxvq_u16: {
11312     Int = Intrinsic::aarch64_neon_umaxv;
11313     Ty = Int32Ty;
11314     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11315     llvm::Type *Tys[2] = { Ty, VTy };
11316     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11317     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11318     return Builder.CreateTrunc(Ops[0], Int16Ty);
11319   }
11320   case NEON::BI__builtin_neon_vmaxv_s8: {
11321     Int = Intrinsic::aarch64_neon_smaxv;
11322     Ty = Int32Ty;
11323     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11324     llvm::Type *Tys[2] = { Ty, VTy };
11325     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11326     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11327     return Builder.CreateTrunc(Ops[0], Int8Ty);
11328   }
11329   case NEON::BI__builtin_neon_vmaxv_s16: {
11330     Int = Intrinsic::aarch64_neon_smaxv;
11331     Ty = Int32Ty;
11332     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11333     llvm::Type *Tys[2] = { Ty, VTy };
11334     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11335     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11336     return Builder.CreateTrunc(Ops[0], Int16Ty);
11337   }
11338   case NEON::BI__builtin_neon_vmaxvq_s8: {
11339     Int = Intrinsic::aarch64_neon_smaxv;
11340     Ty = Int32Ty;
11341     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11342     llvm::Type *Tys[2] = { Ty, VTy };
11343     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11344     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11345     return Builder.CreateTrunc(Ops[0], Int8Ty);
11346   }
11347   case NEON::BI__builtin_neon_vmaxvq_s16: {
11348     Int = Intrinsic::aarch64_neon_smaxv;
11349     Ty = Int32Ty;
11350     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11351     llvm::Type *Tys[2] = { Ty, VTy };
11352     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11353     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11354     return Builder.CreateTrunc(Ops[0], Int16Ty);
11355   }
11356   case NEON::BI__builtin_neon_vmaxv_f16: {
11357     Int = Intrinsic::aarch64_neon_fmaxv;
11358     Ty = HalfTy;
11359     VTy = llvm::FixedVectorType::get(HalfTy, 4);
11360     llvm::Type *Tys[2] = { Ty, VTy };
11361     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11362     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11363     return Builder.CreateTrunc(Ops[0], HalfTy);
11364   }
11365   case NEON::BI__builtin_neon_vmaxvq_f16: {
11366     Int = Intrinsic::aarch64_neon_fmaxv;
11367     Ty = HalfTy;
11368     VTy = llvm::FixedVectorType::get(HalfTy, 8);
11369     llvm::Type *Tys[2] = { Ty, VTy };
11370     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11371     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11372     return Builder.CreateTrunc(Ops[0], HalfTy);
11373   }
11374   case NEON::BI__builtin_neon_vminv_u8: {
11375     Int = Intrinsic::aarch64_neon_uminv;
11376     Ty = Int32Ty;
11377     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11378     llvm::Type *Tys[2] = { Ty, VTy };
11379     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11380     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11381     return Builder.CreateTrunc(Ops[0], Int8Ty);
11382   }
11383   case NEON::BI__builtin_neon_vminv_u16: {
11384     Int = Intrinsic::aarch64_neon_uminv;
11385     Ty = Int32Ty;
11386     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11387     llvm::Type *Tys[2] = { Ty, VTy };
11388     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11389     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11390     return Builder.CreateTrunc(Ops[0], Int16Ty);
11391   }
11392   case NEON::BI__builtin_neon_vminvq_u8: {
11393     Int = Intrinsic::aarch64_neon_uminv;
11394     Ty = Int32Ty;
11395     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11396     llvm::Type *Tys[2] = { Ty, VTy };
11397     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11398     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11399     return Builder.CreateTrunc(Ops[0], Int8Ty);
11400   }
11401   case NEON::BI__builtin_neon_vminvq_u16: {
11402     Int = Intrinsic::aarch64_neon_uminv;
11403     Ty = Int32Ty;
11404     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11405     llvm::Type *Tys[2] = { Ty, VTy };
11406     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11407     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11408     return Builder.CreateTrunc(Ops[0], Int16Ty);
11409   }
11410   case NEON::BI__builtin_neon_vminv_s8: {
11411     Int = Intrinsic::aarch64_neon_sminv;
11412     Ty = Int32Ty;
11413     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11414     llvm::Type *Tys[2] = { Ty, VTy };
11415     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11416     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11417     return Builder.CreateTrunc(Ops[0], Int8Ty);
11418   }
11419   case NEON::BI__builtin_neon_vminv_s16: {
11420     Int = Intrinsic::aarch64_neon_sminv;
11421     Ty = Int32Ty;
11422     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11423     llvm::Type *Tys[2] = { Ty, VTy };
11424     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11425     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11426     return Builder.CreateTrunc(Ops[0], Int16Ty);
11427   }
11428   case NEON::BI__builtin_neon_vminvq_s8: {
11429     Int = Intrinsic::aarch64_neon_sminv;
11430     Ty = Int32Ty;
11431     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11432     llvm::Type *Tys[2] = { Ty, VTy };
11433     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11434     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11435     return Builder.CreateTrunc(Ops[0], Int8Ty);
11436   }
11437   case NEON::BI__builtin_neon_vminvq_s16: {
11438     Int = Intrinsic::aarch64_neon_sminv;
11439     Ty = Int32Ty;
11440     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11441     llvm::Type *Tys[2] = { Ty, VTy };
11442     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11443     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11444     return Builder.CreateTrunc(Ops[0], Int16Ty);
11445   }
11446   case NEON::BI__builtin_neon_vminv_f16: {
11447     Int = Intrinsic::aarch64_neon_fminv;
11448     Ty = HalfTy;
11449     VTy = llvm::FixedVectorType::get(HalfTy, 4);
11450     llvm::Type *Tys[2] = { Ty, VTy };
11451     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11452     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11453     return Builder.CreateTrunc(Ops[0], HalfTy);
11454   }
11455   case NEON::BI__builtin_neon_vminvq_f16: {
11456     Int = Intrinsic::aarch64_neon_fminv;
11457     Ty = HalfTy;
11458     VTy = llvm::FixedVectorType::get(HalfTy, 8);
11459     llvm::Type *Tys[2] = { Ty, VTy };
11460     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11461     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
11462     return Builder.CreateTrunc(Ops[0], HalfTy);
11463   }
11464   case NEON::BI__builtin_neon_vmaxnmv_f16: {
11465     Int = Intrinsic::aarch64_neon_fmaxnmv;
11466     Ty = HalfTy;
11467     VTy = llvm::FixedVectorType::get(HalfTy, 4);
11468     llvm::Type *Tys[2] = { Ty, VTy };
11469     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11470     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
11471     return Builder.CreateTrunc(Ops[0], HalfTy);
11472   }
11473   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
11474     Int = Intrinsic::aarch64_neon_fmaxnmv;
11475     Ty = HalfTy;
11476     VTy = llvm::FixedVectorType::get(HalfTy, 8);
11477     llvm::Type *Tys[2] = { Ty, VTy };
11478     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11479     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
11480     return Builder.CreateTrunc(Ops[0], HalfTy);
11481   }
11482   case NEON::BI__builtin_neon_vminnmv_f16: {
11483     Int = Intrinsic::aarch64_neon_fminnmv;
11484     Ty = HalfTy;
11485     VTy = llvm::FixedVectorType::get(HalfTy, 4);
11486     llvm::Type *Tys[2] = { Ty, VTy };
11487     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11488     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
11489     return Builder.CreateTrunc(Ops[0], HalfTy);
11490   }
11491   case NEON::BI__builtin_neon_vminnmvq_f16: {
11492     Int = Intrinsic::aarch64_neon_fminnmv;
11493     Ty = HalfTy;
11494     VTy = llvm::FixedVectorType::get(HalfTy, 8);
11495     llvm::Type *Tys[2] = { Ty, VTy };
11496     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11497     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
11498     return Builder.CreateTrunc(Ops[0], HalfTy);
11499   }
11500   case NEON::BI__builtin_neon_vmul_n_f64: {
11501     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11502     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
11503     return Builder.CreateFMul(Ops[0], RHS);
11504   }
11505   case NEON::BI__builtin_neon_vaddlv_u8: {
11506     Int = Intrinsic::aarch64_neon_uaddlv;
11507     Ty = Int32Ty;
11508     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11509     llvm::Type *Tys[2] = { Ty, VTy };
11510     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11511     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11512     return Builder.CreateTrunc(Ops[0], Int16Ty);
11513   }
11514   case NEON::BI__builtin_neon_vaddlv_u16: {
11515     Int = Intrinsic::aarch64_neon_uaddlv;
11516     Ty = Int32Ty;
11517     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11518     llvm::Type *Tys[2] = { Ty, VTy };
11519     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11520     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11521   }
11522   case NEON::BI__builtin_neon_vaddlvq_u8: {
11523     Int = Intrinsic::aarch64_neon_uaddlv;
11524     Ty = Int32Ty;
11525     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11526     llvm::Type *Tys[2] = { Ty, VTy };
11527     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11528     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11529     return Builder.CreateTrunc(Ops[0], Int16Ty);
11530   }
11531   case NEON::BI__builtin_neon_vaddlvq_u16: {
11532     Int = Intrinsic::aarch64_neon_uaddlv;
11533     Ty = Int32Ty;
11534     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11535     llvm::Type *Tys[2] = { Ty, VTy };
11536     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11537     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11538   }
11539   case NEON::BI__builtin_neon_vaddlv_s8: {
11540     Int = Intrinsic::aarch64_neon_saddlv;
11541     Ty = Int32Ty;
11542     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11543     llvm::Type *Tys[2] = { Ty, VTy };
11544     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11545     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11546     return Builder.CreateTrunc(Ops[0], Int16Ty);
11547   }
11548   case NEON::BI__builtin_neon_vaddlv_s16: {
11549     Int = Intrinsic::aarch64_neon_saddlv;
11550     Ty = Int32Ty;
11551     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11552     llvm::Type *Tys[2] = { Ty, VTy };
11553     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11554     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11555   }
11556   case NEON::BI__builtin_neon_vaddlvq_s8: {
11557     Int = Intrinsic::aarch64_neon_saddlv;
11558     Ty = Int32Ty;
11559     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11560     llvm::Type *Tys[2] = { Ty, VTy };
11561     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11562     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11563     return Builder.CreateTrunc(Ops[0], Int16Ty);
11564   }
11565   case NEON::BI__builtin_neon_vaddlvq_s16: {
11566     Int = Intrinsic::aarch64_neon_saddlv;
11567     Ty = Int32Ty;
11568     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11569     llvm::Type *Tys[2] = { Ty, VTy };
11570     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11571     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
11572   }
11573   case NEON::BI__builtin_neon_vsri_n_v:
11574   case NEON::BI__builtin_neon_vsriq_n_v: {
11575     Int = Intrinsic::aarch64_neon_vsri;
11576     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
11577     return EmitNeonCall(Intrin, Ops, "vsri_n");
11578   }
11579   case NEON::BI__builtin_neon_vsli_n_v:
11580   case NEON::BI__builtin_neon_vsliq_n_v: {
11581     Int = Intrinsic::aarch64_neon_vsli;
11582     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
11583     return EmitNeonCall(Intrin, Ops, "vsli_n");
11584   }
11585   case NEON::BI__builtin_neon_vsra_n_v:
11586   case NEON::BI__builtin_neon_vsraq_n_v:
11587     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11588     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
11589     return Builder.CreateAdd(Ops[0], Ops[1]);
11590   case NEON::BI__builtin_neon_vrsra_n_v:
11591   case NEON::BI__builtin_neon_vrsraq_n_v: {
11592     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
11593     SmallVector<llvm::Value*,2> TmpOps;
11594     TmpOps.push_back(Ops[1]);
11595     TmpOps.push_back(Ops[2]);
11596     Function* F = CGM.getIntrinsic(Int, Ty);
11597     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
11598     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
11599     return Builder.CreateAdd(Ops[0], tmp);
11600   }
11601   case NEON::BI__builtin_neon_vld1_v:
11602   case NEON::BI__builtin_neon_vld1q_v: {
11603     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
11604     return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment());
11605   }
11606   case NEON::BI__builtin_neon_vst1_v:
11607   case NEON::BI__builtin_neon_vst1q_v:
11608     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
11609     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
11610     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
11611   case NEON::BI__builtin_neon_vld1_lane_v:
11612   case NEON::BI__builtin_neon_vld1q_lane_v: {
11613     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11614     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
11615     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11616     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
11617                                        PtrOp0.getAlignment());
11618     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
11619   }
11620   case NEON::BI__builtin_neon_vld1_dup_v:
11621   case NEON::BI__builtin_neon_vld1q_dup_v: {
11622     Value *V = UndefValue::get(Ty);
11623     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
11624     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11625     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
11626                                        PtrOp0.getAlignment());
11627     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
11628     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
11629     return EmitNeonSplat(Ops[0], CI);
11630   }
11631   case NEON::BI__builtin_neon_vst1_lane_v:
11632   case NEON::BI__builtin_neon_vst1q_lane_v:
11633     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11634     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
11635     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
11636     return Builder.CreateAlignedStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty),
11637                                       PtrOp0.getAlignment());
11638   case NEON::BI__builtin_neon_vld2_v:
11639   case NEON::BI__builtin_neon_vld2q_v: {
11640     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
11641     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11642     llvm::Type *Tys[2] = { VTy, PTy };
11643     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
11644     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
11645     Ops[0] = Builder.CreateBitCast(Ops[0],
11646                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11647     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11648   }
11649   case NEON::BI__builtin_neon_vld3_v:
11650   case NEON::BI__builtin_neon_vld3q_v: {
11651     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
11652     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11653     llvm::Type *Tys[2] = { VTy, PTy };
11654     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
11655     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
11656     Ops[0] = Builder.CreateBitCast(Ops[0],
11657                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11658     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11659   }
11660   case NEON::BI__builtin_neon_vld4_v:
11661   case NEON::BI__builtin_neon_vld4q_v: {
11662     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
11663     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11664     llvm::Type *Tys[2] = { VTy, PTy };
11665     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
11666     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
11667     Ops[0] = Builder.CreateBitCast(Ops[0],
11668                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11669     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11670   }
11671   case NEON::BI__builtin_neon_vld2_dup_v:
11672   case NEON::BI__builtin_neon_vld2q_dup_v: {
11673     llvm::Type *PTy =
11674       llvm::PointerType::getUnqual(VTy->getElementType());
11675     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11676     llvm::Type *Tys[2] = { VTy, PTy };
11677     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
11678     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
11679     Ops[0] = Builder.CreateBitCast(Ops[0],
11680                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11681     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11682   }
11683   case NEON::BI__builtin_neon_vld3_dup_v:
11684   case NEON::BI__builtin_neon_vld3q_dup_v: {
11685     llvm::Type *PTy =
11686       llvm::PointerType::getUnqual(VTy->getElementType());
11687     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11688     llvm::Type *Tys[2] = { VTy, PTy };
11689     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
11690     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
11691     Ops[0] = Builder.CreateBitCast(Ops[0],
11692                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11693     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11694   }
11695   case NEON::BI__builtin_neon_vld4_dup_v:
11696   case NEON::BI__builtin_neon_vld4q_dup_v: {
11697     llvm::Type *PTy =
11698       llvm::PointerType::getUnqual(VTy->getElementType());
11699     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
11700     llvm::Type *Tys[2] = { VTy, PTy };
11701     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
11702     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
11703     Ops[0] = Builder.CreateBitCast(Ops[0],
11704                 llvm::PointerType::getUnqual(Ops[1]->getType()));
11705     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11706   }
11707   case NEON::BI__builtin_neon_vld2_lane_v:
11708   case NEON::BI__builtin_neon_vld2q_lane_v: {
11709     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
11710     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
11711     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
11712     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11713     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11714     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
11715     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane");
11716     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
11717     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11718     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11719   }
11720   case NEON::BI__builtin_neon_vld3_lane_v:
11721   case NEON::BI__builtin_neon_vld3q_lane_v: {
11722     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
11723     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
11724     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
11725     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11726     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11727     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
11728     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
11729     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
11730     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
11731     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11732     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11733   }
11734   case NEON::BI__builtin_neon_vld4_lane_v:
11735   case NEON::BI__builtin_neon_vld4q_lane_v: {
11736     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
11737     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
11738     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
11739     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11740     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11741     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
11742     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
11743     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
11744     Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld4_lane");
11745     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
11746     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11747     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
11748   }
11749   case NEON::BI__builtin_neon_vst2_v:
11750   case NEON::BI__builtin_neon_vst2q_v: {
11751     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11752     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
11753     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
11754                         Ops, "");
11755   }
11756   case NEON::BI__builtin_neon_vst2_lane_v:
11757   case NEON::BI__builtin_neon_vst2q_lane_v: {
11758     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11759     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
11760     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
11761     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
11762                         Ops, "");
11763   }
11764   case NEON::BI__builtin_neon_vst3_v:
11765   case NEON::BI__builtin_neon_vst3q_v: {
11766     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11767     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
11768     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
11769                         Ops, "");
11770   }
11771   case NEON::BI__builtin_neon_vst3_lane_v:
11772   case NEON::BI__builtin_neon_vst3q_lane_v: {
11773     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11774     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
11775     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
11776     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
11777                         Ops, "");
11778   }
11779   case NEON::BI__builtin_neon_vst4_v:
11780   case NEON::BI__builtin_neon_vst4q_v: {
11781     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11782     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
11783     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
11784                         Ops, "");
11785   }
11786   case NEON::BI__builtin_neon_vst4_lane_v:
11787   case NEON::BI__builtin_neon_vst4q_lane_v: {
11788     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
11789     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
11790     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
11791     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
11792                         Ops, "");
11793   }
11794   case NEON::BI__builtin_neon_vtrn_v:
11795   case NEON::BI__builtin_neon_vtrnq_v: {
11796     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
11797     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11798     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11799     Value *SV = nullptr;
11800 
11801     for (unsigned vi = 0; vi != 2; ++vi) {
11802       SmallVector<int, 16> Indices;
11803       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
11804         Indices.push_back(i+vi);
11805         Indices.push_back(i+e+vi);
11806       }
11807       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
11808       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
11809       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
11810     }
11811     return SV;
11812   }
11813   case NEON::BI__builtin_neon_vuzp_v:
11814   case NEON::BI__builtin_neon_vuzpq_v: {
11815     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
11816     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11817     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11818     Value *SV = nullptr;
11819 
11820     for (unsigned vi = 0; vi != 2; ++vi) {
11821       SmallVector<int, 16> Indices;
11822       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
11823         Indices.push_back(2*i+vi);
11824 
11825       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
11826       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
11827       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
11828     }
11829     return SV;
11830   }
11831   case NEON::BI__builtin_neon_vzip_v:
11832   case NEON::BI__builtin_neon_vzipq_v: {
11833     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
11834     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11835     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11836     Value *SV = nullptr;
11837 
11838     for (unsigned vi = 0; vi != 2; ++vi) {
11839       SmallVector<int, 16> Indices;
11840       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
11841         Indices.push_back((i + vi*e) >> 1);
11842         Indices.push_back(((i + vi*e) >> 1)+e);
11843       }
11844       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
11845       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
11846       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
11847     }
11848     return SV;
11849   }
11850   case NEON::BI__builtin_neon_vqtbl1q_v: {
11851     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
11852                         Ops, "vtbl1");
11853   }
11854   case NEON::BI__builtin_neon_vqtbl2q_v: {
11855     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
11856                         Ops, "vtbl2");
11857   }
11858   case NEON::BI__builtin_neon_vqtbl3q_v: {
11859     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
11860                         Ops, "vtbl3");
11861   }
11862   case NEON::BI__builtin_neon_vqtbl4q_v: {
11863     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
11864                         Ops, "vtbl4");
11865   }
11866   case NEON::BI__builtin_neon_vqtbx1q_v: {
11867     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
11868                         Ops, "vtbx1");
11869   }
11870   case NEON::BI__builtin_neon_vqtbx2q_v: {
11871     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
11872                         Ops, "vtbx2");
11873   }
11874   case NEON::BI__builtin_neon_vqtbx3q_v: {
11875     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
11876                         Ops, "vtbx3");
11877   }
11878   case NEON::BI__builtin_neon_vqtbx4q_v: {
11879     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
11880                         Ops, "vtbx4");
11881   }
11882   case NEON::BI__builtin_neon_vsqadd_v:
11883   case NEON::BI__builtin_neon_vsqaddq_v: {
11884     Int = Intrinsic::aarch64_neon_usqadd;
11885     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
11886   }
11887   case NEON::BI__builtin_neon_vuqadd_v:
11888   case NEON::BI__builtin_neon_vuqaddq_v: {
11889     Int = Intrinsic::aarch64_neon_suqadd;
11890     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
11891   }
11892   }
11893 }
11894 
11895 Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
11896                                            const CallExpr *E) {
11897   assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
11898           BuiltinID == BPF::BI__builtin_btf_type_id ||
11899           BuiltinID == BPF::BI__builtin_preserve_type_info ||
11900           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
11901          "unexpected BPF builtin");
11902 
11903   // A sequence number, injected into IR builtin functions, to
11904   // prevent CSE given the only difference of the funciton
11905   // may just be the debuginfo metadata.
11906   static uint32_t BuiltinSeqNum;
11907 
11908   switch (BuiltinID) {
11909   default:
11910     llvm_unreachable("Unexpected BPF builtin");
11911   case BPF::BI__builtin_preserve_field_info: {
11912     const Expr *Arg = E->getArg(0);
11913     bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
11914 
11915     if (!getDebugInfo()) {
11916       CGM.Error(E->getExprLoc(),
11917                 "using __builtin_preserve_field_info() without -g");
11918       return IsBitField ? EmitLValue(Arg).getBitFieldPointer()
11919                         : EmitLValue(Arg).getPointer(*this);
11920     }
11921 
11922     // Enable underlying preserve_*_access_index() generation.
11923     bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
11924     IsInPreservedAIRegion = true;
11925     Value *FieldAddr = IsBitField ? EmitLValue(Arg).getBitFieldPointer()
11926                                   : EmitLValue(Arg).getPointer(*this);
11927     IsInPreservedAIRegion = OldIsInPreservedAIRegion;
11928 
11929     ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11930     Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
11931 
11932     // Built the IR for the preserve_field_info intrinsic.
11933     llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration(
11934         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info,
11935         {FieldAddr->getType()});
11936     return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
11937   }
11938   case BPF::BI__builtin_btf_type_id:
11939   case BPF::BI__builtin_preserve_type_info: {
11940     if (!getDebugInfo()) {
11941       CGM.Error(E->getExprLoc(), "using builtin function without -g");
11942       return nullptr;
11943     }
11944 
11945     const Expr *Arg0 = E->getArg(0);
11946     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
11947         Arg0->getType(), Arg0->getExprLoc());
11948 
11949     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11950     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
11951     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
11952 
11953     llvm::Function *FnDecl;
11954     if (BuiltinID == BPF::BI__builtin_btf_type_id)
11955       FnDecl = llvm::Intrinsic::getDeclaration(
11956           &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {});
11957     else
11958       FnDecl = llvm::Intrinsic::getDeclaration(
11959           &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {});
11960     CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
11961     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
11962     return Fn;
11963   }
11964   case BPF::BI__builtin_preserve_enum_value: {
11965     if (!getDebugInfo()) {
11966       CGM.Error(E->getExprLoc(), "using builtin function without -g");
11967       return nullptr;
11968     }
11969 
11970     const Expr *Arg0 = E->getArg(0);
11971     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
11972         Arg0->getType(), Arg0->getExprLoc());
11973 
11974     // Find enumerator
11975     const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens());
11976     const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr());
11977     const auto *DR = cast<DeclRefExpr>(CE->getSubExpr());
11978     const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl());
11979 
11980     auto &InitVal = Enumerator->getInitVal();
11981     std::string InitValStr;
11982     if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
11983       InitValStr = std::to_string(InitVal.getSExtValue());
11984     else
11985       InitValStr = std::to_string(InitVal.getZExtValue());
11986     std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
11987     Value *EnumStrVal = Builder.CreateGlobalStringPtr(EnumStr);
11988 
11989     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11990     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
11991     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
11992 
11993     llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration(
11994         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {});
11995     CallInst *Fn =
11996         Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
11997     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
11998     return Fn;
11999   }
12000   }
12001 }
12002 
12003 llvm::Value *CodeGenFunction::
12004 BuildVector(ArrayRef<llvm::Value*> Ops) {
12005   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
12006          "Not a power-of-two sized vector!");
12007   bool AllConstants = true;
12008   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
12009     AllConstants &= isa<Constant>(Ops[i]);
12010 
12011   // If this is a constant vector, create a ConstantVector.
12012   if (AllConstants) {
12013     SmallVector<llvm::Constant*, 16> CstOps;
12014     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
12015       CstOps.push_back(cast<Constant>(Ops[i]));
12016     return llvm::ConstantVector::get(CstOps);
12017   }
12018 
12019   // Otherwise, insertelement the values to build the vector.
12020   Value *Result = llvm::UndefValue::get(
12021       llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
12022 
12023   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
12024     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt32(i));
12025 
12026   return Result;
12027 }
12028 
12029 // Convert the mask from an integer type to a vector of i1.
12030 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
12031                               unsigned NumElts) {
12032 
12033   auto *MaskTy = llvm::FixedVectorType::get(
12034       CGF.Builder.getInt1Ty(),
12035       cast<IntegerType>(Mask->getType())->getBitWidth());
12036   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
12037 
12038   // If we have less than 8 elements, then the starting mask was an i8 and
12039   // we need to extract down to the right number of elements.
12040   if (NumElts < 8) {
12041     int Indices[4];
12042     for (unsigned i = 0; i != NumElts; ++i)
12043       Indices[i] = i;
12044     MaskVec = CGF.Builder.CreateShuffleVector(MaskVec, MaskVec,
12045                                              makeArrayRef(Indices, NumElts),
12046                                              "extract");
12047   }
12048   return MaskVec;
12049 }
12050 
12051 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12052                                  Align Alignment) {
12053   // Cast the pointer to right type.
12054   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12055                                llvm::PointerType::getUnqual(Ops[1]->getType()));
12056 
12057   Value *MaskVec = getMaskVecValue(
12058       CGF, Ops[2],
12059       cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
12060 
12061   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
12062 }
12063 
12064 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12065                                 Align Alignment) {
12066   // Cast the pointer to right type.
12067   llvm::Type *Ty = Ops[1]->getType();
12068   Value *Ptr =
12069       CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
12070 
12071   Value *MaskVec = getMaskVecValue(
12072       CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
12073 
12074   return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
12075 }
12076 
12077 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
12078                                 ArrayRef<Value *> Ops) {
12079   auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
12080   llvm::Type *PtrTy = ResultTy->getElementType();
12081 
12082   // Cast the pointer to element type.
12083   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12084                                          llvm::PointerType::getUnqual(PtrTy));
12085 
12086   Value *MaskVec = getMaskVecValue(
12087       CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
12088 
12089   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
12090                                            ResultTy);
12091   return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
12092 }
12093 
12094 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
12095                                     ArrayRef<Value *> Ops,
12096                                     bool IsCompress) {
12097   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
12098 
12099   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
12100 
12101   Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
12102                                  : Intrinsic::x86_avx512_mask_expand;
12103   llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
12104   return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
12105 }
12106 
12107 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
12108                                    ArrayRef<Value *> Ops) {
12109   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
12110   llvm::Type *PtrTy = ResultTy->getElementType();
12111 
12112   // Cast the pointer to element type.
12113   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12114                                          llvm::PointerType::getUnqual(PtrTy));
12115 
12116   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
12117 
12118   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
12119                                            ResultTy);
12120   return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
12121 }
12122 
12123 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
12124                               ArrayRef<Value *> Ops,
12125                               bool InvertLHS = false) {
12126   unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
12127   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
12128   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
12129 
12130   if (InvertLHS)
12131     LHS = CGF.Builder.CreateNot(LHS);
12132 
12133   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
12134                                    Ops[0]->getType());
12135 }
12136 
12137 static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
12138                                  Value *Amt, bool IsRight) {
12139   llvm::Type *Ty = Op0->getType();
12140 
12141   // Amount may be scalar immediate, in which case create a splat vector.
12142   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
12143   // we only care about the lowest log2 bits anyway.
12144   if (Amt->getType() != Ty) {
12145     unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
12146     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
12147     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
12148   }
12149 
12150   unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
12151   Function *F = CGF.CGM.getIntrinsic(IID, Ty);
12152   return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
12153 }
12154 
12155 static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12156                            bool IsSigned) {
12157   Value *Op0 = Ops[0];
12158   Value *Op1 = Ops[1];
12159   llvm::Type *Ty = Op0->getType();
12160   uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
12161 
12162   CmpInst::Predicate Pred;
12163   switch (Imm) {
12164   case 0x0:
12165     Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
12166     break;
12167   case 0x1:
12168     Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
12169     break;
12170   case 0x2:
12171     Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
12172     break;
12173   case 0x3:
12174     Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
12175     break;
12176   case 0x4:
12177     Pred = ICmpInst::ICMP_EQ;
12178     break;
12179   case 0x5:
12180     Pred = ICmpInst::ICMP_NE;
12181     break;
12182   case 0x6:
12183     return llvm::Constant::getNullValue(Ty); // FALSE
12184   case 0x7:
12185     return llvm::Constant::getAllOnesValue(Ty); // TRUE
12186   default:
12187     llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
12188   }
12189 
12190   Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
12191   Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
12192   return Res;
12193 }
12194 
12195 static Value *EmitX86Select(CodeGenFunction &CGF,
12196                             Value *Mask, Value *Op0, Value *Op1) {
12197 
12198   // If the mask is all ones just return first argument.
12199   if (const auto *C = dyn_cast<Constant>(Mask))
12200     if (C->isAllOnesValue())
12201       return Op0;
12202 
12203   Mask = getMaskVecValue(
12204       CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
12205 
12206   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
12207 }
12208 
12209 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
12210                                   Value *Mask, Value *Op0, Value *Op1) {
12211   // If the mask is all ones just return first argument.
12212   if (const auto *C = dyn_cast<Constant>(Mask))
12213     if (C->isAllOnesValue())
12214       return Op0;
12215 
12216   auto *MaskTy = llvm::FixedVectorType::get(
12217       CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
12218   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
12219   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
12220   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
12221 }
12222 
12223 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
12224                                          unsigned NumElts, Value *MaskIn) {
12225   if (MaskIn) {
12226     const auto *C = dyn_cast<Constant>(MaskIn);
12227     if (!C || !C->isAllOnesValue())
12228       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
12229   }
12230 
12231   if (NumElts < 8) {
12232     int Indices[8];
12233     for (unsigned i = 0; i != NumElts; ++i)
12234       Indices[i] = i;
12235     for (unsigned i = NumElts; i != 8; ++i)
12236       Indices[i] = i % NumElts + NumElts;
12237     Cmp = CGF.Builder.CreateShuffleVector(
12238         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
12239   }
12240 
12241   return CGF.Builder.CreateBitCast(Cmp,
12242                                    IntegerType::get(CGF.getLLVMContext(),
12243                                                     std::max(NumElts, 8U)));
12244 }
12245 
12246 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
12247                                    bool Signed, ArrayRef<Value *> Ops) {
12248   assert((Ops.size() == 2 || Ops.size() == 4) &&
12249          "Unexpected number of arguments");
12250   unsigned NumElts =
12251       cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
12252   Value *Cmp;
12253 
12254   if (CC == 3) {
12255     Cmp = Constant::getNullValue(
12256         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
12257   } else if (CC == 7) {
12258     Cmp = Constant::getAllOnesValue(
12259         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
12260   } else {
12261     ICmpInst::Predicate Pred;
12262     switch (CC) {
12263     default: llvm_unreachable("Unknown condition code");
12264     case 0: Pred = ICmpInst::ICMP_EQ;  break;
12265     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
12266     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
12267     case 4: Pred = ICmpInst::ICMP_NE;  break;
12268     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
12269     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
12270     }
12271     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
12272   }
12273 
12274   Value *MaskIn = nullptr;
12275   if (Ops.size() == 4)
12276     MaskIn = Ops[3];
12277 
12278   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
12279 }
12280 
12281 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
12282   Value *Zero = Constant::getNullValue(In->getType());
12283   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
12284 }
12285 
12286 static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
12287                                     ArrayRef<Value *> Ops, bool IsSigned) {
12288   unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
12289   llvm::Type *Ty = Ops[1]->getType();
12290 
12291   Value *Res;
12292   if (Rnd != 4) {
12293     Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
12294                                  : Intrinsic::x86_avx512_uitofp_round;
12295     Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
12296     Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
12297   } else {
12298     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
12299     Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
12300                    : CGF.Builder.CreateUIToFP(Ops[0], Ty);
12301   }
12302 
12303   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
12304 }
12305 
12306 // Lowers X86 FMA intrinsics to IR.
12307 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
12308                              ArrayRef<Value *> Ops, unsigned BuiltinID,
12309                              bool IsAddSub) {
12310 
12311   bool Subtract = false;
12312   Intrinsic::ID IID = Intrinsic::not_intrinsic;
12313   switch (BuiltinID) {
12314   default: break;
12315   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
12316     Subtract = true;
12317     LLVM_FALLTHROUGH;
12318   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
12319   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
12320   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
12321     IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512;
12322     break;
12323   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
12324     Subtract = true;
12325     LLVM_FALLTHROUGH;
12326   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
12327   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
12328   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
12329     IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
12330     break;
12331   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
12332     Subtract = true;
12333     LLVM_FALLTHROUGH;
12334   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
12335   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
12336   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
12337     IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
12338   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
12339     Subtract = true;
12340     LLVM_FALLTHROUGH;
12341   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
12342   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
12343   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
12344     IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
12345   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
12346     Subtract = true;
12347     LLVM_FALLTHROUGH;
12348   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
12349   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
12350   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
12351     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
12352     break;
12353   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
12354     Subtract = true;
12355     LLVM_FALLTHROUGH;
12356   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
12357   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
12358   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
12359     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
12360     break;
12361   }
12362 
12363   Value *A = Ops[0];
12364   Value *B = Ops[1];
12365   Value *C = Ops[2];
12366 
12367   if (Subtract)
12368     C = CGF.Builder.CreateFNeg(C);
12369 
12370   Value *Res;
12371 
12372   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
12373   if (IID != Intrinsic::not_intrinsic &&
12374       (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
12375        IsAddSub)) {
12376     Function *Intr = CGF.CGM.getIntrinsic(IID);
12377     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
12378   } else {
12379     llvm::Type *Ty = A->getType();
12380     Function *FMA;
12381     if (CGF.Builder.getIsFPConstrained()) {
12382       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
12383       FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
12384       Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
12385     } else {
12386       FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
12387       Res = CGF.Builder.CreateCall(FMA, {A, B, C});
12388     }
12389   }
12390 
12391   // Handle any required masking.
12392   Value *MaskFalseVal = nullptr;
12393   switch (BuiltinID) {
12394   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
12395   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
12396   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
12397   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
12398   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
12399   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
12400     MaskFalseVal = Ops[0];
12401     break;
12402   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
12403   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
12404   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
12405   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
12406   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
12407   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
12408     MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
12409     break;
12410   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
12411   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
12412   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
12413   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
12414   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
12415   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
12416   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
12417   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
12418   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
12419   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
12420   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
12421   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
12422     MaskFalseVal = Ops[2];
12423     break;
12424   }
12425 
12426   if (MaskFalseVal)
12427     return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
12428 
12429   return Res;
12430 }
12431 
12432 static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
12433                                 MutableArrayRef<Value *> Ops, Value *Upper,
12434                                 bool ZeroMask = false, unsigned PTIdx = 0,
12435                                 bool NegAcc = false) {
12436   unsigned Rnd = 4;
12437   if (Ops.size() > 4)
12438     Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
12439 
12440   if (NegAcc)
12441     Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
12442 
12443   Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
12444   Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
12445   Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
12446   Value *Res;
12447   if (Rnd != 4) {
12448     Intrinsic::ID IID;
12449 
12450     switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
12451     case 16:
12452       IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
12453       break;
12454     case 32:
12455       IID = Intrinsic::x86_avx512_vfmadd_f32;
12456       break;
12457     case 64:
12458       IID = Intrinsic::x86_avx512_vfmadd_f64;
12459       break;
12460     default:
12461       llvm_unreachable("Unexpected size");
12462     }
12463     Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
12464                                  {Ops[0], Ops[1], Ops[2], Ops[4]});
12465   } else if (CGF.Builder.getIsFPConstrained()) {
12466     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
12467     Function *FMA = CGF.CGM.getIntrinsic(
12468         Intrinsic::experimental_constrained_fma, Ops[0]->getType());
12469     Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
12470   } else {
12471     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
12472     Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
12473   }
12474   // If we have more than 3 arguments, we need to do masking.
12475   if (Ops.size() > 3) {
12476     Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
12477                                : Ops[PTIdx];
12478 
12479     // If we negated the accumulator and the its the PassThru value we need to
12480     // bypass the negate. Conveniently Upper should be the same thing in this
12481     // case.
12482     if (NegAcc && PTIdx == 2)
12483       PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
12484 
12485     Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
12486   }
12487   return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
12488 }
12489 
12490 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
12491                            ArrayRef<Value *> Ops) {
12492   llvm::Type *Ty = Ops[0]->getType();
12493   // Arguments have a vXi32 type so cast to vXi64.
12494   Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
12495                                   Ty->getPrimitiveSizeInBits() / 64);
12496   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
12497   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
12498 
12499   if (IsSigned) {
12500     // Shift left then arithmetic shift right.
12501     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
12502     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
12503     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
12504     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
12505     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
12506   } else {
12507     // Clear the upper bits.
12508     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
12509     LHS = CGF.Builder.CreateAnd(LHS, Mask);
12510     RHS = CGF.Builder.CreateAnd(RHS, Mask);
12511   }
12512 
12513   return CGF.Builder.CreateMul(LHS, RHS);
12514 }
12515 
12516 // Emit a masked pternlog intrinsic. This only exists because the header has to
12517 // use a macro and we aren't able to pass the input argument to a pternlog
12518 // builtin and a select builtin without evaluating it twice.
12519 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
12520                              ArrayRef<Value *> Ops) {
12521   llvm::Type *Ty = Ops[0]->getType();
12522 
12523   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
12524   unsigned EltWidth = Ty->getScalarSizeInBits();
12525   Intrinsic::ID IID;
12526   if (VecWidth == 128 && EltWidth == 32)
12527     IID = Intrinsic::x86_avx512_pternlog_d_128;
12528   else if (VecWidth == 256 && EltWidth == 32)
12529     IID = Intrinsic::x86_avx512_pternlog_d_256;
12530   else if (VecWidth == 512 && EltWidth == 32)
12531     IID = Intrinsic::x86_avx512_pternlog_d_512;
12532   else if (VecWidth == 128 && EltWidth == 64)
12533     IID = Intrinsic::x86_avx512_pternlog_q_128;
12534   else if (VecWidth == 256 && EltWidth == 64)
12535     IID = Intrinsic::x86_avx512_pternlog_q_256;
12536   else if (VecWidth == 512 && EltWidth == 64)
12537     IID = Intrinsic::x86_avx512_pternlog_q_512;
12538   else
12539     llvm_unreachable("Unexpected intrinsic");
12540 
12541   Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
12542                                           Ops.drop_back());
12543   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
12544   return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
12545 }
12546 
12547 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
12548                               llvm::Type *DstTy) {
12549   unsigned NumberOfElements =
12550       cast<llvm::FixedVectorType>(DstTy)->getNumElements();
12551   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
12552   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
12553 }
12554 
12555 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
12556   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
12557   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
12558   return EmitX86CpuIs(CPUStr);
12559 }
12560 
12561 // Convert F16 halfs to floats.
12562 static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
12563                                        ArrayRef<Value *> Ops,
12564                                        llvm::Type *DstTy) {
12565   assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
12566          "Unknown cvtph2ps intrinsic");
12567 
12568   // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
12569   if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
12570     Function *F =
12571         CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
12572     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
12573   }
12574 
12575   unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
12576   Value *Src = Ops[0];
12577 
12578   // Extract the subvector.
12579   if (NumDstElts !=
12580       cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
12581     assert(NumDstElts == 4 && "Unexpected vector size");
12582     Src = CGF.Builder.CreateShuffleVector(Src, ArrayRef<int>{0, 1, 2, 3});
12583   }
12584 
12585   // Bitcast from vXi16 to vXf16.
12586   auto *HalfTy = llvm::FixedVectorType::get(
12587       llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
12588   Src = CGF.Builder.CreateBitCast(Src, HalfTy);
12589 
12590   // Perform the fp-extension.
12591   Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
12592 
12593   if (Ops.size() >= 3)
12594     Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
12595   return Res;
12596 }
12597 
12598 // Convert a BF16 to a float.
12599 static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF,
12600                                         const CallExpr *E,
12601                                         ArrayRef<Value *> Ops) {
12602   llvm::Type *Int32Ty = CGF.Builder.getInt32Ty();
12603   Value *ZeroExt = CGF.Builder.CreateZExt(Ops[0], Int32Ty);
12604   Value *Shl = CGF.Builder.CreateShl(ZeroExt, 16);
12605   llvm::Type *ResultType = CGF.ConvertType(E->getType());
12606   Value *BitCast = CGF.Builder.CreateBitCast(Shl, ResultType);
12607   return BitCast;
12608 }
12609 
12610 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
12611 
12612   llvm::Type *Int32Ty = Builder.getInt32Ty();
12613 
12614   // Matching the struct layout from the compiler-rt/libgcc structure that is
12615   // filled in:
12616   // unsigned int __cpu_vendor;
12617   // unsigned int __cpu_type;
12618   // unsigned int __cpu_subtype;
12619   // unsigned int __cpu_features[1];
12620   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
12621                                           llvm::ArrayType::get(Int32Ty, 1));
12622 
12623   // Grab the global __cpu_model.
12624   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
12625   cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
12626 
12627   // Calculate the index needed to access the correct field based on the
12628   // range. Also adjust the expected value.
12629   unsigned Index;
12630   unsigned Value;
12631   std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
12632 #define X86_VENDOR(ENUM, STRING)                                               \
12633   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
12634 #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)                                        \
12635   .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
12636 #define X86_CPU_TYPE(ENUM, STR)                                                \
12637   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
12638 #define X86_CPU_SUBTYPE(ENUM, STR)                                             \
12639   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
12640 #include "llvm/Support/X86TargetParser.def"
12641                                .Default({0, 0});
12642   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
12643 
12644   // Grab the appropriate field from __cpu_model.
12645   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
12646                          ConstantInt::get(Int32Ty, Index)};
12647   llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
12648   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
12649                                        CharUnits::fromQuantity(4));
12650 
12651   // Check the value of the field against the requested value.
12652   return Builder.CreateICmpEQ(CpuValue,
12653                                   llvm::ConstantInt::get(Int32Ty, Value));
12654 }
12655 
12656 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
12657   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
12658   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
12659   return EmitX86CpuSupports(FeatureStr);
12660 }
12661 
12662 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
12663   return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
12664 }
12665 
12666 llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
12667   uint32_t Features1 = Lo_32(FeaturesMask);
12668   uint32_t Features2 = Hi_32(FeaturesMask);
12669 
12670   Value *Result = Builder.getTrue();
12671 
12672   if (Features1 != 0) {
12673     // Matching the struct layout from the compiler-rt/libgcc structure that is
12674     // filled in:
12675     // unsigned int __cpu_vendor;
12676     // unsigned int __cpu_type;
12677     // unsigned int __cpu_subtype;
12678     // unsigned int __cpu_features[1];
12679     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
12680                                             llvm::ArrayType::get(Int32Ty, 1));
12681 
12682     // Grab the global __cpu_model.
12683     llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
12684     cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
12685 
12686     // Grab the first (0th) element from the field __cpu_features off of the
12687     // global in the struct STy.
12688     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
12689                      Builder.getInt32(0)};
12690     Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
12691     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
12692                                                 CharUnits::fromQuantity(4));
12693 
12694     // Check the value of the bit corresponding to the feature requested.
12695     Value *Mask = Builder.getInt32(Features1);
12696     Value *Bitset = Builder.CreateAnd(Features, Mask);
12697     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
12698     Result = Builder.CreateAnd(Result, Cmp);
12699   }
12700 
12701   if (Features2 != 0) {
12702     llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty,
12703                                                              "__cpu_features2");
12704     cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
12705 
12706     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures2,
12707                                                 CharUnits::fromQuantity(4));
12708 
12709     // Check the value of the bit corresponding to the feature requested.
12710     Value *Mask = Builder.getInt32(Features2);
12711     Value *Bitset = Builder.CreateAnd(Features, Mask);
12712     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
12713     Result = Builder.CreateAnd(Result, Cmp);
12714   }
12715 
12716   return Result;
12717 }
12718 
12719 Value *CodeGenFunction::EmitX86CpuInit() {
12720   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
12721                                                     /*Variadic*/ false);
12722   llvm::FunctionCallee Func =
12723       CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
12724   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
12725   cast<llvm::GlobalValue>(Func.getCallee())
12726       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
12727   return Builder.CreateCall(Func);
12728 }
12729 
12730 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
12731                                            const CallExpr *E) {
12732   if (BuiltinID == X86::BI__builtin_cpu_is)
12733     return EmitX86CpuIs(E);
12734   if (BuiltinID == X86::BI__builtin_cpu_supports)
12735     return EmitX86CpuSupports(E);
12736   if (BuiltinID == X86::BI__builtin_cpu_init)
12737     return EmitX86CpuInit();
12738 
12739   // Handle MSVC intrinsics before argument evaluation to prevent double
12740   // evaluation.
12741   if (Optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
12742     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
12743 
12744   SmallVector<Value*, 4> Ops;
12745   bool IsMaskFCmp = false;
12746   bool IsConjFMA = false;
12747 
12748   // Find out if any arguments are required to be integer constant expressions.
12749   unsigned ICEArguments = 0;
12750   ASTContext::GetBuiltinTypeError Error;
12751   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
12752   assert(Error == ASTContext::GE_None && "Should not codegen an error");
12753 
12754   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
12755     // If this is a normal argument, just emit it as a scalar.
12756     if ((ICEArguments & (1 << i)) == 0) {
12757       Ops.push_back(EmitScalarExpr(E->getArg(i)));
12758       continue;
12759     }
12760 
12761     // If this is required to be a constant, constant fold it so that we know
12762     // that the generated intrinsic gets a ConstantInt.
12763     Ops.push_back(llvm::ConstantInt::get(
12764         getLLVMContext(), *E->getArg(i)->getIntegerConstantExpr(getContext())));
12765   }
12766 
12767   // These exist so that the builtin that takes an immediate can be bounds
12768   // checked by clang to avoid passing bad immediates to the backend. Since
12769   // AVX has a larger immediate than SSE we would need separate builtins to
12770   // do the different bounds checking. Rather than create a clang specific
12771   // SSE only builtin, this implements eight separate builtins to match gcc
12772   // implementation.
12773   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
12774     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
12775     llvm::Function *F = CGM.getIntrinsic(ID);
12776     return Builder.CreateCall(F, Ops);
12777   };
12778 
12779   // For the vector forms of FP comparisons, translate the builtins directly to
12780   // IR.
12781   // TODO: The builtins could be removed if the SSE header files used vector
12782   // extension comparisons directly (vector ordered/unordered may need
12783   // additional support via __builtin_isnan()).
12784   auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
12785                                          bool IsSignaling) {
12786     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
12787     Value *Cmp;
12788     if (IsSignaling)
12789       Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
12790     else
12791       Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
12792     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
12793     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
12794     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
12795     return Builder.CreateBitCast(Sext, FPVecTy);
12796   };
12797 
12798   switch (BuiltinID) {
12799   default: return nullptr;
12800   case X86::BI_mm_prefetch: {
12801     Value *Address = Ops[0];
12802     ConstantInt *C = cast<ConstantInt>(Ops[1]);
12803     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
12804     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
12805     Value *Data = ConstantInt::get(Int32Ty, 1);
12806     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
12807     return Builder.CreateCall(F, {Address, RW, Locality, Data});
12808   }
12809   case X86::BI_mm_clflush: {
12810     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
12811                               Ops[0]);
12812   }
12813   case X86::BI_mm_lfence: {
12814     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
12815   }
12816   case X86::BI_mm_mfence: {
12817     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
12818   }
12819   case X86::BI_mm_sfence: {
12820     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
12821   }
12822   case X86::BI_mm_pause: {
12823     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
12824   }
12825   case X86::BI__rdtsc: {
12826     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
12827   }
12828   case X86::BI__builtin_ia32_rdtscp: {
12829     Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
12830     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
12831                                       Ops[0]);
12832     return Builder.CreateExtractValue(Call, 0);
12833   }
12834   case X86::BI__builtin_ia32_lzcnt_u16:
12835   case X86::BI__builtin_ia32_lzcnt_u32:
12836   case X86::BI__builtin_ia32_lzcnt_u64: {
12837     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
12838     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
12839   }
12840   case X86::BI__builtin_ia32_tzcnt_u16:
12841   case X86::BI__builtin_ia32_tzcnt_u32:
12842   case X86::BI__builtin_ia32_tzcnt_u64: {
12843     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
12844     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
12845   }
12846   case X86::BI__builtin_ia32_undef128:
12847   case X86::BI__builtin_ia32_undef256:
12848   case X86::BI__builtin_ia32_undef512:
12849     // The x86 definition of "undef" is not the same as the LLVM definition
12850     // (PR32176). We leave optimizing away an unnecessary zero constant to the
12851     // IR optimizer and backend.
12852     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
12853     // value, we should use that here instead of a zero.
12854     return llvm::Constant::getNullValue(ConvertType(E->getType()));
12855   case X86::BI__builtin_ia32_vec_init_v8qi:
12856   case X86::BI__builtin_ia32_vec_init_v4hi:
12857   case X86::BI__builtin_ia32_vec_init_v2si:
12858     return Builder.CreateBitCast(BuildVector(Ops),
12859                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
12860   case X86::BI__builtin_ia32_vec_ext_v2si:
12861   case X86::BI__builtin_ia32_vec_ext_v16qi:
12862   case X86::BI__builtin_ia32_vec_ext_v8hi:
12863   case X86::BI__builtin_ia32_vec_ext_v4si:
12864   case X86::BI__builtin_ia32_vec_ext_v4sf:
12865   case X86::BI__builtin_ia32_vec_ext_v2di:
12866   case X86::BI__builtin_ia32_vec_ext_v32qi:
12867   case X86::BI__builtin_ia32_vec_ext_v16hi:
12868   case X86::BI__builtin_ia32_vec_ext_v8si:
12869   case X86::BI__builtin_ia32_vec_ext_v4di: {
12870     unsigned NumElts =
12871         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
12872     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
12873     Index &= NumElts - 1;
12874     // These builtins exist so we can ensure the index is an ICE and in range.
12875     // Otherwise we could just do this in the header file.
12876     return Builder.CreateExtractElement(Ops[0], Index);
12877   }
12878   case X86::BI__builtin_ia32_vec_set_v16qi:
12879   case X86::BI__builtin_ia32_vec_set_v8hi:
12880   case X86::BI__builtin_ia32_vec_set_v4si:
12881   case X86::BI__builtin_ia32_vec_set_v2di:
12882   case X86::BI__builtin_ia32_vec_set_v32qi:
12883   case X86::BI__builtin_ia32_vec_set_v16hi:
12884   case X86::BI__builtin_ia32_vec_set_v8si:
12885   case X86::BI__builtin_ia32_vec_set_v4di: {
12886     unsigned NumElts =
12887         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
12888     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
12889     Index &= NumElts - 1;
12890     // These builtins exist so we can ensure the index is an ICE and in range.
12891     // Otherwise we could just do this in the header file.
12892     return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
12893   }
12894   case X86::BI_mm_setcsr:
12895   case X86::BI__builtin_ia32_ldmxcsr: {
12896     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
12897     Builder.CreateStore(Ops[0], Tmp);
12898     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
12899                           Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
12900   }
12901   case X86::BI_mm_getcsr:
12902   case X86::BI__builtin_ia32_stmxcsr: {
12903     Address Tmp = CreateMemTemp(E->getType());
12904     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
12905                        Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
12906     return Builder.CreateLoad(Tmp, "stmxcsr");
12907   }
12908   case X86::BI__builtin_ia32_xsave:
12909   case X86::BI__builtin_ia32_xsave64:
12910   case X86::BI__builtin_ia32_xrstor:
12911   case X86::BI__builtin_ia32_xrstor64:
12912   case X86::BI__builtin_ia32_xsaveopt:
12913   case X86::BI__builtin_ia32_xsaveopt64:
12914   case X86::BI__builtin_ia32_xrstors:
12915   case X86::BI__builtin_ia32_xrstors64:
12916   case X86::BI__builtin_ia32_xsavec:
12917   case X86::BI__builtin_ia32_xsavec64:
12918   case X86::BI__builtin_ia32_xsaves:
12919   case X86::BI__builtin_ia32_xsaves64:
12920   case X86::BI__builtin_ia32_xsetbv:
12921   case X86::BI_xsetbv: {
12922     Intrinsic::ID ID;
12923 #define INTRINSIC_X86_XSAVE_ID(NAME) \
12924     case X86::BI__builtin_ia32_##NAME: \
12925       ID = Intrinsic::x86_##NAME; \
12926       break
12927     switch (BuiltinID) {
12928     default: llvm_unreachable("Unsupported intrinsic!");
12929     INTRINSIC_X86_XSAVE_ID(xsave);
12930     INTRINSIC_X86_XSAVE_ID(xsave64);
12931     INTRINSIC_X86_XSAVE_ID(xrstor);
12932     INTRINSIC_X86_XSAVE_ID(xrstor64);
12933     INTRINSIC_X86_XSAVE_ID(xsaveopt);
12934     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
12935     INTRINSIC_X86_XSAVE_ID(xrstors);
12936     INTRINSIC_X86_XSAVE_ID(xrstors64);
12937     INTRINSIC_X86_XSAVE_ID(xsavec);
12938     INTRINSIC_X86_XSAVE_ID(xsavec64);
12939     INTRINSIC_X86_XSAVE_ID(xsaves);
12940     INTRINSIC_X86_XSAVE_ID(xsaves64);
12941     INTRINSIC_X86_XSAVE_ID(xsetbv);
12942     case X86::BI_xsetbv:
12943       ID = Intrinsic::x86_xsetbv;
12944       break;
12945     }
12946 #undef INTRINSIC_X86_XSAVE_ID
12947     Value *Mhi = Builder.CreateTrunc(
12948       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
12949     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
12950     Ops[1] = Mhi;
12951     Ops.push_back(Mlo);
12952     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
12953   }
12954   case X86::BI__builtin_ia32_xgetbv:
12955   case X86::BI_xgetbv:
12956     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
12957   case X86::BI__builtin_ia32_storedqudi128_mask:
12958   case X86::BI__builtin_ia32_storedqusi128_mask:
12959   case X86::BI__builtin_ia32_storedquhi128_mask:
12960   case X86::BI__builtin_ia32_storedquqi128_mask:
12961   case X86::BI__builtin_ia32_storeupd128_mask:
12962   case X86::BI__builtin_ia32_storeups128_mask:
12963   case X86::BI__builtin_ia32_storedqudi256_mask:
12964   case X86::BI__builtin_ia32_storedqusi256_mask:
12965   case X86::BI__builtin_ia32_storedquhi256_mask:
12966   case X86::BI__builtin_ia32_storedquqi256_mask:
12967   case X86::BI__builtin_ia32_storeupd256_mask:
12968   case X86::BI__builtin_ia32_storeups256_mask:
12969   case X86::BI__builtin_ia32_storedqudi512_mask:
12970   case X86::BI__builtin_ia32_storedqusi512_mask:
12971   case X86::BI__builtin_ia32_storedquhi512_mask:
12972   case X86::BI__builtin_ia32_storedquqi512_mask:
12973   case X86::BI__builtin_ia32_storeupd512_mask:
12974   case X86::BI__builtin_ia32_storeups512_mask:
12975     return EmitX86MaskedStore(*this, Ops, Align(1));
12976 
12977   case X86::BI__builtin_ia32_storesh128_mask:
12978   case X86::BI__builtin_ia32_storess128_mask:
12979   case X86::BI__builtin_ia32_storesd128_mask:
12980     return EmitX86MaskedStore(*this, Ops, Align(1));
12981 
12982   case X86::BI__builtin_ia32_vpopcntb_128:
12983   case X86::BI__builtin_ia32_vpopcntd_128:
12984   case X86::BI__builtin_ia32_vpopcntq_128:
12985   case X86::BI__builtin_ia32_vpopcntw_128:
12986   case X86::BI__builtin_ia32_vpopcntb_256:
12987   case X86::BI__builtin_ia32_vpopcntd_256:
12988   case X86::BI__builtin_ia32_vpopcntq_256:
12989   case X86::BI__builtin_ia32_vpopcntw_256:
12990   case X86::BI__builtin_ia32_vpopcntb_512:
12991   case X86::BI__builtin_ia32_vpopcntd_512:
12992   case X86::BI__builtin_ia32_vpopcntq_512:
12993   case X86::BI__builtin_ia32_vpopcntw_512: {
12994     llvm::Type *ResultType = ConvertType(E->getType());
12995     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
12996     return Builder.CreateCall(F, Ops);
12997   }
12998   case X86::BI__builtin_ia32_cvtmask2b128:
12999   case X86::BI__builtin_ia32_cvtmask2b256:
13000   case X86::BI__builtin_ia32_cvtmask2b512:
13001   case X86::BI__builtin_ia32_cvtmask2w128:
13002   case X86::BI__builtin_ia32_cvtmask2w256:
13003   case X86::BI__builtin_ia32_cvtmask2w512:
13004   case X86::BI__builtin_ia32_cvtmask2d128:
13005   case X86::BI__builtin_ia32_cvtmask2d256:
13006   case X86::BI__builtin_ia32_cvtmask2d512:
13007   case X86::BI__builtin_ia32_cvtmask2q128:
13008   case X86::BI__builtin_ia32_cvtmask2q256:
13009   case X86::BI__builtin_ia32_cvtmask2q512:
13010     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
13011 
13012   case X86::BI__builtin_ia32_cvtb2mask128:
13013   case X86::BI__builtin_ia32_cvtb2mask256:
13014   case X86::BI__builtin_ia32_cvtb2mask512:
13015   case X86::BI__builtin_ia32_cvtw2mask128:
13016   case X86::BI__builtin_ia32_cvtw2mask256:
13017   case X86::BI__builtin_ia32_cvtw2mask512:
13018   case X86::BI__builtin_ia32_cvtd2mask128:
13019   case X86::BI__builtin_ia32_cvtd2mask256:
13020   case X86::BI__builtin_ia32_cvtd2mask512:
13021   case X86::BI__builtin_ia32_cvtq2mask128:
13022   case X86::BI__builtin_ia32_cvtq2mask256:
13023   case X86::BI__builtin_ia32_cvtq2mask512:
13024     return EmitX86ConvertToMask(*this, Ops[0]);
13025 
13026   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
13027   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
13028   case X86::BI__builtin_ia32_cvtqq2pd512_mask:
13029   case X86::BI__builtin_ia32_vcvtw2ph512_mask:
13030   case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
13031   case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
13032     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
13033   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
13034   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
13035   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
13036   case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
13037   case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
13038   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
13039     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
13040 
13041   case X86::BI__builtin_ia32_vfmaddss3:
13042   case X86::BI__builtin_ia32_vfmaddsd3:
13043   case X86::BI__builtin_ia32_vfmaddsh3_mask:
13044   case X86::BI__builtin_ia32_vfmaddss3_mask:
13045   case X86::BI__builtin_ia32_vfmaddsd3_mask:
13046     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
13047   case X86::BI__builtin_ia32_vfmaddss:
13048   case X86::BI__builtin_ia32_vfmaddsd:
13049     return EmitScalarFMAExpr(*this, E, Ops,
13050                              Constant::getNullValue(Ops[0]->getType()));
13051   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
13052   case X86::BI__builtin_ia32_vfmaddss3_maskz:
13053   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
13054     return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
13055   case X86::BI__builtin_ia32_vfmaddsh3_mask3:
13056   case X86::BI__builtin_ia32_vfmaddss3_mask3:
13057   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
13058     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
13059   case X86::BI__builtin_ia32_vfmsubsh3_mask3:
13060   case X86::BI__builtin_ia32_vfmsubss3_mask3:
13061   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
13062     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
13063                              /*NegAcc*/ true);
13064   case X86::BI__builtin_ia32_vfmaddph:
13065   case X86::BI__builtin_ia32_vfmaddps:
13066   case X86::BI__builtin_ia32_vfmaddpd:
13067   case X86::BI__builtin_ia32_vfmaddph256:
13068   case X86::BI__builtin_ia32_vfmaddps256:
13069   case X86::BI__builtin_ia32_vfmaddpd256:
13070   case X86::BI__builtin_ia32_vfmaddph512_mask:
13071   case X86::BI__builtin_ia32_vfmaddph512_maskz:
13072   case X86::BI__builtin_ia32_vfmaddph512_mask3:
13073   case X86::BI__builtin_ia32_vfmaddps512_mask:
13074   case X86::BI__builtin_ia32_vfmaddps512_maskz:
13075   case X86::BI__builtin_ia32_vfmaddps512_mask3:
13076   case X86::BI__builtin_ia32_vfmsubps512_mask3:
13077   case X86::BI__builtin_ia32_vfmaddpd512_mask:
13078   case X86::BI__builtin_ia32_vfmaddpd512_maskz:
13079   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
13080   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
13081   case X86::BI__builtin_ia32_vfmsubph512_mask3:
13082     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
13083   case X86::BI__builtin_ia32_vfmaddsubph512_mask:
13084   case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13085   case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13086   case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13087   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
13088   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13089   case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13090   case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13091   case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13092   case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13093   case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13094   case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13095     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
13096 
13097   case X86::BI__builtin_ia32_movdqa32store128_mask:
13098   case X86::BI__builtin_ia32_movdqa64store128_mask:
13099   case X86::BI__builtin_ia32_storeaps128_mask:
13100   case X86::BI__builtin_ia32_storeapd128_mask:
13101   case X86::BI__builtin_ia32_movdqa32store256_mask:
13102   case X86::BI__builtin_ia32_movdqa64store256_mask:
13103   case X86::BI__builtin_ia32_storeaps256_mask:
13104   case X86::BI__builtin_ia32_storeapd256_mask:
13105   case X86::BI__builtin_ia32_movdqa32store512_mask:
13106   case X86::BI__builtin_ia32_movdqa64store512_mask:
13107   case X86::BI__builtin_ia32_storeaps512_mask:
13108   case X86::BI__builtin_ia32_storeapd512_mask:
13109     return EmitX86MaskedStore(
13110         *this, Ops,
13111         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
13112 
13113   case X86::BI__builtin_ia32_loadups128_mask:
13114   case X86::BI__builtin_ia32_loadups256_mask:
13115   case X86::BI__builtin_ia32_loadups512_mask:
13116   case X86::BI__builtin_ia32_loadupd128_mask:
13117   case X86::BI__builtin_ia32_loadupd256_mask:
13118   case X86::BI__builtin_ia32_loadupd512_mask:
13119   case X86::BI__builtin_ia32_loaddquqi128_mask:
13120   case X86::BI__builtin_ia32_loaddquqi256_mask:
13121   case X86::BI__builtin_ia32_loaddquqi512_mask:
13122   case X86::BI__builtin_ia32_loaddquhi128_mask:
13123   case X86::BI__builtin_ia32_loaddquhi256_mask:
13124   case X86::BI__builtin_ia32_loaddquhi512_mask:
13125   case X86::BI__builtin_ia32_loaddqusi128_mask:
13126   case X86::BI__builtin_ia32_loaddqusi256_mask:
13127   case X86::BI__builtin_ia32_loaddqusi512_mask:
13128   case X86::BI__builtin_ia32_loaddqudi128_mask:
13129   case X86::BI__builtin_ia32_loaddqudi256_mask:
13130   case X86::BI__builtin_ia32_loaddqudi512_mask:
13131     return EmitX86MaskedLoad(*this, Ops, Align(1));
13132 
13133   case X86::BI__builtin_ia32_loadsh128_mask:
13134   case X86::BI__builtin_ia32_loadss128_mask:
13135   case X86::BI__builtin_ia32_loadsd128_mask:
13136     return EmitX86MaskedLoad(*this, Ops, Align(1));
13137 
13138   case X86::BI__builtin_ia32_loadaps128_mask:
13139   case X86::BI__builtin_ia32_loadaps256_mask:
13140   case X86::BI__builtin_ia32_loadaps512_mask:
13141   case X86::BI__builtin_ia32_loadapd128_mask:
13142   case X86::BI__builtin_ia32_loadapd256_mask:
13143   case X86::BI__builtin_ia32_loadapd512_mask:
13144   case X86::BI__builtin_ia32_movdqa32load128_mask:
13145   case X86::BI__builtin_ia32_movdqa32load256_mask:
13146   case X86::BI__builtin_ia32_movdqa32load512_mask:
13147   case X86::BI__builtin_ia32_movdqa64load128_mask:
13148   case X86::BI__builtin_ia32_movdqa64load256_mask:
13149   case X86::BI__builtin_ia32_movdqa64load512_mask:
13150     return EmitX86MaskedLoad(
13151         *this, Ops,
13152         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
13153 
13154   case X86::BI__builtin_ia32_expandloaddf128_mask:
13155   case X86::BI__builtin_ia32_expandloaddf256_mask:
13156   case X86::BI__builtin_ia32_expandloaddf512_mask:
13157   case X86::BI__builtin_ia32_expandloadsf128_mask:
13158   case X86::BI__builtin_ia32_expandloadsf256_mask:
13159   case X86::BI__builtin_ia32_expandloadsf512_mask:
13160   case X86::BI__builtin_ia32_expandloaddi128_mask:
13161   case X86::BI__builtin_ia32_expandloaddi256_mask:
13162   case X86::BI__builtin_ia32_expandloaddi512_mask:
13163   case X86::BI__builtin_ia32_expandloadsi128_mask:
13164   case X86::BI__builtin_ia32_expandloadsi256_mask:
13165   case X86::BI__builtin_ia32_expandloadsi512_mask:
13166   case X86::BI__builtin_ia32_expandloadhi128_mask:
13167   case X86::BI__builtin_ia32_expandloadhi256_mask:
13168   case X86::BI__builtin_ia32_expandloadhi512_mask:
13169   case X86::BI__builtin_ia32_expandloadqi128_mask:
13170   case X86::BI__builtin_ia32_expandloadqi256_mask:
13171   case X86::BI__builtin_ia32_expandloadqi512_mask:
13172     return EmitX86ExpandLoad(*this, Ops);
13173 
13174   case X86::BI__builtin_ia32_compressstoredf128_mask:
13175   case X86::BI__builtin_ia32_compressstoredf256_mask:
13176   case X86::BI__builtin_ia32_compressstoredf512_mask:
13177   case X86::BI__builtin_ia32_compressstoresf128_mask:
13178   case X86::BI__builtin_ia32_compressstoresf256_mask:
13179   case X86::BI__builtin_ia32_compressstoresf512_mask:
13180   case X86::BI__builtin_ia32_compressstoredi128_mask:
13181   case X86::BI__builtin_ia32_compressstoredi256_mask:
13182   case X86::BI__builtin_ia32_compressstoredi512_mask:
13183   case X86::BI__builtin_ia32_compressstoresi128_mask:
13184   case X86::BI__builtin_ia32_compressstoresi256_mask:
13185   case X86::BI__builtin_ia32_compressstoresi512_mask:
13186   case X86::BI__builtin_ia32_compressstorehi128_mask:
13187   case X86::BI__builtin_ia32_compressstorehi256_mask:
13188   case X86::BI__builtin_ia32_compressstorehi512_mask:
13189   case X86::BI__builtin_ia32_compressstoreqi128_mask:
13190   case X86::BI__builtin_ia32_compressstoreqi256_mask:
13191   case X86::BI__builtin_ia32_compressstoreqi512_mask:
13192     return EmitX86CompressStore(*this, Ops);
13193 
13194   case X86::BI__builtin_ia32_expanddf128_mask:
13195   case X86::BI__builtin_ia32_expanddf256_mask:
13196   case X86::BI__builtin_ia32_expanddf512_mask:
13197   case X86::BI__builtin_ia32_expandsf128_mask:
13198   case X86::BI__builtin_ia32_expandsf256_mask:
13199   case X86::BI__builtin_ia32_expandsf512_mask:
13200   case X86::BI__builtin_ia32_expanddi128_mask:
13201   case X86::BI__builtin_ia32_expanddi256_mask:
13202   case X86::BI__builtin_ia32_expanddi512_mask:
13203   case X86::BI__builtin_ia32_expandsi128_mask:
13204   case X86::BI__builtin_ia32_expandsi256_mask:
13205   case X86::BI__builtin_ia32_expandsi512_mask:
13206   case X86::BI__builtin_ia32_expandhi128_mask:
13207   case X86::BI__builtin_ia32_expandhi256_mask:
13208   case X86::BI__builtin_ia32_expandhi512_mask:
13209   case X86::BI__builtin_ia32_expandqi128_mask:
13210   case X86::BI__builtin_ia32_expandqi256_mask:
13211   case X86::BI__builtin_ia32_expandqi512_mask:
13212     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
13213 
13214   case X86::BI__builtin_ia32_compressdf128_mask:
13215   case X86::BI__builtin_ia32_compressdf256_mask:
13216   case X86::BI__builtin_ia32_compressdf512_mask:
13217   case X86::BI__builtin_ia32_compresssf128_mask:
13218   case X86::BI__builtin_ia32_compresssf256_mask:
13219   case X86::BI__builtin_ia32_compresssf512_mask:
13220   case X86::BI__builtin_ia32_compressdi128_mask:
13221   case X86::BI__builtin_ia32_compressdi256_mask:
13222   case X86::BI__builtin_ia32_compressdi512_mask:
13223   case X86::BI__builtin_ia32_compresssi128_mask:
13224   case X86::BI__builtin_ia32_compresssi256_mask:
13225   case X86::BI__builtin_ia32_compresssi512_mask:
13226   case X86::BI__builtin_ia32_compresshi128_mask:
13227   case X86::BI__builtin_ia32_compresshi256_mask:
13228   case X86::BI__builtin_ia32_compresshi512_mask:
13229   case X86::BI__builtin_ia32_compressqi128_mask:
13230   case X86::BI__builtin_ia32_compressqi256_mask:
13231   case X86::BI__builtin_ia32_compressqi512_mask:
13232     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
13233 
13234   case X86::BI__builtin_ia32_gather3div2df:
13235   case X86::BI__builtin_ia32_gather3div2di:
13236   case X86::BI__builtin_ia32_gather3div4df:
13237   case X86::BI__builtin_ia32_gather3div4di:
13238   case X86::BI__builtin_ia32_gather3div4sf:
13239   case X86::BI__builtin_ia32_gather3div4si:
13240   case X86::BI__builtin_ia32_gather3div8sf:
13241   case X86::BI__builtin_ia32_gather3div8si:
13242   case X86::BI__builtin_ia32_gather3siv2df:
13243   case X86::BI__builtin_ia32_gather3siv2di:
13244   case X86::BI__builtin_ia32_gather3siv4df:
13245   case X86::BI__builtin_ia32_gather3siv4di:
13246   case X86::BI__builtin_ia32_gather3siv4sf:
13247   case X86::BI__builtin_ia32_gather3siv4si:
13248   case X86::BI__builtin_ia32_gather3siv8sf:
13249   case X86::BI__builtin_ia32_gather3siv8si:
13250   case X86::BI__builtin_ia32_gathersiv8df:
13251   case X86::BI__builtin_ia32_gathersiv16sf:
13252   case X86::BI__builtin_ia32_gatherdiv8df:
13253   case X86::BI__builtin_ia32_gatherdiv16sf:
13254   case X86::BI__builtin_ia32_gathersiv8di:
13255   case X86::BI__builtin_ia32_gathersiv16si:
13256   case X86::BI__builtin_ia32_gatherdiv8di:
13257   case X86::BI__builtin_ia32_gatherdiv16si: {
13258     Intrinsic::ID IID;
13259     switch (BuiltinID) {
13260     default: llvm_unreachable("Unexpected builtin");
13261     case X86::BI__builtin_ia32_gather3div2df:
13262       IID = Intrinsic::x86_avx512_mask_gather3div2_df;
13263       break;
13264     case X86::BI__builtin_ia32_gather3div2di:
13265       IID = Intrinsic::x86_avx512_mask_gather3div2_di;
13266       break;
13267     case X86::BI__builtin_ia32_gather3div4df:
13268       IID = Intrinsic::x86_avx512_mask_gather3div4_df;
13269       break;
13270     case X86::BI__builtin_ia32_gather3div4di:
13271       IID = Intrinsic::x86_avx512_mask_gather3div4_di;
13272       break;
13273     case X86::BI__builtin_ia32_gather3div4sf:
13274       IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
13275       break;
13276     case X86::BI__builtin_ia32_gather3div4si:
13277       IID = Intrinsic::x86_avx512_mask_gather3div4_si;
13278       break;
13279     case X86::BI__builtin_ia32_gather3div8sf:
13280       IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
13281       break;
13282     case X86::BI__builtin_ia32_gather3div8si:
13283       IID = Intrinsic::x86_avx512_mask_gather3div8_si;
13284       break;
13285     case X86::BI__builtin_ia32_gather3siv2df:
13286       IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
13287       break;
13288     case X86::BI__builtin_ia32_gather3siv2di:
13289       IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
13290       break;
13291     case X86::BI__builtin_ia32_gather3siv4df:
13292       IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
13293       break;
13294     case X86::BI__builtin_ia32_gather3siv4di:
13295       IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
13296       break;
13297     case X86::BI__builtin_ia32_gather3siv4sf:
13298       IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
13299       break;
13300     case X86::BI__builtin_ia32_gather3siv4si:
13301       IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
13302       break;
13303     case X86::BI__builtin_ia32_gather3siv8sf:
13304       IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
13305       break;
13306     case X86::BI__builtin_ia32_gather3siv8si:
13307       IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
13308       break;
13309     case X86::BI__builtin_ia32_gathersiv8df:
13310       IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
13311       break;
13312     case X86::BI__builtin_ia32_gathersiv16sf:
13313       IID = Intrinsic::x86_avx512_mask_gather_dps_512;
13314       break;
13315     case X86::BI__builtin_ia32_gatherdiv8df:
13316       IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
13317       break;
13318     case X86::BI__builtin_ia32_gatherdiv16sf:
13319       IID = Intrinsic::x86_avx512_mask_gather_qps_512;
13320       break;
13321     case X86::BI__builtin_ia32_gathersiv8di:
13322       IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
13323       break;
13324     case X86::BI__builtin_ia32_gathersiv16si:
13325       IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
13326       break;
13327     case X86::BI__builtin_ia32_gatherdiv8di:
13328       IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
13329       break;
13330     case X86::BI__builtin_ia32_gatherdiv16si:
13331       IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
13332       break;
13333     }
13334 
13335     unsigned MinElts = std::min(
13336         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
13337         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
13338     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
13339     Function *Intr = CGM.getIntrinsic(IID);
13340     return Builder.CreateCall(Intr, Ops);
13341   }
13342 
13343   case X86::BI__builtin_ia32_scattersiv8df:
13344   case X86::BI__builtin_ia32_scattersiv16sf:
13345   case X86::BI__builtin_ia32_scatterdiv8df:
13346   case X86::BI__builtin_ia32_scatterdiv16sf:
13347   case X86::BI__builtin_ia32_scattersiv8di:
13348   case X86::BI__builtin_ia32_scattersiv16si:
13349   case X86::BI__builtin_ia32_scatterdiv8di:
13350   case X86::BI__builtin_ia32_scatterdiv16si:
13351   case X86::BI__builtin_ia32_scatterdiv2df:
13352   case X86::BI__builtin_ia32_scatterdiv2di:
13353   case X86::BI__builtin_ia32_scatterdiv4df:
13354   case X86::BI__builtin_ia32_scatterdiv4di:
13355   case X86::BI__builtin_ia32_scatterdiv4sf:
13356   case X86::BI__builtin_ia32_scatterdiv4si:
13357   case X86::BI__builtin_ia32_scatterdiv8sf:
13358   case X86::BI__builtin_ia32_scatterdiv8si:
13359   case X86::BI__builtin_ia32_scattersiv2df:
13360   case X86::BI__builtin_ia32_scattersiv2di:
13361   case X86::BI__builtin_ia32_scattersiv4df:
13362   case X86::BI__builtin_ia32_scattersiv4di:
13363   case X86::BI__builtin_ia32_scattersiv4sf:
13364   case X86::BI__builtin_ia32_scattersiv4si:
13365   case X86::BI__builtin_ia32_scattersiv8sf:
13366   case X86::BI__builtin_ia32_scattersiv8si: {
13367     Intrinsic::ID IID;
13368     switch (BuiltinID) {
13369     default: llvm_unreachable("Unexpected builtin");
13370     case X86::BI__builtin_ia32_scattersiv8df:
13371       IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
13372       break;
13373     case X86::BI__builtin_ia32_scattersiv16sf:
13374       IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
13375       break;
13376     case X86::BI__builtin_ia32_scatterdiv8df:
13377       IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
13378       break;
13379     case X86::BI__builtin_ia32_scatterdiv16sf:
13380       IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
13381       break;
13382     case X86::BI__builtin_ia32_scattersiv8di:
13383       IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
13384       break;
13385     case X86::BI__builtin_ia32_scattersiv16si:
13386       IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
13387       break;
13388     case X86::BI__builtin_ia32_scatterdiv8di:
13389       IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
13390       break;
13391     case X86::BI__builtin_ia32_scatterdiv16si:
13392       IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
13393       break;
13394     case X86::BI__builtin_ia32_scatterdiv2df:
13395       IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
13396       break;
13397     case X86::BI__builtin_ia32_scatterdiv2di:
13398       IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
13399       break;
13400     case X86::BI__builtin_ia32_scatterdiv4df:
13401       IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
13402       break;
13403     case X86::BI__builtin_ia32_scatterdiv4di:
13404       IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
13405       break;
13406     case X86::BI__builtin_ia32_scatterdiv4sf:
13407       IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
13408       break;
13409     case X86::BI__builtin_ia32_scatterdiv4si:
13410       IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
13411       break;
13412     case X86::BI__builtin_ia32_scatterdiv8sf:
13413       IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
13414       break;
13415     case X86::BI__builtin_ia32_scatterdiv8si:
13416       IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
13417       break;
13418     case X86::BI__builtin_ia32_scattersiv2df:
13419       IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
13420       break;
13421     case X86::BI__builtin_ia32_scattersiv2di:
13422       IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
13423       break;
13424     case X86::BI__builtin_ia32_scattersiv4df:
13425       IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
13426       break;
13427     case X86::BI__builtin_ia32_scattersiv4di:
13428       IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
13429       break;
13430     case X86::BI__builtin_ia32_scattersiv4sf:
13431       IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
13432       break;
13433     case X86::BI__builtin_ia32_scattersiv4si:
13434       IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
13435       break;
13436     case X86::BI__builtin_ia32_scattersiv8sf:
13437       IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
13438       break;
13439     case X86::BI__builtin_ia32_scattersiv8si:
13440       IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
13441       break;
13442     }
13443 
13444     unsigned MinElts = std::min(
13445         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
13446         cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
13447     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
13448     Function *Intr = CGM.getIntrinsic(IID);
13449     return Builder.CreateCall(Intr, Ops);
13450   }
13451 
13452   case X86::BI__builtin_ia32_vextractf128_pd256:
13453   case X86::BI__builtin_ia32_vextractf128_ps256:
13454   case X86::BI__builtin_ia32_vextractf128_si256:
13455   case X86::BI__builtin_ia32_extract128i256:
13456   case X86::BI__builtin_ia32_extractf64x4_mask:
13457   case X86::BI__builtin_ia32_extractf32x4_mask:
13458   case X86::BI__builtin_ia32_extracti64x4_mask:
13459   case X86::BI__builtin_ia32_extracti32x4_mask:
13460   case X86::BI__builtin_ia32_extractf32x8_mask:
13461   case X86::BI__builtin_ia32_extracti32x8_mask:
13462   case X86::BI__builtin_ia32_extractf32x4_256_mask:
13463   case X86::BI__builtin_ia32_extracti32x4_256_mask:
13464   case X86::BI__builtin_ia32_extractf64x2_256_mask:
13465   case X86::BI__builtin_ia32_extracti64x2_256_mask:
13466   case X86::BI__builtin_ia32_extractf64x2_512_mask:
13467   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
13468     auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
13469     unsigned NumElts = DstTy->getNumElements();
13470     unsigned SrcNumElts =
13471         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13472     unsigned SubVectors = SrcNumElts / NumElts;
13473     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
13474     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
13475     Index &= SubVectors - 1; // Remove any extra bits.
13476     Index *= NumElts;
13477 
13478     int Indices[16];
13479     for (unsigned i = 0; i != NumElts; ++i)
13480       Indices[i] = i + Index;
13481 
13482     Value *Res = Builder.CreateShuffleVector(Ops[0],
13483                                              makeArrayRef(Indices, NumElts),
13484                                              "extract");
13485 
13486     if (Ops.size() == 4)
13487       Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
13488 
13489     return Res;
13490   }
13491   case X86::BI__builtin_ia32_vinsertf128_pd256:
13492   case X86::BI__builtin_ia32_vinsertf128_ps256:
13493   case X86::BI__builtin_ia32_vinsertf128_si256:
13494   case X86::BI__builtin_ia32_insert128i256:
13495   case X86::BI__builtin_ia32_insertf64x4:
13496   case X86::BI__builtin_ia32_insertf32x4:
13497   case X86::BI__builtin_ia32_inserti64x4:
13498   case X86::BI__builtin_ia32_inserti32x4:
13499   case X86::BI__builtin_ia32_insertf32x8:
13500   case X86::BI__builtin_ia32_inserti32x8:
13501   case X86::BI__builtin_ia32_insertf32x4_256:
13502   case X86::BI__builtin_ia32_inserti32x4_256:
13503   case X86::BI__builtin_ia32_insertf64x2_256:
13504   case X86::BI__builtin_ia32_inserti64x2_256:
13505   case X86::BI__builtin_ia32_insertf64x2_512:
13506   case X86::BI__builtin_ia32_inserti64x2_512: {
13507     unsigned DstNumElts =
13508         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13509     unsigned SrcNumElts =
13510         cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
13511     unsigned SubVectors = DstNumElts / SrcNumElts;
13512     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
13513     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
13514     Index &= SubVectors - 1; // Remove any extra bits.
13515     Index *= SrcNumElts;
13516 
13517     int Indices[16];
13518     for (unsigned i = 0; i != DstNumElts; ++i)
13519       Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
13520 
13521     Value *Op1 = Builder.CreateShuffleVector(Ops[1],
13522                                              makeArrayRef(Indices, DstNumElts),
13523                                              "widen");
13524 
13525     for (unsigned i = 0; i != DstNumElts; ++i) {
13526       if (i >= Index && i < (Index + SrcNumElts))
13527         Indices[i] = (i - Index) + DstNumElts;
13528       else
13529         Indices[i] = i;
13530     }
13531 
13532     return Builder.CreateShuffleVector(Ops[0], Op1,
13533                                        makeArrayRef(Indices, DstNumElts),
13534                                        "insert");
13535   }
13536   case X86::BI__builtin_ia32_pmovqd512_mask:
13537   case X86::BI__builtin_ia32_pmovwb512_mask: {
13538     Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
13539     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
13540   }
13541   case X86::BI__builtin_ia32_pmovdb512_mask:
13542   case X86::BI__builtin_ia32_pmovdw512_mask:
13543   case X86::BI__builtin_ia32_pmovqw512_mask: {
13544     if (const auto *C = dyn_cast<Constant>(Ops[2]))
13545       if (C->isAllOnesValue())
13546         return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
13547 
13548     Intrinsic::ID IID;
13549     switch (BuiltinID) {
13550     default: llvm_unreachable("Unsupported intrinsic!");
13551     case X86::BI__builtin_ia32_pmovdb512_mask:
13552       IID = Intrinsic::x86_avx512_mask_pmov_db_512;
13553       break;
13554     case X86::BI__builtin_ia32_pmovdw512_mask:
13555       IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
13556       break;
13557     case X86::BI__builtin_ia32_pmovqw512_mask:
13558       IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
13559       break;
13560     }
13561 
13562     Function *Intr = CGM.getIntrinsic(IID);
13563     return Builder.CreateCall(Intr, Ops);
13564   }
13565   case X86::BI__builtin_ia32_pblendw128:
13566   case X86::BI__builtin_ia32_blendpd:
13567   case X86::BI__builtin_ia32_blendps:
13568   case X86::BI__builtin_ia32_blendpd256:
13569   case X86::BI__builtin_ia32_blendps256:
13570   case X86::BI__builtin_ia32_pblendw256:
13571   case X86::BI__builtin_ia32_pblendd128:
13572   case X86::BI__builtin_ia32_pblendd256: {
13573     unsigned NumElts =
13574         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13575     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
13576 
13577     int Indices[16];
13578     // If there are more than 8 elements, the immediate is used twice so make
13579     // sure we handle that.
13580     for (unsigned i = 0; i != NumElts; ++i)
13581       Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
13582 
13583     return Builder.CreateShuffleVector(Ops[0], Ops[1],
13584                                        makeArrayRef(Indices, NumElts),
13585                                        "blend");
13586   }
13587   case X86::BI__builtin_ia32_pshuflw:
13588   case X86::BI__builtin_ia32_pshuflw256:
13589   case X86::BI__builtin_ia32_pshuflw512: {
13590     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
13591     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13592     unsigned NumElts = Ty->getNumElements();
13593 
13594     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
13595     Imm = (Imm & 0xff) * 0x01010101;
13596 
13597     int Indices[32];
13598     for (unsigned l = 0; l != NumElts; l += 8) {
13599       for (unsigned i = 0; i != 4; ++i) {
13600         Indices[l + i] = l + (Imm & 3);
13601         Imm >>= 2;
13602       }
13603       for (unsigned i = 4; i != 8; ++i)
13604         Indices[l + i] = l + i;
13605     }
13606 
13607     return Builder.CreateShuffleVector(Ops[0], makeArrayRef(Indices, NumElts),
13608                                        "pshuflw");
13609   }
13610   case X86::BI__builtin_ia32_pshufhw:
13611   case X86::BI__builtin_ia32_pshufhw256:
13612   case X86::BI__builtin_ia32_pshufhw512: {
13613     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
13614     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13615     unsigned NumElts = Ty->getNumElements();
13616 
13617     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
13618     Imm = (Imm & 0xff) * 0x01010101;
13619 
13620     int Indices[32];
13621     for (unsigned l = 0; l != NumElts; l += 8) {
13622       for (unsigned i = 0; i != 4; ++i)
13623         Indices[l + i] = l + i;
13624       for (unsigned i = 4; i != 8; ++i) {
13625         Indices[l + i] = l + 4 + (Imm & 3);
13626         Imm >>= 2;
13627       }
13628     }
13629 
13630     return Builder.CreateShuffleVector(Ops[0], makeArrayRef(Indices, NumElts),
13631                                        "pshufhw");
13632   }
13633   case X86::BI__builtin_ia32_pshufd:
13634   case X86::BI__builtin_ia32_pshufd256:
13635   case X86::BI__builtin_ia32_pshufd512:
13636   case X86::BI__builtin_ia32_vpermilpd:
13637   case X86::BI__builtin_ia32_vpermilps:
13638   case X86::BI__builtin_ia32_vpermilpd256:
13639   case X86::BI__builtin_ia32_vpermilps256:
13640   case X86::BI__builtin_ia32_vpermilpd512:
13641   case X86::BI__builtin_ia32_vpermilps512: {
13642     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
13643     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13644     unsigned NumElts = Ty->getNumElements();
13645     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
13646     unsigned NumLaneElts = NumElts / NumLanes;
13647 
13648     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
13649     Imm = (Imm & 0xff) * 0x01010101;
13650 
13651     int Indices[16];
13652     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
13653       for (unsigned i = 0; i != NumLaneElts; ++i) {
13654         Indices[i + l] = (Imm % NumLaneElts) + l;
13655         Imm /= NumLaneElts;
13656       }
13657     }
13658 
13659     return Builder.CreateShuffleVector(Ops[0], makeArrayRef(Indices, NumElts),
13660                                        "permil");
13661   }
13662   case X86::BI__builtin_ia32_shufpd:
13663   case X86::BI__builtin_ia32_shufpd256:
13664   case X86::BI__builtin_ia32_shufpd512:
13665   case X86::BI__builtin_ia32_shufps:
13666   case X86::BI__builtin_ia32_shufps256:
13667   case X86::BI__builtin_ia32_shufps512: {
13668     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
13669     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13670     unsigned NumElts = Ty->getNumElements();
13671     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
13672     unsigned NumLaneElts = NumElts / NumLanes;
13673 
13674     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
13675     Imm = (Imm & 0xff) * 0x01010101;
13676 
13677     int Indices[16];
13678     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
13679       for (unsigned i = 0; i != NumLaneElts; ++i) {
13680         unsigned Index = Imm % NumLaneElts;
13681         Imm /= NumLaneElts;
13682         if (i >= (NumLaneElts / 2))
13683           Index += NumElts;
13684         Indices[l + i] = l + Index;
13685       }
13686     }
13687 
13688     return Builder.CreateShuffleVector(Ops[0], Ops[1],
13689                                        makeArrayRef(Indices, NumElts),
13690                                        "shufp");
13691   }
13692   case X86::BI__builtin_ia32_permdi256:
13693   case X86::BI__builtin_ia32_permdf256:
13694   case X86::BI__builtin_ia32_permdi512:
13695   case X86::BI__builtin_ia32_permdf512: {
13696     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
13697     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13698     unsigned NumElts = Ty->getNumElements();
13699 
13700     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
13701     int Indices[8];
13702     for (unsigned l = 0; l != NumElts; l += 4)
13703       for (unsigned i = 0; i != 4; ++i)
13704         Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
13705 
13706     return Builder.CreateShuffleVector(Ops[0], makeArrayRef(Indices, NumElts),
13707                                        "perm");
13708   }
13709   case X86::BI__builtin_ia32_palignr128:
13710   case X86::BI__builtin_ia32_palignr256:
13711   case X86::BI__builtin_ia32_palignr512: {
13712     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
13713 
13714     unsigned NumElts =
13715         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13716     assert(NumElts % 16 == 0);
13717 
13718     // If palignr is shifting the pair of vectors more than the size of two
13719     // lanes, emit zero.
13720     if (ShiftVal >= 32)
13721       return llvm::Constant::getNullValue(ConvertType(E->getType()));
13722 
13723     // If palignr is shifting the pair of input vectors more than one lane,
13724     // but less than two lanes, convert to shifting in zeroes.
13725     if (ShiftVal > 16) {
13726       ShiftVal -= 16;
13727       Ops[1] = Ops[0];
13728       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
13729     }
13730 
13731     int Indices[64];
13732     // 256-bit palignr operates on 128-bit lanes so we need to handle that
13733     for (unsigned l = 0; l != NumElts; l += 16) {
13734       for (unsigned i = 0; i != 16; ++i) {
13735         unsigned Idx = ShiftVal + i;
13736         if (Idx >= 16)
13737           Idx += NumElts - 16; // End of lane, switch operand.
13738         Indices[l + i] = Idx + l;
13739       }
13740     }
13741 
13742     return Builder.CreateShuffleVector(Ops[1], Ops[0],
13743                                        makeArrayRef(Indices, NumElts),
13744                                        "palignr");
13745   }
13746   case X86::BI__builtin_ia32_alignd128:
13747   case X86::BI__builtin_ia32_alignd256:
13748   case X86::BI__builtin_ia32_alignd512:
13749   case X86::BI__builtin_ia32_alignq128:
13750   case X86::BI__builtin_ia32_alignq256:
13751   case X86::BI__builtin_ia32_alignq512: {
13752     unsigned NumElts =
13753         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13754     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
13755 
13756     // Mask the shift amount to width of a vector.
13757     ShiftVal &= NumElts - 1;
13758 
13759     int Indices[16];
13760     for (unsigned i = 0; i != NumElts; ++i)
13761       Indices[i] = i + ShiftVal;
13762 
13763     return Builder.CreateShuffleVector(Ops[1], Ops[0],
13764                                        makeArrayRef(Indices, NumElts),
13765                                        "valign");
13766   }
13767   case X86::BI__builtin_ia32_shuf_f32x4_256:
13768   case X86::BI__builtin_ia32_shuf_f64x2_256:
13769   case X86::BI__builtin_ia32_shuf_i32x4_256:
13770   case X86::BI__builtin_ia32_shuf_i64x2_256:
13771   case X86::BI__builtin_ia32_shuf_f32x4:
13772   case X86::BI__builtin_ia32_shuf_f64x2:
13773   case X86::BI__builtin_ia32_shuf_i32x4:
13774   case X86::BI__builtin_ia32_shuf_i64x2: {
13775     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
13776     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
13777     unsigned NumElts = Ty->getNumElements();
13778     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
13779     unsigned NumLaneElts = NumElts / NumLanes;
13780 
13781     int Indices[16];
13782     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
13783       unsigned Index = (Imm % NumLanes) * NumLaneElts;
13784       Imm /= NumLanes; // Discard the bits we just used.
13785       if (l >= (NumElts / 2))
13786         Index += NumElts; // Switch to other source.
13787       for (unsigned i = 0; i != NumLaneElts; ++i) {
13788         Indices[l + i] = Index + i;
13789       }
13790     }
13791 
13792     return Builder.CreateShuffleVector(Ops[0], Ops[1],
13793                                        makeArrayRef(Indices, NumElts),
13794                                        "shuf");
13795   }
13796 
13797   case X86::BI__builtin_ia32_vperm2f128_pd256:
13798   case X86::BI__builtin_ia32_vperm2f128_ps256:
13799   case X86::BI__builtin_ia32_vperm2f128_si256:
13800   case X86::BI__builtin_ia32_permti256: {
13801     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
13802     unsigned NumElts =
13803         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13804 
13805     // This takes a very simple approach since there are two lanes and a
13806     // shuffle can have 2 inputs. So we reserve the first input for the first
13807     // lane and the second input for the second lane. This may result in
13808     // duplicate sources, but this can be dealt with in the backend.
13809 
13810     Value *OutOps[2];
13811     int Indices[8];
13812     for (unsigned l = 0; l != 2; ++l) {
13813       // Determine the source for this lane.
13814       if (Imm & (1 << ((l * 4) + 3)))
13815         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
13816       else if (Imm & (1 << ((l * 4) + 1)))
13817         OutOps[l] = Ops[1];
13818       else
13819         OutOps[l] = Ops[0];
13820 
13821       for (unsigned i = 0; i != NumElts/2; ++i) {
13822         // Start with ith element of the source for this lane.
13823         unsigned Idx = (l * NumElts) + i;
13824         // If bit 0 of the immediate half is set, switch to the high half of
13825         // the source.
13826         if (Imm & (1 << (l * 4)))
13827           Idx += NumElts/2;
13828         Indices[(l * (NumElts/2)) + i] = Idx;
13829       }
13830     }
13831 
13832     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
13833                                        makeArrayRef(Indices, NumElts),
13834                                        "vperm");
13835   }
13836 
13837   case X86::BI__builtin_ia32_pslldqi128_byteshift:
13838   case X86::BI__builtin_ia32_pslldqi256_byteshift:
13839   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
13840     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
13841     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
13842     // Builtin type is vXi64 so multiply by 8 to get bytes.
13843     unsigned NumElts = ResultType->getNumElements() * 8;
13844 
13845     // If pslldq is shifting the vector more than 15 bytes, emit zero.
13846     if (ShiftVal >= 16)
13847       return llvm::Constant::getNullValue(ResultType);
13848 
13849     int Indices[64];
13850     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
13851     for (unsigned l = 0; l != NumElts; l += 16) {
13852       for (unsigned i = 0; i != 16; ++i) {
13853         unsigned Idx = NumElts + i - ShiftVal;
13854         if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
13855         Indices[l + i] = Idx + l;
13856       }
13857     }
13858 
13859     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
13860     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
13861     Value *Zero = llvm::Constant::getNullValue(VecTy);
13862     Value *SV = Builder.CreateShuffleVector(Zero, Cast,
13863                                             makeArrayRef(Indices, NumElts),
13864                                             "pslldq");
13865     return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
13866   }
13867   case X86::BI__builtin_ia32_psrldqi128_byteshift:
13868   case X86::BI__builtin_ia32_psrldqi256_byteshift:
13869   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
13870     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
13871     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
13872     // Builtin type is vXi64 so multiply by 8 to get bytes.
13873     unsigned NumElts = ResultType->getNumElements() * 8;
13874 
13875     // If psrldq is shifting the vector more than 15 bytes, emit zero.
13876     if (ShiftVal >= 16)
13877       return llvm::Constant::getNullValue(ResultType);
13878 
13879     int Indices[64];
13880     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
13881     for (unsigned l = 0; l != NumElts; l += 16) {
13882       for (unsigned i = 0; i != 16; ++i) {
13883         unsigned Idx = i + ShiftVal;
13884         if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
13885         Indices[l + i] = Idx + l;
13886       }
13887     }
13888 
13889     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
13890     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
13891     Value *Zero = llvm::Constant::getNullValue(VecTy);
13892     Value *SV = Builder.CreateShuffleVector(Cast, Zero,
13893                                             makeArrayRef(Indices, NumElts),
13894                                             "psrldq");
13895     return Builder.CreateBitCast(SV, ResultType, "cast");
13896   }
13897   case X86::BI__builtin_ia32_kshiftliqi:
13898   case X86::BI__builtin_ia32_kshiftlihi:
13899   case X86::BI__builtin_ia32_kshiftlisi:
13900   case X86::BI__builtin_ia32_kshiftlidi: {
13901     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
13902     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
13903 
13904     if (ShiftVal >= NumElts)
13905       return llvm::Constant::getNullValue(Ops[0]->getType());
13906 
13907     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
13908 
13909     int Indices[64];
13910     for (unsigned i = 0; i != NumElts; ++i)
13911       Indices[i] = NumElts + i - ShiftVal;
13912 
13913     Value *Zero = llvm::Constant::getNullValue(In->getType());
13914     Value *SV = Builder.CreateShuffleVector(Zero, In,
13915                                             makeArrayRef(Indices, NumElts),
13916                                             "kshiftl");
13917     return Builder.CreateBitCast(SV, Ops[0]->getType());
13918   }
13919   case X86::BI__builtin_ia32_kshiftriqi:
13920   case X86::BI__builtin_ia32_kshiftrihi:
13921   case X86::BI__builtin_ia32_kshiftrisi:
13922   case X86::BI__builtin_ia32_kshiftridi: {
13923     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
13924     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
13925 
13926     if (ShiftVal >= NumElts)
13927       return llvm::Constant::getNullValue(Ops[0]->getType());
13928 
13929     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
13930 
13931     int Indices[64];
13932     for (unsigned i = 0; i != NumElts; ++i)
13933       Indices[i] = i + ShiftVal;
13934 
13935     Value *Zero = llvm::Constant::getNullValue(In->getType());
13936     Value *SV = Builder.CreateShuffleVector(In, Zero,
13937                                             makeArrayRef(Indices, NumElts),
13938                                             "kshiftr");
13939     return Builder.CreateBitCast(SV, Ops[0]->getType());
13940   }
13941   case X86::BI__builtin_ia32_movnti:
13942   case X86::BI__builtin_ia32_movnti64:
13943   case X86::BI__builtin_ia32_movntsd:
13944   case X86::BI__builtin_ia32_movntss: {
13945     llvm::MDNode *Node = llvm::MDNode::get(
13946         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
13947 
13948     Value *Ptr = Ops[0];
13949     Value *Src = Ops[1];
13950 
13951     // Extract the 0'th element of the source vector.
13952     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
13953         BuiltinID == X86::BI__builtin_ia32_movntss)
13954       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
13955 
13956     // Convert the type of the pointer to a pointer to the stored type.
13957     Value *BC = Builder.CreateBitCast(
13958         Ptr, llvm::PointerType::getUnqual(Src->getType()), "cast");
13959 
13960     // Unaligned nontemporal store of the scalar value.
13961     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, BC);
13962     SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
13963     SI->setAlignment(llvm::Align(1));
13964     return SI;
13965   }
13966   // Rotate is a special case of funnel shift - 1st 2 args are the same.
13967   case X86::BI__builtin_ia32_vprotb:
13968   case X86::BI__builtin_ia32_vprotw:
13969   case X86::BI__builtin_ia32_vprotd:
13970   case X86::BI__builtin_ia32_vprotq:
13971   case X86::BI__builtin_ia32_vprotbi:
13972   case X86::BI__builtin_ia32_vprotwi:
13973   case X86::BI__builtin_ia32_vprotdi:
13974   case X86::BI__builtin_ia32_vprotqi:
13975   case X86::BI__builtin_ia32_prold128:
13976   case X86::BI__builtin_ia32_prold256:
13977   case X86::BI__builtin_ia32_prold512:
13978   case X86::BI__builtin_ia32_prolq128:
13979   case X86::BI__builtin_ia32_prolq256:
13980   case X86::BI__builtin_ia32_prolq512:
13981   case X86::BI__builtin_ia32_prolvd128:
13982   case X86::BI__builtin_ia32_prolvd256:
13983   case X86::BI__builtin_ia32_prolvd512:
13984   case X86::BI__builtin_ia32_prolvq128:
13985   case X86::BI__builtin_ia32_prolvq256:
13986   case X86::BI__builtin_ia32_prolvq512:
13987     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
13988   case X86::BI__builtin_ia32_prord128:
13989   case X86::BI__builtin_ia32_prord256:
13990   case X86::BI__builtin_ia32_prord512:
13991   case X86::BI__builtin_ia32_prorq128:
13992   case X86::BI__builtin_ia32_prorq256:
13993   case X86::BI__builtin_ia32_prorq512:
13994   case X86::BI__builtin_ia32_prorvd128:
13995   case X86::BI__builtin_ia32_prorvd256:
13996   case X86::BI__builtin_ia32_prorvd512:
13997   case X86::BI__builtin_ia32_prorvq128:
13998   case X86::BI__builtin_ia32_prorvq256:
13999   case X86::BI__builtin_ia32_prorvq512:
14000     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
14001   case X86::BI__builtin_ia32_selectb_128:
14002   case X86::BI__builtin_ia32_selectb_256:
14003   case X86::BI__builtin_ia32_selectb_512:
14004   case X86::BI__builtin_ia32_selectw_128:
14005   case X86::BI__builtin_ia32_selectw_256:
14006   case X86::BI__builtin_ia32_selectw_512:
14007   case X86::BI__builtin_ia32_selectd_128:
14008   case X86::BI__builtin_ia32_selectd_256:
14009   case X86::BI__builtin_ia32_selectd_512:
14010   case X86::BI__builtin_ia32_selectq_128:
14011   case X86::BI__builtin_ia32_selectq_256:
14012   case X86::BI__builtin_ia32_selectq_512:
14013   case X86::BI__builtin_ia32_selectph_128:
14014   case X86::BI__builtin_ia32_selectph_256:
14015   case X86::BI__builtin_ia32_selectph_512:
14016   case X86::BI__builtin_ia32_selectps_128:
14017   case X86::BI__builtin_ia32_selectps_256:
14018   case X86::BI__builtin_ia32_selectps_512:
14019   case X86::BI__builtin_ia32_selectpd_128:
14020   case X86::BI__builtin_ia32_selectpd_256:
14021   case X86::BI__builtin_ia32_selectpd_512:
14022     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
14023   case X86::BI__builtin_ia32_selectsh_128:
14024   case X86::BI__builtin_ia32_selectss_128:
14025   case X86::BI__builtin_ia32_selectsd_128: {
14026     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
14027     Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
14028     A = EmitX86ScalarSelect(*this, Ops[0], A, B);
14029     return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
14030   }
14031   case X86::BI__builtin_ia32_cmpb128_mask:
14032   case X86::BI__builtin_ia32_cmpb256_mask:
14033   case X86::BI__builtin_ia32_cmpb512_mask:
14034   case X86::BI__builtin_ia32_cmpw128_mask:
14035   case X86::BI__builtin_ia32_cmpw256_mask:
14036   case X86::BI__builtin_ia32_cmpw512_mask:
14037   case X86::BI__builtin_ia32_cmpd128_mask:
14038   case X86::BI__builtin_ia32_cmpd256_mask:
14039   case X86::BI__builtin_ia32_cmpd512_mask:
14040   case X86::BI__builtin_ia32_cmpq128_mask:
14041   case X86::BI__builtin_ia32_cmpq256_mask:
14042   case X86::BI__builtin_ia32_cmpq512_mask: {
14043     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
14044     return EmitX86MaskedCompare(*this, CC, true, Ops);
14045   }
14046   case X86::BI__builtin_ia32_ucmpb128_mask:
14047   case X86::BI__builtin_ia32_ucmpb256_mask:
14048   case X86::BI__builtin_ia32_ucmpb512_mask:
14049   case X86::BI__builtin_ia32_ucmpw128_mask:
14050   case X86::BI__builtin_ia32_ucmpw256_mask:
14051   case X86::BI__builtin_ia32_ucmpw512_mask:
14052   case X86::BI__builtin_ia32_ucmpd128_mask:
14053   case X86::BI__builtin_ia32_ucmpd256_mask:
14054   case X86::BI__builtin_ia32_ucmpd512_mask:
14055   case X86::BI__builtin_ia32_ucmpq128_mask:
14056   case X86::BI__builtin_ia32_ucmpq256_mask:
14057   case X86::BI__builtin_ia32_ucmpq512_mask: {
14058     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
14059     return EmitX86MaskedCompare(*this, CC, false, Ops);
14060   }
14061   case X86::BI__builtin_ia32_vpcomb:
14062   case X86::BI__builtin_ia32_vpcomw:
14063   case X86::BI__builtin_ia32_vpcomd:
14064   case X86::BI__builtin_ia32_vpcomq:
14065     return EmitX86vpcom(*this, Ops, true);
14066   case X86::BI__builtin_ia32_vpcomub:
14067   case X86::BI__builtin_ia32_vpcomuw:
14068   case X86::BI__builtin_ia32_vpcomud:
14069   case X86::BI__builtin_ia32_vpcomuq:
14070     return EmitX86vpcom(*this, Ops, false);
14071 
14072   case X86::BI__builtin_ia32_kortestcqi:
14073   case X86::BI__builtin_ia32_kortestchi:
14074   case X86::BI__builtin_ia32_kortestcsi:
14075   case X86::BI__builtin_ia32_kortestcdi: {
14076     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
14077     Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
14078     Value *Cmp = Builder.CreateICmpEQ(Or, C);
14079     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
14080   }
14081   case X86::BI__builtin_ia32_kortestzqi:
14082   case X86::BI__builtin_ia32_kortestzhi:
14083   case X86::BI__builtin_ia32_kortestzsi:
14084   case X86::BI__builtin_ia32_kortestzdi: {
14085     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
14086     Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
14087     Value *Cmp = Builder.CreateICmpEQ(Or, C);
14088     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
14089   }
14090 
14091   case X86::BI__builtin_ia32_ktestcqi:
14092   case X86::BI__builtin_ia32_ktestzqi:
14093   case X86::BI__builtin_ia32_ktestchi:
14094   case X86::BI__builtin_ia32_ktestzhi:
14095   case X86::BI__builtin_ia32_ktestcsi:
14096   case X86::BI__builtin_ia32_ktestzsi:
14097   case X86::BI__builtin_ia32_ktestcdi:
14098   case X86::BI__builtin_ia32_ktestzdi: {
14099     Intrinsic::ID IID;
14100     switch (BuiltinID) {
14101     default: llvm_unreachable("Unsupported intrinsic!");
14102     case X86::BI__builtin_ia32_ktestcqi:
14103       IID = Intrinsic::x86_avx512_ktestc_b;
14104       break;
14105     case X86::BI__builtin_ia32_ktestzqi:
14106       IID = Intrinsic::x86_avx512_ktestz_b;
14107       break;
14108     case X86::BI__builtin_ia32_ktestchi:
14109       IID = Intrinsic::x86_avx512_ktestc_w;
14110       break;
14111     case X86::BI__builtin_ia32_ktestzhi:
14112       IID = Intrinsic::x86_avx512_ktestz_w;
14113       break;
14114     case X86::BI__builtin_ia32_ktestcsi:
14115       IID = Intrinsic::x86_avx512_ktestc_d;
14116       break;
14117     case X86::BI__builtin_ia32_ktestzsi:
14118       IID = Intrinsic::x86_avx512_ktestz_d;
14119       break;
14120     case X86::BI__builtin_ia32_ktestcdi:
14121       IID = Intrinsic::x86_avx512_ktestc_q;
14122       break;
14123     case X86::BI__builtin_ia32_ktestzdi:
14124       IID = Intrinsic::x86_avx512_ktestz_q;
14125       break;
14126     }
14127 
14128     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14129     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14130     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14131     Function *Intr = CGM.getIntrinsic(IID);
14132     return Builder.CreateCall(Intr, {LHS, RHS});
14133   }
14134 
14135   case X86::BI__builtin_ia32_kaddqi:
14136   case X86::BI__builtin_ia32_kaddhi:
14137   case X86::BI__builtin_ia32_kaddsi:
14138   case X86::BI__builtin_ia32_kadddi: {
14139     Intrinsic::ID IID;
14140     switch (BuiltinID) {
14141     default: llvm_unreachable("Unsupported intrinsic!");
14142     case X86::BI__builtin_ia32_kaddqi:
14143       IID = Intrinsic::x86_avx512_kadd_b;
14144       break;
14145     case X86::BI__builtin_ia32_kaddhi:
14146       IID = Intrinsic::x86_avx512_kadd_w;
14147       break;
14148     case X86::BI__builtin_ia32_kaddsi:
14149       IID = Intrinsic::x86_avx512_kadd_d;
14150       break;
14151     case X86::BI__builtin_ia32_kadddi:
14152       IID = Intrinsic::x86_avx512_kadd_q;
14153       break;
14154     }
14155 
14156     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14157     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14158     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14159     Function *Intr = CGM.getIntrinsic(IID);
14160     Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
14161     return Builder.CreateBitCast(Res, Ops[0]->getType());
14162   }
14163   case X86::BI__builtin_ia32_kandqi:
14164   case X86::BI__builtin_ia32_kandhi:
14165   case X86::BI__builtin_ia32_kandsi:
14166   case X86::BI__builtin_ia32_kanddi:
14167     return EmitX86MaskLogic(*this, Instruction::And, Ops);
14168   case X86::BI__builtin_ia32_kandnqi:
14169   case X86::BI__builtin_ia32_kandnhi:
14170   case X86::BI__builtin_ia32_kandnsi:
14171   case X86::BI__builtin_ia32_kandndi:
14172     return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
14173   case X86::BI__builtin_ia32_korqi:
14174   case X86::BI__builtin_ia32_korhi:
14175   case X86::BI__builtin_ia32_korsi:
14176   case X86::BI__builtin_ia32_kordi:
14177     return EmitX86MaskLogic(*this, Instruction::Or, Ops);
14178   case X86::BI__builtin_ia32_kxnorqi:
14179   case X86::BI__builtin_ia32_kxnorhi:
14180   case X86::BI__builtin_ia32_kxnorsi:
14181   case X86::BI__builtin_ia32_kxnordi:
14182     return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
14183   case X86::BI__builtin_ia32_kxorqi:
14184   case X86::BI__builtin_ia32_kxorhi:
14185   case X86::BI__builtin_ia32_kxorsi:
14186   case X86::BI__builtin_ia32_kxordi:
14187     return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
14188   case X86::BI__builtin_ia32_knotqi:
14189   case X86::BI__builtin_ia32_knothi:
14190   case X86::BI__builtin_ia32_knotsi:
14191   case X86::BI__builtin_ia32_knotdi: {
14192     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14193     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
14194     return Builder.CreateBitCast(Builder.CreateNot(Res),
14195                                  Ops[0]->getType());
14196   }
14197   case X86::BI__builtin_ia32_kmovb:
14198   case X86::BI__builtin_ia32_kmovw:
14199   case X86::BI__builtin_ia32_kmovd:
14200   case X86::BI__builtin_ia32_kmovq: {
14201     // Bitcast to vXi1 type and then back to integer. This gets the mask
14202     // register type into the IR, but might be optimized out depending on
14203     // what's around it.
14204     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14205     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
14206     return Builder.CreateBitCast(Res, Ops[0]->getType());
14207   }
14208 
14209   case X86::BI__builtin_ia32_kunpckdi:
14210   case X86::BI__builtin_ia32_kunpcksi:
14211   case X86::BI__builtin_ia32_kunpckhi: {
14212     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14213     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14214     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14215     int Indices[64];
14216     for (unsigned i = 0; i != NumElts; ++i)
14217       Indices[i] = i;
14218 
14219     // First extract half of each vector. This gives better codegen than
14220     // doing it in a single shuffle.
14221     LHS = Builder.CreateShuffleVector(LHS, LHS,
14222                                       makeArrayRef(Indices, NumElts / 2));
14223     RHS = Builder.CreateShuffleVector(RHS, RHS,
14224                                       makeArrayRef(Indices, NumElts / 2));
14225     // Concat the vectors.
14226     // NOTE: Operands are swapped to match the intrinsic definition.
14227     Value *Res = Builder.CreateShuffleVector(RHS, LHS,
14228                                              makeArrayRef(Indices, NumElts));
14229     return Builder.CreateBitCast(Res, Ops[0]->getType());
14230   }
14231 
14232   case X86::BI__builtin_ia32_vplzcntd_128:
14233   case X86::BI__builtin_ia32_vplzcntd_256:
14234   case X86::BI__builtin_ia32_vplzcntd_512:
14235   case X86::BI__builtin_ia32_vplzcntq_128:
14236   case X86::BI__builtin_ia32_vplzcntq_256:
14237   case X86::BI__builtin_ia32_vplzcntq_512: {
14238     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
14239     return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
14240   }
14241   case X86::BI__builtin_ia32_sqrtss:
14242   case X86::BI__builtin_ia32_sqrtsd: {
14243     Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
14244     Function *F;
14245     if (Builder.getIsFPConstrained()) {
14246       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14247       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
14248                            A->getType());
14249       A = Builder.CreateConstrainedFPCall(F, {A});
14250     } else {
14251       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
14252       A = Builder.CreateCall(F, {A});
14253     }
14254     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
14255   }
14256   case X86::BI__builtin_ia32_sqrtsh_round_mask:
14257   case X86::BI__builtin_ia32_sqrtsd_round_mask:
14258   case X86::BI__builtin_ia32_sqrtss_round_mask: {
14259     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
14260     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
14261     // otherwise keep the intrinsic.
14262     if (CC != 4) {
14263       Intrinsic::ID IID;
14264 
14265       switch (BuiltinID) {
14266       default:
14267         llvm_unreachable("Unsupported intrinsic!");
14268       case X86::BI__builtin_ia32_sqrtsh_round_mask:
14269         IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
14270         break;
14271       case X86::BI__builtin_ia32_sqrtsd_round_mask:
14272         IID = Intrinsic::x86_avx512_mask_sqrt_sd;
14273         break;
14274       case X86::BI__builtin_ia32_sqrtss_round_mask:
14275         IID = Intrinsic::x86_avx512_mask_sqrt_ss;
14276         break;
14277       }
14278       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
14279     }
14280     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
14281     Function *F;
14282     if (Builder.getIsFPConstrained()) {
14283       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14284       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
14285                            A->getType());
14286       A = Builder.CreateConstrainedFPCall(F, A);
14287     } else {
14288       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
14289       A = Builder.CreateCall(F, A);
14290     }
14291     Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
14292     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
14293     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
14294   }
14295   case X86::BI__builtin_ia32_sqrtpd256:
14296   case X86::BI__builtin_ia32_sqrtpd:
14297   case X86::BI__builtin_ia32_sqrtps256:
14298   case X86::BI__builtin_ia32_sqrtps:
14299   case X86::BI__builtin_ia32_sqrtph256:
14300   case X86::BI__builtin_ia32_sqrtph:
14301   case X86::BI__builtin_ia32_sqrtph512:
14302   case X86::BI__builtin_ia32_sqrtps512:
14303   case X86::BI__builtin_ia32_sqrtpd512: {
14304     if (Ops.size() == 2) {
14305       unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14306       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
14307       // otherwise keep the intrinsic.
14308       if (CC != 4) {
14309         Intrinsic::ID IID;
14310 
14311         switch (BuiltinID) {
14312         default:
14313           llvm_unreachable("Unsupported intrinsic!");
14314         case X86::BI__builtin_ia32_sqrtph512:
14315           IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
14316           break;
14317         case X86::BI__builtin_ia32_sqrtps512:
14318           IID = Intrinsic::x86_avx512_sqrt_ps_512;
14319           break;
14320         case X86::BI__builtin_ia32_sqrtpd512:
14321           IID = Intrinsic::x86_avx512_sqrt_pd_512;
14322           break;
14323         }
14324         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
14325       }
14326     }
14327     if (Builder.getIsFPConstrained()) {
14328       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14329       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
14330                                      Ops[0]->getType());
14331       return Builder.CreateConstrainedFPCall(F, Ops[0]);
14332     } else {
14333       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
14334       return Builder.CreateCall(F, Ops[0]);
14335     }
14336   }
14337 
14338   case X86::BI__builtin_ia32_pmuludq128:
14339   case X86::BI__builtin_ia32_pmuludq256:
14340   case X86::BI__builtin_ia32_pmuludq512:
14341     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
14342 
14343   case X86::BI__builtin_ia32_pmuldq128:
14344   case X86::BI__builtin_ia32_pmuldq256:
14345   case X86::BI__builtin_ia32_pmuldq512:
14346     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
14347 
14348   case X86::BI__builtin_ia32_pternlogd512_mask:
14349   case X86::BI__builtin_ia32_pternlogq512_mask:
14350   case X86::BI__builtin_ia32_pternlogd128_mask:
14351   case X86::BI__builtin_ia32_pternlogd256_mask:
14352   case X86::BI__builtin_ia32_pternlogq128_mask:
14353   case X86::BI__builtin_ia32_pternlogq256_mask:
14354     return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
14355 
14356   case X86::BI__builtin_ia32_pternlogd512_maskz:
14357   case X86::BI__builtin_ia32_pternlogq512_maskz:
14358   case X86::BI__builtin_ia32_pternlogd128_maskz:
14359   case X86::BI__builtin_ia32_pternlogd256_maskz:
14360   case X86::BI__builtin_ia32_pternlogq128_maskz:
14361   case X86::BI__builtin_ia32_pternlogq256_maskz:
14362     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
14363 
14364   case X86::BI__builtin_ia32_vpshldd128:
14365   case X86::BI__builtin_ia32_vpshldd256:
14366   case X86::BI__builtin_ia32_vpshldd512:
14367   case X86::BI__builtin_ia32_vpshldq128:
14368   case X86::BI__builtin_ia32_vpshldq256:
14369   case X86::BI__builtin_ia32_vpshldq512:
14370   case X86::BI__builtin_ia32_vpshldw128:
14371   case X86::BI__builtin_ia32_vpshldw256:
14372   case X86::BI__builtin_ia32_vpshldw512:
14373     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
14374 
14375   case X86::BI__builtin_ia32_vpshrdd128:
14376   case X86::BI__builtin_ia32_vpshrdd256:
14377   case X86::BI__builtin_ia32_vpshrdd512:
14378   case X86::BI__builtin_ia32_vpshrdq128:
14379   case X86::BI__builtin_ia32_vpshrdq256:
14380   case X86::BI__builtin_ia32_vpshrdq512:
14381   case X86::BI__builtin_ia32_vpshrdw128:
14382   case X86::BI__builtin_ia32_vpshrdw256:
14383   case X86::BI__builtin_ia32_vpshrdw512:
14384     // Ops 0 and 1 are swapped.
14385     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
14386 
14387   case X86::BI__builtin_ia32_vpshldvd128:
14388   case X86::BI__builtin_ia32_vpshldvd256:
14389   case X86::BI__builtin_ia32_vpshldvd512:
14390   case X86::BI__builtin_ia32_vpshldvq128:
14391   case X86::BI__builtin_ia32_vpshldvq256:
14392   case X86::BI__builtin_ia32_vpshldvq512:
14393   case X86::BI__builtin_ia32_vpshldvw128:
14394   case X86::BI__builtin_ia32_vpshldvw256:
14395   case X86::BI__builtin_ia32_vpshldvw512:
14396     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
14397 
14398   case X86::BI__builtin_ia32_vpshrdvd128:
14399   case X86::BI__builtin_ia32_vpshrdvd256:
14400   case X86::BI__builtin_ia32_vpshrdvd512:
14401   case X86::BI__builtin_ia32_vpshrdvq128:
14402   case X86::BI__builtin_ia32_vpshrdvq256:
14403   case X86::BI__builtin_ia32_vpshrdvq512:
14404   case X86::BI__builtin_ia32_vpshrdvw128:
14405   case X86::BI__builtin_ia32_vpshrdvw256:
14406   case X86::BI__builtin_ia32_vpshrdvw512:
14407     // Ops 0 and 1 are swapped.
14408     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
14409 
14410   // Reductions
14411   case X86::BI__builtin_ia32_reduce_add_d512:
14412   case X86::BI__builtin_ia32_reduce_add_q512: {
14413     Function *F =
14414         CGM.getIntrinsic(Intrinsic::vector_reduce_add, Ops[0]->getType());
14415     return Builder.CreateCall(F, {Ops[0]});
14416   }
14417   case X86::BI__builtin_ia32_reduce_fadd_pd512:
14418   case X86::BI__builtin_ia32_reduce_fadd_ps512:
14419   case X86::BI__builtin_ia32_reduce_fadd_ph512:
14420   case X86::BI__builtin_ia32_reduce_fadd_ph256:
14421   case X86::BI__builtin_ia32_reduce_fadd_ph128: {
14422     Function *F =
14423         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
14424     Builder.getFastMathFlags().setAllowReassoc();
14425     return Builder.CreateCall(F, {Ops[0], Ops[1]});
14426   }
14427   case X86::BI__builtin_ia32_reduce_fmul_pd512:
14428   case X86::BI__builtin_ia32_reduce_fmul_ps512:
14429   case X86::BI__builtin_ia32_reduce_fmul_ph512:
14430   case X86::BI__builtin_ia32_reduce_fmul_ph256:
14431   case X86::BI__builtin_ia32_reduce_fmul_ph128: {
14432     Function *F =
14433         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
14434     Builder.getFastMathFlags().setAllowReassoc();
14435     return Builder.CreateCall(F, {Ops[0], Ops[1]});
14436   }
14437   case X86::BI__builtin_ia32_reduce_fmax_pd512:
14438   case X86::BI__builtin_ia32_reduce_fmax_ps512:
14439   case X86::BI__builtin_ia32_reduce_fmax_ph512:
14440   case X86::BI__builtin_ia32_reduce_fmax_ph256:
14441   case X86::BI__builtin_ia32_reduce_fmax_ph128: {
14442     Function *F =
14443         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
14444     Builder.getFastMathFlags().setNoNaNs();
14445     return Builder.CreateCall(F, {Ops[0]});
14446   }
14447   case X86::BI__builtin_ia32_reduce_fmin_pd512:
14448   case X86::BI__builtin_ia32_reduce_fmin_ps512:
14449   case X86::BI__builtin_ia32_reduce_fmin_ph512:
14450   case X86::BI__builtin_ia32_reduce_fmin_ph256:
14451   case X86::BI__builtin_ia32_reduce_fmin_ph128: {
14452     Function *F =
14453         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
14454     Builder.getFastMathFlags().setNoNaNs();
14455     return Builder.CreateCall(F, {Ops[0]});
14456   }
14457   case X86::BI__builtin_ia32_reduce_mul_d512:
14458   case X86::BI__builtin_ia32_reduce_mul_q512: {
14459     Function *F =
14460         CGM.getIntrinsic(Intrinsic::vector_reduce_mul, Ops[0]->getType());
14461     return Builder.CreateCall(F, {Ops[0]});
14462   }
14463 
14464   // 3DNow!
14465   case X86::BI__builtin_ia32_pswapdsf:
14466   case X86::BI__builtin_ia32_pswapdsi: {
14467     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
14468     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
14469     llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
14470     return Builder.CreateCall(F, Ops, "pswapd");
14471   }
14472   case X86::BI__builtin_ia32_rdrand16_step:
14473   case X86::BI__builtin_ia32_rdrand32_step:
14474   case X86::BI__builtin_ia32_rdrand64_step:
14475   case X86::BI__builtin_ia32_rdseed16_step:
14476   case X86::BI__builtin_ia32_rdseed32_step:
14477   case X86::BI__builtin_ia32_rdseed64_step: {
14478     Intrinsic::ID ID;
14479     switch (BuiltinID) {
14480     default: llvm_unreachable("Unsupported intrinsic!");
14481     case X86::BI__builtin_ia32_rdrand16_step:
14482       ID = Intrinsic::x86_rdrand_16;
14483       break;
14484     case X86::BI__builtin_ia32_rdrand32_step:
14485       ID = Intrinsic::x86_rdrand_32;
14486       break;
14487     case X86::BI__builtin_ia32_rdrand64_step:
14488       ID = Intrinsic::x86_rdrand_64;
14489       break;
14490     case X86::BI__builtin_ia32_rdseed16_step:
14491       ID = Intrinsic::x86_rdseed_16;
14492       break;
14493     case X86::BI__builtin_ia32_rdseed32_step:
14494       ID = Intrinsic::x86_rdseed_32;
14495       break;
14496     case X86::BI__builtin_ia32_rdseed64_step:
14497       ID = Intrinsic::x86_rdseed_64;
14498       break;
14499     }
14500 
14501     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
14502     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
14503                                       Ops[0]);
14504     return Builder.CreateExtractValue(Call, 1);
14505   }
14506   case X86::BI__builtin_ia32_addcarryx_u32:
14507   case X86::BI__builtin_ia32_addcarryx_u64:
14508   case X86::BI__builtin_ia32_subborrow_u32:
14509   case X86::BI__builtin_ia32_subborrow_u64: {
14510     Intrinsic::ID IID;
14511     switch (BuiltinID) {
14512     default: llvm_unreachable("Unsupported intrinsic!");
14513     case X86::BI__builtin_ia32_addcarryx_u32:
14514       IID = Intrinsic::x86_addcarry_32;
14515       break;
14516     case X86::BI__builtin_ia32_addcarryx_u64:
14517       IID = Intrinsic::x86_addcarry_64;
14518       break;
14519     case X86::BI__builtin_ia32_subborrow_u32:
14520       IID = Intrinsic::x86_subborrow_32;
14521       break;
14522     case X86::BI__builtin_ia32_subborrow_u64:
14523       IID = Intrinsic::x86_subborrow_64;
14524       break;
14525     }
14526 
14527     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
14528                                      { Ops[0], Ops[1], Ops[2] });
14529     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
14530                                       Ops[3]);
14531     return Builder.CreateExtractValue(Call, 0);
14532   }
14533 
14534   case X86::BI__builtin_ia32_fpclassps128_mask:
14535   case X86::BI__builtin_ia32_fpclassps256_mask:
14536   case X86::BI__builtin_ia32_fpclassps512_mask:
14537   case X86::BI__builtin_ia32_fpclassph128_mask:
14538   case X86::BI__builtin_ia32_fpclassph256_mask:
14539   case X86::BI__builtin_ia32_fpclassph512_mask:
14540   case X86::BI__builtin_ia32_fpclasspd128_mask:
14541   case X86::BI__builtin_ia32_fpclasspd256_mask:
14542   case X86::BI__builtin_ia32_fpclasspd512_mask: {
14543     unsigned NumElts =
14544         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14545     Value *MaskIn = Ops[2];
14546     Ops.erase(&Ops[2]);
14547 
14548     Intrinsic::ID ID;
14549     switch (BuiltinID) {
14550     default: llvm_unreachable("Unsupported intrinsic!");
14551     case X86::BI__builtin_ia32_fpclassph128_mask:
14552       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
14553       break;
14554     case X86::BI__builtin_ia32_fpclassph256_mask:
14555       ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
14556       break;
14557     case X86::BI__builtin_ia32_fpclassph512_mask:
14558       ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
14559       break;
14560     case X86::BI__builtin_ia32_fpclassps128_mask:
14561       ID = Intrinsic::x86_avx512_fpclass_ps_128;
14562       break;
14563     case X86::BI__builtin_ia32_fpclassps256_mask:
14564       ID = Intrinsic::x86_avx512_fpclass_ps_256;
14565       break;
14566     case X86::BI__builtin_ia32_fpclassps512_mask:
14567       ID = Intrinsic::x86_avx512_fpclass_ps_512;
14568       break;
14569     case X86::BI__builtin_ia32_fpclasspd128_mask:
14570       ID = Intrinsic::x86_avx512_fpclass_pd_128;
14571       break;
14572     case X86::BI__builtin_ia32_fpclasspd256_mask:
14573       ID = Intrinsic::x86_avx512_fpclass_pd_256;
14574       break;
14575     case X86::BI__builtin_ia32_fpclasspd512_mask:
14576       ID = Intrinsic::x86_avx512_fpclass_pd_512;
14577       break;
14578     }
14579 
14580     Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
14581     return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
14582   }
14583 
14584   case X86::BI__builtin_ia32_vp2intersect_q_512:
14585   case X86::BI__builtin_ia32_vp2intersect_q_256:
14586   case X86::BI__builtin_ia32_vp2intersect_q_128:
14587   case X86::BI__builtin_ia32_vp2intersect_d_512:
14588   case X86::BI__builtin_ia32_vp2intersect_d_256:
14589   case X86::BI__builtin_ia32_vp2intersect_d_128: {
14590     unsigned NumElts =
14591         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14592     Intrinsic::ID ID;
14593 
14594     switch (BuiltinID) {
14595     default: llvm_unreachable("Unsupported intrinsic!");
14596     case X86::BI__builtin_ia32_vp2intersect_q_512:
14597       ID = Intrinsic::x86_avx512_vp2intersect_q_512;
14598       break;
14599     case X86::BI__builtin_ia32_vp2intersect_q_256:
14600       ID = Intrinsic::x86_avx512_vp2intersect_q_256;
14601       break;
14602     case X86::BI__builtin_ia32_vp2intersect_q_128:
14603       ID = Intrinsic::x86_avx512_vp2intersect_q_128;
14604       break;
14605     case X86::BI__builtin_ia32_vp2intersect_d_512:
14606       ID = Intrinsic::x86_avx512_vp2intersect_d_512;
14607       break;
14608     case X86::BI__builtin_ia32_vp2intersect_d_256:
14609       ID = Intrinsic::x86_avx512_vp2intersect_d_256;
14610       break;
14611     case X86::BI__builtin_ia32_vp2intersect_d_128:
14612       ID = Intrinsic::x86_avx512_vp2intersect_d_128;
14613       break;
14614     }
14615 
14616     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
14617     Value *Result = Builder.CreateExtractValue(Call, 0);
14618     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
14619     Builder.CreateDefaultAlignedStore(Result, Ops[2]);
14620 
14621     Result = Builder.CreateExtractValue(Call, 1);
14622     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
14623     return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
14624   }
14625 
14626   case X86::BI__builtin_ia32_vpmultishiftqb128:
14627   case X86::BI__builtin_ia32_vpmultishiftqb256:
14628   case X86::BI__builtin_ia32_vpmultishiftqb512: {
14629     Intrinsic::ID ID;
14630     switch (BuiltinID) {
14631     default: llvm_unreachable("Unsupported intrinsic!");
14632     case X86::BI__builtin_ia32_vpmultishiftqb128:
14633       ID = Intrinsic::x86_avx512_pmultishift_qb_128;
14634       break;
14635     case X86::BI__builtin_ia32_vpmultishiftqb256:
14636       ID = Intrinsic::x86_avx512_pmultishift_qb_256;
14637       break;
14638     case X86::BI__builtin_ia32_vpmultishiftqb512:
14639       ID = Intrinsic::x86_avx512_pmultishift_qb_512;
14640       break;
14641     }
14642 
14643     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
14644   }
14645 
14646   case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
14647   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
14648   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
14649     unsigned NumElts =
14650         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14651     Value *MaskIn = Ops[2];
14652     Ops.erase(&Ops[2]);
14653 
14654     Intrinsic::ID ID;
14655     switch (BuiltinID) {
14656     default: llvm_unreachable("Unsupported intrinsic!");
14657     case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
14658       ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
14659       break;
14660     case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
14661       ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
14662       break;
14663     case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
14664       ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
14665       break;
14666     }
14667 
14668     Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
14669     return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
14670   }
14671 
14672   // packed comparison intrinsics
14673   case X86::BI__builtin_ia32_cmpeqps:
14674   case X86::BI__builtin_ia32_cmpeqpd:
14675     return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
14676   case X86::BI__builtin_ia32_cmpltps:
14677   case X86::BI__builtin_ia32_cmpltpd:
14678     return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
14679   case X86::BI__builtin_ia32_cmpleps:
14680   case X86::BI__builtin_ia32_cmplepd:
14681     return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
14682   case X86::BI__builtin_ia32_cmpunordps:
14683   case X86::BI__builtin_ia32_cmpunordpd:
14684     return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
14685   case X86::BI__builtin_ia32_cmpneqps:
14686   case X86::BI__builtin_ia32_cmpneqpd:
14687     return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
14688   case X86::BI__builtin_ia32_cmpnltps:
14689   case X86::BI__builtin_ia32_cmpnltpd:
14690     return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
14691   case X86::BI__builtin_ia32_cmpnleps:
14692   case X86::BI__builtin_ia32_cmpnlepd:
14693     return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
14694   case X86::BI__builtin_ia32_cmpordps:
14695   case X86::BI__builtin_ia32_cmpordpd:
14696     return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
14697   case X86::BI__builtin_ia32_cmpph128_mask:
14698   case X86::BI__builtin_ia32_cmpph256_mask:
14699   case X86::BI__builtin_ia32_cmpph512_mask:
14700   case X86::BI__builtin_ia32_cmpps128_mask:
14701   case X86::BI__builtin_ia32_cmpps256_mask:
14702   case X86::BI__builtin_ia32_cmpps512_mask:
14703   case X86::BI__builtin_ia32_cmppd128_mask:
14704   case X86::BI__builtin_ia32_cmppd256_mask:
14705   case X86::BI__builtin_ia32_cmppd512_mask:
14706     IsMaskFCmp = true;
14707     LLVM_FALLTHROUGH;
14708   case X86::BI__builtin_ia32_cmpps:
14709   case X86::BI__builtin_ia32_cmpps256:
14710   case X86::BI__builtin_ia32_cmppd:
14711   case X86::BI__builtin_ia32_cmppd256: {
14712     // Lowering vector comparisons to fcmp instructions, while
14713     // ignoring signalling behaviour requested
14714     // ignoring rounding mode requested
14715     // This is only possible if fp-model is not strict and FENV_ACCESS is off.
14716 
14717     // The third argument is the comparison condition, and integer in the
14718     // range [0, 31]
14719     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
14720 
14721     // Lowering to IR fcmp instruction.
14722     // Ignoring requested signaling behaviour,
14723     // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
14724     FCmpInst::Predicate Pred;
14725     bool IsSignaling;
14726     // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
14727     // behavior is inverted. We'll handle that after the switch.
14728     switch (CC & 0xf) {
14729     case 0x00: Pred = FCmpInst::FCMP_OEQ;   IsSignaling = false; break;
14730     case 0x01: Pred = FCmpInst::FCMP_OLT;   IsSignaling = true;  break;
14731     case 0x02: Pred = FCmpInst::FCMP_OLE;   IsSignaling = true;  break;
14732     case 0x03: Pred = FCmpInst::FCMP_UNO;   IsSignaling = false; break;
14733     case 0x04: Pred = FCmpInst::FCMP_UNE;   IsSignaling = false; break;
14734     case 0x05: Pred = FCmpInst::FCMP_UGE;   IsSignaling = true;  break;
14735     case 0x06: Pred = FCmpInst::FCMP_UGT;   IsSignaling = true;  break;
14736     case 0x07: Pred = FCmpInst::FCMP_ORD;   IsSignaling = false; break;
14737     case 0x08: Pred = FCmpInst::FCMP_UEQ;   IsSignaling = false; break;
14738     case 0x09: Pred = FCmpInst::FCMP_ULT;   IsSignaling = true;  break;
14739     case 0x0a: Pred = FCmpInst::FCMP_ULE;   IsSignaling = true;  break;
14740     case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
14741     case 0x0c: Pred = FCmpInst::FCMP_ONE;   IsSignaling = false; break;
14742     case 0x0d: Pred = FCmpInst::FCMP_OGE;   IsSignaling = true;  break;
14743     case 0x0e: Pred = FCmpInst::FCMP_OGT;   IsSignaling = true;  break;
14744     case 0x0f: Pred = FCmpInst::FCMP_TRUE;  IsSignaling = false; break;
14745     default: llvm_unreachable("Unhandled CC");
14746     }
14747 
14748     // Invert the signalling behavior for 16-31.
14749     if (CC & 0x10)
14750       IsSignaling = !IsSignaling;
14751 
14752     // If the predicate is true or false and we're using constrained intrinsics,
14753     // we don't have a compare intrinsic we can use. Just use the legacy X86
14754     // specific intrinsic.
14755     // If the intrinsic is mask enabled and we're using constrained intrinsics,
14756     // use the legacy X86 specific intrinsic.
14757     if (Builder.getIsFPConstrained() &&
14758         (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
14759          IsMaskFCmp)) {
14760 
14761       Intrinsic::ID IID;
14762       switch (BuiltinID) {
14763       default: llvm_unreachable("Unexpected builtin");
14764       case X86::BI__builtin_ia32_cmpps:
14765         IID = Intrinsic::x86_sse_cmp_ps;
14766         break;
14767       case X86::BI__builtin_ia32_cmpps256:
14768         IID = Intrinsic::x86_avx_cmp_ps_256;
14769         break;
14770       case X86::BI__builtin_ia32_cmppd:
14771         IID = Intrinsic::x86_sse2_cmp_pd;
14772         break;
14773       case X86::BI__builtin_ia32_cmppd256:
14774         IID = Intrinsic::x86_avx_cmp_pd_256;
14775         break;
14776       case X86::BI__builtin_ia32_cmpps512_mask:
14777         IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
14778         break;
14779       case X86::BI__builtin_ia32_cmppd512_mask:
14780         IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
14781         break;
14782       case X86::BI__builtin_ia32_cmpps128_mask:
14783         IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
14784         break;
14785       case X86::BI__builtin_ia32_cmpps256_mask:
14786         IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
14787         break;
14788       case X86::BI__builtin_ia32_cmppd128_mask:
14789         IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
14790         break;
14791       case X86::BI__builtin_ia32_cmppd256_mask:
14792         IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
14793         break;
14794       }
14795 
14796       Function *Intr = CGM.getIntrinsic(IID);
14797       if (IsMaskFCmp) {
14798         unsigned NumElts =
14799             cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14800         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
14801         Value *Cmp = Builder.CreateCall(Intr, Ops);
14802         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
14803       }
14804 
14805       return Builder.CreateCall(Intr, Ops);
14806     }
14807 
14808     // Builtins without the _mask suffix return a vector of integers
14809     // of the same width as the input vectors
14810     if (IsMaskFCmp) {
14811       // We ignore SAE if strict FP is disabled. We only keep precise
14812       // exception behavior under strict FP.
14813       // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
14814       // object will be required.
14815       unsigned NumElts =
14816           cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14817       Value *Cmp;
14818       if (IsSignaling)
14819         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
14820       else
14821         Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
14822       return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
14823     }
14824 
14825     return getVectorFCmpIR(Pred, IsSignaling);
14826   }
14827 
14828   // SSE scalar comparison intrinsics
14829   case X86::BI__builtin_ia32_cmpeqss:
14830     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
14831   case X86::BI__builtin_ia32_cmpltss:
14832     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
14833   case X86::BI__builtin_ia32_cmpless:
14834     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
14835   case X86::BI__builtin_ia32_cmpunordss:
14836     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
14837   case X86::BI__builtin_ia32_cmpneqss:
14838     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
14839   case X86::BI__builtin_ia32_cmpnltss:
14840     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
14841   case X86::BI__builtin_ia32_cmpnless:
14842     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
14843   case X86::BI__builtin_ia32_cmpordss:
14844     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
14845   case X86::BI__builtin_ia32_cmpeqsd:
14846     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
14847   case X86::BI__builtin_ia32_cmpltsd:
14848     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
14849   case X86::BI__builtin_ia32_cmplesd:
14850     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
14851   case X86::BI__builtin_ia32_cmpunordsd:
14852     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
14853   case X86::BI__builtin_ia32_cmpneqsd:
14854     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
14855   case X86::BI__builtin_ia32_cmpnltsd:
14856     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
14857   case X86::BI__builtin_ia32_cmpnlesd:
14858     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
14859   case X86::BI__builtin_ia32_cmpordsd:
14860     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
14861 
14862   // f16c half2float intrinsics
14863   case X86::BI__builtin_ia32_vcvtph2ps:
14864   case X86::BI__builtin_ia32_vcvtph2ps256:
14865   case X86::BI__builtin_ia32_vcvtph2ps_mask:
14866   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
14867   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
14868     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14869     return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
14870   }
14871 
14872 // AVX512 bf16 intrinsics
14873   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
14874     Ops[2] = getMaskVecValue(
14875         *this, Ops[2],
14876         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
14877     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
14878     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
14879   }
14880   case X86::BI__builtin_ia32_cvtsbf162ss_32:
14881     return EmitX86CvtBF16ToFloatExpr(*this, E, Ops);
14882 
14883   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
14884   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
14885     Intrinsic::ID IID;
14886     switch (BuiltinID) {
14887     default: llvm_unreachable("Unsupported intrinsic!");
14888     case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
14889       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
14890       break;
14891     case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
14892       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
14893       break;
14894     }
14895     Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
14896     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
14897   }
14898 
14899   case X86::BI__emul:
14900   case X86::BI__emulu: {
14901     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
14902     bool isSigned = (BuiltinID == X86::BI__emul);
14903     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
14904     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
14905     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
14906   }
14907   case X86::BI__mulh:
14908   case X86::BI__umulh:
14909   case X86::BI_mul128:
14910   case X86::BI_umul128: {
14911     llvm::Type *ResType = ConvertType(E->getType());
14912     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
14913 
14914     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
14915     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
14916     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
14917 
14918     Value *MulResult, *HigherBits;
14919     if (IsSigned) {
14920       MulResult = Builder.CreateNSWMul(LHS, RHS);
14921       HigherBits = Builder.CreateAShr(MulResult, 64);
14922     } else {
14923       MulResult = Builder.CreateNUWMul(LHS, RHS);
14924       HigherBits = Builder.CreateLShr(MulResult, 64);
14925     }
14926     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
14927 
14928     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
14929       return HigherBits;
14930 
14931     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
14932     Builder.CreateStore(HigherBits, HighBitsAddress);
14933     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
14934   }
14935 
14936   case X86::BI__faststorefence: {
14937     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
14938                                llvm::SyncScope::System);
14939   }
14940   case X86::BI__shiftleft128:
14941   case X86::BI__shiftright128: {
14942     llvm::Function *F = CGM.getIntrinsic(
14943         BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
14944         Int64Ty);
14945     // Flip low/high ops and zero-extend amount to matching type.
14946     // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
14947     // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
14948     std::swap(Ops[0], Ops[1]);
14949     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
14950     return Builder.CreateCall(F, Ops);
14951   }
14952   case X86::BI_ReadWriteBarrier:
14953   case X86::BI_ReadBarrier:
14954   case X86::BI_WriteBarrier: {
14955     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
14956                                llvm::SyncScope::SingleThread);
14957   }
14958 
14959   case X86::BI_AddressOfReturnAddress: {
14960     Function *F =
14961         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
14962     return Builder.CreateCall(F);
14963   }
14964   case X86::BI__stosb: {
14965     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
14966     // instruction, but it will create a memset that won't be optimized away.
14967     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
14968   }
14969   case X86::BI__ud2:
14970     // llvm.trap makes a ud2a instruction on x86.
14971     return EmitTrapCall(Intrinsic::trap);
14972   case X86::BI__int2c: {
14973     // This syscall signals a driver assertion failure in x86 NT kernels.
14974     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
14975     llvm::InlineAsm *IA =
14976         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
14977     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
14978         getLLVMContext(), llvm::AttributeList::FunctionIndex,
14979         llvm::Attribute::NoReturn);
14980     llvm::CallInst *CI = Builder.CreateCall(IA);
14981     CI->setAttributes(NoReturnAttr);
14982     return CI;
14983   }
14984   case X86::BI__readfsbyte:
14985   case X86::BI__readfsword:
14986   case X86::BI__readfsdword:
14987   case X86::BI__readfsqword: {
14988     llvm::Type *IntTy = ConvertType(E->getType());
14989     Value *Ptr =
14990         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 257));
14991     LoadInst *Load = Builder.CreateAlignedLoad(
14992         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
14993     Load->setVolatile(true);
14994     return Load;
14995   }
14996   case X86::BI__readgsbyte:
14997   case X86::BI__readgsword:
14998   case X86::BI__readgsdword:
14999   case X86::BI__readgsqword: {
15000     llvm::Type *IntTy = ConvertType(E->getType());
15001     Value *Ptr =
15002         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 256));
15003     LoadInst *Load = Builder.CreateAlignedLoad(
15004         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
15005     Load->setVolatile(true);
15006     return Load;
15007   }
15008   case X86::BI__builtin_ia32_encodekey128_u32: {
15009     Intrinsic::ID IID = Intrinsic::x86_encodekey128;
15010 
15011     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
15012 
15013     for (int i = 0; i < 3; ++i) {
15014       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15015       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
15016       Ptr = Builder.CreateBitCast(
15017           Ptr, llvm::PointerType::getUnqual(Extract->getType()));
15018       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
15019     }
15020 
15021     return Builder.CreateExtractValue(Call, 0);
15022   }
15023   case X86::BI__builtin_ia32_encodekey256_u32: {
15024     Intrinsic::ID IID = Intrinsic::x86_encodekey256;
15025 
15026     Value *Call =
15027         Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
15028 
15029     for (int i = 0; i < 4; ++i) {
15030       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15031       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
15032       Ptr = Builder.CreateBitCast(
15033           Ptr, llvm::PointerType::getUnqual(Extract->getType()));
15034       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
15035     }
15036 
15037     return Builder.CreateExtractValue(Call, 0);
15038   }
15039   case X86::BI__builtin_ia32_aesenc128kl_u8:
15040   case X86::BI__builtin_ia32_aesdec128kl_u8:
15041   case X86::BI__builtin_ia32_aesenc256kl_u8:
15042   case X86::BI__builtin_ia32_aesdec256kl_u8: {
15043     Intrinsic::ID IID;
15044     StringRef BlockName;
15045     switch (BuiltinID) {
15046     default:
15047       llvm_unreachable("Unexpected builtin");
15048     case X86::BI__builtin_ia32_aesenc128kl_u8:
15049       IID = Intrinsic::x86_aesenc128kl;
15050       BlockName = "aesenc128kl";
15051       break;
15052     case X86::BI__builtin_ia32_aesdec128kl_u8:
15053       IID = Intrinsic::x86_aesdec128kl;
15054       BlockName = "aesdec128kl";
15055       break;
15056     case X86::BI__builtin_ia32_aesenc256kl_u8:
15057       IID = Intrinsic::x86_aesenc256kl;
15058       BlockName = "aesenc256kl";
15059       break;
15060     case X86::BI__builtin_ia32_aesdec256kl_u8:
15061       IID = Intrinsic::x86_aesdec256kl;
15062       BlockName = "aesdec256kl";
15063       break;
15064     }
15065 
15066     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
15067 
15068     BasicBlock *NoError =
15069         createBasicBlock(BlockName + "_no_error", this->CurFn);
15070     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
15071     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
15072 
15073     Value *Ret = Builder.CreateExtractValue(Call, 0);
15074     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
15075     Value *Out = Builder.CreateExtractValue(Call, 1);
15076     Builder.CreateCondBr(Succ, NoError, Error);
15077 
15078     Builder.SetInsertPoint(NoError);
15079     Builder.CreateDefaultAlignedStore(Out, Ops[0]);
15080     Builder.CreateBr(End);
15081 
15082     Builder.SetInsertPoint(Error);
15083     Constant *Zero = llvm::Constant::getNullValue(Out->getType());
15084     Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
15085     Builder.CreateBr(End);
15086 
15087     Builder.SetInsertPoint(End);
15088     return Builder.CreateExtractValue(Call, 0);
15089   }
15090   case X86::BI__builtin_ia32_aesencwide128kl_u8:
15091   case X86::BI__builtin_ia32_aesdecwide128kl_u8:
15092   case X86::BI__builtin_ia32_aesencwide256kl_u8:
15093   case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
15094     Intrinsic::ID IID;
15095     StringRef BlockName;
15096     switch (BuiltinID) {
15097     case X86::BI__builtin_ia32_aesencwide128kl_u8:
15098       IID = Intrinsic::x86_aesencwide128kl;
15099       BlockName = "aesencwide128kl";
15100       break;
15101     case X86::BI__builtin_ia32_aesdecwide128kl_u8:
15102       IID = Intrinsic::x86_aesdecwide128kl;
15103       BlockName = "aesdecwide128kl";
15104       break;
15105     case X86::BI__builtin_ia32_aesencwide256kl_u8:
15106       IID = Intrinsic::x86_aesencwide256kl;
15107       BlockName = "aesencwide256kl";
15108       break;
15109     case X86::BI__builtin_ia32_aesdecwide256kl_u8:
15110       IID = Intrinsic::x86_aesdecwide256kl;
15111       BlockName = "aesdecwide256kl";
15112       break;
15113     }
15114 
15115     llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
15116     Value *InOps[9];
15117     InOps[0] = Ops[2];
15118     for (int i = 0; i != 8; ++i) {
15119       Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
15120       InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
15121     }
15122 
15123     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
15124 
15125     BasicBlock *NoError =
15126         createBasicBlock(BlockName + "_no_error", this->CurFn);
15127     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
15128     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
15129 
15130     Value *Ret = Builder.CreateExtractValue(Call, 0);
15131     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
15132     Builder.CreateCondBr(Succ, NoError, Error);
15133 
15134     Builder.SetInsertPoint(NoError);
15135     for (int i = 0; i != 8; ++i) {
15136       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15137       Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
15138       Builder.CreateAlignedStore(Extract, Ptr, Align(16));
15139     }
15140     Builder.CreateBr(End);
15141 
15142     Builder.SetInsertPoint(Error);
15143     for (int i = 0; i != 8; ++i) {
15144       Value *Out = Builder.CreateExtractValue(Call, i + 1);
15145       Constant *Zero = llvm::Constant::getNullValue(Out->getType());
15146       Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
15147       Builder.CreateAlignedStore(Zero, Ptr, Align(16));
15148     }
15149     Builder.CreateBr(End);
15150 
15151     Builder.SetInsertPoint(End);
15152     return Builder.CreateExtractValue(Call, 0);
15153   }
15154   case X86::BI__builtin_ia32_vfcmaddcph512_mask:
15155     IsConjFMA = true;
15156     LLVM_FALLTHROUGH;
15157   case X86::BI__builtin_ia32_vfmaddcph512_mask: {
15158     Intrinsic::ID IID = IsConjFMA
15159                             ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
15160                             : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
15161     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15162     return EmitX86Select(*this, Ops[3], Call, Ops[0]);
15163   }
15164   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
15165     IsConjFMA = true;
15166     LLVM_FALLTHROUGH;
15167   case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
15168     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
15169                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
15170     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15171     Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
15172     return EmitX86Select(*this, And, Call, Ops[0]);
15173   }
15174   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
15175     IsConjFMA = true;
15176     LLVM_FALLTHROUGH;
15177   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
15178     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
15179                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
15180     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15181     static constexpr int Mask[] = {0, 5, 6, 7};
15182     return Builder.CreateShuffleVector(Call, Ops[2], Mask);
15183   }
15184   }
15185 }
15186 
15187 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
15188                                            const CallExpr *E) {
15189   SmallVector<Value*, 4> Ops;
15190 
15191   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
15192     if (E->getArg(i)->getType()->isArrayType())
15193       Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
15194     else
15195       Ops.push_back(EmitScalarExpr(E->getArg(i)));
15196   }
15197 
15198   Intrinsic::ID ID = Intrinsic::not_intrinsic;
15199 
15200   switch (BuiltinID) {
15201   default: return nullptr;
15202 
15203   // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
15204   // call __builtin_readcyclecounter.
15205   case PPC::BI__builtin_ppc_get_timebase:
15206     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
15207 
15208   // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
15209   case PPC::BI__builtin_altivec_lvx:
15210   case PPC::BI__builtin_altivec_lvxl:
15211   case PPC::BI__builtin_altivec_lvebx:
15212   case PPC::BI__builtin_altivec_lvehx:
15213   case PPC::BI__builtin_altivec_lvewx:
15214   case PPC::BI__builtin_altivec_lvsl:
15215   case PPC::BI__builtin_altivec_lvsr:
15216   case PPC::BI__builtin_vsx_lxvd2x:
15217   case PPC::BI__builtin_vsx_lxvw4x:
15218   case PPC::BI__builtin_vsx_lxvd2x_be:
15219   case PPC::BI__builtin_vsx_lxvw4x_be:
15220   case PPC::BI__builtin_vsx_lxvl:
15221   case PPC::BI__builtin_vsx_lxvll:
15222   {
15223     if(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
15224        BuiltinID == PPC::BI__builtin_vsx_lxvll){
15225       Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
15226     }else {
15227       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
15228       Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
15229       Ops.pop_back();
15230     }
15231 
15232     switch (BuiltinID) {
15233     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
15234     case PPC::BI__builtin_altivec_lvx:
15235       ID = Intrinsic::ppc_altivec_lvx;
15236       break;
15237     case PPC::BI__builtin_altivec_lvxl:
15238       ID = Intrinsic::ppc_altivec_lvxl;
15239       break;
15240     case PPC::BI__builtin_altivec_lvebx:
15241       ID = Intrinsic::ppc_altivec_lvebx;
15242       break;
15243     case PPC::BI__builtin_altivec_lvehx:
15244       ID = Intrinsic::ppc_altivec_lvehx;
15245       break;
15246     case PPC::BI__builtin_altivec_lvewx:
15247       ID = Intrinsic::ppc_altivec_lvewx;
15248       break;
15249     case PPC::BI__builtin_altivec_lvsl:
15250       ID = Intrinsic::ppc_altivec_lvsl;
15251       break;
15252     case PPC::BI__builtin_altivec_lvsr:
15253       ID = Intrinsic::ppc_altivec_lvsr;
15254       break;
15255     case PPC::BI__builtin_vsx_lxvd2x:
15256       ID = Intrinsic::ppc_vsx_lxvd2x;
15257       break;
15258     case PPC::BI__builtin_vsx_lxvw4x:
15259       ID = Intrinsic::ppc_vsx_lxvw4x;
15260       break;
15261     case PPC::BI__builtin_vsx_lxvd2x_be:
15262       ID = Intrinsic::ppc_vsx_lxvd2x_be;
15263       break;
15264     case PPC::BI__builtin_vsx_lxvw4x_be:
15265       ID = Intrinsic::ppc_vsx_lxvw4x_be;
15266       break;
15267     case PPC::BI__builtin_vsx_lxvl:
15268       ID = Intrinsic::ppc_vsx_lxvl;
15269       break;
15270     case PPC::BI__builtin_vsx_lxvll:
15271       ID = Intrinsic::ppc_vsx_lxvll;
15272       break;
15273     }
15274     llvm::Function *F = CGM.getIntrinsic(ID);
15275     return Builder.CreateCall(F, Ops, "");
15276   }
15277 
15278   // vec_st, vec_xst_be
15279   case PPC::BI__builtin_altivec_stvx:
15280   case PPC::BI__builtin_altivec_stvxl:
15281   case PPC::BI__builtin_altivec_stvebx:
15282   case PPC::BI__builtin_altivec_stvehx:
15283   case PPC::BI__builtin_altivec_stvewx:
15284   case PPC::BI__builtin_vsx_stxvd2x:
15285   case PPC::BI__builtin_vsx_stxvw4x:
15286   case PPC::BI__builtin_vsx_stxvd2x_be:
15287   case PPC::BI__builtin_vsx_stxvw4x_be:
15288   case PPC::BI__builtin_vsx_stxvl:
15289   case PPC::BI__builtin_vsx_stxvll:
15290   {
15291     if(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
15292       BuiltinID == PPC::BI__builtin_vsx_stxvll ){
15293       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
15294     }else {
15295       Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
15296       Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
15297       Ops.pop_back();
15298     }
15299 
15300     switch (BuiltinID) {
15301     default: llvm_unreachable("Unsupported st intrinsic!");
15302     case PPC::BI__builtin_altivec_stvx:
15303       ID = Intrinsic::ppc_altivec_stvx;
15304       break;
15305     case PPC::BI__builtin_altivec_stvxl:
15306       ID = Intrinsic::ppc_altivec_stvxl;
15307       break;
15308     case PPC::BI__builtin_altivec_stvebx:
15309       ID = Intrinsic::ppc_altivec_stvebx;
15310       break;
15311     case PPC::BI__builtin_altivec_stvehx:
15312       ID = Intrinsic::ppc_altivec_stvehx;
15313       break;
15314     case PPC::BI__builtin_altivec_stvewx:
15315       ID = Intrinsic::ppc_altivec_stvewx;
15316       break;
15317     case PPC::BI__builtin_vsx_stxvd2x:
15318       ID = Intrinsic::ppc_vsx_stxvd2x;
15319       break;
15320     case PPC::BI__builtin_vsx_stxvw4x:
15321       ID = Intrinsic::ppc_vsx_stxvw4x;
15322       break;
15323     case PPC::BI__builtin_vsx_stxvd2x_be:
15324       ID = Intrinsic::ppc_vsx_stxvd2x_be;
15325       break;
15326     case PPC::BI__builtin_vsx_stxvw4x_be:
15327       ID = Intrinsic::ppc_vsx_stxvw4x_be;
15328       break;
15329     case PPC::BI__builtin_vsx_stxvl:
15330       ID = Intrinsic::ppc_vsx_stxvl;
15331       break;
15332     case PPC::BI__builtin_vsx_stxvll:
15333       ID = Intrinsic::ppc_vsx_stxvll;
15334       break;
15335     }
15336     llvm::Function *F = CGM.getIntrinsic(ID);
15337     return Builder.CreateCall(F, Ops, "");
15338   }
15339   case PPC::BI__builtin_vsx_ldrmb: {
15340     // Essentially boils down to performing an unaligned VMX load sequence so
15341     // as to avoid crossing a page boundary and then shuffling the elements
15342     // into the right side of the vector register.
15343     int64_t NumBytes = cast<ConstantInt>(Ops[1])->getZExtValue();
15344     llvm::Type *ResTy = ConvertType(E->getType());
15345     bool IsLE = getTarget().isLittleEndian();
15346 
15347     // If the user wants the entire vector, just load the entire vector.
15348     if (NumBytes == 16) {
15349       Value *BC = Builder.CreateBitCast(Ops[0], ResTy->getPointerTo());
15350       Value *LD =
15351           Builder.CreateLoad(Address(BC, ResTy, CharUnits::fromQuantity(1)));
15352       if (!IsLE)
15353         return LD;
15354 
15355       // Reverse the bytes on LE.
15356       SmallVector<int, 16> RevMask;
15357       for (int Idx = 0; Idx < 16; Idx++)
15358         RevMask.push_back(15 - Idx);
15359       return Builder.CreateShuffleVector(LD, LD, RevMask);
15360     }
15361 
15362     llvm::Function *Lvx = CGM.getIntrinsic(Intrinsic::ppc_altivec_lvx);
15363     llvm::Function *Lvs = CGM.getIntrinsic(IsLE ? Intrinsic::ppc_altivec_lvsr
15364                                                 : Intrinsic::ppc_altivec_lvsl);
15365     llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm);
15366     Value *HiMem = Builder.CreateGEP(
15367         Int8Ty, Ops[0], ConstantInt::get(Ops[1]->getType(), NumBytes - 1));
15368     Value *LoLd = Builder.CreateCall(Lvx, Ops[0], "ld.lo");
15369     Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi");
15370     Value *Mask1 = Builder.CreateCall(Lvs, Ops[0], "mask1");
15371 
15372     Ops.clear();
15373     Ops.push_back(IsLE ? HiLd : LoLd);
15374     Ops.push_back(IsLE ? LoLd : HiLd);
15375     Ops.push_back(Mask1);
15376     Value *AllElts = Builder.CreateCall(Vperm, Ops, "shuffle1");
15377     Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType());
15378 
15379     if (IsLE) {
15380       SmallVector<int, 16> Consts;
15381       for (int Idx = 0; Idx < 16; Idx++) {
15382         int Val = (NumBytes - Idx - 1 >= 0) ? (NumBytes - Idx - 1)
15383                                             : 16 - (NumBytes - Idx);
15384         Consts.push_back(Val);
15385       }
15386       return Builder.CreateShuffleVector(Builder.CreateBitCast(AllElts, ResTy),
15387                                          Zero, Consts);
15388     }
15389     SmallVector<Constant *, 16> Consts;
15390     for (int Idx = 0; Idx < 16; Idx++)
15391       Consts.push_back(Builder.getInt8(NumBytes + Idx));
15392     Value *Mask2 = ConstantVector::get(Consts);
15393     return Builder.CreateBitCast(
15394         Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy);
15395   }
15396   case PPC::BI__builtin_vsx_strmb: {
15397     int64_t NumBytes = cast<ConstantInt>(Ops[1])->getZExtValue();
15398     bool IsLE = getTarget().isLittleEndian();
15399     auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) {
15400       // Storing the whole vector, simply store it on BE and reverse bytes and
15401       // store on LE.
15402       if (Width == 16) {
15403         Value *BC =
15404             Builder.CreateBitCast(Ops[0], Ops[2]->getType()->getPointerTo());
15405         Value *StVec = Ops[2];
15406         if (IsLE) {
15407           SmallVector<int, 16> RevMask;
15408           for (int Idx = 0; Idx < 16; Idx++)
15409             RevMask.push_back(15 - Idx);
15410           StVec = Builder.CreateShuffleVector(Ops[2], Ops[2], RevMask);
15411         }
15412         return Builder.CreateStore(
15413             StVec, Address(BC, Ops[2]->getType(), CharUnits::fromQuantity(1)));
15414       }
15415       auto *ConvTy = Int64Ty;
15416       unsigned NumElts = 0;
15417       switch (Width) {
15418       default:
15419         llvm_unreachable("width for stores must be a power of 2");
15420       case 8:
15421         ConvTy = Int64Ty;
15422         NumElts = 2;
15423         break;
15424       case 4:
15425         ConvTy = Int32Ty;
15426         NumElts = 4;
15427         break;
15428       case 2:
15429         ConvTy = Int16Ty;
15430         NumElts = 8;
15431         break;
15432       case 1:
15433         ConvTy = Int8Ty;
15434         NumElts = 16;
15435         break;
15436       }
15437       Value *Vec = Builder.CreateBitCast(
15438           Ops[2], llvm::FixedVectorType::get(ConvTy, NumElts));
15439       Value *Ptr = Builder.CreateGEP(Int8Ty, Ops[0],
15440                                      ConstantInt::get(Int64Ty, Offset));
15441       Value *PtrBC = Builder.CreateBitCast(Ptr, ConvTy->getPointerTo());
15442       Value *Elt = Builder.CreateExtractElement(Vec, EltNo);
15443       if (IsLE && Width > 1) {
15444         Function *F = CGM.getIntrinsic(Intrinsic::bswap, ConvTy);
15445         Elt = Builder.CreateCall(F, Elt);
15446       }
15447       return Builder.CreateStore(
15448           Elt, Address(PtrBC, ConvTy, CharUnits::fromQuantity(1)));
15449     };
15450     unsigned Stored = 0;
15451     unsigned RemainingBytes = NumBytes;
15452     Value *Result;
15453     if (NumBytes == 16)
15454       return StoreSubVec(16, 0, 0);
15455     if (NumBytes >= 8) {
15456       Result = StoreSubVec(8, NumBytes - 8, IsLE ? 0 : 1);
15457       RemainingBytes -= 8;
15458       Stored += 8;
15459     }
15460     if (RemainingBytes >= 4) {
15461       Result = StoreSubVec(4, NumBytes - Stored - 4,
15462                            IsLE ? (Stored >> 2) : 3 - (Stored >> 2));
15463       RemainingBytes -= 4;
15464       Stored += 4;
15465     }
15466     if (RemainingBytes >= 2) {
15467       Result = StoreSubVec(2, NumBytes - Stored - 2,
15468                            IsLE ? (Stored >> 1) : 7 - (Stored >> 1));
15469       RemainingBytes -= 2;
15470       Stored += 2;
15471     }
15472     if (RemainingBytes)
15473       Result =
15474           StoreSubVec(1, NumBytes - Stored - 1, IsLE ? Stored : 15 - Stored);
15475     return Result;
15476   }
15477   // Square root
15478   case PPC::BI__builtin_vsx_xvsqrtsp:
15479   case PPC::BI__builtin_vsx_xvsqrtdp: {
15480     llvm::Type *ResultType = ConvertType(E->getType());
15481     Value *X = EmitScalarExpr(E->getArg(0));
15482     if (Builder.getIsFPConstrained()) {
15483       llvm::Function *F = CGM.getIntrinsic(
15484           Intrinsic::experimental_constrained_sqrt, ResultType);
15485       return Builder.CreateConstrainedFPCall(F, X);
15486     } else {
15487       llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
15488       return Builder.CreateCall(F, X);
15489     }
15490   }
15491   // Count leading zeros
15492   case PPC::BI__builtin_altivec_vclzb:
15493   case PPC::BI__builtin_altivec_vclzh:
15494   case PPC::BI__builtin_altivec_vclzw:
15495   case PPC::BI__builtin_altivec_vclzd: {
15496     llvm::Type *ResultType = ConvertType(E->getType());
15497     Value *X = EmitScalarExpr(E->getArg(0));
15498     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
15499     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
15500     return Builder.CreateCall(F, {X, Undef});
15501   }
15502   case PPC::BI__builtin_altivec_vctzb:
15503   case PPC::BI__builtin_altivec_vctzh:
15504   case PPC::BI__builtin_altivec_vctzw:
15505   case PPC::BI__builtin_altivec_vctzd: {
15506     llvm::Type *ResultType = ConvertType(E->getType());
15507     Value *X = EmitScalarExpr(E->getArg(0));
15508     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
15509     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
15510     return Builder.CreateCall(F, {X, Undef});
15511   }
15512   case PPC::BI__builtin_altivec_vec_replace_elt:
15513   case PPC::BI__builtin_altivec_vec_replace_unaligned: {
15514     // The third argument of vec_replace_elt and vec_replace_unaligned must
15515     // be a compile time constant and will be emitted either to the vinsw
15516     // or vinsd instruction.
15517     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
15518     assert(ArgCI &&
15519            "Third Arg to vinsw/vinsd intrinsic must be a constant integer!");
15520     llvm::Type *ResultType = ConvertType(E->getType());
15521     llvm::Function *F = nullptr;
15522     Value *Call = nullptr;
15523     int64_t ConstArg = ArgCI->getSExtValue();
15524     unsigned ArgWidth = Ops[1]->getType()->getPrimitiveSizeInBits();
15525     bool Is32Bit = false;
15526     assert((ArgWidth == 32 || ArgWidth == 64) && "Invalid argument width");
15527     // The input to vec_replace_elt is an element index, not a byte index.
15528     if (BuiltinID == PPC::BI__builtin_altivec_vec_replace_elt)
15529       ConstArg *= ArgWidth / 8;
15530     if (ArgWidth == 32) {
15531       Is32Bit = true;
15532       // When the second argument is 32 bits, it can either be an integer or
15533       // a float. The vinsw intrinsic is used in this case.
15534       F = CGM.getIntrinsic(Intrinsic::ppc_altivec_vinsw);
15535       // Fix the constant according to endianess.
15536       if (getTarget().isLittleEndian())
15537         ConstArg = 12 - ConstArg;
15538     } else {
15539       // When the second argument is 64 bits, it can either be a long long or
15540       // a double. The vinsd intrinsic is used in this case.
15541       F = CGM.getIntrinsic(Intrinsic::ppc_altivec_vinsd);
15542       // Fix the constant for little endian.
15543       if (getTarget().isLittleEndian())
15544         ConstArg = 8 - ConstArg;
15545     }
15546     Ops[2] = ConstantInt::getSigned(Int32Ty, ConstArg);
15547     // Depending on ArgWidth, the input vector could be a float or a double.
15548     // If the input vector is a float type, bitcast the inputs to integers. Or,
15549     // if the input vector is a double, bitcast the inputs to 64-bit integers.
15550     if (!Ops[1]->getType()->isIntegerTy(ArgWidth)) {
15551       Ops[0] = Builder.CreateBitCast(
15552           Ops[0], Is32Bit ? llvm::FixedVectorType::get(Int32Ty, 4)
15553                           : llvm::FixedVectorType::get(Int64Ty, 2));
15554       Ops[1] = Builder.CreateBitCast(Ops[1], Is32Bit ? Int32Ty : Int64Ty);
15555     }
15556     // Emit the call to vinsw or vinsd.
15557     Call = Builder.CreateCall(F, Ops);
15558     // Depending on the builtin, bitcast to the approriate result type.
15559     if (BuiltinID == PPC::BI__builtin_altivec_vec_replace_elt &&
15560         !Ops[1]->getType()->isIntegerTy())
15561       return Builder.CreateBitCast(Call, ResultType);
15562     else if (BuiltinID == PPC::BI__builtin_altivec_vec_replace_elt &&
15563              Ops[1]->getType()->isIntegerTy())
15564       return Call;
15565     else
15566       return Builder.CreateBitCast(Call,
15567                                    llvm::FixedVectorType::get(Int8Ty, 16));
15568   }
15569   case PPC::BI__builtin_altivec_vpopcntb:
15570   case PPC::BI__builtin_altivec_vpopcnth:
15571   case PPC::BI__builtin_altivec_vpopcntw:
15572   case PPC::BI__builtin_altivec_vpopcntd: {
15573     llvm::Type *ResultType = ConvertType(E->getType());
15574     Value *X = EmitScalarExpr(E->getArg(0));
15575     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
15576     return Builder.CreateCall(F, X);
15577   }
15578   case PPC::BI__builtin_altivec_vadduqm:
15579   case PPC::BI__builtin_altivec_vsubuqm: {
15580     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
15581     Ops[0] =
15582         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int128Ty, 1));
15583     Ops[1] =
15584         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int128Ty, 1));
15585     if (BuiltinID == PPC::BI__builtin_altivec_vadduqm)
15586       return Builder.CreateAdd(Ops[0], Ops[1], "vadduqm");
15587     else
15588       return Builder.CreateSub(Ops[0], Ops[1], "vsubuqm");
15589   }
15590   // Rotate and insert under mask operation.
15591   // __rldimi(rs, is, shift, mask)
15592   // (rotl64(rs, shift) & mask) | (is & ~mask)
15593   // __rlwimi(rs, is, shift, mask)
15594   // (rotl(rs, shift) & mask) | (is & ~mask)
15595   case PPC::BI__builtin_ppc_rldimi:
15596   case PPC::BI__builtin_ppc_rlwimi: {
15597     llvm::Type *Ty = Ops[0]->getType();
15598     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
15599     if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
15600       Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
15601     Value *Shift = Builder.CreateCall(F, {Ops[0], Ops[0], Ops[2]});
15602     Value *X = Builder.CreateAnd(Shift, Ops[3]);
15603     Value *Y = Builder.CreateAnd(Ops[1], Builder.CreateNot(Ops[3]));
15604     return Builder.CreateOr(X, Y);
15605   }
15606   // Rotate and insert under mask operation.
15607   // __rlwnm(rs, shift, mask)
15608   // rotl(rs, shift) & mask
15609   case PPC::BI__builtin_ppc_rlwnm: {
15610     llvm::Type *Ty = Ops[0]->getType();
15611     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
15612     Value *Shift = Builder.CreateCall(F, {Ops[0], Ops[0], Ops[1]});
15613     return Builder.CreateAnd(Shift, Ops[2]);
15614   }
15615   case PPC::BI__builtin_ppc_poppar4:
15616   case PPC::BI__builtin_ppc_poppar8: {
15617     llvm::Type *ArgType = Ops[0]->getType();
15618     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
15619     Value *Tmp = Builder.CreateCall(F, Ops[0]);
15620 
15621     llvm::Type *ResultType = ConvertType(E->getType());
15622     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
15623     if (Result->getType() != ResultType)
15624       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
15625                                      "cast");
15626     return Result;
15627   }
15628   case PPC::BI__builtin_ppc_cmpb: {
15629     if (getTarget().getTriple().isPPC64()) {
15630       Function *F =
15631           CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int64Ty, Int64Ty, Int64Ty});
15632       return Builder.CreateCall(F, Ops, "cmpb");
15633     }
15634     // For 32 bit, emit the code as below:
15635     // %conv = trunc i64 %a to i32
15636     // %conv1 = trunc i64 %b to i32
15637     // %shr = lshr i64 %a, 32
15638     // %conv2 = trunc i64 %shr to i32
15639     // %shr3 = lshr i64 %b, 32
15640     // %conv4 = trunc i64 %shr3 to i32
15641     // %0 = tail call i32 @llvm.ppc.cmpb32(i32 %conv, i32 %conv1)
15642     // %conv5 = zext i32 %0 to i64
15643     // %1 = tail call i32 @llvm.ppc.cmpb32(i32 %conv2, i32 %conv4)
15644     // %conv614 = zext i32 %1 to i64
15645     // %shl = shl nuw i64 %conv614, 32
15646     // %or = or i64 %shl, %conv5
15647     // ret i64 %or
15648     Function *F =
15649         CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int32Ty, Int32Ty, Int32Ty});
15650     Value *ArgOneLo = Builder.CreateTrunc(Ops[0], Int32Ty);
15651     Value *ArgTwoLo = Builder.CreateTrunc(Ops[1], Int32Ty);
15652     Constant *ShiftAmt = ConstantInt::get(Int64Ty, 32);
15653     Value *ArgOneHi =
15654         Builder.CreateTrunc(Builder.CreateLShr(Ops[0], ShiftAmt), Int32Ty);
15655     Value *ArgTwoHi =
15656         Builder.CreateTrunc(Builder.CreateLShr(Ops[1], ShiftAmt), Int32Ty);
15657     Value *ResLo = Builder.CreateZExt(
15658         Builder.CreateCall(F, {ArgOneLo, ArgTwoLo}, "cmpb"), Int64Ty);
15659     Value *ResHiShift = Builder.CreateZExt(
15660         Builder.CreateCall(F, {ArgOneHi, ArgTwoHi}, "cmpb"), Int64Ty);
15661     Value *ResHi = Builder.CreateShl(ResHiShift, ShiftAmt);
15662     return Builder.CreateOr(ResLo, ResHi);
15663   }
15664   // Copy sign
15665   case PPC::BI__builtin_vsx_xvcpsgnsp:
15666   case PPC::BI__builtin_vsx_xvcpsgndp: {
15667     llvm::Type *ResultType = ConvertType(E->getType());
15668     Value *X = EmitScalarExpr(E->getArg(0));
15669     Value *Y = EmitScalarExpr(E->getArg(1));
15670     ID = Intrinsic::copysign;
15671     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
15672     return Builder.CreateCall(F, {X, Y});
15673   }
15674   // Rounding/truncation
15675   case PPC::BI__builtin_vsx_xvrspip:
15676   case PPC::BI__builtin_vsx_xvrdpip:
15677   case PPC::BI__builtin_vsx_xvrdpim:
15678   case PPC::BI__builtin_vsx_xvrspim:
15679   case PPC::BI__builtin_vsx_xvrdpi:
15680   case PPC::BI__builtin_vsx_xvrspi:
15681   case PPC::BI__builtin_vsx_xvrdpic:
15682   case PPC::BI__builtin_vsx_xvrspic:
15683   case PPC::BI__builtin_vsx_xvrdpiz:
15684   case PPC::BI__builtin_vsx_xvrspiz: {
15685     llvm::Type *ResultType = ConvertType(E->getType());
15686     Value *X = EmitScalarExpr(E->getArg(0));
15687     if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
15688         BuiltinID == PPC::BI__builtin_vsx_xvrspim)
15689       ID = Builder.getIsFPConstrained()
15690                ? Intrinsic::experimental_constrained_floor
15691                : Intrinsic::floor;
15692     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
15693              BuiltinID == PPC::BI__builtin_vsx_xvrspi)
15694       ID = Builder.getIsFPConstrained()
15695                ? Intrinsic::experimental_constrained_round
15696                : Intrinsic::round;
15697     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
15698              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
15699       ID = Builder.getIsFPConstrained()
15700                ? Intrinsic::experimental_constrained_rint
15701                : Intrinsic::rint;
15702     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
15703              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
15704       ID = Builder.getIsFPConstrained()
15705                ? Intrinsic::experimental_constrained_ceil
15706                : Intrinsic::ceil;
15707     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
15708              BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
15709       ID = Builder.getIsFPConstrained()
15710                ? Intrinsic::experimental_constrained_trunc
15711                : Intrinsic::trunc;
15712     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
15713     return Builder.getIsFPConstrained() ? Builder.CreateConstrainedFPCall(F, X)
15714                                         : Builder.CreateCall(F, X);
15715   }
15716 
15717   // Absolute value
15718   case PPC::BI__builtin_vsx_xvabsdp:
15719   case PPC::BI__builtin_vsx_xvabssp: {
15720     llvm::Type *ResultType = ConvertType(E->getType());
15721     Value *X = EmitScalarExpr(E->getArg(0));
15722     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
15723     return Builder.CreateCall(F, X);
15724   }
15725 
15726   // Fastmath by default
15727   case PPC::BI__builtin_ppc_recipdivf:
15728   case PPC::BI__builtin_ppc_recipdivd:
15729   case PPC::BI__builtin_ppc_rsqrtf:
15730   case PPC::BI__builtin_ppc_rsqrtd: {
15731     FastMathFlags FMF = Builder.getFastMathFlags();
15732     Builder.getFastMathFlags().setFast();
15733     llvm::Type *ResultType = ConvertType(E->getType());
15734     Value *X = EmitScalarExpr(E->getArg(0));
15735 
15736     if (BuiltinID == PPC::BI__builtin_ppc_recipdivf ||
15737         BuiltinID == PPC::BI__builtin_ppc_recipdivd) {
15738       Value *Y = EmitScalarExpr(E->getArg(1));
15739       Value *FDiv = Builder.CreateFDiv(X, Y, "recipdiv");
15740       Builder.getFastMathFlags() &= (FMF);
15741       return FDiv;
15742     }
15743     auto *One = ConstantFP::get(ResultType, 1.0);
15744     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
15745     Value *FDiv = Builder.CreateFDiv(One, Builder.CreateCall(F, X), "rsqrt");
15746     Builder.getFastMathFlags() &= (FMF);
15747     return FDiv;
15748   }
15749   case PPC::BI__builtin_ppc_alignx: {
15750     ConstantInt *AlignmentCI = cast<ConstantInt>(Ops[0]);
15751     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
15752       AlignmentCI = ConstantInt::get(AlignmentCI->getType(),
15753                                      llvm::Value::MaximumAlignment);
15754 
15755     emitAlignmentAssumption(Ops[1], E->getArg(1),
15756                             /*The expr loc is sufficient.*/ SourceLocation(),
15757                             AlignmentCI, nullptr);
15758     return Ops[1];
15759   }
15760   case PPC::BI__builtin_ppc_rdlam: {
15761     llvm::Type *Ty = Ops[0]->getType();
15762     Value *ShiftAmt = Builder.CreateIntCast(Ops[1], Ty, false);
15763     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
15764     Value *Rotate = Builder.CreateCall(F, {Ops[0], Ops[0], ShiftAmt});
15765     return Builder.CreateAnd(Rotate, Ops[2]);
15766   }
15767   case PPC::BI__builtin_ppc_load2r: {
15768     Function *F = CGM.getIntrinsic(Intrinsic::ppc_load2r);
15769     Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
15770     Value *LoadIntrinsic = Builder.CreateCall(F, Ops);
15771     return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
15772   }
15773   // FMA variations
15774   case PPC::BI__builtin_vsx_xvmaddadp:
15775   case PPC::BI__builtin_vsx_xvmaddasp:
15776   case PPC::BI__builtin_vsx_xvnmaddadp:
15777   case PPC::BI__builtin_vsx_xvnmaddasp:
15778   case PPC::BI__builtin_vsx_xvmsubadp:
15779   case PPC::BI__builtin_vsx_xvmsubasp:
15780   case PPC::BI__builtin_vsx_xvnmsubadp:
15781   case PPC::BI__builtin_vsx_xvnmsubasp: {
15782     llvm::Type *ResultType = ConvertType(E->getType());
15783     Value *X = EmitScalarExpr(E->getArg(0));
15784     Value *Y = EmitScalarExpr(E->getArg(1));
15785     Value *Z = EmitScalarExpr(E->getArg(2));
15786     llvm::Function *F;
15787     if (Builder.getIsFPConstrained())
15788       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
15789     else
15790       F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
15791     switch (BuiltinID) {
15792       case PPC::BI__builtin_vsx_xvmaddadp:
15793       case PPC::BI__builtin_vsx_xvmaddasp:
15794         if (Builder.getIsFPConstrained())
15795           return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
15796         else
15797           return Builder.CreateCall(F, {X, Y, Z});
15798       case PPC::BI__builtin_vsx_xvnmaddadp:
15799       case PPC::BI__builtin_vsx_xvnmaddasp:
15800         if (Builder.getIsFPConstrained())
15801           return Builder.CreateFNeg(
15802               Builder.CreateConstrainedFPCall(F, {X, Y, Z}), "neg");
15803         else
15804           return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
15805       case PPC::BI__builtin_vsx_xvmsubadp:
15806       case PPC::BI__builtin_vsx_xvmsubasp:
15807         if (Builder.getIsFPConstrained())
15808           return Builder.CreateConstrainedFPCall(
15809               F, {X, Y, Builder.CreateFNeg(Z, "neg")});
15810         else
15811           return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
15812       case PPC::BI__builtin_vsx_xvnmsubadp:
15813       case PPC::BI__builtin_vsx_xvnmsubasp:
15814         if (Builder.getIsFPConstrained())
15815           return Builder.CreateFNeg(
15816               Builder.CreateConstrainedFPCall(
15817                   F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
15818               "neg");
15819         else
15820           return Builder.CreateFNeg(
15821               Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
15822               "neg");
15823     }
15824     llvm_unreachable("Unknown FMA operation");
15825     return nullptr; // Suppress no-return warning
15826   }
15827 
15828   case PPC::BI__builtin_vsx_insertword: {
15829     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
15830 
15831     // Third argument is a compile time constant int. It must be clamped to
15832     // to the range [0, 12].
15833     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
15834     assert(ArgCI &&
15835            "Third arg to xxinsertw intrinsic must be constant integer");
15836     const int64_t MaxIndex = 12;
15837     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
15838 
15839     // The builtin semantics don't exactly match the xxinsertw instructions
15840     // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
15841     // word from the first argument, and inserts it in the second argument. The
15842     // instruction extracts the word from its second input register and inserts
15843     // it into its first input register, so swap the first and second arguments.
15844     std::swap(Ops[0], Ops[1]);
15845 
15846     // Need to cast the second argument from a vector of unsigned int to a
15847     // vector of long long.
15848     Ops[1] =
15849         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
15850 
15851     if (getTarget().isLittleEndian()) {
15852       // Reverse the double words in the vector we will extract from.
15853       Ops[0] =
15854           Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
15855       Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{1, 0});
15856 
15857       // Reverse the index.
15858       Index = MaxIndex - Index;
15859     }
15860 
15861     // Intrinsic expects the first arg to be a vector of int.
15862     Ops[0] =
15863         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
15864     Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
15865     return Builder.CreateCall(F, Ops);
15866   }
15867 
15868   case PPC::BI__builtin_vsx_extractuword: {
15869     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
15870 
15871     // Intrinsic expects the first argument to be a vector of doublewords.
15872     Ops[0] =
15873         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
15874 
15875     // The second argument is a compile time constant int that needs to
15876     // be clamped to the range [0, 12].
15877     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]);
15878     assert(ArgCI &&
15879            "Second Arg to xxextractuw intrinsic must be a constant integer!");
15880     const int64_t MaxIndex = 12;
15881     int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
15882 
15883     if (getTarget().isLittleEndian()) {
15884       // Reverse the index.
15885       Index = MaxIndex - Index;
15886       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
15887 
15888       // Emit the call, then reverse the double words of the results vector.
15889       Value *Call = Builder.CreateCall(F, Ops);
15890 
15891       Value *ShuffleCall =
15892           Builder.CreateShuffleVector(Call, Call, ArrayRef<int>{1, 0});
15893       return ShuffleCall;
15894     } else {
15895       Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
15896       return Builder.CreateCall(F, Ops);
15897     }
15898   }
15899 
15900   case PPC::BI__builtin_vsx_xxpermdi: {
15901     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
15902     assert(ArgCI && "Third arg must be constant integer!");
15903 
15904     unsigned Index = ArgCI->getZExtValue();
15905     Ops[0] =
15906         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
15907     Ops[1] =
15908         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
15909 
15910     // Account for endianness by treating this as just a shuffle. So we use the
15911     // same indices for both LE and BE in order to produce expected results in
15912     // both cases.
15913     int ElemIdx0 = (Index & 2) >> 1;
15914     int ElemIdx1 = 2 + (Index & 1);
15915 
15916     int ShuffleElts[2] = {ElemIdx0, ElemIdx1};
15917     Value *ShuffleCall =
15918         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleElts);
15919     QualType BIRetType = E->getType();
15920     auto RetTy = ConvertType(BIRetType);
15921     return Builder.CreateBitCast(ShuffleCall, RetTy);
15922   }
15923 
15924   case PPC::BI__builtin_vsx_xxsldwi: {
15925     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
15926     assert(ArgCI && "Third argument must be a compile time constant");
15927     unsigned Index = ArgCI->getZExtValue() & 0x3;
15928     Ops[0] =
15929         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
15930     Ops[1] =
15931         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int32Ty, 4));
15932 
15933     // Create a shuffle mask
15934     int ElemIdx0;
15935     int ElemIdx1;
15936     int ElemIdx2;
15937     int ElemIdx3;
15938     if (getTarget().isLittleEndian()) {
15939       // Little endian element N comes from element 8+N-Index of the
15940       // concatenated wide vector (of course, using modulo arithmetic on
15941       // the total number of elements).
15942       ElemIdx0 = (8 - Index) % 8;
15943       ElemIdx1 = (9 - Index) % 8;
15944       ElemIdx2 = (10 - Index) % 8;
15945       ElemIdx3 = (11 - Index) % 8;
15946     } else {
15947       // Big endian ElemIdx<N> = Index + N
15948       ElemIdx0 = Index;
15949       ElemIdx1 = Index + 1;
15950       ElemIdx2 = Index + 2;
15951       ElemIdx3 = Index + 3;
15952     }
15953 
15954     int ShuffleElts[4] = {ElemIdx0, ElemIdx1, ElemIdx2, ElemIdx3};
15955     Value *ShuffleCall =
15956         Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleElts);
15957     QualType BIRetType = E->getType();
15958     auto RetTy = ConvertType(BIRetType);
15959     return Builder.CreateBitCast(ShuffleCall, RetTy);
15960   }
15961 
15962   case PPC::BI__builtin_pack_vector_int128: {
15963     bool isLittleEndian = getTarget().isLittleEndian();
15964     Value *UndefValue =
15965         llvm::UndefValue::get(llvm::FixedVectorType::get(Ops[0]->getType(), 2));
15966     Value *Res = Builder.CreateInsertElement(
15967         UndefValue, Ops[0], (uint64_t)(isLittleEndian ? 1 : 0));
15968     Res = Builder.CreateInsertElement(Res, Ops[1],
15969                                       (uint64_t)(isLittleEndian ? 0 : 1));
15970     return Builder.CreateBitCast(Res, ConvertType(E->getType()));
15971   }
15972 
15973   case PPC::BI__builtin_unpack_vector_int128: {
15974     ConstantInt *Index = cast<ConstantInt>(Ops[1]);
15975     Value *Unpacked = Builder.CreateBitCast(
15976         Ops[0], llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
15977 
15978     if (getTarget().isLittleEndian())
15979       Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());
15980 
15981     return Builder.CreateExtractElement(Unpacked, Index);
15982   }
15983 
15984   case PPC::BI__builtin_ppc_sthcx: {
15985     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_sthcx);
15986     Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
15987     Ops[1] = Builder.CreateSExt(Ops[1], Int32Ty);
15988     return Builder.CreateCall(F, Ops);
15989   }
15990 
15991   // The PPC MMA builtins take a pointer to a __vector_quad as an argument.
15992   // Some of the MMA instructions accumulate their result into an existing
15993   // accumulator whereas the others generate a new accumulator. So we need to
15994   // use custom code generation to expand a builtin call with a pointer to a
15995   // load (if the corresponding instruction accumulates its result) followed by
15996   // the call to the intrinsic and a store of the result.
15997 #define CUSTOM_BUILTIN(Name, Intr, Types, Accumulate) \
15998   case PPC::BI__builtin_##Name:
15999 #include "clang/Basic/BuiltinsPPC.def"
16000   {
16001     // The first argument of these two builtins is a pointer used to store their
16002     // result. However, the llvm intrinsics return their result in multiple
16003     // return values. So, here we emit code extracting these values from the
16004     // intrinsic results and storing them using that pointer.
16005     if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc ||
16006         BuiltinID == PPC::BI__builtin_vsx_disassemble_pair ||
16007         BuiltinID == PPC::BI__builtin_mma_disassemble_pair) {
16008       unsigned NumVecs = 2;
16009       auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair;
16010       if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) {
16011         NumVecs = 4;
16012         Intrinsic = Intrinsic::ppc_mma_disassemble_acc;
16013       }
16014       llvm::Function *F = CGM.getIntrinsic(Intrinsic);
16015       Address Addr = EmitPointerWithAlignment(E->getArg(1));
16016       Value *Vec = Builder.CreateLoad(Addr);
16017       Value *Call = Builder.CreateCall(F, {Vec});
16018       llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, 16);
16019       Value *Ptr = Builder.CreateBitCast(Ops[0], VTy->getPointerTo());
16020       for (unsigned i=0; i<NumVecs; i++) {
16021         Value *Vec = Builder.CreateExtractValue(Call, i);
16022         llvm::ConstantInt* Index = llvm::ConstantInt::get(IntTy, i);
16023         Value *GEP = Builder.CreateInBoundsGEP(VTy, Ptr, Index);
16024         Builder.CreateAlignedStore(Vec, GEP, MaybeAlign(16));
16025       }
16026       return Call;
16027     }
16028     if (BuiltinID == PPC::BI__builtin_vsx_build_pair ||
16029         BuiltinID == PPC::BI__builtin_mma_build_acc) {
16030       // Reverse the order of the operands for LE, so the
16031       // same builtin call can be used on both LE and BE
16032       // without the need for the programmer to swap operands.
16033       // The operands are reversed starting from the second argument,
16034       // the first operand is the pointer to the pair/accumulator
16035       // that is being built.
16036       if (getTarget().isLittleEndian())
16037         std::reverse(Ops.begin() + 1, Ops.end());
16038     }
16039     bool Accumulate;
16040     switch (BuiltinID) {
16041   #define CUSTOM_BUILTIN(Name, Intr, Types, Acc) \
16042     case PPC::BI__builtin_##Name: \
16043       ID = Intrinsic::ppc_##Intr; \
16044       Accumulate = Acc; \
16045       break;
16046   #include "clang/Basic/BuiltinsPPC.def"
16047     }
16048     if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
16049         BuiltinID == PPC::BI__builtin_vsx_stxvp ||
16050         BuiltinID == PPC::BI__builtin_mma_lxvp ||
16051         BuiltinID == PPC::BI__builtin_mma_stxvp) {
16052       if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
16053           BuiltinID == PPC::BI__builtin_mma_lxvp) {
16054         Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
16055         Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
16056       } else {
16057         Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
16058         Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
16059       }
16060       Ops.pop_back();
16061       llvm::Function *F = CGM.getIntrinsic(ID);
16062       return Builder.CreateCall(F, Ops, "");
16063     }
16064     SmallVector<Value*, 4> CallOps;
16065     if (Accumulate) {
16066       Address Addr = EmitPointerWithAlignment(E->getArg(0));
16067       Value *Acc = Builder.CreateLoad(Addr);
16068       CallOps.push_back(Acc);
16069     }
16070     for (unsigned i=1; i<Ops.size(); i++)
16071       CallOps.push_back(Ops[i]);
16072     llvm::Function *F = CGM.getIntrinsic(ID);
16073     Value *Call = Builder.CreateCall(F, CallOps);
16074     return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign(64));
16075   }
16076 
16077   case PPC::BI__builtin_ppc_compare_and_swap:
16078   case PPC::BI__builtin_ppc_compare_and_swaplp: {
16079     Address Addr = EmitPointerWithAlignment(E->getArg(0));
16080     Address OldValAddr = EmitPointerWithAlignment(E->getArg(1));
16081     Value *OldVal = Builder.CreateLoad(OldValAddr);
16082     QualType AtomicTy = E->getArg(0)->getType()->getPointeeType();
16083     LValue LV = MakeAddrLValue(Addr, AtomicTy);
16084     auto Pair = EmitAtomicCompareExchange(
16085         LV, RValue::get(OldVal), RValue::get(Ops[2]), E->getExprLoc(),
16086         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Monotonic, true);
16087     // Unlike c11's atomic_compare_exchange, accroding to
16088     // https://www.ibm.com/docs/en/xl-c-and-cpp-aix/16.1?topic=functions-compare-swap-compare-swaplp
16089     // > In either case, the contents of the memory location specified by addr
16090     // > are copied into the memory location specified by old_val_addr.
16091     // But it hasn't specified storing to OldValAddr is atomic or not and
16092     // which order to use. Now following XL's codegen, treat it as a normal
16093     // store.
16094     Value *LoadedVal = Pair.first.getScalarVal();
16095     Builder.CreateStore(LoadedVal, OldValAddr);
16096     return Builder.CreateZExt(Pair.second, Builder.getInt32Ty());
16097   }
16098   case PPC::BI__builtin_ppc_fetch_and_add:
16099   case PPC::BI__builtin_ppc_fetch_and_addlp: {
16100     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
16101                                  llvm::AtomicOrdering::Monotonic);
16102   }
16103   case PPC::BI__builtin_ppc_fetch_and_and:
16104   case PPC::BI__builtin_ppc_fetch_and_andlp: {
16105     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
16106                                  llvm::AtomicOrdering::Monotonic);
16107   }
16108 
16109   case PPC::BI__builtin_ppc_fetch_and_or:
16110   case PPC::BI__builtin_ppc_fetch_and_orlp: {
16111     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
16112                                  llvm::AtomicOrdering::Monotonic);
16113   }
16114   case PPC::BI__builtin_ppc_fetch_and_swap:
16115   case PPC::BI__builtin_ppc_fetch_and_swaplp: {
16116     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
16117                                  llvm::AtomicOrdering::Monotonic);
16118   }
16119   case PPC::BI__builtin_ppc_ldarx:
16120   case PPC::BI__builtin_ppc_lwarx:
16121   case PPC::BI__builtin_ppc_lharx:
16122   case PPC::BI__builtin_ppc_lbarx:
16123     return emitPPCLoadReserveIntrinsic(*this, BuiltinID, E);
16124   case PPC::BI__builtin_ppc_mfspr: {
16125     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
16126                               ? Int32Ty
16127                               : Int64Ty;
16128     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mfspr, RetType);
16129     return Builder.CreateCall(F, Ops);
16130   }
16131   case PPC::BI__builtin_ppc_mtspr: {
16132     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
16133                               ? Int32Ty
16134                               : Int64Ty;
16135     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtspr, RetType);
16136     return Builder.CreateCall(F, Ops);
16137   }
16138   case PPC::BI__builtin_ppc_popcntb: {
16139     Value *ArgValue = EmitScalarExpr(E->getArg(0));
16140     llvm::Type *ArgType = ArgValue->getType();
16141     Function *F = CGM.getIntrinsic(Intrinsic::ppc_popcntb, {ArgType, ArgType});
16142     return Builder.CreateCall(F, Ops, "popcntb");
16143   }
16144   case PPC::BI__builtin_ppc_mtfsf: {
16145     // The builtin takes a uint32 that needs to be cast to an
16146     // f64 to be passed to the intrinsic.
16147     Value *Cast = Builder.CreateUIToFP(Ops[1], DoubleTy);
16148     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtfsf);
16149     return Builder.CreateCall(F, {Ops[0], Cast}, "");
16150   }
16151 
16152   case PPC::BI__builtin_ppc_swdiv_nochk:
16153   case PPC::BI__builtin_ppc_swdivs_nochk: {
16154     FastMathFlags FMF = Builder.getFastMathFlags();
16155     Builder.getFastMathFlags().setFast();
16156     Value *FDiv = Builder.CreateFDiv(Ops[0], Ops[1], "swdiv_nochk");
16157     Builder.getFastMathFlags() &= (FMF);
16158     return FDiv;
16159   }
16160   case PPC::BI__builtin_ppc_fric:
16161     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16162                            *this, E, Intrinsic::rint,
16163                            Intrinsic::experimental_constrained_rint))
16164         .getScalarVal();
16165   case PPC::BI__builtin_ppc_frim:
16166   case PPC::BI__builtin_ppc_frims:
16167     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16168                            *this, E, Intrinsic::floor,
16169                            Intrinsic::experimental_constrained_floor))
16170         .getScalarVal();
16171   case PPC::BI__builtin_ppc_frin:
16172   case PPC::BI__builtin_ppc_frins:
16173     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16174                            *this, E, Intrinsic::round,
16175                            Intrinsic::experimental_constrained_round))
16176         .getScalarVal();
16177   case PPC::BI__builtin_ppc_frip:
16178   case PPC::BI__builtin_ppc_frips:
16179     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16180                            *this, E, Intrinsic::ceil,
16181                            Intrinsic::experimental_constrained_ceil))
16182         .getScalarVal();
16183   case PPC::BI__builtin_ppc_friz:
16184   case PPC::BI__builtin_ppc_frizs:
16185     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16186                            *this, E, Intrinsic::trunc,
16187                            Intrinsic::experimental_constrained_trunc))
16188         .getScalarVal();
16189   case PPC::BI__builtin_ppc_fsqrt:
16190   case PPC::BI__builtin_ppc_fsqrts:
16191     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16192                            *this, E, Intrinsic::sqrt,
16193                            Intrinsic::experimental_constrained_sqrt))
16194         .getScalarVal();
16195   case PPC::BI__builtin_ppc_test_data_class: {
16196     llvm::Type *ArgType = EmitScalarExpr(E->getArg(0))->getType();
16197     unsigned IntrinsicID;
16198     if (ArgType->isDoubleTy())
16199       IntrinsicID = Intrinsic::ppc_test_data_class_d;
16200     else if (ArgType->isFloatTy())
16201       IntrinsicID = Intrinsic::ppc_test_data_class_f;
16202     else
16203       llvm_unreachable("Invalid Argument Type");
16204     return Builder.CreateCall(CGM.getIntrinsic(IntrinsicID), Ops,
16205                               "test_data_class");
16206   }
16207   case PPC::BI__builtin_ppc_swdiv:
16208   case PPC::BI__builtin_ppc_swdivs:
16209     return Builder.CreateFDiv(Ops[0], Ops[1], "swdiv");
16210   }
16211 }
16212 
16213 namespace {
16214 // If \p E is not null pointer, insert address space cast to match return
16215 // type of \p E if necessary.
16216 Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
16217                              const CallExpr *E = nullptr) {
16218   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
16219   auto *Call = CGF.Builder.CreateCall(F);
16220   Call->addRetAttr(
16221       Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
16222   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
16223   if (!E)
16224     return Call;
16225   QualType BuiltinRetType = E->getType();
16226   auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
16227   if (RetTy == Call->getType())
16228     return Call;
16229   return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
16230 }
16231 
16232 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
16233 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
16234   const unsigned XOffset = 4;
16235   auto *DP = EmitAMDGPUDispatchPtr(CGF);
16236   // Indexing the HSA kernel_dispatch_packet struct.
16237   auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2);
16238   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
16239   auto *DstTy =
16240       CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
16241   auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
16242   auto *LD = CGF.Builder.CreateLoad(
16243       Address(Cast, CGF.Int16Ty, CharUnits::fromQuantity(2)));
16244   llvm::MDBuilder MDHelper(CGF.getLLVMContext());
16245   llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
16246       APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
16247   LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
16248   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
16249       llvm::MDNode::get(CGF.getLLVMContext(), None));
16250   return LD;
16251 }
16252 
16253 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
16254 Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
16255   const unsigned XOffset = 12;
16256   auto *DP = EmitAMDGPUDispatchPtr(CGF);
16257   // Indexing the HSA kernel_dispatch_packet struct.
16258   auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
16259   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
16260   auto *DstTy =
16261       CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
16262   auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
16263   auto *LD = CGF.Builder.CreateLoad(
16264       Address(Cast, CGF.Int32Ty, CharUnits::fromQuantity(4)));
16265   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
16266                   llvm::MDNode::get(CGF.getLLVMContext(), None));
16267   return LD;
16268 }
16269 } // namespace
16270 
16271 // For processing memory ordering and memory scope arguments of various
16272 // amdgcn builtins.
16273 // \p Order takes a C++11 comptabile memory-ordering specifier and converts
16274 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
16275 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
16276 // specific SyncScopeID and writes it to \p SSID.
16277 bool CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
16278                                               llvm::AtomicOrdering &AO,
16279                                               llvm::SyncScope::ID &SSID) {
16280   if (isa<llvm::ConstantInt>(Order)) {
16281     int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
16282 
16283     // Map C11/C++11 memory ordering to LLVM memory ordering
16284     assert(llvm::isValidAtomicOrderingCABI(ord));
16285     switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
16286     case llvm::AtomicOrderingCABI::acquire:
16287     case llvm::AtomicOrderingCABI::consume:
16288       AO = llvm::AtomicOrdering::Acquire;
16289       break;
16290     case llvm::AtomicOrderingCABI::release:
16291       AO = llvm::AtomicOrdering::Release;
16292       break;
16293     case llvm::AtomicOrderingCABI::acq_rel:
16294       AO = llvm::AtomicOrdering::AcquireRelease;
16295       break;
16296     case llvm::AtomicOrderingCABI::seq_cst:
16297       AO = llvm::AtomicOrdering::SequentiallyConsistent;
16298       break;
16299     case llvm::AtomicOrderingCABI::relaxed:
16300       AO = llvm::AtomicOrdering::Monotonic;
16301       break;
16302     }
16303 
16304     StringRef scp;
16305     llvm::getConstantStringInfo(Scope, scp);
16306     SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
16307     return true;
16308   }
16309   return false;
16310 }
16311 
16312 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
16313                                               const CallExpr *E) {
16314   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
16315   llvm::SyncScope::ID SSID;
16316   switch (BuiltinID) {
16317   case AMDGPU::BI__builtin_amdgcn_div_scale:
16318   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
16319     // Translate from the intrinsics's struct return to the builtin's out
16320     // argument.
16321 
16322     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
16323 
16324     llvm::Value *X = EmitScalarExpr(E->getArg(0));
16325     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
16326     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
16327 
16328     llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
16329                                            X->getType());
16330 
16331     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
16332 
16333     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
16334     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
16335 
16336     llvm::Type *RealFlagType = FlagOutPtr.getElementType();
16337 
16338     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
16339     Builder.CreateStore(FlagExt, FlagOutPtr);
16340     return Result;
16341   }
16342   case AMDGPU::BI__builtin_amdgcn_div_fmas:
16343   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
16344     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
16345     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
16346     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
16347     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
16348 
16349     llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
16350                                       Src0->getType());
16351     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
16352     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
16353   }
16354 
16355   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
16356     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
16357   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
16358     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_mov_dpp8);
16359   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
16360   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
16361     llvm::SmallVector<llvm::Value *, 6> Args;
16362     for (unsigned I = 0; I != E->getNumArgs(); ++I)
16363       Args.push_back(EmitScalarExpr(E->getArg(I)));
16364     assert(Args.size() == 5 || Args.size() == 6);
16365     if (Args.size() == 5)
16366       Args.insert(Args.begin(), llvm::UndefValue::get(Args[0]->getType()));
16367     Function *F =
16368         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
16369     return Builder.CreateCall(F, Args);
16370   }
16371   case AMDGPU::BI__builtin_amdgcn_div_fixup:
16372   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
16373   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
16374     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
16375   case AMDGPU::BI__builtin_amdgcn_trig_preop:
16376   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
16377     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
16378   case AMDGPU::BI__builtin_amdgcn_rcp:
16379   case AMDGPU::BI__builtin_amdgcn_rcpf:
16380   case AMDGPU::BI__builtin_amdgcn_rcph:
16381     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
16382   case AMDGPU::BI__builtin_amdgcn_sqrt:
16383   case AMDGPU::BI__builtin_amdgcn_sqrtf:
16384   case AMDGPU::BI__builtin_amdgcn_sqrth:
16385     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sqrt);
16386   case AMDGPU::BI__builtin_amdgcn_rsq:
16387   case AMDGPU::BI__builtin_amdgcn_rsqf:
16388   case AMDGPU::BI__builtin_amdgcn_rsqh:
16389     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
16390   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
16391   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
16392     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
16393   case AMDGPU::BI__builtin_amdgcn_sinf:
16394   case AMDGPU::BI__builtin_amdgcn_sinh:
16395     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
16396   case AMDGPU::BI__builtin_amdgcn_cosf:
16397   case AMDGPU::BI__builtin_amdgcn_cosh:
16398     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
16399   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
16400     return EmitAMDGPUDispatchPtr(*this, E);
16401   case AMDGPU::BI__builtin_amdgcn_log_clampf:
16402     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
16403   case AMDGPU::BI__builtin_amdgcn_ldexp:
16404   case AMDGPU::BI__builtin_amdgcn_ldexpf:
16405   case AMDGPU::BI__builtin_amdgcn_ldexph:
16406     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);
16407   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
16408   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
16409   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
16410     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
16411   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
16412   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
16413     Value *Src0 = EmitScalarExpr(E->getArg(0));
16414     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
16415                                 { Builder.getInt32Ty(), Src0->getType() });
16416     return Builder.CreateCall(F, Src0);
16417   }
16418   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
16419     Value *Src0 = EmitScalarExpr(E->getArg(0));
16420     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
16421                                 { Builder.getInt16Ty(), Src0->getType() });
16422     return Builder.CreateCall(F, Src0);
16423   }
16424   case AMDGPU::BI__builtin_amdgcn_fract:
16425   case AMDGPU::BI__builtin_amdgcn_fractf:
16426   case AMDGPU::BI__builtin_amdgcn_fracth:
16427     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
16428   case AMDGPU::BI__builtin_amdgcn_lerp:
16429     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
16430   case AMDGPU::BI__builtin_amdgcn_ubfe:
16431     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_ubfe);
16432   case AMDGPU::BI__builtin_amdgcn_sbfe:
16433     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_sbfe);
16434   case AMDGPU::BI__builtin_amdgcn_uicmp:
16435   case AMDGPU::BI__builtin_amdgcn_uicmpl:
16436   case AMDGPU::BI__builtin_amdgcn_sicmp:
16437   case AMDGPU::BI__builtin_amdgcn_sicmpl: {
16438     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
16439     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
16440     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
16441 
16442     // FIXME-GFX10: How should 32 bit mask be handled?
16443     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
16444       { Builder.getInt64Ty(), Src0->getType() });
16445     return Builder.CreateCall(F, { Src0, Src1, Src2 });
16446   }
16447   case AMDGPU::BI__builtin_amdgcn_fcmp:
16448   case AMDGPU::BI__builtin_amdgcn_fcmpf: {
16449     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
16450     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
16451     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
16452 
16453     // FIXME-GFX10: How should 32 bit mask be handled?
16454     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
16455       { Builder.getInt64Ty(), Src0->getType() });
16456     return Builder.CreateCall(F, { Src0, Src1, Src2 });
16457   }
16458   case AMDGPU::BI__builtin_amdgcn_class:
16459   case AMDGPU::BI__builtin_amdgcn_classf:
16460   case AMDGPU::BI__builtin_amdgcn_classh:
16461     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
16462   case AMDGPU::BI__builtin_amdgcn_fmed3f:
16463   case AMDGPU::BI__builtin_amdgcn_fmed3h:
16464     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
16465   case AMDGPU::BI__builtin_amdgcn_ds_append:
16466   case AMDGPU::BI__builtin_amdgcn_ds_consume: {
16467     Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
16468       Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
16469     Value *Src0 = EmitScalarExpr(E->getArg(0));
16470     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
16471     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
16472   }
16473   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
16474   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
16475   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
16476     Intrinsic::ID Intrin;
16477     switch (BuiltinID) {
16478     case AMDGPU::BI__builtin_amdgcn_ds_faddf:
16479       Intrin = Intrinsic::amdgcn_ds_fadd;
16480       break;
16481     case AMDGPU::BI__builtin_amdgcn_ds_fminf:
16482       Intrin = Intrinsic::amdgcn_ds_fmin;
16483       break;
16484     case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
16485       Intrin = Intrinsic::amdgcn_ds_fmax;
16486       break;
16487     }
16488     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
16489     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
16490     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
16491     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
16492     llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
16493     llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
16494     llvm::FunctionType *FTy = F->getFunctionType();
16495     llvm::Type *PTy = FTy->getParamType(0);
16496     Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
16497     return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
16498   }
16499   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
16500   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
16501   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
16502   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
16503   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
16504   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
16505   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
16506   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
16507     Intrinsic::ID IID;
16508     llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
16509     switch (BuiltinID) {
16510     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
16511       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
16512       IID = Intrinsic::amdgcn_global_atomic_fadd;
16513       break;
16514     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
16515       ArgTy = llvm::FixedVectorType::get(
16516           llvm::Type::getHalfTy(getLLVMContext()), 2);
16517       IID = Intrinsic::amdgcn_global_atomic_fadd;
16518       break;
16519     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
16520       IID = Intrinsic::amdgcn_global_atomic_fadd;
16521       break;
16522     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
16523       IID = Intrinsic::amdgcn_global_atomic_fmin;
16524       break;
16525     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
16526       IID = Intrinsic::amdgcn_global_atomic_fmax;
16527       break;
16528     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
16529       IID = Intrinsic::amdgcn_flat_atomic_fadd;
16530       break;
16531     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
16532       IID = Intrinsic::amdgcn_flat_atomic_fmin;
16533       break;
16534     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
16535       IID = Intrinsic::amdgcn_flat_atomic_fmax;
16536       break;
16537     }
16538     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
16539     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
16540     llvm::Function *F =
16541         CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
16542     return Builder.CreateCall(F, {Addr, Val});
16543   }
16544   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
16545   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: {
16546     Intrinsic::ID IID;
16547     llvm::Type *ArgTy;
16548     switch (BuiltinID) {
16549     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
16550       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
16551       IID = Intrinsic::amdgcn_ds_fadd;
16552       break;
16553     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
16554       ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
16555       IID = Intrinsic::amdgcn_ds_fadd;
16556       break;
16557     }
16558     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
16559     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
16560     llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
16561         llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
16562     llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
16563         llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
16564     llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
16565     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
16566   }
16567   case AMDGPU::BI__builtin_amdgcn_read_exec: {
16568     CallInst *CI = cast<CallInst>(
16569       EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec"));
16570     CI->setConvergent();
16571     return CI;
16572   }
16573   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
16574   case AMDGPU::BI__builtin_amdgcn_read_exec_hi: {
16575     StringRef RegName = BuiltinID == AMDGPU::BI__builtin_amdgcn_read_exec_lo ?
16576       "exec_lo" : "exec_hi";
16577     CallInst *CI = cast<CallInst>(
16578       EmitSpecialRegisterBuiltin(*this, E, Int32Ty, Int32Ty, NormalRead, RegName));
16579     CI->setConvergent();
16580     return CI;
16581   }
16582   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
16583   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
16584   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
16585   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
16586     llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
16587     llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
16588     llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
16589     llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
16590     llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
16591     llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
16592 
16593     // The builtins take these arguments as vec4 where the last element is
16594     // ignored. The intrinsic takes them as vec3.
16595     RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
16596                                             ArrayRef<int>{0, 1, 2});
16597     RayDir =
16598         Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
16599     RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
16600                                                 ArrayRef<int>{0, 1, 2});
16601 
16602     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
16603                                    {NodePtr->getType(), RayDir->getType()});
16604     return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
16605                                   RayInverseDir, TextureDescr});
16606   }
16607 
16608   // amdgcn workitem
16609   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
16610     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
16611   case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
16612     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
16613   case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
16614     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
16615 
16616   // amdgcn workgroup size
16617   case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
16618     return EmitAMDGPUWorkGroupSize(*this, 0);
16619   case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
16620     return EmitAMDGPUWorkGroupSize(*this, 1);
16621   case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
16622     return EmitAMDGPUWorkGroupSize(*this, 2);
16623 
16624   // amdgcn grid size
16625   case AMDGPU::BI__builtin_amdgcn_grid_size_x:
16626     return EmitAMDGPUGridSize(*this, 0);
16627   case AMDGPU::BI__builtin_amdgcn_grid_size_y:
16628     return EmitAMDGPUGridSize(*this, 1);
16629   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
16630     return EmitAMDGPUGridSize(*this, 2);
16631 
16632   // r600 intrinsics
16633   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
16634   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
16635     return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
16636   case AMDGPU::BI__builtin_r600_read_tidig_x:
16637     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
16638   case AMDGPU::BI__builtin_r600_read_tidig_y:
16639     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
16640   case AMDGPU::BI__builtin_r600_read_tidig_z:
16641     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
16642   case AMDGPU::BI__builtin_amdgcn_alignbit: {
16643     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
16644     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
16645     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
16646     Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
16647     return Builder.CreateCall(F, { Src0, Src1, Src2 });
16648   }
16649 
16650   case AMDGPU::BI__builtin_amdgcn_fence: {
16651     if (ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
16652                                 EmitScalarExpr(E->getArg(1)), AO, SSID))
16653       return Builder.CreateFence(AO, SSID);
16654     LLVM_FALLTHROUGH;
16655   }
16656   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
16657   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
16658   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
16659   case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
16660     unsigned BuiltinAtomicOp;
16661     llvm::Type *ResultType = ConvertType(E->getType());
16662 
16663     switch (BuiltinID) {
16664     case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
16665     case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
16666       BuiltinAtomicOp = Intrinsic::amdgcn_atomic_inc;
16667       break;
16668     case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
16669     case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
16670       BuiltinAtomicOp = Intrinsic::amdgcn_atomic_dec;
16671       break;
16672     }
16673 
16674     Value *Ptr = EmitScalarExpr(E->getArg(0));
16675     Value *Val = EmitScalarExpr(E->getArg(1));
16676 
16677     llvm::Function *F =
16678         CGM.getIntrinsic(BuiltinAtomicOp, {ResultType, Ptr->getType()});
16679 
16680     if (ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
16681                                 EmitScalarExpr(E->getArg(3)), AO, SSID)) {
16682 
16683       // llvm.amdgcn.atomic.inc and llvm.amdgcn.atomic.dec expects ordering and
16684       // scope as unsigned values
16685       Value *MemOrder = Builder.getInt32(static_cast<int>(AO));
16686       Value *MemScope = Builder.getInt32(static_cast<int>(SSID));
16687 
16688       QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
16689       bool Volatile =
16690           PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
16691       Value *IsVolatile = Builder.getInt1(static_cast<bool>(Volatile));
16692 
16693       return Builder.CreateCall(F, {Ptr, Val, MemOrder, MemScope, IsVolatile});
16694     }
16695     LLVM_FALLTHROUGH;
16696   }
16697   default:
16698     return nullptr;
16699   }
16700 }
16701 
16702 /// Handle a SystemZ function in which the final argument is a pointer
16703 /// to an int that receives the post-instruction CC value.  At the LLVM level
16704 /// this is represented as a function that returns a {result, cc} pair.
16705 static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
16706                                          unsigned IntrinsicID,
16707                                          const CallExpr *E) {
16708   unsigned NumArgs = E->getNumArgs() - 1;
16709   SmallVector<Value *, 8> Args(NumArgs);
16710   for (unsigned I = 0; I < NumArgs; ++I)
16711     Args[I] = CGF.EmitScalarExpr(E->getArg(I));
16712   Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
16713   Function *F = CGF.CGM.getIntrinsic(IntrinsicID);
16714   Value *Call = CGF.Builder.CreateCall(F, Args);
16715   Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
16716   CGF.Builder.CreateStore(CC, CCPtr);
16717   return CGF.Builder.CreateExtractValue(Call, 0);
16718 }
16719 
16720 Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
16721                                                const CallExpr *E) {
16722   switch (BuiltinID) {
16723   case SystemZ::BI__builtin_tbegin: {
16724     Value *TDB = EmitScalarExpr(E->getArg(0));
16725     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
16726     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
16727     return Builder.CreateCall(F, {TDB, Control});
16728   }
16729   case SystemZ::BI__builtin_tbegin_nofloat: {
16730     Value *TDB = EmitScalarExpr(E->getArg(0));
16731     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
16732     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
16733     return Builder.CreateCall(F, {TDB, Control});
16734   }
16735   case SystemZ::BI__builtin_tbeginc: {
16736     Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
16737     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
16738     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
16739     return Builder.CreateCall(F, {TDB, Control});
16740   }
16741   case SystemZ::BI__builtin_tabort: {
16742     Value *Data = EmitScalarExpr(E->getArg(0));
16743     Function *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
16744     return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
16745   }
16746   case SystemZ::BI__builtin_non_tx_store: {
16747     Value *Address = EmitScalarExpr(E->getArg(0));
16748     Value *Data = EmitScalarExpr(E->getArg(1));
16749     Function *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
16750     return Builder.CreateCall(F, {Data, Address});
16751   }
16752 
16753   // Vector builtins.  Note that most vector builtins are mapped automatically
16754   // to target-specific LLVM intrinsics.  The ones handled specially here can
16755   // be represented via standard LLVM IR, which is preferable to enable common
16756   // LLVM optimizations.
16757 
16758   case SystemZ::BI__builtin_s390_vpopctb:
16759   case SystemZ::BI__builtin_s390_vpopcth:
16760   case SystemZ::BI__builtin_s390_vpopctf:
16761   case SystemZ::BI__builtin_s390_vpopctg: {
16762     llvm::Type *ResultType = ConvertType(E->getType());
16763     Value *X = EmitScalarExpr(E->getArg(0));
16764     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
16765     return Builder.CreateCall(F, X);
16766   }
16767 
16768   case SystemZ::BI__builtin_s390_vclzb:
16769   case SystemZ::BI__builtin_s390_vclzh:
16770   case SystemZ::BI__builtin_s390_vclzf:
16771   case SystemZ::BI__builtin_s390_vclzg: {
16772     llvm::Type *ResultType = ConvertType(E->getType());
16773     Value *X = EmitScalarExpr(E->getArg(0));
16774     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16775     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
16776     return Builder.CreateCall(F, {X, Undef});
16777   }
16778 
16779   case SystemZ::BI__builtin_s390_vctzb:
16780   case SystemZ::BI__builtin_s390_vctzh:
16781   case SystemZ::BI__builtin_s390_vctzf:
16782   case SystemZ::BI__builtin_s390_vctzg: {
16783     llvm::Type *ResultType = ConvertType(E->getType());
16784     Value *X = EmitScalarExpr(E->getArg(0));
16785     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16786     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
16787     return Builder.CreateCall(F, {X, Undef});
16788   }
16789 
16790   case SystemZ::BI__builtin_s390_vfsqsb:
16791   case SystemZ::BI__builtin_s390_vfsqdb: {
16792     llvm::Type *ResultType = ConvertType(E->getType());
16793     Value *X = EmitScalarExpr(E->getArg(0));
16794     if (Builder.getIsFPConstrained()) {
16795       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, ResultType);
16796       return Builder.CreateConstrainedFPCall(F, { X });
16797     } else {
16798       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
16799       return Builder.CreateCall(F, X);
16800     }
16801   }
16802   case SystemZ::BI__builtin_s390_vfmasb:
16803   case SystemZ::BI__builtin_s390_vfmadb: {
16804     llvm::Type *ResultType = ConvertType(E->getType());
16805     Value *X = EmitScalarExpr(E->getArg(0));
16806     Value *Y = EmitScalarExpr(E->getArg(1));
16807     Value *Z = EmitScalarExpr(E->getArg(2));
16808     if (Builder.getIsFPConstrained()) {
16809       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
16810       return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
16811     } else {
16812       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
16813       return Builder.CreateCall(F, {X, Y, Z});
16814     }
16815   }
16816   case SystemZ::BI__builtin_s390_vfmssb:
16817   case SystemZ::BI__builtin_s390_vfmsdb: {
16818     llvm::Type *ResultType = ConvertType(E->getType());
16819     Value *X = EmitScalarExpr(E->getArg(0));
16820     Value *Y = EmitScalarExpr(E->getArg(1));
16821     Value *Z = EmitScalarExpr(E->getArg(2));
16822     if (Builder.getIsFPConstrained()) {
16823       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
16824       return Builder.CreateConstrainedFPCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
16825     } else {
16826       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
16827       return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
16828     }
16829   }
16830   case SystemZ::BI__builtin_s390_vfnmasb:
16831   case SystemZ::BI__builtin_s390_vfnmadb: {
16832     llvm::Type *ResultType = ConvertType(E->getType());
16833     Value *X = EmitScalarExpr(E->getArg(0));
16834     Value *Y = EmitScalarExpr(E->getArg(1));
16835     Value *Z = EmitScalarExpr(E->getArg(2));
16836     if (Builder.getIsFPConstrained()) {
16837       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
16838       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y,  Z}), "neg");
16839     } else {
16840       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
16841       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
16842     }
16843   }
16844   case SystemZ::BI__builtin_s390_vfnmssb:
16845   case SystemZ::BI__builtin_s390_vfnmsdb: {
16846     llvm::Type *ResultType = ConvertType(E->getType());
16847     Value *X = EmitScalarExpr(E->getArg(0));
16848     Value *Y = EmitScalarExpr(E->getArg(1));
16849     Value *Z = EmitScalarExpr(E->getArg(2));
16850     if (Builder.getIsFPConstrained()) {
16851       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
16852       Value *NegZ = Builder.CreateFNeg(Z, "sub");
16853       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y, NegZ}));
16854     } else {
16855       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
16856       Value *NegZ = Builder.CreateFNeg(Z, "neg");
16857       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, NegZ}));
16858     }
16859   }
16860   case SystemZ::BI__builtin_s390_vflpsb:
16861   case SystemZ::BI__builtin_s390_vflpdb: {
16862     llvm::Type *ResultType = ConvertType(E->getType());
16863     Value *X = EmitScalarExpr(E->getArg(0));
16864     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
16865     return Builder.CreateCall(F, X);
16866   }
16867   case SystemZ::BI__builtin_s390_vflnsb:
16868   case SystemZ::BI__builtin_s390_vflndb: {
16869     llvm::Type *ResultType = ConvertType(E->getType());
16870     Value *X = EmitScalarExpr(E->getArg(0));
16871     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
16872     return Builder.CreateFNeg(Builder.CreateCall(F, X), "neg");
16873   }
16874   case SystemZ::BI__builtin_s390_vfisb:
16875   case SystemZ::BI__builtin_s390_vfidb: {
16876     llvm::Type *ResultType = ConvertType(E->getType());
16877     Value *X = EmitScalarExpr(E->getArg(0));
16878     // Constant-fold the M4 and M5 mask arguments.
16879     llvm::APSInt M4 = *E->getArg(1)->getIntegerConstantExpr(getContext());
16880     llvm::APSInt M5 = *E->getArg(2)->getIntegerConstantExpr(getContext());
16881     // Check whether this instance can be represented via a LLVM standard
16882     // intrinsic.  We only support some combinations of M4 and M5.
16883     Intrinsic::ID ID = Intrinsic::not_intrinsic;
16884     Intrinsic::ID CI;
16885     switch (M4.getZExtValue()) {
16886     default: break;
16887     case 0:  // IEEE-inexact exception allowed
16888       switch (M5.getZExtValue()) {
16889       default: break;
16890       case 0: ID = Intrinsic::rint;
16891               CI = Intrinsic::experimental_constrained_rint; break;
16892       }
16893       break;
16894     case 4:  // IEEE-inexact exception suppressed
16895       switch (M5.getZExtValue()) {
16896       default: break;
16897       case 0: ID = Intrinsic::nearbyint;
16898               CI = Intrinsic::experimental_constrained_nearbyint; break;
16899       case 1: ID = Intrinsic::round;
16900               CI = Intrinsic::experimental_constrained_round; break;
16901       case 5: ID = Intrinsic::trunc;
16902               CI = Intrinsic::experimental_constrained_trunc; break;
16903       case 6: ID = Intrinsic::ceil;
16904               CI = Intrinsic::experimental_constrained_ceil; break;
16905       case 7: ID = Intrinsic::floor;
16906               CI = Intrinsic::experimental_constrained_floor; break;
16907       }
16908       break;
16909     }
16910     if (ID != Intrinsic::not_intrinsic) {
16911       if (Builder.getIsFPConstrained()) {
16912         Function *F = CGM.getIntrinsic(CI, ResultType);
16913         return Builder.CreateConstrainedFPCall(F, X);
16914       } else {
16915         Function *F = CGM.getIntrinsic(ID, ResultType);
16916         return Builder.CreateCall(F, X);
16917       }
16918     }
16919     switch (BuiltinID) { // FIXME: constrained version?
16920       case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
16921       case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
16922       default: llvm_unreachable("Unknown BuiltinID");
16923     }
16924     Function *F = CGM.getIntrinsic(ID);
16925     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
16926     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
16927     return Builder.CreateCall(F, {X, M4Value, M5Value});
16928   }
16929   case SystemZ::BI__builtin_s390_vfmaxsb:
16930   case SystemZ::BI__builtin_s390_vfmaxdb: {
16931     llvm::Type *ResultType = ConvertType(E->getType());
16932     Value *X = EmitScalarExpr(E->getArg(0));
16933     Value *Y = EmitScalarExpr(E->getArg(1));
16934     // Constant-fold the M4 mask argument.
16935     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
16936     // Check whether this instance can be represented via a LLVM standard
16937     // intrinsic.  We only support some values of M4.
16938     Intrinsic::ID ID = Intrinsic::not_intrinsic;
16939     Intrinsic::ID CI;
16940     switch (M4.getZExtValue()) {
16941     default: break;
16942     case 4: ID = Intrinsic::maxnum;
16943             CI = Intrinsic::experimental_constrained_maxnum; break;
16944     }
16945     if (ID != Intrinsic::not_intrinsic) {
16946       if (Builder.getIsFPConstrained()) {
16947         Function *F = CGM.getIntrinsic(CI, ResultType);
16948         return Builder.CreateConstrainedFPCall(F, {X, Y});
16949       } else {
16950         Function *F = CGM.getIntrinsic(ID, ResultType);
16951         return Builder.CreateCall(F, {X, Y});
16952       }
16953     }
16954     switch (BuiltinID) {
16955       case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
16956       case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
16957       default: llvm_unreachable("Unknown BuiltinID");
16958     }
16959     Function *F = CGM.getIntrinsic(ID);
16960     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
16961     return Builder.CreateCall(F, {X, Y, M4Value});
16962   }
16963   case SystemZ::BI__builtin_s390_vfminsb:
16964   case SystemZ::BI__builtin_s390_vfmindb: {
16965     llvm::Type *ResultType = ConvertType(E->getType());
16966     Value *X = EmitScalarExpr(E->getArg(0));
16967     Value *Y = EmitScalarExpr(E->getArg(1));
16968     // Constant-fold the M4 mask argument.
16969     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
16970     // Check whether this instance can be represented via a LLVM standard
16971     // intrinsic.  We only support some values of M4.
16972     Intrinsic::ID ID = Intrinsic::not_intrinsic;
16973     Intrinsic::ID CI;
16974     switch (M4.getZExtValue()) {
16975     default: break;
16976     case 4: ID = Intrinsic::minnum;
16977             CI = Intrinsic::experimental_constrained_minnum; break;
16978     }
16979     if (ID != Intrinsic::not_intrinsic) {
16980       if (Builder.getIsFPConstrained()) {
16981         Function *F = CGM.getIntrinsic(CI, ResultType);
16982         return Builder.CreateConstrainedFPCall(F, {X, Y});
16983       } else {
16984         Function *F = CGM.getIntrinsic(ID, ResultType);
16985         return Builder.CreateCall(F, {X, Y});
16986       }
16987     }
16988     switch (BuiltinID) {
16989       case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
16990       case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
16991       default: llvm_unreachable("Unknown BuiltinID");
16992     }
16993     Function *F = CGM.getIntrinsic(ID);
16994     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
16995     return Builder.CreateCall(F, {X, Y, M4Value});
16996   }
16997 
16998   case SystemZ::BI__builtin_s390_vlbrh:
16999   case SystemZ::BI__builtin_s390_vlbrf:
17000   case SystemZ::BI__builtin_s390_vlbrg: {
17001     llvm::Type *ResultType = ConvertType(E->getType());
17002     Value *X = EmitScalarExpr(E->getArg(0));
17003     Function *F = CGM.getIntrinsic(Intrinsic::bswap, ResultType);
17004     return Builder.CreateCall(F, X);
17005   }
17006 
17007   // Vector intrinsics that output the post-instruction CC value.
17008 
17009 #define INTRINSIC_WITH_CC(NAME) \
17010     case SystemZ::BI__builtin_##NAME: \
17011       return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
17012 
17013   INTRINSIC_WITH_CC(s390_vpkshs);
17014   INTRINSIC_WITH_CC(s390_vpksfs);
17015   INTRINSIC_WITH_CC(s390_vpksgs);
17016 
17017   INTRINSIC_WITH_CC(s390_vpklshs);
17018   INTRINSIC_WITH_CC(s390_vpklsfs);
17019   INTRINSIC_WITH_CC(s390_vpklsgs);
17020 
17021   INTRINSIC_WITH_CC(s390_vceqbs);
17022   INTRINSIC_WITH_CC(s390_vceqhs);
17023   INTRINSIC_WITH_CC(s390_vceqfs);
17024   INTRINSIC_WITH_CC(s390_vceqgs);
17025 
17026   INTRINSIC_WITH_CC(s390_vchbs);
17027   INTRINSIC_WITH_CC(s390_vchhs);
17028   INTRINSIC_WITH_CC(s390_vchfs);
17029   INTRINSIC_WITH_CC(s390_vchgs);
17030 
17031   INTRINSIC_WITH_CC(s390_vchlbs);
17032   INTRINSIC_WITH_CC(s390_vchlhs);
17033   INTRINSIC_WITH_CC(s390_vchlfs);
17034   INTRINSIC_WITH_CC(s390_vchlgs);
17035 
17036   INTRINSIC_WITH_CC(s390_vfaebs);
17037   INTRINSIC_WITH_CC(s390_vfaehs);
17038   INTRINSIC_WITH_CC(s390_vfaefs);
17039 
17040   INTRINSIC_WITH_CC(s390_vfaezbs);
17041   INTRINSIC_WITH_CC(s390_vfaezhs);
17042   INTRINSIC_WITH_CC(s390_vfaezfs);
17043 
17044   INTRINSIC_WITH_CC(s390_vfeebs);
17045   INTRINSIC_WITH_CC(s390_vfeehs);
17046   INTRINSIC_WITH_CC(s390_vfeefs);
17047 
17048   INTRINSIC_WITH_CC(s390_vfeezbs);
17049   INTRINSIC_WITH_CC(s390_vfeezhs);
17050   INTRINSIC_WITH_CC(s390_vfeezfs);
17051 
17052   INTRINSIC_WITH_CC(s390_vfenebs);
17053   INTRINSIC_WITH_CC(s390_vfenehs);
17054   INTRINSIC_WITH_CC(s390_vfenefs);
17055 
17056   INTRINSIC_WITH_CC(s390_vfenezbs);
17057   INTRINSIC_WITH_CC(s390_vfenezhs);
17058   INTRINSIC_WITH_CC(s390_vfenezfs);
17059 
17060   INTRINSIC_WITH_CC(s390_vistrbs);
17061   INTRINSIC_WITH_CC(s390_vistrhs);
17062   INTRINSIC_WITH_CC(s390_vistrfs);
17063 
17064   INTRINSIC_WITH_CC(s390_vstrcbs);
17065   INTRINSIC_WITH_CC(s390_vstrchs);
17066   INTRINSIC_WITH_CC(s390_vstrcfs);
17067 
17068   INTRINSIC_WITH_CC(s390_vstrczbs);
17069   INTRINSIC_WITH_CC(s390_vstrczhs);
17070   INTRINSIC_WITH_CC(s390_vstrczfs);
17071 
17072   INTRINSIC_WITH_CC(s390_vfcesbs);
17073   INTRINSIC_WITH_CC(s390_vfcedbs);
17074   INTRINSIC_WITH_CC(s390_vfchsbs);
17075   INTRINSIC_WITH_CC(s390_vfchdbs);
17076   INTRINSIC_WITH_CC(s390_vfchesbs);
17077   INTRINSIC_WITH_CC(s390_vfchedbs);
17078 
17079   INTRINSIC_WITH_CC(s390_vftcisb);
17080   INTRINSIC_WITH_CC(s390_vftcidb);
17081 
17082   INTRINSIC_WITH_CC(s390_vstrsb);
17083   INTRINSIC_WITH_CC(s390_vstrsh);
17084   INTRINSIC_WITH_CC(s390_vstrsf);
17085 
17086   INTRINSIC_WITH_CC(s390_vstrszb);
17087   INTRINSIC_WITH_CC(s390_vstrszh);
17088   INTRINSIC_WITH_CC(s390_vstrszf);
17089 
17090 #undef INTRINSIC_WITH_CC
17091 
17092   default:
17093     return nullptr;
17094   }
17095 }
17096 
17097 namespace {
17098 // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
17099 struct NVPTXMmaLdstInfo {
17100   unsigned NumResults;  // Number of elements to load/store
17101   // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
17102   unsigned IID_col;
17103   unsigned IID_row;
17104 };
17105 
17106 #define MMA_INTR(geom_op_type, layout) \
17107   Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
17108 #define MMA_LDST(n, geom_op_type)                                              \
17109   { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
17110 
17111 static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
17112   switch (BuiltinID) {
17113   // FP MMA loads
17114   case NVPTX::BI__hmma_m16n16k16_ld_a:
17115     return MMA_LDST(8, m16n16k16_load_a_f16);
17116   case NVPTX::BI__hmma_m16n16k16_ld_b:
17117     return MMA_LDST(8, m16n16k16_load_b_f16);
17118   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
17119     return MMA_LDST(4, m16n16k16_load_c_f16);
17120   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
17121     return MMA_LDST(8, m16n16k16_load_c_f32);
17122   case NVPTX::BI__hmma_m32n8k16_ld_a:
17123     return MMA_LDST(8, m32n8k16_load_a_f16);
17124   case NVPTX::BI__hmma_m32n8k16_ld_b:
17125     return MMA_LDST(8, m32n8k16_load_b_f16);
17126   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
17127     return MMA_LDST(4, m32n8k16_load_c_f16);
17128   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
17129     return MMA_LDST(8, m32n8k16_load_c_f32);
17130   case NVPTX::BI__hmma_m8n32k16_ld_a:
17131     return MMA_LDST(8, m8n32k16_load_a_f16);
17132   case NVPTX::BI__hmma_m8n32k16_ld_b:
17133     return MMA_LDST(8, m8n32k16_load_b_f16);
17134   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
17135     return MMA_LDST(4, m8n32k16_load_c_f16);
17136   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
17137     return MMA_LDST(8, m8n32k16_load_c_f32);
17138 
17139   // Integer MMA loads
17140   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
17141     return MMA_LDST(2, m16n16k16_load_a_s8);
17142   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
17143     return MMA_LDST(2, m16n16k16_load_a_u8);
17144   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
17145     return MMA_LDST(2, m16n16k16_load_b_s8);
17146   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
17147     return MMA_LDST(2, m16n16k16_load_b_u8);
17148   case NVPTX::BI__imma_m16n16k16_ld_c:
17149     return MMA_LDST(8, m16n16k16_load_c_s32);
17150   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
17151     return MMA_LDST(4, m32n8k16_load_a_s8);
17152   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
17153     return MMA_LDST(4, m32n8k16_load_a_u8);
17154   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
17155     return MMA_LDST(1, m32n8k16_load_b_s8);
17156   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
17157     return MMA_LDST(1, m32n8k16_load_b_u8);
17158   case NVPTX::BI__imma_m32n8k16_ld_c:
17159     return MMA_LDST(8, m32n8k16_load_c_s32);
17160   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
17161     return MMA_LDST(1, m8n32k16_load_a_s8);
17162   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
17163     return MMA_LDST(1, m8n32k16_load_a_u8);
17164   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
17165     return MMA_LDST(4, m8n32k16_load_b_s8);
17166   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
17167     return MMA_LDST(4, m8n32k16_load_b_u8);
17168   case NVPTX::BI__imma_m8n32k16_ld_c:
17169     return MMA_LDST(8, m8n32k16_load_c_s32);
17170 
17171   // Sub-integer MMA loads.
17172   // Only row/col layout is supported by A/B fragments.
17173   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
17174     return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
17175   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
17176     return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
17177   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
17178     return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
17179   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
17180     return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
17181   case NVPTX::BI__imma_m8n8k32_ld_c:
17182     return MMA_LDST(2, m8n8k32_load_c_s32);
17183   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
17184     return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
17185   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
17186     return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
17187   case NVPTX::BI__bmma_m8n8k128_ld_c:
17188     return MMA_LDST(2, m8n8k128_load_c_s32);
17189 
17190   // Double MMA loads
17191   case NVPTX::BI__dmma_m8n8k4_ld_a:
17192     return MMA_LDST(1, m8n8k4_load_a_f64);
17193   case NVPTX::BI__dmma_m8n8k4_ld_b:
17194     return MMA_LDST(1, m8n8k4_load_b_f64);
17195   case NVPTX::BI__dmma_m8n8k4_ld_c:
17196     return MMA_LDST(2, m8n8k4_load_c_f64);
17197 
17198   // Alternate float MMA loads
17199   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
17200     return MMA_LDST(4, m16n16k16_load_a_bf16);
17201   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
17202     return MMA_LDST(4, m16n16k16_load_b_bf16);
17203   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
17204     return MMA_LDST(2, m8n32k16_load_a_bf16);
17205   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
17206     return MMA_LDST(8, m8n32k16_load_b_bf16);
17207   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
17208     return MMA_LDST(8, m32n8k16_load_a_bf16);
17209   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
17210     return MMA_LDST(2, m32n8k16_load_b_bf16);
17211   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
17212     return MMA_LDST(4, m16n16k8_load_a_tf32);
17213   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
17214     return MMA_LDST(4, m16n16k8_load_b_tf32);
17215   case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
17216     return MMA_LDST(8, m16n16k8_load_c_f32);
17217 
17218   // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
17219   // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
17220   // use fragment C for both loads and stores.
17221   // FP MMA stores.
17222   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
17223     return MMA_LDST(4, m16n16k16_store_d_f16);
17224   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
17225     return MMA_LDST(8, m16n16k16_store_d_f32);
17226   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
17227     return MMA_LDST(4, m32n8k16_store_d_f16);
17228   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
17229     return MMA_LDST(8, m32n8k16_store_d_f32);
17230   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
17231     return MMA_LDST(4, m8n32k16_store_d_f16);
17232   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
17233     return MMA_LDST(8, m8n32k16_store_d_f32);
17234 
17235   // Integer and sub-integer MMA stores.
17236   // Another naming quirk. Unlike other MMA builtins that use PTX types in the
17237   // name, integer loads/stores use LLVM's i32.
17238   case NVPTX::BI__imma_m16n16k16_st_c_i32:
17239     return MMA_LDST(8, m16n16k16_store_d_s32);
17240   case NVPTX::BI__imma_m32n8k16_st_c_i32:
17241     return MMA_LDST(8, m32n8k16_store_d_s32);
17242   case NVPTX::BI__imma_m8n32k16_st_c_i32:
17243     return MMA_LDST(8, m8n32k16_store_d_s32);
17244   case NVPTX::BI__imma_m8n8k32_st_c_i32:
17245     return MMA_LDST(2, m8n8k32_store_d_s32);
17246   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
17247     return MMA_LDST(2, m8n8k128_store_d_s32);
17248 
17249   // Double MMA store
17250   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
17251     return MMA_LDST(2, m8n8k4_store_d_f64);
17252 
17253   // Alternate float MMA store
17254   case NVPTX::BI__mma_m16n16k8_st_c_f32:
17255     return MMA_LDST(8, m16n16k8_store_d_f32);
17256 
17257   default:
17258     llvm_unreachable("Unknown MMA builtin");
17259   }
17260 }
17261 #undef MMA_LDST
17262 #undef MMA_INTR
17263 
17264 
17265 struct NVPTXMmaInfo {
17266   unsigned NumEltsA;
17267   unsigned NumEltsB;
17268   unsigned NumEltsC;
17269   unsigned NumEltsD;
17270 
17271   // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
17272   // over 'col' for layout. The index of non-satf variants is expected to match
17273   // the undocumented layout constants used by CUDA's mma.hpp.
17274   std::array<unsigned, 8> Variants;
17275 
17276   unsigned getMMAIntrinsic(int Layout, bool Satf) {
17277     unsigned Index = Layout + 4 * Satf;
17278     if (Index >= Variants.size())
17279       return 0;
17280     return Variants[Index];
17281   }
17282 };
17283 
17284   // Returns an intrinsic that matches Layout and Satf for valid combinations of
17285   // Layout and Satf, 0 otherwise.
17286 static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
17287   // clang-format off
17288 #define MMA_VARIANTS(geom, type)                                    \
17289       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
17290       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
17291       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
17292       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
17293 #define MMA_SATF_VARIANTS(geom, type)                               \
17294       MMA_VARIANTS(geom, type),                                     \
17295       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
17296       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
17297       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
17298       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
17299 // Sub-integer MMA only supports row.col layout.
17300 #define MMA_VARIANTS_I4(geom, type) \
17301       0, \
17302       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
17303       0, \
17304       0, \
17305       0, \
17306       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
17307       0, \
17308       0
17309 // b1 MMA does not support .satfinite.
17310 #define MMA_VARIANTS_B1_XOR(geom, type) \
17311       0, \
17312       Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type,             \
17313       0, \
17314       0, \
17315       0, \
17316       0, \
17317       0, \
17318       0
17319 #define MMA_VARIANTS_B1_AND(geom, type) \
17320       0, \
17321       Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type,             \
17322       0, \
17323       0, \
17324       0, \
17325       0, \
17326       0, \
17327       0
17328   // clang-format on
17329   switch (BuiltinID) {
17330   // FP MMA
17331   // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
17332   // NumEltsN of return value are ordered as A,B,C,D.
17333   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
17334     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
17335   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
17336     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
17337   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
17338     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
17339   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
17340     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
17341   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
17342     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
17343   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
17344     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
17345   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
17346     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
17347   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
17348     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
17349   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
17350     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
17351   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
17352     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
17353   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
17354     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
17355   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
17356     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
17357 
17358   // Integer MMA
17359   case NVPTX::BI__imma_m16n16k16_mma_s8:
17360     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
17361   case NVPTX::BI__imma_m16n16k16_mma_u8:
17362     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
17363   case NVPTX::BI__imma_m32n8k16_mma_s8:
17364     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
17365   case NVPTX::BI__imma_m32n8k16_mma_u8:
17366     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
17367   case NVPTX::BI__imma_m8n32k16_mma_s8:
17368     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
17369   case NVPTX::BI__imma_m8n32k16_mma_u8:
17370     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
17371 
17372   // Sub-integer MMA
17373   case NVPTX::BI__imma_m8n8k32_mma_s4:
17374     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
17375   case NVPTX::BI__imma_m8n8k32_mma_u4:
17376     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
17377   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
17378     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
17379   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
17380     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
17381 
17382   // Double MMA
17383   case NVPTX::BI__dmma_m8n8k4_mma_f64:
17384     return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
17385 
17386   // Alternate FP MMA
17387   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
17388     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
17389   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
17390     return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
17391   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
17392     return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
17393   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
17394     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
17395   default:
17396     llvm_unreachable("Unexpected builtin ID.");
17397   }
17398 #undef MMA_VARIANTS
17399 #undef MMA_SATF_VARIANTS
17400 #undef MMA_VARIANTS_I4
17401 #undef MMA_VARIANTS_B1_AND
17402 #undef MMA_VARIANTS_B1_XOR
17403 }
17404 
17405 } // namespace
17406 
17407 Value *
17408 CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) {
17409   auto MakeLdg = [&](unsigned IntrinsicID) {
17410     Value *Ptr = EmitScalarExpr(E->getArg(0));
17411     clang::CharUnits Align =
17412         CGM.getNaturalPointeeTypeAlignment(E->getArg(0)->getType());
17413     return Builder.CreateCall(
17414         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
17415                                        Ptr->getType()}),
17416         {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
17417   };
17418   auto MakeScopedAtomic = [&](unsigned IntrinsicID) {
17419     Value *Ptr = EmitScalarExpr(E->getArg(0));
17420     return Builder.CreateCall(
17421         CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
17422                                        Ptr->getType()}),
17423         {Ptr, EmitScalarExpr(E->getArg(1))});
17424   };
17425   switch (BuiltinID) {
17426   case NVPTX::BI__nvvm_atom_add_gen_i:
17427   case NVPTX::BI__nvvm_atom_add_gen_l:
17428   case NVPTX::BI__nvvm_atom_add_gen_ll:
17429     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
17430 
17431   case NVPTX::BI__nvvm_atom_sub_gen_i:
17432   case NVPTX::BI__nvvm_atom_sub_gen_l:
17433   case NVPTX::BI__nvvm_atom_sub_gen_ll:
17434     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
17435 
17436   case NVPTX::BI__nvvm_atom_and_gen_i:
17437   case NVPTX::BI__nvvm_atom_and_gen_l:
17438   case NVPTX::BI__nvvm_atom_and_gen_ll:
17439     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
17440 
17441   case NVPTX::BI__nvvm_atom_or_gen_i:
17442   case NVPTX::BI__nvvm_atom_or_gen_l:
17443   case NVPTX::BI__nvvm_atom_or_gen_ll:
17444     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
17445 
17446   case NVPTX::BI__nvvm_atom_xor_gen_i:
17447   case NVPTX::BI__nvvm_atom_xor_gen_l:
17448   case NVPTX::BI__nvvm_atom_xor_gen_ll:
17449     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
17450 
17451   case NVPTX::BI__nvvm_atom_xchg_gen_i:
17452   case NVPTX::BI__nvvm_atom_xchg_gen_l:
17453   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
17454     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
17455 
17456   case NVPTX::BI__nvvm_atom_max_gen_i:
17457   case NVPTX::BI__nvvm_atom_max_gen_l:
17458   case NVPTX::BI__nvvm_atom_max_gen_ll:
17459     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
17460 
17461   case NVPTX::BI__nvvm_atom_max_gen_ui:
17462   case NVPTX::BI__nvvm_atom_max_gen_ul:
17463   case NVPTX::BI__nvvm_atom_max_gen_ull:
17464     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
17465 
17466   case NVPTX::BI__nvvm_atom_min_gen_i:
17467   case NVPTX::BI__nvvm_atom_min_gen_l:
17468   case NVPTX::BI__nvvm_atom_min_gen_ll:
17469     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
17470 
17471   case NVPTX::BI__nvvm_atom_min_gen_ui:
17472   case NVPTX::BI__nvvm_atom_min_gen_ul:
17473   case NVPTX::BI__nvvm_atom_min_gen_ull:
17474     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
17475 
17476   case NVPTX::BI__nvvm_atom_cas_gen_i:
17477   case NVPTX::BI__nvvm_atom_cas_gen_l:
17478   case NVPTX::BI__nvvm_atom_cas_gen_ll:
17479     // __nvvm_atom_cas_gen_* should return the old value rather than the
17480     // success flag.
17481     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
17482 
17483   case NVPTX::BI__nvvm_atom_add_gen_f:
17484   case NVPTX::BI__nvvm_atom_add_gen_d: {
17485     Value *Ptr = EmitScalarExpr(E->getArg(0));
17486     Value *Val = EmitScalarExpr(E->getArg(1));
17487     return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, Ptr, Val,
17488                                    AtomicOrdering::SequentiallyConsistent);
17489   }
17490 
17491   case NVPTX::BI__nvvm_atom_inc_gen_ui: {
17492     Value *Ptr = EmitScalarExpr(E->getArg(0));
17493     Value *Val = EmitScalarExpr(E->getArg(1));
17494     Function *FnALI32 =
17495         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
17496     return Builder.CreateCall(FnALI32, {Ptr, Val});
17497   }
17498 
17499   case NVPTX::BI__nvvm_atom_dec_gen_ui: {
17500     Value *Ptr = EmitScalarExpr(E->getArg(0));
17501     Value *Val = EmitScalarExpr(E->getArg(1));
17502     Function *FnALD32 =
17503         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
17504     return Builder.CreateCall(FnALD32, {Ptr, Val});
17505   }
17506 
17507   case NVPTX::BI__nvvm_ldg_c:
17508   case NVPTX::BI__nvvm_ldg_c2:
17509   case NVPTX::BI__nvvm_ldg_c4:
17510   case NVPTX::BI__nvvm_ldg_s:
17511   case NVPTX::BI__nvvm_ldg_s2:
17512   case NVPTX::BI__nvvm_ldg_s4:
17513   case NVPTX::BI__nvvm_ldg_i:
17514   case NVPTX::BI__nvvm_ldg_i2:
17515   case NVPTX::BI__nvvm_ldg_i4:
17516   case NVPTX::BI__nvvm_ldg_l:
17517   case NVPTX::BI__nvvm_ldg_ll:
17518   case NVPTX::BI__nvvm_ldg_ll2:
17519   case NVPTX::BI__nvvm_ldg_uc:
17520   case NVPTX::BI__nvvm_ldg_uc2:
17521   case NVPTX::BI__nvvm_ldg_uc4:
17522   case NVPTX::BI__nvvm_ldg_us:
17523   case NVPTX::BI__nvvm_ldg_us2:
17524   case NVPTX::BI__nvvm_ldg_us4:
17525   case NVPTX::BI__nvvm_ldg_ui:
17526   case NVPTX::BI__nvvm_ldg_ui2:
17527   case NVPTX::BI__nvvm_ldg_ui4:
17528   case NVPTX::BI__nvvm_ldg_ul:
17529   case NVPTX::BI__nvvm_ldg_ull:
17530   case NVPTX::BI__nvvm_ldg_ull2:
17531     // PTX Interoperability section 2.2: "For a vector with an even number of
17532     // elements, its alignment is set to number of elements times the alignment
17533     // of its member: n*alignof(t)."
17534     return MakeLdg(Intrinsic::nvvm_ldg_global_i);
17535   case NVPTX::BI__nvvm_ldg_f:
17536   case NVPTX::BI__nvvm_ldg_f2:
17537   case NVPTX::BI__nvvm_ldg_f4:
17538   case NVPTX::BI__nvvm_ldg_d:
17539   case NVPTX::BI__nvvm_ldg_d2:
17540     return MakeLdg(Intrinsic::nvvm_ldg_global_f);
17541 
17542   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
17543   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
17544   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
17545     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta);
17546   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
17547   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
17548   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
17549     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys);
17550   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
17551   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
17552     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta);
17553   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
17554   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
17555     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys);
17556   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
17557   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
17558   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
17559     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta);
17560   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
17561   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
17562   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
17563     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys);
17564   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
17565   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
17566   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
17567   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
17568   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
17569   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
17570     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta);
17571   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
17572   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
17573   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
17574   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
17575   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
17576   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
17577     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys);
17578   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
17579   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
17580   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
17581   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
17582   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
17583   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
17584     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta);
17585   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
17586   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
17587   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
17588   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
17589   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
17590   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
17591     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys);
17592   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
17593     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta);
17594   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
17595     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta);
17596   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
17597     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys);
17598   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
17599     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys);
17600   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
17601   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
17602   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
17603     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta);
17604   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
17605   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
17606   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
17607     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys);
17608   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
17609   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
17610   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
17611     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta);
17612   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
17613   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
17614   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
17615     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys);
17616   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
17617   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
17618   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
17619     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta);
17620   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
17621   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
17622   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
17623     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys);
17624   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
17625   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
17626   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
17627     Value *Ptr = EmitScalarExpr(E->getArg(0));
17628     return Builder.CreateCall(
17629         CGM.getIntrinsic(
17630             Intrinsic::nvvm_atomic_cas_gen_i_cta,
17631             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
17632         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
17633   }
17634   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
17635   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
17636   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
17637     Value *Ptr = EmitScalarExpr(E->getArg(0));
17638     return Builder.CreateCall(
17639         CGM.getIntrinsic(
17640             Intrinsic::nvvm_atomic_cas_gen_i_sys,
17641             {Ptr->getType()->getPointerElementType(), Ptr->getType()}),
17642         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
17643   }
17644   case NVPTX::BI__nvvm_match_all_sync_i32p:
17645   case NVPTX::BI__nvvm_match_all_sync_i64p: {
17646     Value *Mask = EmitScalarExpr(E->getArg(0));
17647     Value *Val = EmitScalarExpr(E->getArg(1));
17648     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
17649     Value *ResultPair = Builder.CreateCall(
17650         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
17651                              ? Intrinsic::nvvm_match_all_sync_i32p
17652                              : Intrinsic::nvvm_match_all_sync_i64p),
17653         {Mask, Val});
17654     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
17655                                      PredOutPtr.getElementType());
17656     Builder.CreateStore(Pred, PredOutPtr);
17657     return Builder.CreateExtractValue(ResultPair, 0);
17658   }
17659 
17660   // FP MMA loads
17661   case NVPTX::BI__hmma_m16n16k16_ld_a:
17662   case NVPTX::BI__hmma_m16n16k16_ld_b:
17663   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
17664   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
17665   case NVPTX::BI__hmma_m32n8k16_ld_a:
17666   case NVPTX::BI__hmma_m32n8k16_ld_b:
17667   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
17668   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
17669   case NVPTX::BI__hmma_m8n32k16_ld_a:
17670   case NVPTX::BI__hmma_m8n32k16_ld_b:
17671   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
17672   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
17673   // Integer MMA loads.
17674   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
17675   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
17676   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
17677   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
17678   case NVPTX::BI__imma_m16n16k16_ld_c:
17679   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
17680   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
17681   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
17682   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
17683   case NVPTX::BI__imma_m32n8k16_ld_c:
17684   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
17685   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
17686   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
17687   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
17688   case NVPTX::BI__imma_m8n32k16_ld_c:
17689   // Sub-integer MMA loads.
17690   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
17691   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
17692   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
17693   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
17694   case NVPTX::BI__imma_m8n8k32_ld_c:
17695   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
17696   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
17697   case NVPTX::BI__bmma_m8n8k128_ld_c:
17698   // Double MMA loads.
17699   case NVPTX::BI__dmma_m8n8k4_ld_a:
17700   case NVPTX::BI__dmma_m8n8k4_ld_b:
17701   case NVPTX::BI__dmma_m8n8k4_ld_c:
17702   // Alternate float MMA loads.
17703   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
17704   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
17705   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
17706   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
17707   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
17708   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
17709   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
17710   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
17711   case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
17712     Address Dst = EmitPointerWithAlignment(E->getArg(0));
17713     Value *Src = EmitScalarExpr(E->getArg(1));
17714     Value *Ldm = EmitScalarExpr(E->getArg(2));
17715     Optional<llvm::APSInt> isColMajorArg =
17716         E->getArg(3)->getIntegerConstantExpr(getContext());
17717     if (!isColMajorArg)
17718       return nullptr;
17719     bool isColMajor = isColMajorArg->getSExtValue();
17720     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
17721     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
17722     if (IID == 0)
17723       return nullptr;
17724 
17725     Value *Result =
17726         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
17727 
17728     // Save returned values.
17729     assert(II.NumResults);
17730     if (II.NumResults == 1) {
17731       Builder.CreateAlignedStore(Result, Dst.getPointer(),
17732                                  CharUnits::fromQuantity(4));
17733     } else {
17734       for (unsigned i = 0; i < II.NumResults; ++i) {
17735         Builder.CreateAlignedStore(
17736             Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
17737                                   Dst.getElementType()),
17738             Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
17739                               llvm::ConstantInt::get(IntTy, i)),
17740             CharUnits::fromQuantity(4));
17741       }
17742     }
17743     return Result;
17744   }
17745 
17746   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
17747   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
17748   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
17749   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
17750   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
17751   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
17752   case NVPTX::BI__imma_m16n16k16_st_c_i32:
17753   case NVPTX::BI__imma_m32n8k16_st_c_i32:
17754   case NVPTX::BI__imma_m8n32k16_st_c_i32:
17755   case NVPTX::BI__imma_m8n8k32_st_c_i32:
17756   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
17757   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
17758   case NVPTX::BI__mma_m16n16k8_st_c_f32: {
17759     Value *Dst = EmitScalarExpr(E->getArg(0));
17760     Address Src = EmitPointerWithAlignment(E->getArg(1));
17761     Value *Ldm = EmitScalarExpr(E->getArg(2));
17762     Optional<llvm::APSInt> isColMajorArg =
17763         E->getArg(3)->getIntegerConstantExpr(getContext());
17764     if (!isColMajorArg)
17765       return nullptr;
17766     bool isColMajor = isColMajorArg->getSExtValue();
17767     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
17768     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
17769     if (IID == 0)
17770       return nullptr;
17771     Function *Intrinsic =
17772         CGM.getIntrinsic(IID, Dst->getType());
17773     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
17774     SmallVector<Value *, 10> Values = {Dst};
17775     for (unsigned i = 0; i < II.NumResults; ++i) {
17776       Value *V = Builder.CreateAlignedLoad(
17777           Src.getElementType(),
17778           Builder.CreateGEP(Src.getElementType(), Src.getPointer(),
17779                             llvm::ConstantInt::get(IntTy, i)),
17780           CharUnits::fromQuantity(4));
17781       Values.push_back(Builder.CreateBitCast(V, ParamType));
17782     }
17783     Values.push_back(Ldm);
17784     Value *Result = Builder.CreateCall(Intrinsic, Values);
17785     return Result;
17786   }
17787 
17788   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
17789   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
17790   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
17791   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
17792   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
17793   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
17794   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
17795   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
17796   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
17797   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
17798   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
17799   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
17800   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
17801   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
17802   case NVPTX::BI__imma_m16n16k16_mma_s8:
17803   case NVPTX::BI__imma_m16n16k16_mma_u8:
17804   case NVPTX::BI__imma_m32n8k16_mma_s8:
17805   case NVPTX::BI__imma_m32n8k16_mma_u8:
17806   case NVPTX::BI__imma_m8n32k16_mma_s8:
17807   case NVPTX::BI__imma_m8n32k16_mma_u8:
17808   case NVPTX::BI__imma_m8n8k32_mma_s4:
17809   case NVPTX::BI__imma_m8n8k32_mma_u4:
17810   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
17811   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
17812   case NVPTX::BI__dmma_m8n8k4_mma_f64:
17813   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
17814   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
17815   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
17816   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
17817     Address Dst = EmitPointerWithAlignment(E->getArg(0));
17818     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
17819     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
17820     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
17821     Optional<llvm::APSInt> LayoutArg =
17822         E->getArg(4)->getIntegerConstantExpr(getContext());
17823     if (!LayoutArg)
17824       return nullptr;
17825     int Layout = LayoutArg->getSExtValue();
17826     if (Layout < 0 || Layout > 3)
17827       return nullptr;
17828     llvm::APSInt SatfArg;
17829     if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
17830         BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
17831       SatfArg = 0;  // .b1 does not have satf argument.
17832     else if (Optional<llvm::APSInt> OptSatfArg =
17833                  E->getArg(5)->getIntegerConstantExpr(getContext()))
17834       SatfArg = *OptSatfArg;
17835     else
17836       return nullptr;
17837     bool Satf = SatfArg.getSExtValue();
17838     NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
17839     unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
17840     if (IID == 0)  // Unsupported combination of Layout/Satf.
17841       return nullptr;
17842 
17843     SmallVector<Value *, 24> Values;
17844     Function *Intrinsic = CGM.getIntrinsic(IID);
17845     llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
17846     // Load A
17847     for (unsigned i = 0; i < MI.NumEltsA; ++i) {
17848       Value *V = Builder.CreateAlignedLoad(
17849           SrcA.getElementType(),
17850           Builder.CreateGEP(SrcA.getElementType(), SrcA.getPointer(),
17851                             llvm::ConstantInt::get(IntTy, i)),
17852           CharUnits::fromQuantity(4));
17853       Values.push_back(Builder.CreateBitCast(V, AType));
17854     }
17855     // Load B
17856     llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
17857     for (unsigned i = 0; i < MI.NumEltsB; ++i) {
17858       Value *V = Builder.CreateAlignedLoad(
17859           SrcB.getElementType(),
17860           Builder.CreateGEP(SrcB.getElementType(), SrcB.getPointer(),
17861                             llvm::ConstantInt::get(IntTy, i)),
17862           CharUnits::fromQuantity(4));
17863       Values.push_back(Builder.CreateBitCast(V, BType));
17864     }
17865     // Load C
17866     llvm::Type *CType =
17867         Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
17868     for (unsigned i = 0; i < MI.NumEltsC; ++i) {
17869       Value *V = Builder.CreateAlignedLoad(
17870           SrcC.getElementType(),
17871           Builder.CreateGEP(SrcC.getElementType(), SrcC.getPointer(),
17872                             llvm::ConstantInt::get(IntTy, i)),
17873           CharUnits::fromQuantity(4));
17874       Values.push_back(Builder.CreateBitCast(V, CType));
17875     }
17876     Value *Result = Builder.CreateCall(Intrinsic, Values);
17877     llvm::Type *DType = Dst.getElementType();
17878     for (unsigned i = 0; i < MI.NumEltsD; ++i)
17879       Builder.CreateAlignedStore(
17880           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
17881           Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
17882                             llvm::ConstantInt::get(IntTy, i)),
17883           CharUnits::fromQuantity(4));
17884     return Result;
17885   }
17886   default:
17887     return nullptr;
17888   }
17889 }
17890 
17891 namespace {
17892 struct BuiltinAlignArgs {
17893   llvm::Value *Src = nullptr;
17894   llvm::Type *SrcType = nullptr;
17895   llvm::Value *Alignment = nullptr;
17896   llvm::Value *Mask = nullptr;
17897   llvm::IntegerType *IntType = nullptr;
17898 
17899   BuiltinAlignArgs(const CallExpr *E, CodeGenFunction &CGF) {
17900     QualType AstType = E->getArg(0)->getType();
17901     if (AstType->isArrayType())
17902       Src = CGF.EmitArrayToPointerDecay(E->getArg(0)).getPointer();
17903     else
17904       Src = CGF.EmitScalarExpr(E->getArg(0));
17905     SrcType = Src->getType();
17906     if (SrcType->isPointerTy()) {
17907       IntType = IntegerType::get(
17908           CGF.getLLVMContext(),
17909           CGF.CGM.getDataLayout().getIndexTypeSizeInBits(SrcType));
17910     } else {
17911       assert(SrcType->isIntegerTy());
17912       IntType = cast<llvm::IntegerType>(SrcType);
17913     }
17914     Alignment = CGF.EmitScalarExpr(E->getArg(1));
17915     Alignment = CGF.Builder.CreateZExtOrTrunc(Alignment, IntType, "alignment");
17916     auto *One = llvm::ConstantInt::get(IntType, 1);
17917     Mask = CGF.Builder.CreateSub(Alignment, One, "mask");
17918   }
17919 };
17920 } // namespace
17921 
17922 /// Generate (x & (y-1)) == 0.
17923 RValue CodeGenFunction::EmitBuiltinIsAligned(const CallExpr *E) {
17924   BuiltinAlignArgs Args(E, *this);
17925   llvm::Value *SrcAddress = Args.Src;
17926   if (Args.SrcType->isPointerTy())
17927     SrcAddress =
17928         Builder.CreateBitOrPointerCast(Args.Src, Args.IntType, "src_addr");
17929   return RValue::get(Builder.CreateICmpEQ(
17930       Builder.CreateAnd(SrcAddress, Args.Mask, "set_bits"),
17931       llvm::Constant::getNullValue(Args.IntType), "is_aligned"));
17932 }
17933 
17934 /// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
17935 /// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
17936 /// llvm.ptrmask instrinsic (with a GEP before in the align_up case).
17937 /// TODO: actually use ptrmask once most optimization passes know about it.
17938 RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
17939   BuiltinAlignArgs Args(E, *this);
17940   llvm::Value *SrcAddr = Args.Src;
17941   if (Args.Src->getType()->isPointerTy())
17942     SrcAddr = Builder.CreatePtrToInt(Args.Src, Args.IntType, "intptr");
17943   llvm::Value *SrcForMask = SrcAddr;
17944   if (AlignUp) {
17945     // When aligning up we have to first add the mask to ensure we go over the
17946     // next alignment value and then align down to the next valid multiple.
17947     // By adding the mask, we ensure that align_up on an already aligned
17948     // value will not change the value.
17949     SrcForMask = Builder.CreateAdd(SrcForMask, Args.Mask, "over_boundary");
17950   }
17951   // Invert the mask to only clear the lower bits.
17952   llvm::Value *InvertedMask = Builder.CreateNot(Args.Mask, "inverted_mask");
17953   llvm::Value *Result =
17954       Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result");
17955   if (Args.Src->getType()->isPointerTy()) {
17956     /// TODO: Use ptrmask instead of ptrtoint+gep once it is optimized well.
17957     // Result = Builder.CreateIntrinsic(
17958     //  Intrinsic::ptrmask, {Args.SrcType, SrcForMask->getType(), Args.IntType},
17959     //  {SrcForMask, NegatedMask}, nullptr, "aligned_result");
17960     Result->setName("aligned_intptr");
17961     llvm::Value *Difference = Builder.CreateSub(Result, SrcAddr, "diff");
17962     // The result must point to the same underlying allocation. This means we
17963     // can use an inbounds GEP to enable better optimization.
17964     Value *Base = EmitCastToVoidPtr(Args.Src);
17965     if (getLangOpts().isSignedOverflowDefined())
17966       Result = Builder.CreateGEP(Int8Ty, Base, Difference, "aligned_result");
17967     else
17968       Result = EmitCheckedInBoundsGEP(Int8Ty, Base, Difference,
17969                                       /*SignedIndices=*/true,
17970                                       /*isSubtraction=*/!AlignUp,
17971                                       E->getExprLoc(), "aligned_result");
17972     Result = Builder.CreatePointerCast(Result, Args.SrcType);
17973     // Emit an alignment assumption to ensure that the new alignment is
17974     // propagated to loads/stores, etc.
17975     emitAlignmentAssumption(Result, E, E->getExprLoc(), Args.Alignment);
17976   }
17977   assert(Result->getType() == Args.SrcType);
17978   return RValue::get(Result);
17979 }
17980 
17981 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
17982                                                    const CallExpr *E) {
17983   switch (BuiltinID) {
17984   case WebAssembly::BI__builtin_wasm_memory_size: {
17985     llvm::Type *ResultType = ConvertType(E->getType());
17986     Value *I = EmitScalarExpr(E->getArg(0));
17987     Function *Callee =
17988         CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
17989     return Builder.CreateCall(Callee, I);
17990   }
17991   case WebAssembly::BI__builtin_wasm_memory_grow: {
17992     llvm::Type *ResultType = ConvertType(E->getType());
17993     Value *Args[] = {EmitScalarExpr(E->getArg(0)),
17994                      EmitScalarExpr(E->getArg(1))};
17995     Function *Callee =
17996         CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
17997     return Builder.CreateCall(Callee, Args);
17998   }
17999   case WebAssembly::BI__builtin_wasm_tls_size: {
18000     llvm::Type *ResultType = ConvertType(E->getType());
18001     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_size, ResultType);
18002     return Builder.CreateCall(Callee);
18003   }
18004   case WebAssembly::BI__builtin_wasm_tls_align: {
18005     llvm::Type *ResultType = ConvertType(E->getType());
18006     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_align, ResultType);
18007     return Builder.CreateCall(Callee);
18008   }
18009   case WebAssembly::BI__builtin_wasm_tls_base: {
18010     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_base);
18011     return Builder.CreateCall(Callee);
18012   }
18013   case WebAssembly::BI__builtin_wasm_throw: {
18014     Value *Tag = EmitScalarExpr(E->getArg(0));
18015     Value *Obj = EmitScalarExpr(E->getArg(1));
18016     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
18017     return Builder.CreateCall(Callee, {Tag, Obj});
18018   }
18019   case WebAssembly::BI__builtin_wasm_rethrow: {
18020     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
18021     return Builder.CreateCall(Callee);
18022   }
18023   case WebAssembly::BI__builtin_wasm_memory_atomic_wait32: {
18024     Value *Addr = EmitScalarExpr(E->getArg(0));
18025     Value *Expected = EmitScalarExpr(E->getArg(1));
18026     Value *Timeout = EmitScalarExpr(E->getArg(2));
18027     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait32);
18028     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
18029   }
18030   case WebAssembly::BI__builtin_wasm_memory_atomic_wait64: {
18031     Value *Addr = EmitScalarExpr(E->getArg(0));
18032     Value *Expected = EmitScalarExpr(E->getArg(1));
18033     Value *Timeout = EmitScalarExpr(E->getArg(2));
18034     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait64);
18035     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
18036   }
18037   case WebAssembly::BI__builtin_wasm_memory_atomic_notify: {
18038     Value *Addr = EmitScalarExpr(E->getArg(0));
18039     Value *Count = EmitScalarExpr(E->getArg(1));
18040     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_notify);
18041     return Builder.CreateCall(Callee, {Addr, Count});
18042   }
18043   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f32:
18044   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f64:
18045   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f32:
18046   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f64: {
18047     Value *Src = EmitScalarExpr(E->getArg(0));
18048     llvm::Type *ResT = ConvertType(E->getType());
18049     Function *Callee =
18050         CGM.getIntrinsic(Intrinsic::wasm_trunc_signed, {ResT, Src->getType()});
18051     return Builder.CreateCall(Callee, {Src});
18052   }
18053   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f32:
18054   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f64:
18055   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f32:
18056   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f64: {
18057     Value *Src = EmitScalarExpr(E->getArg(0));
18058     llvm::Type *ResT = ConvertType(E->getType());
18059     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_unsigned,
18060                                         {ResT, Src->getType()});
18061     return Builder.CreateCall(Callee, {Src});
18062   }
18063   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f32:
18064   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
18065   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
18066   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
18067   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
18068     Value *Src = EmitScalarExpr(E->getArg(0));
18069     llvm::Type *ResT = ConvertType(E->getType());
18070     Function *Callee =
18071         CGM.getIntrinsic(Intrinsic::fptosi_sat, {ResT, Src->getType()});
18072     return Builder.CreateCall(Callee, {Src});
18073   }
18074   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
18075   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
18076   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
18077   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
18078   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
18079     Value *Src = EmitScalarExpr(E->getArg(0));
18080     llvm::Type *ResT = ConvertType(E->getType());
18081     Function *Callee =
18082         CGM.getIntrinsic(Intrinsic::fptoui_sat, {ResT, Src->getType()});
18083     return Builder.CreateCall(Callee, {Src});
18084   }
18085   case WebAssembly::BI__builtin_wasm_min_f32:
18086   case WebAssembly::BI__builtin_wasm_min_f64:
18087   case WebAssembly::BI__builtin_wasm_min_f32x4:
18088   case WebAssembly::BI__builtin_wasm_min_f64x2: {
18089     Value *LHS = EmitScalarExpr(E->getArg(0));
18090     Value *RHS = EmitScalarExpr(E->getArg(1));
18091     Function *Callee =
18092         CGM.getIntrinsic(Intrinsic::minimum, ConvertType(E->getType()));
18093     return Builder.CreateCall(Callee, {LHS, RHS});
18094   }
18095   case WebAssembly::BI__builtin_wasm_max_f32:
18096   case WebAssembly::BI__builtin_wasm_max_f64:
18097   case WebAssembly::BI__builtin_wasm_max_f32x4:
18098   case WebAssembly::BI__builtin_wasm_max_f64x2: {
18099     Value *LHS = EmitScalarExpr(E->getArg(0));
18100     Value *RHS = EmitScalarExpr(E->getArg(1));
18101     Function *Callee =
18102         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
18103     return Builder.CreateCall(Callee, {LHS, RHS});
18104   }
18105   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
18106   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
18107     Value *LHS = EmitScalarExpr(E->getArg(0));
18108     Value *RHS = EmitScalarExpr(E->getArg(1));
18109     Function *Callee =
18110         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
18111     return Builder.CreateCall(Callee, {LHS, RHS});
18112   }
18113   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
18114   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
18115     Value *LHS = EmitScalarExpr(E->getArg(0));
18116     Value *RHS = EmitScalarExpr(E->getArg(1));
18117     Function *Callee =
18118         CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
18119     return Builder.CreateCall(Callee, {LHS, RHS});
18120   }
18121   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
18122   case WebAssembly::BI__builtin_wasm_floor_f32x4:
18123   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
18124   case WebAssembly::BI__builtin_wasm_nearest_f32x4:
18125   case WebAssembly::BI__builtin_wasm_ceil_f64x2:
18126   case WebAssembly::BI__builtin_wasm_floor_f64x2:
18127   case WebAssembly::BI__builtin_wasm_trunc_f64x2:
18128   case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
18129     unsigned IntNo;
18130     switch (BuiltinID) {
18131     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
18132     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
18133       IntNo = Intrinsic::ceil;
18134       break;
18135     case WebAssembly::BI__builtin_wasm_floor_f32x4:
18136     case WebAssembly::BI__builtin_wasm_floor_f64x2:
18137       IntNo = Intrinsic::floor;
18138       break;
18139     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
18140     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
18141       IntNo = Intrinsic::trunc;
18142       break;
18143     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
18144     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
18145       IntNo = Intrinsic::nearbyint;
18146       break;
18147     default:
18148       llvm_unreachable("unexpected builtin ID");
18149     }
18150     Value *Value = EmitScalarExpr(E->getArg(0));
18151     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
18152     return Builder.CreateCall(Callee, Value);
18153   }
18154   case WebAssembly::BI__builtin_wasm_swizzle_i8x16: {
18155     Value *Src = EmitScalarExpr(E->getArg(0));
18156     Value *Indices = EmitScalarExpr(E->getArg(1));
18157     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_swizzle);
18158     return Builder.CreateCall(Callee, {Src, Indices});
18159   }
18160   case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
18161   case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
18162   case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
18163   case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
18164   case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
18165   case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
18166   case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
18167   case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: {
18168     unsigned IntNo;
18169     switch (BuiltinID) {
18170     case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
18171     case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
18172       IntNo = Intrinsic::sadd_sat;
18173       break;
18174     case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
18175     case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
18176       IntNo = Intrinsic::uadd_sat;
18177       break;
18178     case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
18179     case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
18180       IntNo = Intrinsic::wasm_sub_sat_signed;
18181       break;
18182     case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
18183     case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8:
18184       IntNo = Intrinsic::wasm_sub_sat_unsigned;
18185       break;
18186     default:
18187       llvm_unreachable("unexpected builtin ID");
18188     }
18189     Value *LHS = EmitScalarExpr(E->getArg(0));
18190     Value *RHS = EmitScalarExpr(E->getArg(1));
18191     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
18192     return Builder.CreateCall(Callee, {LHS, RHS});
18193   }
18194   case WebAssembly::BI__builtin_wasm_abs_i8x16:
18195   case WebAssembly::BI__builtin_wasm_abs_i16x8:
18196   case WebAssembly::BI__builtin_wasm_abs_i32x4:
18197   case WebAssembly::BI__builtin_wasm_abs_i64x2: {
18198     Value *Vec = EmitScalarExpr(E->getArg(0));
18199     Value *Neg = Builder.CreateNeg(Vec, "neg");
18200     Constant *Zero = llvm::Constant::getNullValue(Vec->getType());
18201     Value *ICmp = Builder.CreateICmpSLT(Vec, Zero, "abscond");
18202     return Builder.CreateSelect(ICmp, Neg, Vec, "abs");
18203   }
18204   case WebAssembly::BI__builtin_wasm_min_s_i8x16:
18205   case WebAssembly::BI__builtin_wasm_min_u_i8x16:
18206   case WebAssembly::BI__builtin_wasm_max_s_i8x16:
18207   case WebAssembly::BI__builtin_wasm_max_u_i8x16:
18208   case WebAssembly::BI__builtin_wasm_min_s_i16x8:
18209   case WebAssembly::BI__builtin_wasm_min_u_i16x8:
18210   case WebAssembly::BI__builtin_wasm_max_s_i16x8:
18211   case WebAssembly::BI__builtin_wasm_max_u_i16x8:
18212   case WebAssembly::BI__builtin_wasm_min_s_i32x4:
18213   case WebAssembly::BI__builtin_wasm_min_u_i32x4:
18214   case WebAssembly::BI__builtin_wasm_max_s_i32x4:
18215   case WebAssembly::BI__builtin_wasm_max_u_i32x4: {
18216     Value *LHS = EmitScalarExpr(E->getArg(0));
18217     Value *RHS = EmitScalarExpr(E->getArg(1));
18218     Value *ICmp;
18219     switch (BuiltinID) {
18220     case WebAssembly::BI__builtin_wasm_min_s_i8x16:
18221     case WebAssembly::BI__builtin_wasm_min_s_i16x8:
18222     case WebAssembly::BI__builtin_wasm_min_s_i32x4:
18223       ICmp = Builder.CreateICmpSLT(LHS, RHS);
18224       break;
18225     case WebAssembly::BI__builtin_wasm_min_u_i8x16:
18226     case WebAssembly::BI__builtin_wasm_min_u_i16x8:
18227     case WebAssembly::BI__builtin_wasm_min_u_i32x4:
18228       ICmp = Builder.CreateICmpULT(LHS, RHS);
18229       break;
18230     case WebAssembly::BI__builtin_wasm_max_s_i8x16:
18231     case WebAssembly::BI__builtin_wasm_max_s_i16x8:
18232     case WebAssembly::BI__builtin_wasm_max_s_i32x4:
18233       ICmp = Builder.CreateICmpSGT(LHS, RHS);
18234       break;
18235     case WebAssembly::BI__builtin_wasm_max_u_i8x16:
18236     case WebAssembly::BI__builtin_wasm_max_u_i16x8:
18237     case WebAssembly::BI__builtin_wasm_max_u_i32x4:
18238       ICmp = Builder.CreateICmpUGT(LHS, RHS);
18239       break;
18240     default:
18241       llvm_unreachable("unexpected builtin ID");
18242     }
18243     return Builder.CreateSelect(ICmp, LHS, RHS);
18244   }
18245   case WebAssembly::BI__builtin_wasm_avgr_u_i8x16:
18246   case WebAssembly::BI__builtin_wasm_avgr_u_i16x8: {
18247     Value *LHS = EmitScalarExpr(E->getArg(0));
18248     Value *RHS = EmitScalarExpr(E->getArg(1));
18249     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_avgr_unsigned,
18250                                         ConvertType(E->getType()));
18251     return Builder.CreateCall(Callee, {LHS, RHS});
18252   }
18253   case WebAssembly::BI__builtin_wasm_q15mulr_sat_s_i16x8: {
18254     Value *LHS = EmitScalarExpr(E->getArg(0));
18255     Value *RHS = EmitScalarExpr(E->getArg(1));
18256     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_q15mulr_sat_signed);
18257     return Builder.CreateCall(Callee, {LHS, RHS});
18258   }
18259   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
18260   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
18261   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
18262   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4: {
18263     Value *Vec = EmitScalarExpr(E->getArg(0));
18264     unsigned IntNo;
18265     switch (BuiltinID) {
18266     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
18267     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
18268       IntNo = Intrinsic::wasm_extadd_pairwise_signed;
18269       break;
18270     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
18271     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4:
18272       IntNo = Intrinsic::wasm_extadd_pairwise_unsigned;
18273       break;
18274     default:
18275       llvm_unreachable("unexptected builtin ID");
18276     }
18277 
18278     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
18279     return Builder.CreateCall(Callee, Vec);
18280   }
18281   case WebAssembly::BI__builtin_wasm_bitselect: {
18282     Value *V1 = EmitScalarExpr(E->getArg(0));
18283     Value *V2 = EmitScalarExpr(E->getArg(1));
18284     Value *C = EmitScalarExpr(E->getArg(2));
18285     Function *Callee =
18286         CGM.getIntrinsic(Intrinsic::wasm_bitselect, ConvertType(E->getType()));
18287     return Builder.CreateCall(Callee, {V1, V2, C});
18288   }
18289   case WebAssembly::BI__builtin_wasm_dot_s_i32x4_i16x8: {
18290     Value *LHS = EmitScalarExpr(E->getArg(0));
18291     Value *RHS = EmitScalarExpr(E->getArg(1));
18292     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_dot);
18293     return Builder.CreateCall(Callee, {LHS, RHS});
18294   }
18295   case WebAssembly::BI__builtin_wasm_popcnt_i8x16: {
18296     Value *Vec = EmitScalarExpr(E->getArg(0));
18297     Function *Callee =
18298         CGM.getIntrinsic(Intrinsic::ctpop, ConvertType(E->getType()));
18299     return Builder.CreateCall(Callee, {Vec});
18300   }
18301   case WebAssembly::BI__builtin_wasm_any_true_v128:
18302   case WebAssembly::BI__builtin_wasm_all_true_i8x16:
18303   case WebAssembly::BI__builtin_wasm_all_true_i16x8:
18304   case WebAssembly::BI__builtin_wasm_all_true_i32x4:
18305   case WebAssembly::BI__builtin_wasm_all_true_i64x2: {
18306     unsigned IntNo;
18307     switch (BuiltinID) {
18308     case WebAssembly::BI__builtin_wasm_any_true_v128:
18309       IntNo = Intrinsic::wasm_anytrue;
18310       break;
18311     case WebAssembly::BI__builtin_wasm_all_true_i8x16:
18312     case WebAssembly::BI__builtin_wasm_all_true_i16x8:
18313     case WebAssembly::BI__builtin_wasm_all_true_i32x4:
18314     case WebAssembly::BI__builtin_wasm_all_true_i64x2:
18315       IntNo = Intrinsic::wasm_alltrue;
18316       break;
18317     default:
18318       llvm_unreachable("unexpected builtin ID");
18319     }
18320     Value *Vec = EmitScalarExpr(E->getArg(0));
18321     Function *Callee = CGM.getIntrinsic(IntNo, Vec->getType());
18322     return Builder.CreateCall(Callee, {Vec});
18323   }
18324   case WebAssembly::BI__builtin_wasm_bitmask_i8x16:
18325   case WebAssembly::BI__builtin_wasm_bitmask_i16x8:
18326   case WebAssembly::BI__builtin_wasm_bitmask_i32x4:
18327   case WebAssembly::BI__builtin_wasm_bitmask_i64x2: {
18328     Value *Vec = EmitScalarExpr(E->getArg(0));
18329     Function *Callee =
18330         CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
18331     return Builder.CreateCall(Callee, {Vec});
18332   }
18333   case WebAssembly::BI__builtin_wasm_abs_f32x4:
18334   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
18335     Value *Vec = EmitScalarExpr(E->getArg(0));
18336     Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
18337     return Builder.CreateCall(Callee, {Vec});
18338   }
18339   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
18340   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
18341     Value *Vec = EmitScalarExpr(E->getArg(0));
18342     Function *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType());
18343     return Builder.CreateCall(Callee, {Vec});
18344   }
18345   case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
18346   case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
18347   case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
18348   case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {
18349     Value *Low = EmitScalarExpr(E->getArg(0));
18350     Value *High = EmitScalarExpr(E->getArg(1));
18351     unsigned IntNo;
18352     switch (BuiltinID) {
18353     case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
18354     case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
18355       IntNo = Intrinsic::wasm_narrow_signed;
18356       break;
18357     case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
18358     case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4:
18359       IntNo = Intrinsic::wasm_narrow_unsigned;
18360       break;
18361     default:
18362       llvm_unreachable("unexpected builtin ID");
18363     }
18364     Function *Callee =
18365         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
18366     return Builder.CreateCall(Callee, {Low, High});
18367   }
18368   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4:
18369   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4: {
18370     Value *Vec = EmitScalarExpr(E->getArg(0));
18371     unsigned IntNo;
18372     switch (BuiltinID) {
18373     case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4:
18374       IntNo = Intrinsic::fptosi_sat;
18375       break;
18376     case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4:
18377       IntNo = Intrinsic::fptoui_sat;
18378       break;
18379     default:
18380       llvm_unreachable("unexpected builtin ID");
18381     }
18382     llvm::Type *SrcT = Vec->getType();
18383     llvm::Type *TruncT = SrcT->getWithNewType(Builder.getInt32Ty());
18384     Function *Callee = CGM.getIntrinsic(IntNo, {TruncT, SrcT});
18385     Value *Trunc = Builder.CreateCall(Callee, Vec);
18386     Value *Splat = Constant::getNullValue(TruncT);
18387     return Builder.CreateShuffleVector(Trunc, Splat, ArrayRef<int>{0, 1, 2, 3});
18388   }
18389   case WebAssembly::BI__builtin_wasm_shuffle_i8x16: {
18390     Value *Ops[18];
18391     size_t OpIdx = 0;
18392     Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
18393     Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
18394     while (OpIdx < 18) {
18395       Optional<llvm::APSInt> LaneConst =
18396           E->getArg(OpIdx)->getIntegerConstantExpr(getContext());
18397       assert(LaneConst && "Constant arg isn't actually constant?");
18398       Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), *LaneConst);
18399     }
18400     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
18401     return Builder.CreateCall(Callee, Ops);
18402   }
18403   case WebAssembly::BI__builtin_wasm_fma_f32x4:
18404   case WebAssembly::BI__builtin_wasm_fms_f32x4:
18405   case WebAssembly::BI__builtin_wasm_fma_f64x2:
18406   case WebAssembly::BI__builtin_wasm_fms_f64x2: {
18407     Value *A = EmitScalarExpr(E->getArg(0));
18408     Value *B = EmitScalarExpr(E->getArg(1));
18409     Value *C = EmitScalarExpr(E->getArg(2));
18410     unsigned IntNo;
18411     switch (BuiltinID) {
18412     case WebAssembly::BI__builtin_wasm_fma_f32x4:
18413     case WebAssembly::BI__builtin_wasm_fma_f64x2:
18414       IntNo = Intrinsic::wasm_fma;
18415       break;
18416     case WebAssembly::BI__builtin_wasm_fms_f32x4:
18417     case WebAssembly::BI__builtin_wasm_fms_f64x2:
18418       IntNo = Intrinsic::wasm_fms;
18419       break;
18420     default:
18421       llvm_unreachable("unexpected builtin ID");
18422     }
18423     Function *Callee = CGM.getIntrinsic(IntNo, A->getType());
18424     return Builder.CreateCall(Callee, {A, B, C});
18425   }
18426   case WebAssembly::BI__builtin_wasm_laneselect_i8x16:
18427   case WebAssembly::BI__builtin_wasm_laneselect_i16x8:
18428   case WebAssembly::BI__builtin_wasm_laneselect_i32x4:
18429   case WebAssembly::BI__builtin_wasm_laneselect_i64x2: {
18430     Value *A = EmitScalarExpr(E->getArg(0));
18431     Value *B = EmitScalarExpr(E->getArg(1));
18432     Value *C = EmitScalarExpr(E->getArg(2));
18433     Function *Callee =
18434         CGM.getIntrinsic(Intrinsic::wasm_laneselect, A->getType());
18435     return Builder.CreateCall(Callee, {A, B, C});
18436   }
18437   case WebAssembly::BI__builtin_wasm_relaxed_swizzle_i8x16: {
18438     Value *Src = EmitScalarExpr(E->getArg(0));
18439     Value *Indices = EmitScalarExpr(E->getArg(1));
18440     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_swizzle);
18441     return Builder.CreateCall(Callee, {Src, Indices});
18442   }
18443   case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
18444   case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
18445   case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
18446   case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2: {
18447     Value *LHS = EmitScalarExpr(E->getArg(0));
18448     Value *RHS = EmitScalarExpr(E->getArg(1));
18449     unsigned IntNo;
18450     switch (BuiltinID) {
18451     case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
18452     case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
18453       IntNo = Intrinsic::wasm_relaxed_min;
18454       break;
18455     case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
18456     case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2:
18457       IntNo = Intrinsic::wasm_relaxed_max;
18458       break;
18459     default:
18460       llvm_unreachable("unexpected builtin ID");
18461     }
18462     Function *Callee = CGM.getIntrinsic(IntNo, LHS->getType());
18463     return Builder.CreateCall(Callee, {LHS, RHS});
18464   }
18465   case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
18466   case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
18467   case WebAssembly::BI__builtin_wasm_relaxed_trunc_zero_s_i32x4_f64x2:
18468   case WebAssembly::BI__builtin_wasm_relaxed_trunc_zero_u_i32x4_f64x2: {
18469     Value *Vec = EmitScalarExpr(E->getArg(0));
18470     unsigned IntNo;
18471     switch (BuiltinID) {
18472     case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
18473       IntNo = Intrinsic::wasm_relaxed_trunc_signed;
18474       break;
18475     case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
18476       IntNo = Intrinsic::wasm_relaxed_trunc_unsigned;
18477       break;
18478     case WebAssembly::BI__builtin_wasm_relaxed_trunc_zero_s_i32x4_f64x2:
18479       IntNo = Intrinsic::wasm_relaxed_trunc_zero_signed;
18480       break;
18481     case WebAssembly::BI__builtin_wasm_relaxed_trunc_zero_u_i32x4_f64x2:
18482       IntNo = Intrinsic::wasm_relaxed_trunc_zero_unsigned;
18483       break;
18484     default:
18485       llvm_unreachable("unexpected builtin ID");
18486     }
18487     Function *Callee = CGM.getIntrinsic(IntNo);
18488     return Builder.CreateCall(Callee, {Vec});
18489   }
18490   default:
18491     return nullptr;
18492   }
18493 }
18494 
18495 static std::pair<Intrinsic::ID, unsigned>
18496 getIntrinsicForHexagonNonGCCBuiltin(unsigned BuiltinID) {
18497   struct Info {
18498     unsigned BuiltinID;
18499     Intrinsic::ID IntrinsicID;
18500     unsigned VecLen;
18501   };
18502   Info Infos[] = {
18503 #define CUSTOM_BUILTIN_MAPPING(x,s) \
18504   { Hexagon::BI__builtin_HEXAGON_##x, Intrinsic::hexagon_##x, s },
18505     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pci, 0)
18506     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pci, 0)
18507     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pci, 0)
18508     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pci, 0)
18509     CUSTOM_BUILTIN_MAPPING(L2_loadri_pci, 0)
18510     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pci, 0)
18511     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pcr, 0)
18512     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pcr, 0)
18513     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pcr, 0)
18514     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pcr, 0)
18515     CUSTOM_BUILTIN_MAPPING(L2_loadri_pcr, 0)
18516     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pcr, 0)
18517     CUSTOM_BUILTIN_MAPPING(S2_storerb_pci, 0)
18518     CUSTOM_BUILTIN_MAPPING(S2_storerh_pci, 0)
18519     CUSTOM_BUILTIN_MAPPING(S2_storerf_pci, 0)
18520     CUSTOM_BUILTIN_MAPPING(S2_storeri_pci, 0)
18521     CUSTOM_BUILTIN_MAPPING(S2_storerd_pci, 0)
18522     CUSTOM_BUILTIN_MAPPING(S2_storerb_pcr, 0)
18523     CUSTOM_BUILTIN_MAPPING(S2_storerh_pcr, 0)
18524     CUSTOM_BUILTIN_MAPPING(S2_storerf_pcr, 0)
18525     CUSTOM_BUILTIN_MAPPING(S2_storeri_pcr, 0)
18526     CUSTOM_BUILTIN_MAPPING(S2_storerd_pcr, 0)
18527     // Legacy builtins that take a vector in place of a vector predicate.
18528     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq, 64)
18529     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq, 64)
18530     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq, 64)
18531     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq, 64)
18532     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq_128B, 128)
18533     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq_128B, 128)
18534     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq_128B, 128)
18535     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq_128B, 128)
18536 #include "clang/Basic/BuiltinsHexagonMapCustomDep.def"
18537 #undef CUSTOM_BUILTIN_MAPPING
18538   };
18539 
18540   auto CmpInfo = [] (Info A, Info B) { return A.BuiltinID < B.BuiltinID; };
18541   static const bool SortOnce = (llvm::sort(Infos, CmpInfo), true);
18542   (void)SortOnce;
18543 
18544   const Info *F = std::lower_bound(std::begin(Infos), std::end(Infos),
18545                                    Info{BuiltinID, 0, 0}, CmpInfo);
18546   if (F == std::end(Infos) || F->BuiltinID != BuiltinID)
18547     return {Intrinsic::not_intrinsic, 0};
18548 
18549   return {F->IntrinsicID, F->VecLen};
18550 }
18551 
18552 Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
18553                                                const CallExpr *E) {
18554   Intrinsic::ID ID;
18555   unsigned VecLen;
18556   std::tie(ID, VecLen) = getIntrinsicForHexagonNonGCCBuiltin(BuiltinID);
18557 
18558   auto MakeCircOp = [this, E](unsigned IntID, bool IsLoad) {
18559     // The base pointer is passed by address, so it needs to be loaded.
18560     Address A = EmitPointerWithAlignment(E->getArg(0));
18561     Address BP = Address(Builder.CreateBitCast(
18562         A.getPointer(), Int8PtrPtrTy), Int8PtrTy, A.getAlignment());
18563     llvm::Value *Base = Builder.CreateLoad(BP);
18564     // The treatment of both loads and stores is the same: the arguments for
18565     // the builtin are the same as the arguments for the intrinsic.
18566     // Load:
18567     //   builtin(Base, Inc, Mod, Start) -> intr(Base, Inc, Mod, Start)
18568     //   builtin(Base, Mod, Start)      -> intr(Base, Mod, Start)
18569     // Store:
18570     //   builtin(Base, Inc, Mod, Val, Start) -> intr(Base, Inc, Mod, Val, Start)
18571     //   builtin(Base, Mod, Val, Start)      -> intr(Base, Mod, Val, Start)
18572     SmallVector<llvm::Value*,5> Ops = { Base };
18573     for (unsigned i = 1, e = E->getNumArgs(); i != e; ++i)
18574       Ops.push_back(EmitScalarExpr(E->getArg(i)));
18575 
18576     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
18577     // The load intrinsics generate two results (Value, NewBase), stores
18578     // generate one (NewBase). The new base address needs to be stored.
18579     llvm::Value *NewBase = IsLoad ? Builder.CreateExtractValue(Result, 1)
18580                                   : Result;
18581     llvm::Value *LV = Builder.CreateBitCast(
18582         EmitScalarExpr(E->getArg(0)), NewBase->getType()->getPointerTo());
18583     Address Dest = EmitPointerWithAlignment(E->getArg(0));
18584     llvm::Value *RetVal =
18585         Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
18586     if (IsLoad)
18587       RetVal = Builder.CreateExtractValue(Result, 0);
18588     return RetVal;
18589   };
18590 
18591   // Handle the conversion of bit-reverse load intrinsics to bit code.
18592   // The intrinsic call after this function only reads from memory and the
18593   // write to memory is dealt by the store instruction.
18594   auto MakeBrevLd = [this, E](unsigned IntID, llvm::Type *DestTy) {
18595     // The intrinsic generates one result, which is the new value for the base
18596     // pointer. It needs to be returned. The result of the load instruction is
18597     // passed to intrinsic by address, so the value needs to be stored.
18598     llvm::Value *BaseAddress =
18599         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
18600 
18601     // Expressions like &(*pt++) will be incremented per evaluation.
18602     // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
18603     // per call.
18604     Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
18605     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy),
18606                        Int8Ty, DestAddr.getAlignment());
18607     llvm::Value *DestAddress = DestAddr.getPointer();
18608 
18609     // Operands are Base, Dest, Modifier.
18610     // The intrinsic format in LLVM IR is defined as
18611     // { ValueType, i8* } (i8*, i32).
18612     llvm::Value *Result = Builder.CreateCall(
18613         CGM.getIntrinsic(IntID), {BaseAddress, EmitScalarExpr(E->getArg(2))});
18614 
18615     // The value needs to be stored as the variable is passed by reference.
18616     llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
18617 
18618     // The store needs to be truncated to fit the destination type.
18619     // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
18620     // to be handled with stores of respective destination type.
18621     DestVal = Builder.CreateTrunc(DestVal, DestTy);
18622 
18623     llvm::Value *DestForStore =
18624         Builder.CreateBitCast(DestAddress, DestVal->getType()->getPointerTo());
18625     Builder.CreateAlignedStore(DestVal, DestForStore, DestAddr.getAlignment());
18626     // The updated value of the base pointer is returned.
18627     return Builder.CreateExtractValue(Result, 1);
18628   };
18629 
18630   auto V2Q = [this, VecLen] (llvm::Value *Vec) {
18631     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandvrt_128B
18632                                      : Intrinsic::hexagon_V6_vandvrt;
18633     return Builder.CreateCall(CGM.getIntrinsic(ID),
18634                               {Vec, Builder.getInt32(-1)});
18635   };
18636   auto Q2V = [this, VecLen] (llvm::Value *Pred) {
18637     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandqrt_128B
18638                                      : Intrinsic::hexagon_V6_vandqrt;
18639     return Builder.CreateCall(CGM.getIntrinsic(ID),
18640                               {Pred, Builder.getInt32(-1)});
18641   };
18642 
18643   switch (BuiltinID) {
18644   // These intrinsics return a tuple {Vector, VectorPred} in LLVM IR,
18645   // and the corresponding C/C++ builtins use loads/stores to update
18646   // the predicate.
18647   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
18648   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B:
18649   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
18650   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
18651     // Get the type from the 0-th argument.
18652     llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
18653     Address PredAddr = Builder.CreateElementBitCast(
18654         EmitPointerWithAlignment(E->getArg(2)), VecType);
18655     llvm::Value *PredIn = V2Q(Builder.CreateLoad(PredAddr));
18656     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
18657         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), PredIn});
18658 
18659     llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
18660     Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
18661         PredAddr.getAlignment());
18662     return Builder.CreateExtractValue(Result, 0);
18663   }
18664 
18665   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq:
18666   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq:
18667   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq:
18668   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq:
18669   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq_128B:
18670   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq_128B:
18671   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq_128B:
18672   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq_128B: {
18673     SmallVector<llvm::Value*,4> Ops;
18674     const Expr *PredOp = E->getArg(0);
18675     // There will be an implicit cast to a boolean vector. Strip it.
18676     if (auto *Cast = dyn_cast<ImplicitCastExpr>(PredOp)) {
18677       if (Cast->getCastKind() == CK_BitCast)
18678         PredOp = Cast->getSubExpr();
18679       Ops.push_back(V2Q(EmitScalarExpr(PredOp)));
18680     }
18681     for (int i = 1, e = E->getNumArgs(); i != e; ++i)
18682       Ops.push_back(EmitScalarExpr(E->getArg(i)));
18683     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
18684   }
18685 
18686   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
18687   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
18688   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
18689   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
18690   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
18691   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
18692   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
18693   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
18694   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
18695   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
18696   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
18697   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
18698     return MakeCircOp(ID, /*IsLoad=*/true);
18699   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
18700   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
18701   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
18702   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
18703   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
18704   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
18705   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
18706   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
18707   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
18708   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
18709     return MakeCircOp(ID, /*IsLoad=*/false);
18710   case Hexagon::BI__builtin_brev_ldub:
18711     return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
18712   case Hexagon::BI__builtin_brev_ldb:
18713     return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
18714   case Hexagon::BI__builtin_brev_lduh:
18715     return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
18716   case Hexagon::BI__builtin_brev_ldh:
18717     return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
18718   case Hexagon::BI__builtin_brev_ldw:
18719     return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
18720   case Hexagon::BI__builtin_brev_ldd:
18721     return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
18722   } // switch
18723 
18724   return nullptr;
18725 }
18726 
18727 Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
18728                                              const CallExpr *E,
18729                                              ReturnValueSlot ReturnValue) {
18730   SmallVector<Value *, 4> Ops;
18731   llvm::Type *ResultType = ConvertType(E->getType());
18732 
18733   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
18734     Ops.push_back(EmitScalarExpr(E->getArg(i)));
18735 
18736   Intrinsic::ID ID = Intrinsic::not_intrinsic;
18737   unsigned NF = 1;
18738   constexpr unsigned TAIL_UNDISTURBED = 0;
18739 
18740   // Required for overloaded intrinsics.
18741   llvm::SmallVector<llvm::Type *, 2> IntrinsicTypes;
18742   switch (BuiltinID) {
18743   default: llvm_unreachable("unexpected builtin ID");
18744   case RISCV::BI__builtin_riscv_orc_b_32:
18745   case RISCV::BI__builtin_riscv_orc_b_64:
18746   case RISCV::BI__builtin_riscv_clmul:
18747   case RISCV::BI__builtin_riscv_clmulh:
18748   case RISCV::BI__builtin_riscv_clmulr:
18749   case RISCV::BI__builtin_riscv_bcompress_32:
18750   case RISCV::BI__builtin_riscv_bcompress_64:
18751   case RISCV::BI__builtin_riscv_bdecompress_32:
18752   case RISCV::BI__builtin_riscv_bdecompress_64:
18753   case RISCV::BI__builtin_riscv_bfp_32:
18754   case RISCV::BI__builtin_riscv_bfp_64:
18755   case RISCV::BI__builtin_riscv_grev_32:
18756   case RISCV::BI__builtin_riscv_grev_64:
18757   case RISCV::BI__builtin_riscv_gorc_32:
18758   case RISCV::BI__builtin_riscv_gorc_64:
18759   case RISCV::BI__builtin_riscv_shfl_32:
18760   case RISCV::BI__builtin_riscv_shfl_64:
18761   case RISCV::BI__builtin_riscv_unshfl_32:
18762   case RISCV::BI__builtin_riscv_unshfl_64:
18763   case RISCV::BI__builtin_riscv_xperm_n:
18764   case RISCV::BI__builtin_riscv_xperm_b:
18765   case RISCV::BI__builtin_riscv_xperm_h:
18766   case RISCV::BI__builtin_riscv_xperm_w:
18767   case RISCV::BI__builtin_riscv_crc32_b:
18768   case RISCV::BI__builtin_riscv_crc32_h:
18769   case RISCV::BI__builtin_riscv_crc32_w:
18770   case RISCV::BI__builtin_riscv_crc32_d:
18771   case RISCV::BI__builtin_riscv_crc32c_b:
18772   case RISCV::BI__builtin_riscv_crc32c_h:
18773   case RISCV::BI__builtin_riscv_crc32c_w:
18774   case RISCV::BI__builtin_riscv_crc32c_d:
18775   case RISCV::BI__builtin_riscv_fsl_32:
18776   case RISCV::BI__builtin_riscv_fsr_32:
18777   case RISCV::BI__builtin_riscv_fsl_64:
18778   case RISCV::BI__builtin_riscv_fsr_64: {
18779     switch (BuiltinID) {
18780     default: llvm_unreachable("unexpected builtin ID");
18781     // Zbb
18782     case RISCV::BI__builtin_riscv_orc_b_32:
18783     case RISCV::BI__builtin_riscv_orc_b_64:
18784       ID = Intrinsic::riscv_orc_b;
18785       break;
18786 
18787     // Zbc
18788     case RISCV::BI__builtin_riscv_clmul:
18789       ID = Intrinsic::riscv_clmul;
18790       break;
18791     case RISCV::BI__builtin_riscv_clmulh:
18792       ID = Intrinsic::riscv_clmulh;
18793       break;
18794     case RISCV::BI__builtin_riscv_clmulr:
18795       ID = Intrinsic::riscv_clmulr;
18796       break;
18797 
18798     // Zbe
18799     case RISCV::BI__builtin_riscv_bcompress_32:
18800     case RISCV::BI__builtin_riscv_bcompress_64:
18801       ID = Intrinsic::riscv_bcompress;
18802       break;
18803     case RISCV::BI__builtin_riscv_bdecompress_32:
18804     case RISCV::BI__builtin_riscv_bdecompress_64:
18805       ID = Intrinsic::riscv_bdecompress;
18806       break;
18807 
18808     // Zbf
18809     case RISCV::BI__builtin_riscv_bfp_32:
18810     case RISCV::BI__builtin_riscv_bfp_64:
18811       ID = Intrinsic::riscv_bfp;
18812       break;
18813 
18814     // Zbp
18815     case RISCV::BI__builtin_riscv_grev_32:
18816     case RISCV::BI__builtin_riscv_grev_64:
18817       ID = Intrinsic::riscv_grev;
18818       break;
18819     case RISCV::BI__builtin_riscv_gorc_32:
18820     case RISCV::BI__builtin_riscv_gorc_64:
18821       ID = Intrinsic::riscv_gorc;
18822       break;
18823     case RISCV::BI__builtin_riscv_shfl_32:
18824     case RISCV::BI__builtin_riscv_shfl_64:
18825       ID = Intrinsic::riscv_shfl;
18826       break;
18827     case RISCV::BI__builtin_riscv_unshfl_32:
18828     case RISCV::BI__builtin_riscv_unshfl_64:
18829       ID = Intrinsic::riscv_unshfl;
18830       break;
18831     case RISCV::BI__builtin_riscv_xperm_n:
18832       ID = Intrinsic::riscv_xperm_n;
18833       break;
18834     case RISCV::BI__builtin_riscv_xperm_b:
18835       ID = Intrinsic::riscv_xperm_b;
18836       break;
18837     case RISCV::BI__builtin_riscv_xperm_h:
18838       ID = Intrinsic::riscv_xperm_h;
18839       break;
18840     case RISCV::BI__builtin_riscv_xperm_w:
18841       ID = Intrinsic::riscv_xperm_w;
18842       break;
18843 
18844     // Zbr
18845     case RISCV::BI__builtin_riscv_crc32_b:
18846       ID = Intrinsic::riscv_crc32_b;
18847       break;
18848     case RISCV::BI__builtin_riscv_crc32_h:
18849       ID = Intrinsic::riscv_crc32_h;
18850       break;
18851     case RISCV::BI__builtin_riscv_crc32_w:
18852       ID = Intrinsic::riscv_crc32_w;
18853       break;
18854     case RISCV::BI__builtin_riscv_crc32_d:
18855       ID = Intrinsic::riscv_crc32_d;
18856       break;
18857     case RISCV::BI__builtin_riscv_crc32c_b:
18858       ID = Intrinsic::riscv_crc32c_b;
18859       break;
18860     case RISCV::BI__builtin_riscv_crc32c_h:
18861       ID = Intrinsic::riscv_crc32c_h;
18862       break;
18863     case RISCV::BI__builtin_riscv_crc32c_w:
18864       ID = Intrinsic::riscv_crc32c_w;
18865       break;
18866     case RISCV::BI__builtin_riscv_crc32c_d:
18867       ID = Intrinsic::riscv_crc32c_d;
18868       break;
18869 
18870     // Zbt
18871     case RISCV::BI__builtin_riscv_fsl_32:
18872     case RISCV::BI__builtin_riscv_fsl_64:
18873       ID = Intrinsic::riscv_fsl;
18874       break;
18875     case RISCV::BI__builtin_riscv_fsr_32:
18876     case RISCV::BI__builtin_riscv_fsr_64:
18877       ID = Intrinsic::riscv_fsr;
18878       break;
18879     }
18880 
18881     IntrinsicTypes = {ResultType};
18882     break;
18883   }
18884   // Vector builtins are handled from here.
18885 #include "clang/Basic/riscv_vector_builtin_cg.inc"
18886   }
18887 
18888   assert(ID != Intrinsic::not_intrinsic);
18889 
18890   llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
18891   return Builder.CreateCall(F, Ops, "");
18892 }
18893