1 //===--- SemaCUDA.cpp - Semantic Analysis for CUDA constructs -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// \brief This file implements semantic analysis for CUDA constructs.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/ASTContext.h"
15 #include "clang/AST/Decl.h"
16 #include "clang/AST/ExprCXX.h"
17 #include "clang/Lex/Preprocessor.h"
18 #include "clang/Sema/Lookup.h"
19 #include "clang/Sema/Sema.h"
20 #include "clang/Sema/SemaDiagnostic.h"
21 #include "clang/Sema/SemaInternal.h"
22 #include "clang/Sema/Template.h"
23 #include "llvm/ADT/Optional.h"
24 #include "llvm/ADT/SmallVector.h"
25 using namespace clang;
26 
27 void Sema::PushForceCUDAHostDevice() {
28   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
29   ForceCUDAHostDeviceDepth++;
30 }
31 
32 bool Sema::PopForceCUDAHostDevice() {
33   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
34   if (ForceCUDAHostDeviceDepth == 0)
35     return false;
36   ForceCUDAHostDeviceDepth--;
37   return true;
38 }
39 
40 ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
41                                          MultiExprArg ExecConfig,
42                                          SourceLocation GGGLoc) {
43   FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
44   if (!ConfigDecl)
45     return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
46                      << "cudaConfigureCall");
47   QualType ConfigQTy = ConfigDecl->getType();
48 
49   DeclRefExpr *ConfigDR = new (Context)
50       DeclRefExpr(ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
51   MarkFunctionReferenced(LLLLoc, ConfigDecl);
52 
53   return ActOnCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
54                        /*IsExecConfig=*/true);
55 }
56 
57 /// IdentifyCUDATarget - Determine the CUDA compilation target for this function
58 Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
59   // Code that lives outside a function is run on the host.
60   if (D == nullptr)
61     return CFT_Host;
62 
63   if (D->hasAttr<CUDAInvalidTargetAttr>())
64     return CFT_InvalidTarget;
65 
66   if (D->hasAttr<CUDAGlobalAttr>())
67     return CFT_Global;
68 
69   if (D->hasAttr<CUDADeviceAttr>()) {
70     if (D->hasAttr<CUDAHostAttr>())
71       return CFT_HostDevice;
72     return CFT_Device;
73   } else if (D->hasAttr<CUDAHostAttr>()) {
74     return CFT_Host;
75   } else if (D->isImplicit()) {
76     // Some implicit declarations (like intrinsic functions) are not marked.
77     // Set the most lenient target on them for maximal flexibility.
78     return CFT_HostDevice;
79   }
80 
81   return CFT_Host;
82 }
83 
84 // * CUDA Call preference table
85 //
86 // F - from,
87 // T - to
88 // Ph - preference in host mode
89 // Pd - preference in device mode
90 // H  - handled in (x)
91 // Preferences: N:native, SS:same side, HD:host-device, WS:wrong side, --:never.
92 //
93 // | F  | T  | Ph  | Pd  |  H  |
94 // |----+----+-----+-----+-----+
95 // | d  | d  | N   | N   | (c) |
96 // | d  | g  | --  | --  | (a) |
97 // | d  | h  | --  | --  | (e) |
98 // | d  | hd | HD  | HD  | (b) |
99 // | g  | d  | N   | N   | (c) |
100 // | g  | g  | --  | --  | (a) |
101 // | g  | h  | --  | --  | (e) |
102 // | g  | hd | HD  | HD  | (b) |
103 // | h  | d  | --  | --  | (e) |
104 // | h  | g  | N   | N   | (c) |
105 // | h  | h  | N   | N   | (c) |
106 // | h  | hd | HD  | HD  | (b) |
107 // | hd | d  | WS  | SS  | (d) |
108 // | hd | g  | SS  | --  |(d/a)|
109 // | hd | h  | SS  | WS  | (d) |
110 // | hd | hd | HD  | HD  | (b) |
111 
112 Sema::CUDAFunctionPreference
113 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
114                              const FunctionDecl *Callee) {
115   assert(Callee && "Callee must be valid.");
116   CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
117   CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(Callee);
118 
119   // If one of the targets is invalid, the check always fails, no matter what
120   // the other target is.
121   if (CallerTarget == CFT_InvalidTarget || CalleeTarget == CFT_InvalidTarget)
122     return CFP_Never;
123 
124   // (a) Can't call global from some contexts until we support CUDA's
125   // dynamic parallelism.
126   if (CalleeTarget == CFT_Global &&
127       (CallerTarget == CFT_Global || CallerTarget == CFT_Device))
128     return CFP_Never;
129 
130   // (b) Calling HostDevice is OK for everyone.
131   if (CalleeTarget == CFT_HostDevice)
132     return CFP_HostDevice;
133 
134   // (c) Best case scenarios
135   if (CalleeTarget == CallerTarget ||
136       (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) ||
137       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
138     return CFP_Native;
139 
140   // (d) HostDevice behavior depends on compilation mode.
141   if (CallerTarget == CFT_HostDevice) {
142     // It's OK to call a compilation-mode matching function from an HD one.
143     if ((getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Device) ||
144         (!getLangOpts().CUDAIsDevice &&
145          (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)))
146       return CFP_SameSide;
147 
148     // Calls from HD to non-mode-matching functions (i.e., to host functions
149     // when compiling in device mode or to device functions when compiling in
150     // host mode) are allowed at the sema level, but eventually rejected if
151     // they're ever codegened.  TODO: Reject said calls earlier.
152     return CFP_WrongSide;
153   }
154 
155   // (e) Calling across device/host boundary is not something you should do.
156   if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) ||
157       (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
158       (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
159     return CFP_Never;
160 
161   llvm_unreachable("All cases should've been handled by now.");
162 }
163 
164 void Sema::EraseUnwantedCUDAMatches(
165     const FunctionDecl *Caller,
166     SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
167   if (Matches.size() <= 1)
168     return;
169 
170   using Pair = std::pair<DeclAccessPair, FunctionDecl*>;
171 
172   // Gets the CUDA function preference for a call from Caller to Match.
173   auto GetCFP = [&](const Pair &Match) {
174     return IdentifyCUDAPreference(Caller, Match.second);
175   };
176 
177   // Find the best call preference among the functions in Matches.
178   CUDAFunctionPreference BestCFP = GetCFP(*std::max_element(
179       Matches.begin(), Matches.end(),
180       [&](const Pair &M1, const Pair &M2) { return GetCFP(M1) < GetCFP(M2); }));
181 
182   // Erase all functions with lower priority.
183   Matches.erase(
184       llvm::remove_if(
185           Matches, [&](const Pair &Match) { return GetCFP(Match) < BestCFP; }),
186       Matches.end());
187 }
188 
189 /// When an implicitly-declared special member has to invoke more than one
190 /// base/field special member, conflicts may occur in the targets of these
191 /// members. For example, if one base's member __host__ and another's is
192 /// __device__, it's a conflict.
193 /// This function figures out if the given targets \param Target1 and
194 /// \param Target2 conflict, and if they do not it fills in
195 /// \param ResolvedTarget with a target that resolves for both calls.
196 /// \return true if there's a conflict, false otherwise.
197 static bool
198 resolveCalleeCUDATargetConflict(Sema::CUDAFunctionTarget Target1,
199                                 Sema::CUDAFunctionTarget Target2,
200                                 Sema::CUDAFunctionTarget *ResolvedTarget) {
201   // Only free functions and static member functions may be global.
202   assert(Target1 != Sema::CFT_Global);
203   assert(Target2 != Sema::CFT_Global);
204 
205   if (Target1 == Sema::CFT_HostDevice) {
206     *ResolvedTarget = Target2;
207   } else if (Target2 == Sema::CFT_HostDevice) {
208     *ResolvedTarget = Target1;
209   } else if (Target1 != Target2) {
210     return true;
211   } else {
212     *ResolvedTarget = Target1;
213   }
214 
215   return false;
216 }
217 
218 bool Sema::inferCUDATargetForImplicitSpecialMember(CXXRecordDecl *ClassDecl,
219                                                    CXXSpecialMember CSM,
220                                                    CXXMethodDecl *MemberDecl,
221                                                    bool ConstRHS,
222                                                    bool Diagnose) {
223   llvm::Optional<CUDAFunctionTarget> InferredTarget;
224 
225   // We're going to invoke special member lookup; mark that these special
226   // members are called from this one, and not from its caller.
227   ContextRAII MethodContext(*this, MemberDecl);
228 
229   // Look for special members in base classes that should be invoked from here.
230   // Infer the target of this member base on the ones it should call.
231   // Skip direct and indirect virtual bases for abstract classes.
232   llvm::SmallVector<const CXXBaseSpecifier *, 16> Bases;
233   for (const auto &B : ClassDecl->bases()) {
234     if (!B.isVirtual()) {
235       Bases.push_back(&B);
236     }
237   }
238 
239   if (!ClassDecl->isAbstract()) {
240     for (const auto &VB : ClassDecl->vbases()) {
241       Bases.push_back(&VB);
242     }
243   }
244 
245   for (const auto *B : Bases) {
246     const RecordType *BaseType = B->getType()->getAs<RecordType>();
247     if (!BaseType) {
248       continue;
249     }
250 
251     CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
252     Sema::SpecialMemberOverloadResult *SMOR =
253         LookupSpecialMember(BaseClassDecl, CSM,
254                             /* ConstArg */ ConstRHS,
255                             /* VolatileArg */ false,
256                             /* RValueThis */ false,
257                             /* ConstThis */ false,
258                             /* VolatileThis */ false);
259 
260     if (!SMOR || !SMOR->getMethod()) {
261       continue;
262     }
263 
264     CUDAFunctionTarget BaseMethodTarget = IdentifyCUDATarget(SMOR->getMethod());
265     if (!InferredTarget.hasValue()) {
266       InferredTarget = BaseMethodTarget;
267     } else {
268       bool ResolutionError = resolveCalleeCUDATargetConflict(
269           InferredTarget.getValue(), BaseMethodTarget,
270           InferredTarget.getPointer());
271       if (ResolutionError) {
272         if (Diagnose) {
273           Diag(ClassDecl->getLocation(),
274                diag::note_implicit_member_target_infer_collision)
275               << (unsigned)CSM << InferredTarget.getValue() << BaseMethodTarget;
276         }
277         MemberDecl->addAttr(CUDAInvalidTargetAttr::CreateImplicit(Context));
278         return true;
279       }
280     }
281   }
282 
283   // Same as for bases, but now for special members of fields.
284   for (const auto *F : ClassDecl->fields()) {
285     if (F->isInvalidDecl()) {
286       continue;
287     }
288 
289     const RecordType *FieldType =
290         Context.getBaseElementType(F->getType())->getAs<RecordType>();
291     if (!FieldType) {
292       continue;
293     }
294 
295     CXXRecordDecl *FieldRecDecl = cast<CXXRecordDecl>(FieldType->getDecl());
296     Sema::SpecialMemberOverloadResult *SMOR =
297         LookupSpecialMember(FieldRecDecl, CSM,
298                             /* ConstArg */ ConstRHS && !F->isMutable(),
299                             /* VolatileArg */ false,
300                             /* RValueThis */ false,
301                             /* ConstThis */ false,
302                             /* VolatileThis */ false);
303 
304     if (!SMOR || !SMOR->getMethod()) {
305       continue;
306     }
307 
308     CUDAFunctionTarget FieldMethodTarget =
309         IdentifyCUDATarget(SMOR->getMethod());
310     if (!InferredTarget.hasValue()) {
311       InferredTarget = FieldMethodTarget;
312     } else {
313       bool ResolutionError = resolveCalleeCUDATargetConflict(
314           InferredTarget.getValue(), FieldMethodTarget,
315           InferredTarget.getPointer());
316       if (ResolutionError) {
317         if (Diagnose) {
318           Diag(ClassDecl->getLocation(),
319                diag::note_implicit_member_target_infer_collision)
320               << (unsigned)CSM << InferredTarget.getValue()
321               << FieldMethodTarget;
322         }
323         MemberDecl->addAttr(CUDAInvalidTargetAttr::CreateImplicit(Context));
324         return true;
325       }
326     }
327   }
328 
329   if (InferredTarget.hasValue()) {
330     if (InferredTarget.getValue() == CFT_Device) {
331       MemberDecl->addAttr(CUDADeviceAttr::CreateImplicit(Context));
332     } else if (InferredTarget.getValue() == CFT_Host) {
333       MemberDecl->addAttr(CUDAHostAttr::CreateImplicit(Context));
334     } else {
335       MemberDecl->addAttr(CUDADeviceAttr::CreateImplicit(Context));
336       MemberDecl->addAttr(CUDAHostAttr::CreateImplicit(Context));
337     }
338   } else {
339     // If no target was inferred, mark this member as __host__ __device__;
340     // it's the least restrictive option that can be invoked from any target.
341     MemberDecl->addAttr(CUDADeviceAttr::CreateImplicit(Context));
342     MemberDecl->addAttr(CUDAHostAttr::CreateImplicit(Context));
343   }
344 
345   return false;
346 }
347 
348 bool Sema::isEmptyCudaConstructor(SourceLocation Loc, CXXConstructorDecl *CD) {
349   if (!CD->isDefined() && CD->isTemplateInstantiation())
350     InstantiateFunctionDefinition(Loc, CD->getFirstDecl());
351 
352   // (E.2.3.1, CUDA 7.5) A constructor for a class type is considered
353   // empty at a point in the translation unit, if it is either a
354   // trivial constructor
355   if (CD->isTrivial())
356     return true;
357 
358   // ... or it satisfies all of the following conditions:
359   // The constructor function has been defined.
360   // The constructor function has no parameters,
361   // and the function body is an empty compound statement.
362   if (!(CD->hasTrivialBody() && CD->getNumParams() == 0))
363     return false;
364 
365   // Its class has no virtual functions and no virtual base classes.
366   if (CD->getParent()->isDynamicClass())
367     return false;
368 
369   // The only form of initializer allowed is an empty constructor.
370   // This will recursively check all base classes and member initializers
371   if (!llvm::all_of(CD->inits(), [&](const CXXCtorInitializer *CI) {
372         if (const CXXConstructExpr *CE =
373                 dyn_cast<CXXConstructExpr>(CI->getInit()))
374           return isEmptyCudaConstructor(Loc, CE->getConstructor());
375         return false;
376       }))
377     return false;
378 
379   return true;
380 }
381 
382 bool Sema::isEmptyCudaDestructor(SourceLocation Loc, CXXDestructorDecl *DD) {
383   // No destructor -> no problem.
384   if (!DD)
385     return true;
386 
387   if (!DD->isDefined() && DD->isTemplateInstantiation())
388     InstantiateFunctionDefinition(Loc, DD->getFirstDecl());
389 
390   // (E.2.3.1, CUDA 7.5) A destructor for a class type is considered
391   // empty at a point in the translation unit, if it is either a
392   // trivial constructor
393   if (DD->isTrivial())
394     return true;
395 
396   // ... or it satisfies all of the following conditions:
397   // The destructor function has been defined.
398   // and the function body is an empty compound statement.
399   if (!DD->hasTrivialBody())
400     return false;
401 
402   const CXXRecordDecl *ClassDecl = DD->getParent();
403 
404   // Its class has no virtual functions and no virtual base classes.
405   if (ClassDecl->isDynamicClass())
406     return false;
407 
408   // Only empty destructors are allowed. This will recursively check
409   // destructors for all base classes...
410   if (!llvm::all_of(ClassDecl->bases(), [&](const CXXBaseSpecifier &BS) {
411         if (CXXRecordDecl *RD = BS.getType()->getAsCXXRecordDecl())
412           return isEmptyCudaDestructor(Loc, RD->getDestructor());
413         return true;
414       }))
415     return false;
416 
417   // ... and member fields.
418   if (!llvm::all_of(ClassDecl->fields(), [&](const FieldDecl *Field) {
419         if (CXXRecordDecl *RD = Field->getType()
420                                     ->getBaseElementTypeUnsafe()
421                                     ->getAsCXXRecordDecl())
422           return isEmptyCudaDestructor(Loc, RD->getDestructor());
423         return true;
424       }))
425     return false;
426 
427   return true;
428 }
429 
430 // With -fcuda-host-device-constexpr, an unattributed constexpr function is
431 // treated as implicitly __host__ __device__, unless:
432 //  * it is a variadic function (device-side variadic functions are not
433 //    allowed), or
434 //  * a __device__ function with this signature was already declared, in which
435 //    case in which case we output an error, unless the __device__ decl is in a
436 //    system header, in which case we leave the constexpr function unattributed.
437 //
438 // In addition, all function decls are treated as __host__ __device__ when
439 // ForceCUDAHostDeviceDepth > 0 (corresponding to code within a
440 //   #pragma clang force_cuda_host_device_begin/end
441 // pair).
442 void Sema::maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *NewD,
443                                        const LookupResult &Previous) {
444   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
445 
446   if (ForceCUDAHostDeviceDepth > 0) {
447     if (!NewD->hasAttr<CUDAHostAttr>())
448       NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
449     if (!NewD->hasAttr<CUDADeviceAttr>())
450       NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
451     return;
452   }
453 
454   if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() ||
455       NewD->isVariadic() || NewD->hasAttr<CUDAHostAttr>() ||
456       NewD->hasAttr<CUDADeviceAttr>() || NewD->hasAttr<CUDAGlobalAttr>())
457     return;
458 
459   // Is D a __device__ function with the same signature as NewD, ignoring CUDA
460   // attributes?
461   auto IsMatchingDeviceFn = [&](NamedDecl *D) {
462     if (UsingShadowDecl *Using = dyn_cast<UsingShadowDecl>(D))
463       D = Using->getTargetDecl();
464     FunctionDecl *OldD = D->getAsFunction();
465     return OldD && OldD->hasAttr<CUDADeviceAttr>() &&
466            !OldD->hasAttr<CUDAHostAttr>() &&
467            !IsOverload(NewD, OldD, /* UseMemberUsingDeclRules = */ false,
468                        /* ConsiderCudaAttrs = */ false);
469   };
470   auto It = llvm::find_if(Previous, IsMatchingDeviceFn);
471   if (It != Previous.end()) {
472     // We found a __device__ function with the same name and signature as NewD
473     // (ignoring CUDA attrs).  This is an error unless that function is defined
474     // in a system header, in which case we simply return without making NewD
475     // host+device.
476     NamedDecl *Match = *It;
477     if (!getSourceManager().isInSystemHeader(Match->getLocation())) {
478       Diag(NewD->getLocation(),
479            diag::err_cuda_unattributed_constexpr_cannot_overload_device)
480           << NewD->getName();
481       Diag(Match->getLocation(),
482            diag::note_cuda_conflicting_device_function_declared_here);
483     }
484     return;
485   }
486 
487   NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
488   NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
489 }
490 
491 // In CUDA, there are some constructs which may appear in semantically-valid
492 // code, but trigger errors if we ever generate code for the function in which
493 // they appear.  Essentially every construct you're not allowed to use on the
494 // device falls into this category, because you are allowed to use these
495 // constructs in a __host__ __device__ function, but only if that function is
496 // never codegen'ed on the device.
497 //
498 // To handle semantic checking for these constructs, we keep track of the set of
499 // functions we know will be emitted, either because we could tell a priori that
500 // they would be emitted, or because they were transitively called by a
501 // known-emitted function.
502 //
503 // We also keep a partial call graph of which not-known-emitted functions call
504 // which other not-known-emitted functions.
505 //
506 // When we see something which is illegal if the current function is emitted
507 // (usually by way of CUDADiagIfDeviceCode, CUDADiagIfHostCode, or
508 // CheckCUDACall), we first check if the current function is known-emitted.  If
509 // so, we immediately output the diagnostic.
510 //
511 // Otherwise, we "defer" the diagnostic.  It sits in Sema::CUDADeferredDiags
512 // until we discover that the function is known-emitted, at which point we take
513 // it out of this map and emit the diagnostic.
514 
515 Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
516                                        unsigned DiagID, FunctionDecl *Fn,
517                                        Sema &S)
518     : S(S), Loc(Loc), DiagID(DiagID), Fn(Fn),
519       ShowCallStack(K == K_ImmediateWithCallStack || K == K_Deferred) {
520   switch (K) {
521   case K_Nop:
522     break;
523   case K_Immediate:
524   case K_ImmediateWithCallStack:
525     ImmediateDiag.emplace(S.Diag(Loc, DiagID));
526     break;
527   case K_Deferred:
528     assert(Fn && "Must have a function to attach the deferred diag to.");
529     PartialDiag.emplace(S.PDiag(DiagID));
530     break;
531   }
532 }
533 
534 // Print notes showing how we can reach FD starting from an a priori
535 // known-callable function.
536 static void EmitCallStackNotes(Sema &S, FunctionDecl *FD) {
537   auto FnIt = S.CUDAKnownEmittedFns.find(FD);
538   while (FnIt != S.CUDAKnownEmittedFns.end()) {
539     DiagnosticBuilder Builder(
540         S.Diags.Report(FnIt->second.Loc, diag::note_called_by));
541     Builder << FnIt->second.FD;
542     Builder.setForceEmit();
543 
544     FnIt = S.CUDAKnownEmittedFns.find(FnIt->second.FD);
545   }
546 }
547 
548 Sema::CUDADiagBuilder::~CUDADiagBuilder() {
549   if (ImmediateDiag) {
550     // Emit our diagnostic and, if it was a warning or error, output a callstack
551     // if Fn isn't a priori known-emitted.
552     bool IsWarningOrError = S.getDiagnostics().getDiagnosticLevel(
553                                 DiagID, Loc) >= DiagnosticsEngine::Warning;
554     ImmediateDiag.reset(); // Emit the immediate diag.
555     if (IsWarningOrError && ShowCallStack)
556       EmitCallStackNotes(S, Fn);
557   } else if (PartialDiag) {
558     assert(ShowCallStack && "Must always show call stack for deferred diags.");
559     S.CUDADeferredDiags[Fn].push_back({Loc, std::move(*PartialDiag)});
560   }
561 }
562 
563 // Do we know that we will eventually codegen the given function?
564 static bool IsKnownEmitted(Sema &S, FunctionDecl *FD) {
565   // Templates are emitted when they're instantiated.
566   if (FD->isDependentContext())
567     return false;
568 
569   // When compiling for device, host functions are never emitted.  Similarly,
570   // when compiling for host, device and global functions are never emitted.
571   // (Technically, we do emit a host-side stub for global functions, but this
572   // doesn't count for our purposes here.)
573   Sema::CUDAFunctionTarget T = S.IdentifyCUDATarget(FD);
574   if (S.getLangOpts().CUDAIsDevice && T == Sema::CFT_Host)
575     return false;
576   if (!S.getLangOpts().CUDAIsDevice &&
577       (T == Sema::CFT_Device || T == Sema::CFT_Global))
578     return false;
579 
580   // Externally-visible and similar functions are always emitted.
581   if (!isDiscardableGVALinkage(S.getASTContext().GetGVALinkageForFunction(FD)))
582     return true;
583 
584   // Otherwise, the function is known-emitted if it's in our set of
585   // known-emitted functions.
586   return S.CUDAKnownEmittedFns.count(FD) > 0;
587 }
588 
589 Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
590                                                  unsigned DiagID) {
591   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
592   CUDADiagBuilder::Kind DiagKind = [&] {
593     switch (CurrentCUDATarget()) {
594     case CFT_Global:
595     case CFT_Device:
596       return CUDADiagBuilder::K_Immediate;
597     case CFT_HostDevice:
598       // An HD function counts as host code if we're compiling for host, and
599       // device code if we're compiling for device.  Defer any errors in device
600       // mode until the function is known-emitted.
601       if (getLangOpts().CUDAIsDevice) {
602         return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
603                    ? CUDADiagBuilder::K_ImmediateWithCallStack
604                    : CUDADiagBuilder::K_Deferred;
605       }
606       return CUDADiagBuilder::K_Nop;
607 
608     default:
609       return CUDADiagBuilder::K_Nop;
610     }
611   }();
612   return CUDADiagBuilder(DiagKind, Loc, DiagID,
613                          dyn_cast<FunctionDecl>(CurContext), *this);
614 }
615 
616 Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
617                                                unsigned DiagID) {
618   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
619   CUDADiagBuilder::Kind DiagKind = [&] {
620     switch (CurrentCUDATarget()) {
621     case CFT_Host:
622       return CUDADiagBuilder::K_Immediate;
623     case CFT_HostDevice:
624       // An HD function counts as host code if we're compiling for host, and
625       // device code if we're compiling for device.  Defer any errors in device
626       // mode until the function is known-emitted.
627       if (getLangOpts().CUDAIsDevice)
628         return CUDADiagBuilder::K_Nop;
629 
630       return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
631                  ? CUDADiagBuilder::K_ImmediateWithCallStack
632                  : CUDADiagBuilder::K_Deferred;
633     default:
634       return CUDADiagBuilder::K_Nop;
635     }
636   }();
637   return CUDADiagBuilder(DiagKind, Loc, DiagID,
638                          dyn_cast<FunctionDecl>(CurContext), *this);
639 }
640 
641 // Emit any deferred diagnostics for FD and erase them from the map in which
642 // they're stored.
643 static void EmitDeferredDiags(Sema &S, FunctionDecl *FD) {
644   auto It = S.CUDADeferredDiags.find(FD);
645   if (It == S.CUDADeferredDiags.end())
646     return;
647   bool HasWarningOrError = false;
648   for (PartialDiagnosticAt &PDAt : It->second) {
649     const SourceLocation &Loc = PDAt.first;
650     const PartialDiagnostic &PD = PDAt.second;
651     HasWarningOrError |= S.getDiagnostics().getDiagnosticLevel(
652                              PD.getDiagID(), Loc) >= DiagnosticsEngine::Warning;
653     DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
654     Builder.setForceEmit();
655     PD.Emit(Builder);
656   }
657   S.CUDADeferredDiags.erase(It);
658 
659   // FIXME: Should this be called after every warning/error emitted in the loop
660   // above, instead of just once per function?  That would be consistent with
661   // how we handle immediate errors, but it also seems like a bit much.
662   if (HasWarningOrError)
663     EmitCallStackNotes(S, FD);
664 }
665 
666 // Indicate that this function (and thus everything it transtively calls) will
667 // be codegen'ed, and emit any deferred diagnostics on this function and its
668 // (transitive) callees.
669 static void MarkKnownEmitted(Sema &S, FunctionDecl *OrigCaller,
670                              FunctionDecl *OrigCallee, SourceLocation OrigLoc) {
671   // Nothing to do if we already know that FD is emitted.
672   if (IsKnownEmitted(S, OrigCallee)) {
673     assert(!S.CUDACallGraph.count(OrigCallee));
674     return;
675   }
676 
677   // We've just discovered that OrigCallee is known-emitted.  Walk our call
678   // graph to see what else we can now discover also must be emitted.
679 
680   struct CallInfo {
681     FunctionDecl *Caller;
682     FunctionDecl *Callee;
683     SourceLocation Loc;
684   };
685   llvm::SmallVector<CallInfo, 4> Worklist = {{OrigCaller, OrigCallee, OrigLoc}};
686   llvm::SmallSet<CanonicalDeclPtr<FunctionDecl>, 4> Seen;
687   Seen.insert(OrigCallee);
688   while (!Worklist.empty()) {
689     CallInfo C = Worklist.pop_back_val();
690     assert(!IsKnownEmitted(S, C.Callee) &&
691            "Worklist should not contain known-emitted functions.");
692     S.CUDAKnownEmittedFns[C.Callee] = {C.Caller, C.Loc};
693     EmitDeferredDiags(S, C.Callee);
694 
695     // If this is a template instantiation, explore its callgraph as well:
696     // Non-dependent calls are part of the template's callgraph, while dependent
697     // calls are part of to the instantiation's call graph.
698     if (auto *Templ = C.Callee->getPrimaryTemplate()) {
699       FunctionDecl *TemplFD = Templ->getAsFunction();
700       if (!Seen.count(TemplFD) && !S.CUDAKnownEmittedFns.count(TemplFD)) {
701         Seen.insert(TemplFD);
702         Worklist.push_back(
703             {/* Caller = */ C.Caller, /* Callee = */ TemplFD, C.Loc});
704       }
705     }
706 
707     // Add all functions called by Callee to our worklist.
708     auto CGIt = S.CUDACallGraph.find(C.Callee);
709     if (CGIt == S.CUDACallGraph.end())
710       continue;
711 
712     for (std::pair<CanonicalDeclPtr<FunctionDecl>, SourceLocation> FDLoc :
713          CGIt->second) {
714       FunctionDecl *NewCallee = FDLoc.first;
715       SourceLocation CallLoc = FDLoc.second;
716       if (Seen.count(NewCallee) || IsKnownEmitted(S, NewCallee))
717         continue;
718       Seen.insert(NewCallee);
719       Worklist.push_back(
720           {/* Caller = */ C.Callee, /* Callee = */ NewCallee, CallLoc});
721     }
722 
723     // C.Callee is now known-emitted, so we no longer need to maintain its list
724     // of callees in CUDACallGraph.
725     S.CUDACallGraph.erase(CGIt);
726   }
727 }
728 
729 bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
730   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
731   assert(Callee && "Callee may not be null.");
732   // FIXME: Is bailing out early correct here?  Should we instead assume that
733   // the caller is a global initializer?
734   FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
735   if (!Caller)
736     return true;
737 
738   // If the caller is known-emitted, mark the callee as known-emitted.
739   // Otherwise, mark the call in our call graph so we can traverse it later.
740   bool CallerKnownEmitted = IsKnownEmitted(*this, Caller);
741   if (CallerKnownEmitted)
742     MarkKnownEmitted(*this, Caller, Callee, Loc);
743   else {
744     // If we have
745     //   host fn calls kernel fn calls host+device,
746     // the HD function does not get instantiated on the host.  We model this by
747     // omitting at the call to the kernel from the callgraph.  This ensures
748     // that, when compiling for host, only HD functions actually called from the
749     // host get marked as known-emitted.
750     if (getLangOpts().CUDAIsDevice || IdentifyCUDATarget(Callee) != CFT_Global)
751       CUDACallGraph[Caller].insert({Callee, Loc});
752   }
753 
754   CUDADiagBuilder::Kind DiagKind = [&] {
755     switch (IdentifyCUDAPreference(Caller, Callee)) {
756     case CFP_Never:
757       return CUDADiagBuilder::K_Immediate;
758     case CFP_WrongSide:
759       assert(Caller && "WrongSide calls require a non-null caller");
760       // If we know the caller will be emitted, we know this wrong-side call
761       // will be emitted, so it's an immediate error.  Otherwise, defer the
762       // error until we know the caller is emitted.
763       return CallerKnownEmitted ? CUDADiagBuilder::K_ImmediateWithCallStack
764                                 : CUDADiagBuilder::K_Deferred;
765     default:
766       return CUDADiagBuilder::K_Nop;
767     }
768   }();
769 
770   if (DiagKind == CUDADiagBuilder::K_Nop)
771     return true;
772 
773   // Avoid emitting this error twice for the same location.  Using a hashtable
774   // like this is unfortunate, but because we must continue parsing as normal
775   // after encountering a deferred error, it's otherwise very tricky for us to
776   // ensure that we only emit this deferred error once.
777   if (!LocsWithCUDACallDiags.insert({Caller, Loc.getRawEncoding()}).second)
778     return true;
779 
780   CUDADiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
781       << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
782   CUDADiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
783                   Caller, *this)
784       << Callee;
785   return DiagKind != CUDADiagBuilder::K_Immediate &&
786          DiagKind != CUDADiagBuilder::K_ImmediateWithCallStack;
787 }
788 
789 void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
790   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
791   if (Method->hasAttr<CUDAHostAttr>() || Method->hasAttr<CUDADeviceAttr>())
792     return;
793   FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
794   if (!CurFn)
795     return;
796   CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
797   if (Target == CFT_Global || Target == CFT_Device) {
798     Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
799   } else if (Target == CFT_HostDevice) {
800     Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
801     Method->addAttr(CUDAHostAttr::CreateImplicit(Context));
802   }
803 }
804