1 /*
2  * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for details.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef __STDC_FORMAT_MACROS
14 #define __STDC_FORMAT_MACROS
15 #endif
16 
17 #include <algorithm>
18 #include <atomic>
19 #include <cassert>
20 #include <cstdlib>
21 #include <cstring>
22 #include <inttypes.h>
23 #include <iostream>
24 #include <list>
25 #include <mutex>
26 #include <sstream>
27 #include <string>
28 #include <sys/resource.h>
29 #include <unistd.h>
30 #include <unordered_map>
31 #include <vector>
32 
33 #if (defined __APPLE__ && defined __MACH__)
34 #include <dlfcn.h>
35 #endif
36 
37 #include "omp-tools.h"
38 
39 // Define attribute that indicates that the fall through from the previous
40 // case label is intentional and should not be diagnosed by a compiler
41 //   Code from libcxx/include/__config
42 // Use a function like macro to imply that it must be followed by a semicolon
43 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
44 #define KMP_FALLTHROUGH() [[fallthrough]]
45 // icc cannot properly tell this attribute is absent so force off
46 #elif defined(__INTEL_COMPILER)
47 #define KMP_FALLTHROUGH() ((void)0)
48 #elif __has_cpp_attribute(clang::fallthrough)
49 #define KMP_FALLTHROUGH() [[clang::fallthrough]]
50 #elif __has_attribute(fallthrough) || __GNUC__ >= 7
51 #define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
52 #else
53 #define KMP_FALLTHROUGH() ((void)0)
54 #endif
55 
56 static int runOnTsan;
57 static int hasReductionCallback;
58 
59 class ArcherFlags {
60 public:
61 #if (LLVM_VERSION) >= 40
62   int flush_shadow{0};
63 #endif
64   int print_max_rss{0};
65   int verbose{0};
66   int enabled{1};
67   int report_data_leak{0};
68   int ignore_serial{0};
69 
ArcherFlags(const char * env)70   ArcherFlags(const char *env) {
71     if (env) {
72       std::vector<std::string> tokens;
73       std::string token;
74       std::string str(env);
75       std::istringstream iss(str);
76       while (std::getline(iss, token, ' '))
77         tokens.push_back(token);
78 
79       for (std::vector<std::string>::iterator it = tokens.begin();
80            it != tokens.end(); ++it) {
81 #if (LLVM_VERSION) >= 40
82         if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
83           continue;
84 #endif
85         if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
86           continue;
87         if (sscanf(it->c_str(), "verbose=%d", &verbose))
88           continue;
89         if (sscanf(it->c_str(), "report_data_leak=%d", &report_data_leak))
90           continue;
91         if (sscanf(it->c_str(), "enable=%d", &enabled))
92           continue;
93         if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
94           continue;
95         std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
96                   << std::endl;
97       }
98     }
99   }
100 };
101 
102 class TsanFlags {
103 public:
104   int ignore_noninstrumented_modules;
105 
TsanFlags(const char * env)106   TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
107     if (env) {
108       std::vector<std::string> tokens;
109       std::string str(env);
110       auto end = str.end();
111       auto it = str.begin();
112       auto is_sep = [](char c) {
113         return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
114                c == '\r';
115       };
116       while (it != end) {
117         auto next_it = std::find_if(it, end, is_sep);
118         tokens.emplace_back(it, next_it);
119         it = next_it;
120         if (it != end) {
121           ++it;
122         }
123       }
124 
125       for (const auto &token : tokens) {
126         // we are interested in ignore_noninstrumented_modules to print a
127         // warning
128         if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
129                    &ignore_noninstrumented_modules))
130           continue;
131       }
132     }
133   }
134 };
135 
136 #if (LLVM_VERSION) >= 40
137 extern "C" {
138 int __attribute__((weak)) __archer_get_omp_status();
__tsan_flush_memory()139 void __attribute__((weak)) __tsan_flush_memory() {}
140 }
141 #endif
142 ArcherFlags *archer_flags;
143 
144 #ifndef TsanHappensBefore
145 // Thread Sanitizer is a tool that finds races in code.
146 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
147 // tsan detects these exact functions by name.
148 extern "C" {
149 #if (defined __APPLE__ && defined __MACH__)
150 static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
151 static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
152 static void (*AnnotateIgnoreWritesBegin)(const char *, int);
153 static void (*AnnotateIgnoreWritesEnd)(const char *, int);
154 static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
155                                  size_t);
156 static void (*__tsan_func_entry)(const void *);
157 static void (*__tsan_func_exit)(void);
158 
RunningOnValgrind()159 static int RunningOnValgrind() {
160   int (*fptr)();
161 
162   fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
163   // If we found RunningOnValgrind other than this function, we assume
164   // Annotation functions present in this execution and leave runOnTsan=1
165   // otherwise we change to runOnTsan=0
166   if (!fptr || fptr == RunningOnValgrind)
167     runOnTsan = 0;
168   return 0;
169 }
170 #else
171 void __attribute__((weak))
172 AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
173 void __attribute__((weak))
174 AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
175 void __attribute__((weak))
176 AnnotateIgnoreWritesBegin(const char *file, int line) {}
177 void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
178 }
179 void __attribute__((weak))
180 AnnotateNewMemory(const char *file, int line, const volatile void *cv,
181                   size_t size) {}
182 int __attribute__((weak)) RunningOnValgrind() {
183   runOnTsan = 0;
184   return 0;
185 }
186 void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
187 void __attribute__((weak)) __tsan_func_exit(void) {}
188 #endif
189 }
190 
191 // This marker is used to define a happens-before arc. The race detector will
192 // infer an arc from the begin to the end when they share the same pointer
193 // argument.
194 #define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
195 
196 // This marker defines the destination of a happens-before arc.
197 #define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
198 
199 // Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
200 #define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
201 
202 // Resume checking for racy writes.
203 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
204 
205 // We don't really delete the clock for now
206 #define TsanDeleteClock(cv)
207 
208 // newMemory
209 #define TsanNewMemory(addr, size)                                              \
210   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
211 #define TsanFreeMemory(addr, size)                                             \
212   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
213 #endif
214 
215 // Function entry/exit
216 #define TsanFuncEntry(pc) __tsan_func_entry(pc)
217 #define TsanFuncExit() __tsan_func_exit()
218 
219 /// Required OMPT inquiry functions.
220 static ompt_get_parallel_info_t ompt_get_parallel_info;
221 static ompt_get_thread_data_t ompt_get_thread_data;
222 
223 typedef char ompt_tsan_clockid;
224 
my_next_id()225 static uint64_t my_next_id() {
226   static uint64_t ID = 0;
227   uint64_t ret = __sync_fetch_and_add(&ID, 1);
228   return ret;
229 }
230 
231 static int pagesize{0};
232 
233 // Data structure to provide a threadsafe pool of reusable objects.
234 // DataPool<Type of objects>
235 template <typename T> struct DataPool final {
236   static __thread DataPool<T> *ThreadDataPool;
237   std::mutex DPMutex{};
238 
239   // store unused objects
240   std::vector<T *> DataPointer{};
241   std::vector<T *> RemoteDataPointer{};
242 
243   // store all allocated memory to finally release
244   std::list<void *> memory;
245 
246   // count remotely returned data (RemoteDataPointer.size())
247   std::atomic<int> remote{0};
248 
249   // totally allocated data objects in pool
250   int total{0};
251 #ifdef DEBUG_DATA
252   int remoteReturn{0};
253   int localReturn{0};
254 
getRemoteDataPool255   int getRemote() { return remoteReturn + remote; }
getLocalDataPool256   int getLocal() { return localReturn; }
257 #endif
getTotalDataPool258   int getTotal() { return total; }
getMissingDataPool259   int getMissing() {
260     return total - DataPointer.size() - RemoteDataPointer.size();
261   }
262 
263   // fill the pool by allocating a page of memory
newDatasDataPool264   void newDatas() {
265     if (remote > 0) {
266       const std::lock_guard<std::mutex> lock(DPMutex);
267       // DataPointer is empty, so just swap the vectors
268       DataPointer.swap(RemoteDataPointer);
269       remote = 0;
270       return;
271     }
272     // calculate size of an object including padding to cacheline size
273     size_t elemSize = sizeof(T);
274     size_t paddedSize = (((elemSize - 1) / 64) + 1) * 64;
275     // number of padded elements to allocate
276     int ndatas = pagesize / paddedSize;
277     char *datas = (char *)malloc(ndatas * paddedSize);
278     memory.push_back(datas);
279     for (int i = 0; i < ndatas; i++) {
280       DataPointer.push_back(new (datas + i * paddedSize) T(this));
281     }
282     total += ndatas;
283   }
284 
285   // get data from the pool
getDataDataPool286   T *getData() {
287     T *ret;
288     if (DataPointer.empty())
289       newDatas();
290     ret = DataPointer.back();
291     DataPointer.pop_back();
292     return ret;
293   }
294 
295   // accesses to the thread-local datapool don't need locks
returnOwnDataDataPool296   void returnOwnData(T *data) {
297     DataPointer.emplace_back(data);
298 #ifdef DEBUG_DATA
299     localReturn++;
300 #endif
301   }
302 
303   // returning to a remote datapool using lock
returnDataDataPool304   void returnData(T *data) {
305     const std::lock_guard<std::mutex> lock(DPMutex);
306     RemoteDataPointer.emplace_back(data);
307     remote++;
308 #ifdef DEBUG_DATA
309     remoteReturn++;
310 #endif
311   }
312 
~DataPoolDataPool313   ~DataPool() {
314     // we assume all memory is returned when the thread finished / destructor is
315     // called
316     if (archer_flags->report_data_leak && getMissing() != 0) {
317       printf("ERROR: While freeing DataPool (%s) we are missing %i data "
318              "objects.\n",
319              __PRETTY_FUNCTION__, getMissing());
320       exit(-3);
321     }
322     for (auto i : DataPointer)
323       if (i)
324         i->~T();
325     for (auto i : RemoteDataPointer)
326       if (i)
327         i->~T();
328     for (auto i : memory)
329       if (i)
330         free(i);
331   }
332 };
333 
334 template <typename T> struct DataPoolEntry {
335   DataPool<T> *owner;
336 
NewDataPoolEntry337   static T *New() { return DataPool<T>::ThreadDataPool->getData(); }
338 
DeleteDataPoolEntry339   void Delete() {
340     static_cast<T *>(this)->Reset();
341     if (owner == DataPool<T>::ThreadDataPool)
342       owner->returnOwnData(static_cast<T *>(this));
343     else
344       owner->returnData(static_cast<T *>(this));
345   }
346 
DataPoolEntryDataPoolEntry347   DataPoolEntry(DataPool<T> *dp) : owner(dp) {}
348 };
349 
350 struct DependencyData;
351 typedef DataPool<DependencyData> DependencyDataPool;
352 template <>
353 __thread DependencyDataPool *DependencyDataPool::ThreadDataPool = nullptr;
354 
355 /// Data structure to store additional information for task dependency.
356 struct DependencyData final : DataPoolEntry<DependencyData> {
357   ompt_tsan_clockid in;
358   ompt_tsan_clockid out;
359   ompt_tsan_clockid inoutset;
GetInPtrDependencyData360   void *GetInPtr() { return &in; }
GetOutPtrDependencyData361   void *GetOutPtr() { return &out; }
GetInoutsetPtrDependencyData362   void *GetInoutsetPtr() { return &inoutset; }
363 
ResetDependencyData364   void Reset() {}
365 
NewDependencyData366   static DependencyData *New() { return DataPoolEntry<DependencyData>::New(); }
367 
DependencyDataDependencyData368   DependencyData(DataPool<DependencyData> *dp)
369       : DataPoolEntry<DependencyData>(dp) {}
370 };
371 
372 struct TaskDependency {
373   void *inPtr;
374   void *outPtr;
375   void *inoutsetPtr;
376   ompt_dependence_type_t type;
TaskDependencyTaskDependency377   TaskDependency(DependencyData *depData, ompt_dependence_type_t type)
378       : inPtr(depData->GetInPtr()), outPtr(depData->GetOutPtr()),
379         inoutsetPtr(depData->GetInoutsetPtr()), type(type) {}
AnnotateBeginTaskDependency380   void AnnotateBegin() {
381     if (type == ompt_dependence_type_out ||
382         type == ompt_dependence_type_inout ||
383         type == ompt_dependence_type_mutexinoutset) {
384       TsanHappensAfter(inPtr);
385       TsanHappensAfter(outPtr);
386       TsanHappensAfter(inoutsetPtr);
387     } else if (type == ompt_dependence_type_in) {
388       TsanHappensAfter(outPtr);
389       TsanHappensAfter(inoutsetPtr);
390     } else if (type == ompt_dependence_type_inoutset) {
391       TsanHappensAfter(inPtr);
392       TsanHappensAfter(outPtr);
393     }
394   }
AnnotateEndTaskDependency395   void AnnotateEnd() {
396     if (type == ompt_dependence_type_out ||
397         type == ompt_dependence_type_inout ||
398         type == ompt_dependence_type_mutexinoutset) {
399       TsanHappensBefore(outPtr);
400     } else if (type == ompt_dependence_type_in) {
401       TsanHappensBefore(inPtr);
402     } else if (type == ompt_dependence_type_inoutset) {
403       TsanHappensBefore(inoutsetPtr);
404     }
405   }
406 };
407 
408 struct ParallelData;
409 typedef DataPool<ParallelData> ParallelDataPool;
410 template <>
411 __thread ParallelDataPool *ParallelDataPool::ThreadDataPool = nullptr;
412 
413 /// Data structure to store additional information for parallel regions.
414 struct ParallelData final : DataPoolEntry<ParallelData> {
415 
416   // Parallel fork is just another barrier, use Barrier[1]
417 
418   /// Two addresses for relationships with barriers.
419   ompt_tsan_clockid Barrier[2];
420 
421   const void *codePtr;
422 
GetParallelPtrParallelData423   void *GetParallelPtr() { return &(Barrier[1]); }
424 
GetBarrierPtrParallelData425   void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
426 
InitParallelData427   ParallelData *Init(const void *codeptr) {
428     codePtr = codeptr;
429     return this;
430   }
431 
ResetParallelData432   void Reset() {}
433 
NewParallelData434   static ParallelData *New(const void *codeptr) {
435     return DataPoolEntry<ParallelData>::New()->Init(codeptr);
436   }
437 
ParallelDataParallelData438   ParallelData(DataPool<ParallelData> *dp) : DataPoolEntry<ParallelData>(dp) {}
439 };
440 
ToParallelData(ompt_data_t * parallel_data)441 static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
442   return reinterpret_cast<ParallelData *>(parallel_data->ptr);
443 }
444 
445 struct Taskgroup;
446 typedef DataPool<Taskgroup> TaskgroupPool;
447 template <> __thread TaskgroupPool *TaskgroupPool::ThreadDataPool = nullptr;
448 
449 /// Data structure to support stacking of taskgroups and allow synchronization.
450 struct Taskgroup final : DataPoolEntry<Taskgroup> {
451   /// Its address is used for relationships of the taskgroup's task set.
452   ompt_tsan_clockid Ptr;
453 
454   /// Reference to the parent taskgroup.
455   Taskgroup *Parent;
456 
GetPtrTaskgroup457   void *GetPtr() { return &Ptr; }
458 
InitTaskgroup459   Taskgroup *Init(Taskgroup *parent) {
460     Parent = parent;
461     return this;
462   }
463 
ResetTaskgroup464   void Reset() {}
465 
NewTaskgroup466   static Taskgroup *New(Taskgroup *Parent) {
467     return DataPoolEntry<Taskgroup>::New()->Init(Parent);
468   }
469 
TaskgroupTaskgroup470   Taskgroup(DataPool<Taskgroup> *dp) : DataPoolEntry<Taskgroup>(dp) {}
471 };
472 
473 struct TaskData;
474 typedef DataPool<TaskData> TaskDataPool;
475 template <> __thread TaskDataPool *TaskDataPool::ThreadDataPool = nullptr;
476 
477 /// Data structure to store additional information for tasks.
478 struct TaskData final : DataPoolEntry<TaskData> {
479   /// Its address is used for relationships of this task.
480   ompt_tsan_clockid Task{0};
481 
482   /// Child tasks use its address to declare a relationship to a taskwait in
483   /// this task.
484   ompt_tsan_clockid Taskwait{0};
485 
486   /// Whether this task is currently executing a barrier.
487   bool InBarrier{false};
488 
489   /// Whether this task is an included task.
490   int TaskType{0};
491 
492   /// count execution phase
493   int execution{0};
494 
495   /// Index of which barrier to use next.
496   char BarrierIndex{0};
497 
498   /// Count how often this structure has been put into child tasks + 1.
499   std::atomic_int RefCount{1};
500 
501   /// Reference to the parent that created this task.
502   TaskData *Parent{nullptr};
503 
504   /// Reference to the implicit task in the stack above this task.
505   TaskData *ImplicitTask{nullptr};
506 
507   /// Reference to the team of this task.
508   ParallelData *Team{nullptr};
509 
510   /// Reference to the current taskgroup that this task either belongs to or
511   /// that it just created.
512   Taskgroup *TaskGroup{nullptr};
513 
514   /// Dependency information for this task.
515   TaskDependency *Dependencies{nullptr};
516 
517   /// Number of dependency entries.
518   unsigned DependencyCount{0};
519 
520   // The dependency-map stores DependencyData objects representing
521   // the dependency variables used on the sibling tasks created from
522   // this task
523   // We expect a rare need for the dependency-map, so alloc on demand
524   std::unordered_map<void *, DependencyData *> *DependencyMap{nullptr};
525 
526 #ifdef DEBUG
527   int freed{0};
528 #endif
529 
isIncludedTaskData530   bool isIncluded() { return TaskType & ompt_task_undeferred; }
isUntiedTaskData531   bool isUntied() { return TaskType & ompt_task_untied; }
isFinalTaskData532   bool isFinal() { return TaskType & ompt_task_final; }
isMergableTaskData533   bool isMergable() { return TaskType & ompt_task_mergeable; }
isMergedTaskData534   bool isMerged() { return TaskType & ompt_task_merged; }
535 
isExplicitTaskData536   bool isExplicit() { return TaskType & ompt_task_explicit; }
isImplicitTaskData537   bool isImplicit() { return TaskType & ompt_task_implicit; }
isInitialTaskData538   bool isInitial() { return TaskType & ompt_task_initial; }
isTargetTaskData539   bool isTarget() { return TaskType & ompt_task_target; }
540 
GetTaskPtrTaskData541   void *GetTaskPtr() { return &Task; }
542 
GetTaskwaitPtrTaskData543   void *GetTaskwaitPtr() { return &Taskwait; }
544 
InitTaskData545   TaskData *Init(TaskData *parent, int taskType) {
546     TaskType = taskType;
547     Parent = parent;
548     Team = Parent->Team;
549     if (Parent != nullptr) {
550       Parent->RefCount++;
551       // Copy over pointer to taskgroup. This task may set up its own stack
552       // but for now belongs to its parent's taskgroup.
553       TaskGroup = Parent->TaskGroup;
554     }
555     return this;
556   }
557 
InitTaskData558   TaskData *Init(ParallelData *team, int taskType) {
559     TaskType = taskType;
560     execution = 1;
561     ImplicitTask = this;
562     Team = team;
563     return this;
564   }
565 
ResetTaskData566   void Reset() {
567     InBarrier = false;
568     TaskType = 0;
569     execution = 0;
570     BarrierIndex = 0;
571     RefCount = 1;
572     Parent = nullptr;
573     ImplicitTask = nullptr;
574     Team = nullptr;
575     TaskGroup = nullptr;
576     if (DependencyMap) {
577       for (auto i : *DependencyMap)
578         i.second->Delete();
579       delete DependencyMap;
580     }
581     DependencyMap = nullptr;
582     if (Dependencies)
583       free(Dependencies);
584     Dependencies = nullptr;
585     DependencyCount = 0;
586 #ifdef DEBUG
587     freed = 0;
588 #endif
589   }
590 
NewTaskData591   static TaskData *New(TaskData *parent, int taskType) {
592     return DataPoolEntry<TaskData>::New()->Init(parent, taskType);
593   }
594 
NewTaskData595   static TaskData *New(ParallelData *team, int taskType) {
596     return DataPoolEntry<TaskData>::New()->Init(team, taskType);
597   }
598 
TaskDataTaskData599   TaskData(DataPool<TaskData> *dp) : DataPoolEntry<TaskData>(dp) {}
600 };
601 
ToTaskData(ompt_data_t * task_data)602 static inline TaskData *ToTaskData(ompt_data_t *task_data) {
603   return reinterpret_cast<TaskData *>(task_data->ptr);
604 }
605 
606 /// Store a mutex for each wait_id to resolve race condition with callbacks.
607 std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
608 std::mutex LocksMutex;
609 
ompt_tsan_thread_begin(ompt_thread_t thread_type,ompt_data_t * thread_data)610 static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
611                                    ompt_data_t *thread_data) {
612   ParallelDataPool::ThreadDataPool = new ParallelDataPool;
613   TsanNewMemory(ParallelDataPool::ThreadDataPool,
614                 sizeof(ParallelDataPool::ThreadDataPool));
615   TaskgroupPool::ThreadDataPool = new TaskgroupPool;
616   TsanNewMemory(TaskgroupPool::ThreadDataPool,
617                 sizeof(TaskgroupPool::ThreadDataPool));
618   TaskDataPool::ThreadDataPool = new TaskDataPool;
619   TsanNewMemory(TaskDataPool::ThreadDataPool,
620                 sizeof(TaskDataPool::ThreadDataPool));
621   DependencyDataPool::ThreadDataPool = new DependencyDataPool;
622   TsanNewMemory(DependencyDataPool::ThreadDataPool,
623                 sizeof(DependencyDataPool::ThreadDataPool));
624   thread_data->value = my_next_id();
625 }
626 
ompt_tsan_thread_end(ompt_data_t * thread_data)627 static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
628   TsanIgnoreWritesBegin();
629   delete ParallelDataPool::ThreadDataPool;
630   delete TaskgroupPool::ThreadDataPool;
631   delete TaskDataPool::ThreadDataPool;
632   delete DependencyDataPool::ThreadDataPool;
633   TsanIgnoreWritesEnd();
634 }
635 
636 /// OMPT event callbacks for handling parallel regions.
637 
ompt_tsan_parallel_begin(ompt_data_t * parent_task_data,const ompt_frame_t * parent_task_frame,ompt_data_t * parallel_data,uint32_t requested_team_size,int flag,const void * codeptr_ra)638 static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
639                                      const ompt_frame_t *parent_task_frame,
640                                      ompt_data_t *parallel_data,
641                                      uint32_t requested_team_size, int flag,
642                                      const void *codeptr_ra) {
643   ParallelData *Data = ParallelData::New(codeptr_ra);
644   parallel_data->ptr = Data;
645 
646   TsanHappensBefore(Data->GetParallelPtr());
647   if (archer_flags->ignore_serial && ToTaskData(parent_task_data)->isInitial())
648     TsanIgnoreWritesEnd();
649 }
650 
ompt_tsan_parallel_end(ompt_data_t * parallel_data,ompt_data_t * task_data,int flag,const void * codeptr_ra)651 static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
652                                    ompt_data_t *task_data, int flag,
653                                    const void *codeptr_ra) {
654   if (archer_flags->ignore_serial && ToTaskData(task_data)->isInitial())
655     TsanIgnoreWritesBegin();
656   ParallelData *Data = ToParallelData(parallel_data);
657   TsanHappensAfter(Data->GetBarrierPtr(0));
658   TsanHappensAfter(Data->GetBarrierPtr(1));
659 
660   Data->Delete();
661 
662 #if (LLVM_VERSION >= 40)
663   if (&__archer_get_omp_status) {
664     if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
665       __tsan_flush_memory();
666   }
667 #endif
668 }
669 
ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,unsigned int team_size,unsigned int thread_num,int type)670 static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
671                                     ompt_data_t *parallel_data,
672                                     ompt_data_t *task_data,
673                                     unsigned int team_size,
674                                     unsigned int thread_num, int type) {
675   switch (endpoint) {
676   case ompt_scope_begin:
677     if (type & ompt_task_initial) {
678       parallel_data->ptr = ParallelData::New(nullptr);
679     }
680     task_data->ptr = TaskData::New(ToParallelData(parallel_data), type);
681     TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
682     TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
683     break;
684   case ompt_scope_end: {
685     TaskData *Data = ToTaskData(task_data);
686 #ifdef DEBUG
687     assert(Data->freed == 0 && "Implicit task end should only be called once!");
688     Data->freed = 1;
689 #endif
690     assert(Data->RefCount == 1 &&
691            "All tasks should have finished at the implicit barrier!");
692     if (type & ompt_task_initial) {
693       Data->Team->Delete();
694     }
695     Data->Delete();
696     TsanFuncExit();
697     break;
698   }
699   case ompt_scope_beginend:
700     // Should not occur according to OpenMP 5.1
701     // Tested in OMPT tests
702     break;
703   }
704 }
705 
ompt_tsan_sync_region(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)706 static void ompt_tsan_sync_region(ompt_sync_region_t kind,
707                                   ompt_scope_endpoint_t endpoint,
708                                   ompt_data_t *parallel_data,
709                                   ompt_data_t *task_data,
710                                   const void *codeptr_ra) {
711   TaskData *Data = ToTaskData(task_data);
712   switch (endpoint) {
713   case ompt_scope_begin:
714   case ompt_scope_beginend:
715     TsanFuncEntry(codeptr_ra);
716     switch (kind) {
717     case ompt_sync_region_barrier_implementation:
718     case ompt_sync_region_barrier_implicit:
719     case ompt_sync_region_barrier_explicit:
720     case ompt_sync_region_barrier_implicit_parallel:
721     case ompt_sync_region_barrier_implicit_workshare:
722     case ompt_sync_region_barrier_teams:
723     case ompt_sync_region_barrier: {
724       char BarrierIndex = Data->BarrierIndex;
725       TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
726 
727       if (hasReductionCallback < ompt_set_always) {
728         // We ignore writes inside the barrier. These would either occur during
729         // 1. reductions performed by the runtime which are guaranteed to be
730         // race-free.
731         // 2. execution of another task.
732         // For the latter case we will re-enable tracking in task_switch.
733         Data->InBarrier = true;
734         TsanIgnoreWritesBegin();
735       }
736 
737       break;
738     }
739 
740     case ompt_sync_region_taskwait:
741       break;
742 
743     case ompt_sync_region_taskgroup:
744       Data->TaskGroup = Taskgroup::New(Data->TaskGroup);
745       break;
746 
747     case ompt_sync_region_reduction:
748       // should never be reached
749       break;
750     }
751     if (endpoint == ompt_scope_begin)
752       break;
753     KMP_FALLTHROUGH();
754   case ompt_scope_end:
755     TsanFuncExit();
756     switch (kind) {
757     case ompt_sync_region_barrier_implementation:
758     case ompt_sync_region_barrier_implicit:
759     case ompt_sync_region_barrier_explicit:
760     case ompt_sync_region_barrier_implicit_parallel:
761     case ompt_sync_region_barrier_implicit_workshare:
762     case ompt_sync_region_barrier_teams:
763     case ompt_sync_region_barrier: {
764       if (hasReductionCallback < ompt_set_always) {
765         // We want to track writes after the barrier again.
766         Data->InBarrier = false;
767         TsanIgnoreWritesEnd();
768       }
769 
770       char BarrierIndex = Data->BarrierIndex;
771       // Barrier will end after it has been entered by all threads.
772       if (parallel_data)
773         TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
774 
775       // It is not guaranteed that all threads have exited this barrier before
776       // we enter the next one. So we will use a different address.
777       // We are however guaranteed that this current barrier is finished
778       // by the time we exit the next one. So we can then reuse the first
779       // address.
780       Data->BarrierIndex = (BarrierIndex + 1) % 2;
781       break;
782     }
783 
784     case ompt_sync_region_taskwait: {
785       if (Data->execution > 1)
786         TsanHappensAfter(Data->GetTaskwaitPtr());
787       break;
788     }
789 
790     case ompt_sync_region_taskgroup: {
791       assert(Data->TaskGroup != nullptr &&
792              "Should have at least one taskgroup!");
793 
794       TsanHappensAfter(Data->TaskGroup->GetPtr());
795 
796       // Delete this allocated taskgroup, all descendent task are finished by
797       // now.
798       Taskgroup *Parent = Data->TaskGroup->Parent;
799       Data->TaskGroup->Delete();
800       Data->TaskGroup = Parent;
801       break;
802     }
803 
804     case ompt_sync_region_reduction:
805       // Should not occur according to OpenMP 5.1
806       // Tested in OMPT tests
807       break;
808     }
809     break;
810   }
811 }
812 
ompt_tsan_reduction(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)813 static void ompt_tsan_reduction(ompt_sync_region_t kind,
814                                 ompt_scope_endpoint_t endpoint,
815                                 ompt_data_t *parallel_data,
816                                 ompt_data_t *task_data,
817                                 const void *codeptr_ra) {
818   switch (endpoint) {
819   case ompt_scope_begin:
820     switch (kind) {
821     case ompt_sync_region_reduction:
822       TsanIgnoreWritesBegin();
823       break;
824     default:
825       break;
826     }
827     break;
828   case ompt_scope_end:
829     switch (kind) {
830     case ompt_sync_region_reduction:
831       TsanIgnoreWritesEnd();
832       break;
833     default:
834       break;
835     }
836     break;
837   case ompt_scope_beginend:
838     // Should not occur according to OpenMP 5.1
839     // Tested in OMPT tests
840     // Would have no implications for DR detection
841     break;
842   }
843 }
844 
845 /// OMPT event callbacks for handling tasks.
846 
ompt_tsan_task_create(ompt_data_t * parent_task_data,const ompt_frame_t * parent_frame,ompt_data_t * new_task_data,int type,int has_dependences,const void * codeptr_ra)847 static void ompt_tsan_task_create(
848     ompt_data_t *parent_task_data,    /* id of parent task            */
849     const ompt_frame_t *parent_frame, /* frame data for parent task   */
850     ompt_data_t *new_task_data,       /* id of created task           */
851     int type, int has_dependences,
852     const void *codeptr_ra) /* pointer to outlined function */
853 {
854   TaskData *Data;
855   assert(new_task_data->ptr == NULL &&
856          "Task data should be initialized to NULL");
857   if (type & ompt_task_initial) {
858     ompt_data_t *parallel_data;
859     int team_size = 1;
860     ompt_get_parallel_info(0, &parallel_data, &team_size);
861     ParallelData *PData = ParallelData::New(nullptr);
862     parallel_data->ptr = PData;
863 
864     Data = TaskData::New(PData, type);
865     new_task_data->ptr = Data;
866   } else if (type & ompt_task_undeferred) {
867     Data = TaskData::New(ToTaskData(parent_task_data), type);
868     new_task_data->ptr = Data;
869   } else if (type & ompt_task_explicit || type & ompt_task_target) {
870     Data = TaskData::New(ToTaskData(parent_task_data), type);
871     new_task_data->ptr = Data;
872 
873     // Use the newly created address. We cannot use a single address from the
874     // parent because that would declare wrong relationships with other
875     // sibling tasks that may be created before this task is started!
876     TsanHappensBefore(Data->GetTaskPtr());
877     ToTaskData(parent_task_data)->execution++;
878   }
879 }
880 
freeTask(TaskData * task)881 static void freeTask(TaskData *task) {
882   while (task != nullptr && --task->RefCount == 0) {
883     TaskData *Parent = task->Parent;
884     task->Delete();
885     task = Parent;
886   }
887 }
888 
releaseDependencies(TaskData * task)889 static void releaseDependencies(TaskData *task) {
890   for (unsigned i = 0; i < task->DependencyCount; i++) {
891     task->Dependencies[i].AnnotateEnd();
892   }
893 }
894 
acquireDependencies(TaskData * task)895 static void acquireDependencies(TaskData *task) {
896   for (unsigned i = 0; i < task->DependencyCount; i++) {
897     task->Dependencies[i].AnnotateBegin();
898   }
899 }
900 
ompt_tsan_task_schedule(ompt_data_t * first_task_data,ompt_task_status_t prior_task_status,ompt_data_t * second_task_data)901 static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
902                                     ompt_task_status_t prior_task_status,
903                                     ompt_data_t *second_task_data) {
904 
905   //
906   //  The necessary action depends on prior_task_status:
907   //
908   //    ompt_task_early_fulfill = 5,
909   //     -> ignored
910   //
911   //    ompt_task_late_fulfill  = 6,
912   //     -> first completed, first freed, second ignored
913   //
914   //    ompt_task_complete      = 1,
915   //    ompt_task_cancel        = 3,
916   //     -> first completed, first freed, second starts
917   //
918   //    ompt_task_detach        = 4,
919   //    ompt_task_yield         = 2,
920   //    ompt_task_switch        = 7
921   //     -> first suspended, second starts
922   //
923 
924   if (prior_task_status == ompt_task_early_fulfill)
925     return;
926 
927   TaskData *FromTask = ToTaskData(first_task_data);
928 
929   // Legacy handling for missing reduction callback
930   if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
931     // We want to ignore writes in the runtime code during barriers,
932     // but not when executing tasks with user code!
933     TsanIgnoreWritesEnd();
934   }
935 
936   // The late fulfill happens after the detached task finished execution
937   if (prior_task_status == ompt_task_late_fulfill)
938     TsanHappensAfter(FromTask->GetTaskPtr());
939 
940   // task completed execution
941   if (prior_task_status == ompt_task_complete ||
942       prior_task_status == ompt_task_cancel ||
943       prior_task_status == ompt_task_late_fulfill) {
944     // Included tasks are executed sequentially, no need to track
945     // synchronization
946     if (!FromTask->isIncluded()) {
947       // Task will finish before a barrier in the surrounding parallel region
948       // ...
949       ParallelData *PData = FromTask->Team;
950       TsanHappensBefore(
951           PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
952 
953       // ... and before an eventual taskwait by the parent thread.
954       TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
955 
956       if (FromTask->TaskGroup != nullptr) {
957         // This task is part of a taskgroup, so it will finish before the
958         // corresponding taskgroup_end.
959         TsanHappensBefore(FromTask->TaskGroup->GetPtr());
960       }
961     }
962 
963     // release dependencies
964     releaseDependencies(FromTask);
965     // free the previously running task
966     freeTask(FromTask);
967   }
968 
969   // For late fulfill of detached task, there is no task to schedule to
970   if (prior_task_status == ompt_task_late_fulfill) {
971     return;
972   }
973 
974   TaskData *ToTask = ToTaskData(second_task_data);
975   // Legacy handling for missing reduction callback
976   if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
977     // We re-enter runtime code which currently performs a barrier.
978     TsanIgnoreWritesBegin();
979   }
980 
981   // task suspended
982   if (prior_task_status == ompt_task_switch ||
983       prior_task_status == ompt_task_yield ||
984       prior_task_status == ompt_task_detach) {
985     // Task may be resumed at a later point in time.
986     TsanHappensBefore(FromTask->GetTaskPtr());
987     ToTask->ImplicitTask = FromTask->ImplicitTask;
988     assert(ToTask->ImplicitTask != NULL &&
989            "A task belongs to a team and has an implicit task on the stack");
990   }
991 
992   // Handle dependencies on first execution of the task
993   if (ToTask->execution == 0) {
994     ToTask->execution++;
995     acquireDependencies(ToTask);
996   }
997   // 1. Task will begin execution after it has been created.
998   // 2. Task will resume after it has been switched away.
999   TsanHappensAfter(ToTask->GetTaskPtr());
1000 }
1001 
ompt_tsan_dependences(ompt_data_t * task_data,const ompt_dependence_t * deps,int ndeps)1002 static void ompt_tsan_dependences(ompt_data_t *task_data,
1003                                   const ompt_dependence_t *deps, int ndeps) {
1004   if (ndeps > 0) {
1005     // Copy the data to use it in task_switch and task_end.
1006     TaskData *Data = ToTaskData(task_data);
1007     if (!Data->Parent) {
1008       // Return since doacross dependences are not supported yet.
1009       return;
1010     }
1011     if (!Data->Parent->DependencyMap)
1012       Data->Parent->DependencyMap =
1013           new std::unordered_map<void *, DependencyData *>();
1014     Data->Dependencies =
1015         (TaskDependency *)malloc(sizeof(TaskDependency) * ndeps);
1016     Data->DependencyCount = ndeps;
1017     for (int i = 0; i < ndeps; i++) {
1018       auto ret = Data->Parent->DependencyMap->insert(
1019           std::make_pair(deps[i].variable.ptr, nullptr));
1020       if (ret.second) {
1021         ret.first->second = DependencyData::New();
1022       }
1023       new ((void *)(Data->Dependencies + i))
1024           TaskDependency(ret.first->second, deps[i].dependence_type);
1025     }
1026 
1027     // This callback is executed before this task is first started.
1028     TsanHappensBefore(Data->GetTaskPtr());
1029   }
1030 }
1031 
1032 /// OMPT event callbacks for handling locking.
ompt_tsan_mutex_acquired(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)1033 static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
1034                                      const void *codeptr_ra) {
1035 
1036   // Acquire our own lock to make sure that
1037   // 1. the previous release has finished.
1038   // 2. the next acquire doesn't start before we have finished our release.
1039   LocksMutex.lock();
1040   std::mutex &Lock = Locks[wait_id];
1041   LocksMutex.unlock();
1042 
1043   Lock.lock();
1044   TsanHappensAfter(&Lock);
1045 }
1046 
ompt_tsan_mutex_released(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)1047 static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
1048                                      const void *codeptr_ra) {
1049   LocksMutex.lock();
1050   std::mutex &Lock = Locks[wait_id];
1051   LocksMutex.unlock();
1052   TsanHappensBefore(&Lock);
1053 
1054   Lock.unlock();
1055 }
1056 
1057 // callback , signature , variable to store result , required support level
1058 #define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                    \
1059   do {                                                                         \
1060     ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                \
1061     result = ompt_set_callback(ompt_callback_##event,                          \
1062                                (ompt_callback_t)tsan_##event);                 \
1063     if (result < level)                                                        \
1064       printf("Registered callback '" #event "' is not supported at " #level    \
1065              " (%i)\n",                                                        \
1066              result);                                                          \
1067   } while (0)
1068 
1069 #define SET_CALLBACK_T(event, type)                                            \
1070   do {                                                                         \
1071     int res;                                                                   \
1072     SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);                \
1073   } while (0)
1074 
1075 #define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
1076 
ompt_tsan_initialize(ompt_function_lookup_t lookup,int device_num,ompt_data_t * tool_data)1077 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
1078                                 ompt_data_t *tool_data) {
1079   const char *options = getenv("TSAN_OPTIONS");
1080   TsanFlags tsan_flags(options);
1081 
1082   ompt_set_callback_t ompt_set_callback =
1083       (ompt_set_callback_t)lookup("ompt_set_callback");
1084   if (ompt_set_callback == NULL) {
1085     std::cerr << "Could not set callback, exiting..." << std::endl;
1086     std::exit(1);
1087   }
1088   ompt_get_parallel_info =
1089       (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
1090   ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
1091 
1092   if (ompt_get_parallel_info == NULL) {
1093     fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
1094                     "exiting...\n");
1095     exit(1);
1096   }
1097 
1098 #if (defined __APPLE__ && defined __MACH__)
1099 #define findTsanFunction(f, fSig)                                              \
1100   do {                                                                         \
1101     if (NULL == (f = fSig dlsym(RTLD_DEFAULT, #f)))                            \
1102       printf("Unable to find TSan function " #f ".\n");                        \
1103   } while (0)
1104 
1105   findTsanFunction(AnnotateHappensAfter,
1106                    (void (*)(const char *, int, const volatile void *)));
1107   findTsanFunction(AnnotateHappensBefore,
1108                    (void (*)(const char *, int, const volatile void *)));
1109   findTsanFunction(AnnotateIgnoreWritesBegin, (void (*)(const char *, int)));
1110   findTsanFunction(AnnotateIgnoreWritesEnd, (void (*)(const char *, int)));
1111   findTsanFunction(
1112       AnnotateNewMemory,
1113       (void (*)(const char *, int, const volatile void *, size_t)));
1114   findTsanFunction(__tsan_func_entry, (void (*)(const void *)));
1115   findTsanFunction(__tsan_func_exit, (void (*)(void)));
1116 #endif
1117 
1118   SET_CALLBACK(thread_begin);
1119   SET_CALLBACK(thread_end);
1120   SET_CALLBACK(parallel_begin);
1121   SET_CALLBACK(implicit_task);
1122   SET_CALLBACK(sync_region);
1123   SET_CALLBACK(parallel_end);
1124 
1125   SET_CALLBACK(task_create);
1126   SET_CALLBACK(task_schedule);
1127   SET_CALLBACK(dependences);
1128 
1129   SET_CALLBACK_T(mutex_acquired, mutex);
1130   SET_CALLBACK_T(mutex_released, mutex);
1131   SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
1132                           ompt_set_never);
1133 
1134   if (!tsan_flags.ignore_noninstrumented_modules)
1135     fprintf(stderr,
1136             "Warning: please export "
1137             "TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
1138             "to avoid false positive reports from the OpenMP runtime!\n");
1139   if (archer_flags->ignore_serial)
1140     TsanIgnoreWritesBegin();
1141 
1142   return 1; // success
1143 }
1144 
ompt_tsan_finalize(ompt_data_t * tool_data)1145 static void ompt_tsan_finalize(ompt_data_t *tool_data) {
1146   if (archer_flags->ignore_serial)
1147     TsanIgnoreWritesEnd();
1148   if (archer_flags->print_max_rss) {
1149     struct rusage end;
1150     getrusage(RUSAGE_SELF, &end);
1151     printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
1152   }
1153 
1154   if (archer_flags)
1155     delete archer_flags;
1156 }
1157 
1158 extern "C" ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,const char * runtime_version)1159 ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
1160   const char *options = getenv("ARCHER_OPTIONS");
1161   archer_flags = new ArcherFlags(options);
1162   if (!archer_flags->enabled) {
1163     if (archer_flags->verbose)
1164       std::cout << "Archer disabled, stopping operation" << std::endl;
1165     delete archer_flags;
1166     return NULL;
1167   }
1168 
1169   pagesize = getpagesize();
1170 
1171   static ompt_start_tool_result_t ompt_start_tool_result = {
1172       &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
1173 
1174   // The OMPT start-up code uses dlopen with RTLD_LAZY. Therefore, we cannot
1175   // rely on dlopen to fail if TSan is missing, but would get a runtime error
1176   // for the first TSan call. We use RunningOnValgrind to detect whether
1177   // an implementation of the Annotation interface is available in the
1178   // execution or disable the tool (by returning NULL).
1179 
1180   runOnTsan = 1;
1181   RunningOnValgrind();
1182   if (!runOnTsan) // if we are not running on TSAN, give a different tool the
1183                   // chance to be loaded
1184   {
1185     if (archer_flags->verbose)
1186       std::cout << "Archer detected OpenMP application without TSan "
1187                    "stopping operation"
1188                 << std::endl;
1189     delete archer_flags;
1190     return NULL;
1191   }
1192 
1193   if (archer_flags->verbose)
1194     std::cout << "Archer detected OpenMP application with TSan, supplying "
1195                  "OpenMP synchronization semantics"
1196               << std::endl;
1197   return &ompt_start_tool_result;
1198 }
1199