1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
4 /** @file kmp_stats.h
5  * Functions for collecting statistics.
6  */
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "kmp_config.h"
18 #include "kmp_debug.h"
19 
20 #if KMP_STATS_ENABLED
21 /* Statistics accumulator.
22    Accumulates number of samples and computes min, max, mean, standard deviation
23    on the fly.
24 
25    Online variance calculation algorithm from
26    http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
27  */
28 
29 #include "kmp_stats_timing.h"
30 #include <limits>
31 #include <math.h>
32 #include <new> // placement new
33 #include <stdint.h>
34 #include <string>
35 #include <vector>
36 
37 /* Enable developer statistics here if you want them. They are more detailed
38    than is useful for application characterisation and are intended for the
39    runtime library developer. */
40 #define KMP_DEVELOPER_STATS 0
41 
42 /* Enable/Disable histogram output */
43 #define KMP_STATS_HIST 0
44 
45 /*!
46  * @ingroup STATS_GATHERING
47  * \brief flags to describe the statistic (timer or counter)
48  *
49  */
50 enum stats_flags_e {
51   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
52   onlyInMaster = 1 << 1, //!< statistic is valid only for master
53   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
54   notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
55   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
56   //! KMP_STATS_EVENTS is on (valid only for timers)
57 };
58 
59 /*!
60  * @ingroup STATS_GATHERING
61  * \brief the states which a thread can be in
62  *
63  */
64 enum stats_state_e {
65   IDLE,
66   SERIAL_REGION,
67   FORK_JOIN_BARRIER,
68   PLAIN_BARRIER,
69   TASKWAIT,
70   TASKYIELD,
71   TASKGROUP,
72   IMPLICIT_TASK,
73   EXPLICIT_TASK
74 };
75 
76 /*!
77  * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78  *
79  * @param macro a user defined macro that takes three arguments -
80  * macro(COUNTER_NAME, flags, arg)
81  * @param arg a user defined argument to send to the user defined macro
82  *
83  * \details A counter counts the occurrence of some event. Each thread
84  * accumulates its own count, at the end of execution the counts are aggregated
85  * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86  * in which case there's only a single measurement). The min,mean,max are
87  * therefore the values for the threads. Adding the counter here and then
88  * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89  * need to do. All of the tables and printing is generated from this macro.
90  * Format is "macro(name, flags, arg)"
91  *
92  * @ingroup STATS_GATHERING
93  */
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
96   macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97   macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98   macro(OMP_LOOP_STATIC, 0, arg)                                               \
99   macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100   macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101   macro(OMP_DISTRIBUTE, 0, arg)                                                \
102   macro(OMP_BARRIER, 0, arg)                                                   \
103   macro(OMP_CRITICAL, 0, arg)                                                  \
104   macro(OMP_SINGLE, 0, arg)                                                    \
105   macro(OMP_MASTER, 0, arg)                                                    \
106   macro(OMP_TEAMS, 0, arg)                                                     \
107   macro(OMP_set_lock, 0, arg)                                                  \
108   macro(OMP_test_lock, 0, arg)                                                 \
109   macro(REDUCE_wait, 0, arg)                                                   \
110   macro(REDUCE_nowait, 0, arg)                                                 \
111   macro(OMP_TASKYIELD, 0, arg)                                                 \
112   macro(OMP_TASKLOOP, 0, arg)                                                  \
113   macro(TASK_executed, 0, arg)                                                 \
114   macro(TASK_cancelled, 0, arg)                                                \
115   macro(TASK_stolen, 0, arg)
116 // clang-format on
117 
118 /*!
119  * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
120  *
121  * @param macro a user defined macro that takes three arguments -
122  * macro(TIMER_NAME, flags, arg)
123  * @param arg a user defined argument to send to the user defined macro
124  *
125  * \details A timer collects multiple samples of some count in each thread and
126  * then finally aggregates all of the samples from all of the threads. For most
127  * timers the printing code also provides an aggregation over the thread totals.
128  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
129  * hence the name "timer". (But can be any value, so we use this for "number of
130  * arguments passed to fork" as well). For timers the threads are not
131  * significant, it's the individual observations that count, so the statistics
132  * are at that level. Format is "macro(name, flags, arg)"
133  *
134  * @ingroup STATS_GATHERING2
135  */
136 // clang-format off
137 #define KMP_FOREACH_TIMER(macro, arg)                                          \
138   macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
139   macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
140   macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
141   macro (OMP_loop_static, 0, arg)                                              \
142   macro (OMP_loop_static_scheduling, 0, arg)                                   \
143   macro (OMP_loop_dynamic, 0, arg)                                             \
144   macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
145   macro (OMP_critical, 0, arg)                                                 \
146   macro (OMP_critical_wait, 0, arg)                                            \
147   macro (OMP_single, 0, arg)                                                   \
148   macro (OMP_master, 0, arg)                                                   \
149   macro (OMP_task_immediate, 0, arg)                                           \
150   macro (OMP_task_taskwait, 0, arg)                                            \
151   macro (OMP_task_taskyield, 0, arg)                                           \
152   macro (OMP_task_taskgroup, 0, arg)                                           \
153   macro (OMP_task_join_bar, 0, arg)                                            \
154   macro (OMP_task_plain_bar, 0, arg)                                           \
155   macro (OMP_taskloop_scheduling, 0, arg)                                      \
156   macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
157   macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
158   macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
159   macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
160   macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
161   macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
162          arg)                                                                  \
163   macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
164          arg)                                                                  \
165   macro (OMP_loop_static_iterations,                                           \
166          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
167   macro (OMP_loop_dynamic_iterations,                                          \
168          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
169   KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
170 // clang-format on
171 
172 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
173 //                           initializing OpenMP or being created by a master)
174 //                           until the thread is destroyed
175 // OMP_parallel           -- Time thread spends executing work directly
176 //                           within a #pragma omp parallel
177 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
178 // OMP_loop_static        -- Time thread spends executing loop iterations from
179 //                           a statically scheduled loop
180 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
181 //                               from a statically scheduled loop
182 // OMP_loop_dynamic       -- Time thread spends executing loop iterations from
183 //                           a dynamically scheduled loop
184 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
185 //                                from a dynamically scheduled loop
186 // OMP_critical           -- Time thread spends executing critical section
187 // OMP_critical_wait      -- Time thread spends waiting to enter
188 //                           a critcal seciton
189 // OMP_single             -- Time spent executing a "single" region
190 // OMP_master             -- Time spent executing a "master" region
191 // OMP_task_immediate     -- Time spent executing non-deferred tasks
192 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
193 //                           construct
194 // OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
195 //                           construct
196 // OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
197 //                           construct
198 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
199 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
200 //                           construct
201 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
202 //                            construct
203 // OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
204 //                           inside implicit barrier at end of worksharing
205 //                           construct
206 // OMP_idle               -- Time worker threads spend waiting for next
207 //                           parallel region
208 // OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
209 //                           parallel region
210 // OMP_join_barrier       -- Time spent in a the join barrier surrounding a
211 //                           parallel region
212 // OMP_serial             -- Time thread zero spends executing serial code
213 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
214 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
215 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
216 //                               statically scheduled loops
217 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
218 //                                dynamically scheduled loops
219 
220 #if (KMP_DEVELOPER_STATS)
221 // Timers which are of interest to runtime library developers, not end users.
222 // These have to be explicitly enabled in addition to the other stats.
223 
224 // KMP_fork_barrier       -- time in __kmp_fork_barrier
225 // KMP_join_barrier       -- time in __kmp_join_barrier
226 // KMP_barrier            -- time in __kmp_barrier
227 // KMP_end_split_barrier  -- time in __kmp_end_split_barrier
228 // KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
229 // KMP_icv_copy           -- start/stop timer for any ICV copying
230 // KMP_linear_gather      -- time in __kmp_linear_barrier_gather
231 // KMP_linear_release     -- time in __kmp_linear_barrier_release
232 // KMP_tree_gather        -- time in __kmp_tree_barrier_gather
233 // KMP_tree_release       -- time in __kmp_tree_barrier_release
234 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
235 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
236 // clang-format off
237 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
238   macro(KMP_fork_call, 0, arg)                                                 \
239   macro(KMP_join_call, 0, arg)                                                 \
240   macro(KMP_end_split_barrier, 0, arg)                                         \
241   macro(KMP_hier_gather, 0, arg)                                               \
242   macro(KMP_hier_release, 0, arg)                                              \
243   macro(KMP_hyper_gather, 0, arg)                                              \
244   macro(KMP_hyper_release, 0, arg)                                             \
245   macro(KMP_linear_gather, 0, arg)                                             \
246   macro(KMP_linear_release, 0, arg)                                            \
247   macro(KMP_tree_gather, 0, arg)                                               \
248   macro(KMP_tree_release, 0, arg)                                              \
249   macro(USER_resume, 0, arg)                                                   \
250   macro(USER_suspend, 0, arg)                                                  \
251   macro(KMP_allocate_team, 0, arg)                                             \
252   macro(KMP_setup_icv_copy, 0, arg)                                            \
253   macro(USER_icv_copy, 0, arg)                                                 \
254   macro (FOR_static_steal_stolen,                                              \
255          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
256   macro (FOR_static_steal_chunks,                                              \
257          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
258 #else
259 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
260 #endif
261 // clang-format on
262 
263 /*!
264  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
265  *
266  * @param macro a user defined macro that takes three arguments -
267  * macro(TIMER_NAME, flags, arg)
268  * @param arg a user defined argument to send to the user defined macro
269  *
270  * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
271  * BAD THINGS WILL HAPPEN!
272  *
273  * \details Explicit timers are ones where we need to allocate a timer itself
274  * (as well as the accumulated timing statistics). We allocate these on a
275  * per-thread basis, and explicitly start and stop them. Block timers just
276  * allocate the timer itself on the stack, and use the destructor to notice
277  * block exit; they don't need to be defined here. The name here should be the
278  * same as that of a timer above.
279  *
280  * @ingroup STATS_GATHERING
281 */
282 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
283 
284 #define ENUMERATE(name, ignore, prefix) prefix##name,
285 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
286 
287 enum explicit_timer_e {
288   KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
289 };
290 
291 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
292 #undef ENUMERATE
293 
294 /*
295  * A logarithmic histogram. It accumulates the number of values in each power of
296  * ten bin.  So 1<=x<10, 10<=x<100, ...
297  * Mostly useful where we have some big outliers and want to see information
298  * about them.
299  */
300 class logHistogram {
301   enum {
302     numBins = 31, /* Number of powers of 10. If this changes you need to change
303                    * the initializer for binMax */
304 
305     /*
306      * If you want to use this to analyse values that may be less than 1, (for
307      * instance times in s), then the logOffset gives you negative powers.
308      * In our case here, we're just looking at times in ticks, or counts, so we
309      * can never see values with magnitude < 1 (other than zero), so we can set
310      * it to 0.  As above change the initializer if you change this.
311      */
312     logOffset = 0
313   };
314   uint32_t KMP_ALIGN_CACHE zeroCount;
315   struct {
316     uint32_t count;
317     double total;
318   } bins[numBins];
319 
320   static double binMax[numBins];
321 
322 #ifdef KMP_DEBUG
323   uint64_t _total;
324 
check()325   void check() const {
326     uint64_t t = zeroCount;
327     for (int i = 0; i < numBins; i++)
328       t += bins[i].count;
329     KMP_DEBUG_ASSERT(t == _total);
330   }
331 #else
check()332   void check() const {}
333 #endif
334 
335 public:
logHistogram()336   logHistogram() { reset(); }
337 
logHistogram(logHistogram const & o)338   logHistogram(logHistogram const &o) {
339     for (int i = 0; i < numBins; i++)
340       bins[i] = o.bins[i];
341 #ifdef KMP_DEBUG
342     _total = o._total;
343 #endif
344   }
345 
reset()346   void reset() {
347     zeroCount = 0;
348     for (int i = 0; i < numBins; i++) {
349       bins[i].count = 0;
350       bins[i].total = 0;
351     }
352 
353 #ifdef KMP_DEBUG
354     _total = 0;
355 #endif
356   }
count(int b)357   uint32_t count(int b) const { return bins[b + logOffset].count; }
total(int b)358   double total(int b) const { return bins[b + logOffset].total; }
359   static uint32_t findBin(double sample);
360 
361   logHistogram &operator+=(logHistogram const &o) {
362     zeroCount += o.zeroCount;
363     for (int i = 0; i < numBins; i++) {
364       bins[i].count += o.bins[i].count;
365       bins[i].total += o.bins[i].total;
366     }
367 #ifdef KMP_DEBUG
368     _total += o._total;
369     check();
370 #endif
371 
372     return *this;
373   }
374 
375   void addSample(double sample);
376   int minBin() const;
377   int maxBin() const;
378 
379   std::string format(char) const;
380 };
381 
382 class statistic {
383   double KMP_ALIGN_CACHE minVal;
384   double maxVal;
385   double meanVal;
386   double m2;
387   uint64_t sampleCount;
388   double offset;
389   bool collectingHist;
390   logHistogram hist;
391 
392 public:
393   statistic(bool doHist = bool(KMP_STATS_HIST)) {
394     reset();
395     collectingHist = doHist;
396   }
statistic(statistic const & o)397   statistic(statistic const &o)
398       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
399         sampleCount(o.sampleCount), offset(o.offset),
400         collectingHist(o.collectingHist), hist(o.hist) {}
statistic(double minv,double maxv,double meanv,uint64_t sc,double sd)401   statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
402       : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
403         sampleCount(sc), offset(0.0), collectingHist(false) {}
haveHist()404   bool haveHist() const { return collectingHist; }
getMin()405   double getMin() const { return minVal; }
getMean()406   double getMean() const { return meanVal; }
getMax()407   double getMax() const { return maxVal; }
getCount()408   uint64_t getCount() const { return sampleCount; }
getSD()409   double getSD() const { return sqrt(m2 / sampleCount); }
getTotal()410   double getTotal() const { return sampleCount * meanVal; }
getHist()411   logHistogram const *getHist() const { return &hist; }
setOffset(double d)412   void setOffset(double d) { offset = d; }
413 
reset()414   void reset() {
415     minVal = std::numeric_limits<double>::max();
416     maxVal = -minVal;
417     meanVal = 0.0;
418     m2 = 0.0;
419     sampleCount = 0;
420     offset = 0.0;
421     hist.reset();
422   }
423   void addSample(double sample);
424   void scale(double factor);
scaleDown(double f)425   void scaleDown(double f) { scale(1. / f); }
forceCount(uint64_t count)426   void forceCount(uint64_t count) { sampleCount = count; }
427   statistic &operator+=(statistic const &other);
428 
429   std::string format(char unit, bool total = false) const;
formatHist(char unit)430   std::string formatHist(char unit) const { return hist.format(unit); }
431 };
432 
433 struct statInfo {
434   const char *name;
435   uint32_t flags;
436 };
437 
438 class timeStat : public statistic {
439   static statInfo timerInfo[];
440 
441 public:
timeStat()442   timeStat() : statistic() {}
name(timer_e e)443   static const char *name(timer_e e) { return timerInfo[e].name; }
noTotal(timer_e e)444   static bool noTotal(timer_e e) {
445     return timerInfo[e].flags & stats_flags_e::noTotal;
446   }
masterOnly(timer_e e)447   static bool masterOnly(timer_e e) {
448     return timerInfo[e].flags & stats_flags_e::onlyInMaster;
449   }
workerOnly(timer_e e)450   static bool workerOnly(timer_e e) {
451     return timerInfo[e].flags & stats_flags_e::notInMaster;
452   }
noUnits(timer_e e)453   static bool noUnits(timer_e e) {
454     return timerInfo[e].flags & stats_flags_e::noUnits;
455   }
logEvent(timer_e e)456   static bool logEvent(timer_e e) {
457     return timerInfo[e].flags & stats_flags_e::logEvent;
458   }
clearEventFlags()459   static void clearEventFlags() {
460     for (int i = 0; i < TIMER_LAST; i++) {
461       timerInfo[i].flags &= (~(stats_flags_e::logEvent));
462     }
463   }
464 };
465 
466 // Where we need explicitly to start and end the timer, this version can be used
467 // Since these timers normally aren't nicely scoped, so don't have a good place
468 // to live on the stack of the thread, they're more work to use.
469 class explicitTimer {
470   timeStat *stat;
471   timer_e timerEnumValue;
472   tsc_tick_count startTime;
473   tsc_tick_count pauseStartTime;
474   tsc_tick_count::tsc_interval_t totalPauseTime;
475 
476 public:
explicitTimer(timeStat * s,timer_e te)477   explicitTimer(timeStat *s, timer_e te)
478       : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
479         totalPauseTime() {}
480 
481   // void setStat(timeStat *s) { stat = s; }
482   void start(tsc_tick_count tick);
pause(tsc_tick_count tick)483   void pause(tsc_tick_count tick) { pauseStartTime = tick; }
resume(tsc_tick_count tick)484   void resume(tsc_tick_count tick) {
485     totalPauseTime += (tick - pauseStartTime);
486   }
487   void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
reset()488   void reset() {
489     startTime = 0;
490     pauseStartTime = 0;
491     totalPauseTime = 0;
492   }
get_type()493   timer_e get_type() const { return timerEnumValue; }
494 };
495 
496 // Where you need to partition a threads clock ticks into separate states
497 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
498 // DOING_NOTHING would render these conditions:
499 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
500 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
501 // versa
502 class partitionedTimers {
503 private:
504   std::vector<explicitTimer> timer_stack;
505 
506 public:
507   partitionedTimers();
508   void init(explicitTimer timer);
509   void exchange(explicitTimer timer);
510   void push(explicitTimer timer);
511   void pop();
512   void windup();
513 };
514 
515 // Special wrapper around the partioned timers to aid timing code blocks
516 // It avoids the need to have an explicit end, leaving the scope suffices.
517 class blockPartitionedTimer {
518   partitionedTimers *part_timers;
519 
520 public:
blockPartitionedTimer(partitionedTimers * pt,explicitTimer timer)521   blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
522       : part_timers(pt) {
523     part_timers->push(timer);
524   }
~blockPartitionedTimer()525   ~blockPartitionedTimer() { part_timers->pop(); }
526 };
527 
528 // Special wrapper around the thread state to aid in keeping state in code
529 // blocks It avoids the need to have an explicit end, leaving the scope
530 // suffices.
531 class blockThreadState {
532   stats_state_e *state_pointer;
533   stats_state_e old_state;
534 
535 public:
blockThreadState(stats_state_e * thread_state_pointer,stats_state_e new_state)536   blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
537       : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
538     *state_pointer = new_state;
539   }
~blockThreadState()540   ~blockThreadState() { *state_pointer = old_state; }
541 };
542 
543 // If all you want is a count, then you can use this...
544 // The individual per-thread counts will be aggregated into a statistic at
545 // program exit.
546 class counter {
547   uint64_t value;
548   static const statInfo counterInfo[];
549 
550 public:
counter()551   counter() : value(0) {}
increment()552   void increment() { value++; }
getValue()553   uint64_t getValue() const { return value; }
reset()554   void reset() { value = 0; }
name(counter_e e)555   static const char *name(counter_e e) { return counterInfo[e].name; }
masterOnly(counter_e e)556   static bool masterOnly(counter_e e) {
557     return counterInfo[e].flags & stats_flags_e::onlyInMaster;
558   }
559 };
560 
561 /* ****************************************************************
562     Class to implement an event
563 
564     There are four components to an event: start time, stop time
565     nest_level, and timer_name.
566     The start and stop time should be obvious (recorded in clock ticks).
567     The nest_level relates to the bar width in the timeline graph.
568     The timer_name is used to determine which timer event triggered this event.
569 
570     the interface to this class is through four read-only operations:
571     1) getStart()     -- returns the start time as 64 bit integer
572     2) getStop()      -- returns the stop time as 64 bit integer
573     3) getNestLevel() -- returns the nest level of the event
574     4) getTimerName() -- returns the timer name that triggered event
575 
576     *MORE ON NEST_LEVEL*
577     The nest level is used in the bar graph that represents the timeline.
578     Its main purpose is for showing how events are nested inside eachother.
579     For example, say events, A, B, and C are recorded.  If the timeline
580     looks like this:
581 
582 Begin -------------------------------------------------------------> Time
583          |    |          |        |          |              |
584          A    B          C        C          B              A
585        start start     start     end        end            end
586 
587        Then A, B, C will have a nest level of 1, 2, 3 respectively.
588        These values are then used to calculate the barwidth so you can
589        see that inside A, B has occurred, and inside B, C has occurred.
590        Currently, this is shown with A's bar width being larger than B's
591        bar width, and B's bar width being larger than C's bar width.
592 
593 **************************************************************** */
594 class kmp_stats_event {
595   uint64_t start;
596   uint64_t stop;
597   int nest_level;
598   timer_e timer_name;
599 
600 public:
kmp_stats_event()601   kmp_stats_event()
602       : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
kmp_stats_event(uint64_t strt,uint64_t stp,int nst,timer_e nme)603   kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
604       : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
getStart()605   inline uint64_t getStart() const { return start; }
getStop()606   inline uint64_t getStop() const { return stop; }
getNestLevel()607   inline int getNestLevel() const { return nest_level; }
getTimerName()608   inline timer_e getTimerName() const { return timer_name; }
609 };
610 
611 /* ****************************************************************
612     Class to implement a dynamically expandable array of events
613 
614     ---------------------------------------------------------
615     | event 1 | event 2 | event 3 | event 4 | ... | event N |
616     ---------------------------------------------------------
617 
618     An event is pushed onto the back of this array at every
619     explicitTimer->stop() call.  The event records the thread #,
620     start time, stop time, and nest level related to the bar width.
621 
622     The event vector starts at size INIT_SIZE and grows (doubles in size)
623     if needed.  An implication of this behavior is that log(N)
624     reallocations are needed (where N is number of events).  If you want
625     to avoid reallocations, then set INIT_SIZE to a large value.
626 
627     the interface to this class is through six operations:
628     1) reset() -- sets the internal_size back to 0 but does not deallocate any
629        memory
630     2) size()  -- returns the number of valid elements in the vector
631     3) push_back(start, stop, nest, timer_name) -- pushes an event onto
632        the back of the array
633     4) deallocate() -- frees all memory associated with the vector
634     5) sort() -- sorts the vector by start time
635     6) operator[index] or at(index) -- returns event reference at that index
636 **************************************************************** */
637 class kmp_stats_event_vector {
638   kmp_stats_event *events;
639   int internal_size;
640   int allocated_size;
641   static const int INIT_SIZE = 1024;
642 
643 public:
kmp_stats_event_vector()644   kmp_stats_event_vector() {
645     events =
646         (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
647     internal_size = 0;
648     allocated_size = INIT_SIZE;
649   }
~kmp_stats_event_vector()650   ~kmp_stats_event_vector() {}
reset()651   inline void reset() { internal_size = 0; }
size()652   inline int size() const { return internal_size; }
push_back(uint64_t start_time,uint64_t stop_time,int nest_level,timer_e name)653   void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
654                  timer_e name) {
655     int i;
656     if (internal_size == allocated_size) {
657       kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
658           sizeof(kmp_stats_event) * allocated_size * 2);
659       for (i = 0; i < internal_size; i++)
660         tmp[i] = events[i];
661       __kmp_free(events);
662       events = tmp;
663       allocated_size *= 2;
664     }
665     events[internal_size] =
666         kmp_stats_event(start_time, stop_time, nest_level, name);
667     internal_size++;
668     return;
669   }
670   void deallocate();
671   void sort();
672   const kmp_stats_event &operator[](int index) const { return events[index]; }
673   kmp_stats_event &operator[](int index) { return events[index]; }
at(int index)674   const kmp_stats_event &at(int index) const { return events[index]; }
at(int index)675   kmp_stats_event &at(int index) { return events[index]; }
676 };
677 
678 /* ****************************************************************
679     Class to implement a doubly-linked, circular, statistics list
680 
681     |---| ---> |---| ---> |---| ---> |---| ---> ... next
682     |   |      |   |      |   |      |   |
683     |---| <--- |---| <--- |---| <--- |---| <--- ... prev
684     Sentinel   first      second     third
685     Node       node       node       node
686 
687     The Sentinel Node is the user handle on the list.
688     The first node corresponds to thread 0's statistics.
689     The second node corresponds to thread 1's statistics and so on...
690 
691     Each node has a _timers, _counters, and _explicitTimers array to hold that
692     thread's statistics. The _explicitTimers point to the correct _timer and
693     update its statistics at every stop() call. The explicitTimers' pointers are
694     set up in the constructor. Each node also has an event vector to hold that
695     thread's timing events. The event vector expands as necessary and records
696     the start-stop times for each timer.
697 
698     The nestLevel variable is for plotting events and is related
699     to the bar width in the timeline graph.
700 
701     Every thread will have a thread local pointer to its node in
702     the list.  The sentinel node is used by the master thread to
703     store "dummy" statistics before __kmp_create_worker() is called.
704 **************************************************************** */
705 class kmp_stats_list {
706   int gtid;
707   timeStat _timers[TIMER_LAST + 1];
708   counter _counters[COUNTER_LAST + 1];
709   explicitTimer thread_life_timer;
710   partitionedTimers _partitionedTimers;
711   int _nestLevel; // one per thread
712   kmp_stats_event_vector _event_vector;
713   kmp_stats_list *next;
714   kmp_stats_list *prev;
715   stats_state_e state;
716   int thread_is_idle_flag;
717 
718 public:
kmp_stats_list()719   kmp_stats_list()
720       : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
721                           TIMER_OMP_worker_thread_life),
722         _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
723         thread_is_idle_flag(0) {}
~kmp_stats_list()724   ~kmp_stats_list() {}
getTimer(timer_e idx)725   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
getCounter(counter_e idx)726   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
getPartitionedTimers()727   inline partitionedTimers *getPartitionedTimers() {
728     return &_partitionedTimers;
729   }
getTimers()730   inline timeStat *getTimers() { return _timers; }
getCounters()731   inline counter *getCounters() { return _counters; }
getEventVector()732   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
startLife()733   inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
endLife()734   inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
resetEventVector()735   inline void resetEventVector() { _event_vector.reset(); }
incrementNestValue()736   inline void incrementNestValue() { _nestLevel++; }
getNestValue()737   inline int getNestValue() { return _nestLevel; }
decrementNestValue()738   inline void decrementNestValue() { _nestLevel--; }
getGtid()739   inline int getGtid() const { return gtid; }
setGtid(int newgtid)740   inline void setGtid(int newgtid) { gtid = newgtid; }
setState(stats_state_e newstate)741   inline void setState(stats_state_e newstate) { state = newstate; }
getState()742   inline stats_state_e getState() const { return state; }
getStatePointer()743   inline stats_state_e *getStatePointer() { return &state; }
isIdle()744   inline bool isIdle() { return thread_is_idle_flag == 1; }
setIdleFlag()745   inline void setIdleFlag() { thread_is_idle_flag = 1; }
resetIdleFlag()746   inline void resetIdleFlag() { thread_is_idle_flag = 0; }
747   kmp_stats_list *push_back(int gtid); // returns newly created list node
push_event(uint64_t start_time,uint64_t stop_time,int nest_level,timer_e name)748   inline void push_event(uint64_t start_time, uint64_t stop_time,
749                          int nest_level, timer_e name) {
750     _event_vector.push_back(start_time, stop_time, nest_level, name);
751   }
752   void deallocate();
753   class iterator;
754   kmp_stats_list::iterator begin();
755   kmp_stats_list::iterator end();
756   int size();
757   class iterator {
758     kmp_stats_list *ptr;
759     friend kmp_stats_list::iterator kmp_stats_list::begin();
760     friend kmp_stats_list::iterator kmp_stats_list::end();
761 
762   public:
763     iterator();
764     ~iterator();
765     iterator operator++();
766     iterator operator++(int dummy);
767     iterator operator--();
768     iterator operator--(int dummy);
769     bool operator!=(const iterator &rhs);
770     bool operator==(const iterator &rhs);
771     kmp_stats_list *operator*() const; // dereference operator
772   };
773 };
774 
775 /* ****************************************************************
776    Class to encapsulate all output functions and the environment variables
777 
778    This module holds filenames for various outputs (normal stats, events, plot
779    file), as well as coloring information for the plot file.
780 
781    The filenames and flags variables are read from environment variables.
782    These are read once by the constructor of the global variable
783    __kmp_stats_output which calls init().
784 
785    During this init() call, event flags for the timeStat::timerInfo[] global
786    array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
787 
788    The only interface function that is public is outputStats(heading).  This
789    function should print out everything it needs to, either to files or stderr,
790    depending on the environment variables described below
791 
792    ENVIRONMENT VARIABLES:
793    KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
794                      file, otherwise, print to stderr
795    KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
796                         either KMP_STATS_FILE or stderr
797    KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
798                           otherwise, the plot file is sent to "events.plt"
799    KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
800                        events
801    KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
802                             otherwise, output is sent to "events.dat"
803 **************************************************************** */
804 class kmp_stats_output_module {
805 
806 public:
807   struct rgb_color {
808     float r;
809     float g;
810     float b;
811   };
812 
813 private:
814   std::string outputFileName;
815   static const char *eventsFileName;
816   static const char *plotFileName;
817   static int printPerThreadFlag;
818   static int printPerThreadEventsFlag;
819   static const rgb_color globalColorArray[];
820   static rgb_color timerColorInfo[];
821 
822   void init();
823   static void setupEventColors();
824   static void printPloticusFile();
825   static void printHeaderInfo(FILE *statsOut);
826   static void printTimerStats(FILE *statsOut, statistic const *theStats,
827                               statistic const *totalStats);
828   static void printCounterStats(FILE *statsOut, statistic const *theStats);
829   static void printCounters(FILE *statsOut, counter const *theCounters);
830   static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
831                           int gtid);
getEventColor(timer_e e)832   static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
833   static void windupExplicitTimers();
eventPrintingEnabled()834   bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
835 
836 public:
kmp_stats_output_module()837   kmp_stats_output_module() { init(); }
838   void outputStats(const char *heading);
839 };
840 
841 #ifdef __cplusplus
842 extern "C" {
843 #endif
844 void __kmp_stats_init();
845 void __kmp_stats_fini();
846 void __kmp_reset_stats();
847 void __kmp_output_stats(const char *);
848 void __kmp_accumulate_stats_at_exit(void);
849 // thread local pointer to stats node within list
850 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
851 // head to stats list.
852 extern kmp_stats_list *__kmp_stats_list;
853 // lock for __kmp_stats_list
854 extern kmp_tas_lock_t __kmp_stats_lock;
855 // reference start time
856 extern tsc_tick_count __kmp_stats_start_time;
857 // interface to output
858 extern kmp_stats_output_module __kmp_stats_output;
859 
860 #ifdef __cplusplus
861 }
862 #endif
863 
864 // Simple, standard interfaces that drop out completely if stats aren't enabled
865 
866 /*!
867  * \brief Adds value to specified timer (name).
868  *
869  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
870  * @param value double precision sample value to add to statistics for the timer
871  *
872  * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
873  * a timer statistics.
874  *
875  * @ingroup STATS_GATHERING
876 */
877 #define KMP_COUNT_VALUE(name, value)                                           \
878   __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
879 
880 /*!
881  * \brief Increments specified counter (name).
882  *
883  * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
884  *
885  * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
886  * counter for the executing thread.
887  *
888  * @ingroup STATS_GATHERING
889 */
890 #define KMP_COUNT_BLOCK(name)                                                  \
891   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
892 
893 /*!
894  * \brief Outputs the current thread statistics and reset them.
895  *
896  * @param heading_string heading put above the final stats output
897  *
898  * \details Explicitly stops all timers and outputs all stats. Environment
899  * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
900  * filename instead of stderr. Environment variable,
901  * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
902  * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
903  * defined with any value, which will print out thread specific stats, or it can
904  * be undefined (not specified in the environment) and thread specific stats
905  * won't be printed. It should be noted that all statistics are reset when this
906  * macro is called.
907  *
908  * @ingroup STATS_GATHERING
909 */
910 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
911 
912 /*!
913  * \brief Initializes the paritioned timers to begin with name.
914  *
915  * @param name timer which you want this thread to begin with
916  *
917  * @ingroup STATS_GATHERING
918 */
919 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
920   __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
921       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
922 
923 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
924   blockPartitionedTimer __PBLOCKTIME__(                                        \
925       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
926       explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
927                     TIMER_##name))
928 
929 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
930   __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
931       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
932 
933 #define KMP_POP_PARTITIONED_TIMER()                                            \
934   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
935 
936 #define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
937   __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
938       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
939 
940 #define KMP_SET_THREAD_STATE(state_name)                                       \
941   __kmp_stats_thread_ptr->setState(state_name)
942 
943 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
944 
945 #define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
946   blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
947                                     state_name)
948 
949 /*!
950  * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
951  *
952  * \details Reset all stats for all threads.
953  *
954  * @ingroup STATS_GATHERING
955 */
956 #define KMP_RESET_STATS() __kmp_reset_stats()
957 
958 #if (KMP_DEVELOPER_STATS)
959 #define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
960 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
961 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
962 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
963 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
964 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
965 #else
966 // Null definitions
967 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
968 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
969 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
970 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
971 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
972 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
973 #endif
974 
975 #else // KMP_STATS_ENABLED
976 
977 // Null definitions
978 #define KMP_TIME_BLOCK(n) ((void)0)
979 #define KMP_COUNT_VALUE(n, v) ((void)0)
980 #define KMP_COUNT_BLOCK(n) ((void)0)
981 #define KMP_START_EXPLICIT_TIMER(n) ((void)0)
982 #define KMP_STOP_EXPLICIT_TIMER(n) ((void)0)
983 
984 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
985 #define KMP_RESET_STATS() ((void)0)
986 
987 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
988 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
989 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
990 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
991 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
992 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
993 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
994 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
995 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
996 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
997 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
998 #define KMP_GET_THREAD_STATE() ((void)0)
999 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1000 #endif // KMP_STATS_ENABLED
1001 
1002 #endif // KMP_STATS_H
1003