1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef KMP_AFFINITY_H
16 #define KMP_AFFINITY_H
17 
18 #include "kmp_os.h"
19 #include "kmp.h"
20 
21 #if KMP_AFFINITY_SUPPORTED
22 #if KMP_USE_HWLOC
23 class KMPHwlocAffinity: public KMPAffinity {
24 public:
25     class Mask : public KMPAffinity::Mask {
26         hwloc_cpuset_t mask;
27     public:
28         Mask() { mask = hwloc_bitmap_alloc(); this->zero(); }
29         ~Mask() { hwloc_bitmap_free(mask); }
30         void set(int i) override { hwloc_bitmap_set(mask, i); }
31         bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
32         void clear(int i) override { hwloc_bitmap_clr(mask, i); }
33         void zero() override { hwloc_bitmap_zero(mask); }
34         void copy(const KMPAffinity::Mask* src) override {
35             const Mask* convert = static_cast<const Mask*>(src);
36             hwloc_bitmap_copy(mask, convert->mask);
37         }
38         void bitwise_and(const KMPAffinity::Mask* rhs) override {
39             const Mask* convert = static_cast<const Mask*>(rhs);
40             hwloc_bitmap_and(mask, mask, convert->mask);
41         }
42         void bitwise_or(const KMPAffinity::Mask * rhs) override {
43             const Mask* convert = static_cast<const Mask*>(rhs);
44             hwloc_bitmap_or(mask, mask, convert->mask);
45         }
46         void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
47         int begin() const override { return hwloc_bitmap_first(mask); }
48         int end() const override { return -1; }
49         int next(int previous) const override { return hwloc_bitmap_next(mask, previous); }
50         int get_system_affinity(bool abort_on_error) override {
51             KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
52               "Illegal get affinity operation when not capable");
53             int retval = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
54             if (retval >= 0) {
55                 return 0;
56             }
57             int error = errno;
58             if (abort_on_error) {
59                 __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
60             }
61             return error;
62         }
63         int set_system_affinity(bool abort_on_error) const override {
64             KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
65               "Illegal get affinity operation when not capable");
66             int retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67             if (retval >= 0) {
68                 return 0;
69             }
70             int error = errno;
71             if (abort_on_error) {
72                 __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
73             }
74             return error;
75         }
76         int get_proc_group() const override {
77             int i;
78             int group = -1;
79 # if KMP_OS_WINDOWS
80             if (__kmp_num_proc_groups == 1) {
81                 return 1;
82             }
83             for (i = 0; i < __kmp_num_proc_groups; i++) {
84                 // On windows, the long type is always 32 bits
85                 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i*2);
86                 unsigned long second_32_bits = hwloc_bitmap_to_ith_ulong(mask, i*2+1);
87                 if (first_32_bits == 0 && second_32_bits == 0) {
88                     continue;
89                 }
90                 if (group >= 0) {
91                     return -1;
92                 }
93                 group = i;
94             }
95 # endif /* KMP_OS_WINDOWS */
96             return group;
97         }
98     };
99     void determine_capable(const char* var) override {
100         const hwloc_topology_support* topology_support;
101         if(__kmp_hwloc_topology == NULL) {
102             if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
103                 __kmp_hwloc_error = TRUE;
104                 if(__kmp_affinity_verbose)
105                     KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
106             }
107             if(hwloc_topology_load(__kmp_hwloc_topology) < 0) {
108                 __kmp_hwloc_error = TRUE;
109                 if(__kmp_affinity_verbose)
110                     KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
111             }
112         }
113         topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
114         // Is the system capable of setting/getting this thread's affinity?
115         // also, is topology discovery possible? (pu indicates ability to discover processing units)
116         // and finally, were there no errors when calling any hwloc_* API functions?
117         if(topology_support && topology_support->cpubind->set_thisthread_cpubind &&
118            topology_support->cpubind->get_thisthread_cpubind &&
119            topology_support->discovery->pu &&
120            !__kmp_hwloc_error)
121         {
122             // enables affinity according to KMP_AFFINITY_CAPABLE() macro
123             KMP_AFFINITY_ENABLE(TRUE);
124         } else {
125             // indicate that hwloc didn't work and disable affinity
126             __kmp_hwloc_error = TRUE;
127             KMP_AFFINITY_DISABLE();
128         }
129     }
130     void bind_thread(int which) override {
131         KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
132           "Illegal set affinity operation when not capable");
133         KMPAffinity::Mask *mask;
134         KMP_CPU_ALLOC_ON_STACK(mask);
135         KMP_CPU_ZERO(mask);
136         KMP_CPU_SET(which, mask);
137         __kmp_set_system_affinity(mask, TRUE);
138         KMP_CPU_FREE_FROM_STACK(mask);
139     }
140     KMPAffinity::Mask* allocate_mask() override { return new Mask();  }
141     void deallocate_mask(KMPAffinity::Mask* m) override { delete m; }
142     KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
143     void deallocate_mask_array(KMPAffinity::Mask* array) override {
144         Mask* hwloc_array = static_cast<Mask*>(array);
145         delete[] hwloc_array;
146     }
147     KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
148         Mask* hwloc_array = static_cast<Mask*>(array);
149         return &(hwloc_array[index]);
150     }
151     api_type get_api_type() const override { return HWLOC; }
152 };
153 #endif /* KMP_USE_HWLOC */
154 
155 #if KMP_OS_LINUX
156 /*
157  * On some of the older OS's that we build on, these constants aren't present
158  * in <asm/unistd.h> #included from <sys.syscall.h>.  They must be the same on
159  * all systems of the same arch where they are defined, and they cannot change.
160  * stone forever.
161  */
162 #include <sys/syscall.h>
163 # if KMP_ARCH_X86 || KMP_ARCH_ARM
164 #  ifndef __NR_sched_setaffinity
165 #   define __NR_sched_setaffinity  241
166 #  elif __NR_sched_setaffinity != 241
167 #   error Wrong code for setaffinity system call.
168 #  endif /* __NR_sched_setaffinity */
169 #  ifndef __NR_sched_getaffinity
170 #   define __NR_sched_getaffinity  242
171 #  elif __NR_sched_getaffinity != 242
172 #   error Wrong code for getaffinity system call.
173 #  endif /* __NR_sched_getaffinity */
174 # elif KMP_ARCH_AARCH64
175 #  ifndef __NR_sched_setaffinity
176 #   define __NR_sched_setaffinity  122
177 #  elif __NR_sched_setaffinity != 122
178 #   error Wrong code for setaffinity system call.
179 #  endif /* __NR_sched_setaffinity */
180 #  ifndef __NR_sched_getaffinity
181 #   define __NR_sched_getaffinity  123
182 #  elif __NR_sched_getaffinity != 123
183 #   error Wrong code for getaffinity system call.
184 #  endif /* __NR_sched_getaffinity */
185 # elif KMP_ARCH_X86_64
186 #  ifndef __NR_sched_setaffinity
187 #   define __NR_sched_setaffinity  203
188 #  elif __NR_sched_setaffinity != 203
189 #   error Wrong code for setaffinity system call.
190 #  endif /* __NR_sched_setaffinity */
191 #  ifndef __NR_sched_getaffinity
192 #   define __NR_sched_getaffinity  204
193 #  elif __NR_sched_getaffinity != 204
194 #   error Wrong code for getaffinity system call.
195 #  endif /* __NR_sched_getaffinity */
196 # elif KMP_ARCH_PPC64
197 #  ifndef __NR_sched_setaffinity
198 #   define __NR_sched_setaffinity  222
199 #  elif __NR_sched_setaffinity != 222
200 #   error Wrong code for setaffinity system call.
201 #  endif /* __NR_sched_setaffinity */
202 #  ifndef __NR_sched_getaffinity
203 #   define __NR_sched_getaffinity  223
204 #  elif __NR_sched_getaffinity != 223
205 #   error Wrong code for getaffinity system call.
206 #  endif /* __NR_sched_getaffinity */
207 #  elif KMP_ARCH_MIPS
208 #   ifndef __NR_sched_setaffinity
209 #    define __NR_sched_setaffinity  4239
210 #   elif __NR_sched_setaffinity != 4239
211 #    error Wrong code for setaffinity system call.
212 #   endif /* __NR_sched_setaffinity */
213 #   ifndef __NR_sched_getaffinity
214 #    define __NR_sched_getaffinity  4240
215 #   elif __NR_sched_getaffinity != 4240
216 #    error Wrong code for getaffinity system call.
217 #   endif /* __NR_sched_getaffinity */
218 #  elif KMP_ARCH_MIPS64
219 #   ifndef __NR_sched_setaffinity
220 #    define __NR_sched_setaffinity  5195
221 #   elif __NR_sched_setaffinity != 5195
222 #    error Wrong code for setaffinity system call.
223 #   endif /* __NR_sched_setaffinity */
224 #   ifndef __NR_sched_getaffinity
225 #    define __NR_sched_getaffinity  5196
226 #   elif __NR_sched_getaffinity != 5196
227 #    error Wrong code for getaffinity system call.
228 #   endif /* __NR_sched_getaffinity */
229 #  error Unknown or unsupported architecture
230 # endif /* KMP_ARCH_* */
231 class KMPNativeAffinity : public KMPAffinity {
232     class Mask : public KMPAffinity::Mask {
233         typedef unsigned char mask_t;
234         static const int BITS_PER_MASK_T = sizeof(mask_t)*CHAR_BIT;
235     public:
236         mask_t* mask;
237         Mask() { mask = (mask_t*)__kmp_allocate(__kmp_affin_mask_size); }
238         ~Mask() { if (mask) __kmp_free(mask); }
239         void set(int i) override { mask[i/BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); }
240         bool is_set(int i) const override { return (mask[i/BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); }
241         void clear(int i) override { mask[i/BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); }
242         void zero() override {
243             for (size_t i=0; i<__kmp_affin_mask_size; ++i)
244                 mask[i] = 0;
245         }
246         void copy(const KMPAffinity::Mask* src) override {
247             const Mask * convert = static_cast<const Mask*>(src);
248             for (size_t i=0; i<__kmp_affin_mask_size; ++i)
249                 mask[i] = convert->mask[i];
250         }
251         void bitwise_and(const KMPAffinity::Mask* rhs) override {
252             const Mask * convert = static_cast<const Mask*>(rhs);
253             for (size_t i=0; i<__kmp_affin_mask_size; ++i)
254                 mask[i] &= convert->mask[i];
255         }
256         void bitwise_or(const KMPAffinity::Mask* rhs) override {
257             const Mask * convert = static_cast<const Mask*>(rhs);
258             for (size_t i=0; i<__kmp_affin_mask_size; ++i)
259                 mask[i] |= convert->mask[i];
260         }
261         void bitwise_not() override {
262             for (size_t i=0; i<__kmp_affin_mask_size; ++i)
263                 mask[i] = ~(mask[i]);
264         }
265         int begin() const override {
266             int retval = 0;
267             while (retval < end() && !is_set(retval))
268                 ++retval;
269             return retval;
270         }
271         int end() const override { return __kmp_affin_mask_size*BITS_PER_MASK_T; }
272         int next(int previous) const override {
273             int retval = previous+1;
274             while (retval < end() && !is_set(retval))
275                 ++retval;
276             return retval;
277         }
278         int get_system_affinity(bool abort_on_error) override {
279             KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
280               "Illegal get affinity operation when not capable");
281             int retval = syscall( __NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask );
282             if (retval >= 0) {
283                 return 0;
284             }
285             int error = errno;
286             if (abort_on_error) {
287                 __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
288             }
289             return error;
290         }
291         int set_system_affinity(bool abort_on_error) const override {
292             KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
293               "Illegal get affinity operation when not capable");
294             int retval = syscall( __NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask );
295             if (retval >= 0) {
296                 return 0;
297             }
298             int error = errno;
299             if (abort_on_error) {
300                 __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
301             }
302             return error;
303         }
304     };
305     void determine_capable(const char* env_var) override {
306         __kmp_affinity_determine_capable(env_var);
307     }
308     void bind_thread(int which) override {
309         __kmp_affinity_bind_thread(which);
310     }
311     KMPAffinity::Mask* allocate_mask() override {
312         KMPNativeAffinity::Mask* retval = new Mask();
313         return retval;
314     }
315     void deallocate_mask(KMPAffinity::Mask* m) override {
316         KMPNativeAffinity::Mask* native_mask = static_cast<KMPNativeAffinity::Mask*>(m);
317         delete m;
318     }
319     KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
320     void deallocate_mask_array(KMPAffinity::Mask* array) override {
321         Mask* linux_array = static_cast<Mask*>(array);
322         delete[] linux_array;
323     }
324     KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
325         Mask* linux_array = static_cast<Mask*>(array);
326         return &(linux_array[index]);
327     }
328     api_type get_api_type() const override { return NATIVE_OS; }
329 };
330 #endif /* KMP_OS_LINUX */
331 
332 #if KMP_OS_WINDOWS
333 class KMPNativeAffinity : public KMPAffinity {
334     class Mask : public KMPAffinity::Mask {
335         typedef ULONG_PTR mask_t;
336         static const int BITS_PER_MASK_T = sizeof(mask_t)*CHAR_BIT;
337         mask_t* mask;
338     public:
339         Mask() { mask = (mask_t*)__kmp_allocate(sizeof(mask_t)*__kmp_num_proc_groups); }
340         ~Mask() { if (mask) __kmp_free(mask); }
341         void set(int i) override { mask[i/BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); }
342         bool is_set(int i) const override { return (mask[i/BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); }
343         void clear(int i) override { mask[i/BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); }
344         void zero() override {
345             for (size_t i=0; i<__kmp_num_proc_groups; ++i)
346                 mask[i] = 0;
347         }
348         void copy(const KMPAffinity::Mask* src) override {
349             const Mask * convert = static_cast<const Mask*>(src);
350             for (size_t i=0; i<__kmp_num_proc_groups; ++i)
351                 mask[i] = convert->mask[i];
352         }
353         void bitwise_and(const KMPAffinity::Mask* rhs) override {
354             const Mask * convert = static_cast<const Mask*>(rhs);
355             for (size_t i=0; i<__kmp_num_proc_groups; ++i)
356                 mask[i] &= convert->mask[i];
357         }
358         void bitwise_or(const KMPAffinity::Mask* rhs) override {
359             const Mask * convert = static_cast<const Mask*>(rhs);
360             for (size_t i=0; i<__kmp_num_proc_groups; ++i)
361                 mask[i] |= convert->mask[i];
362         }
363         void bitwise_not() override {
364             for (size_t i=0; i<__kmp_num_proc_groups; ++i)
365                 mask[i] = ~(mask[i]);
366         }
367         int begin() const override {
368             int retval = 0;
369             while (retval < end() && !is_set(retval))
370                 ++retval;
371             return retval;
372         }
373         int end() const override { return __kmp_num_proc_groups*BITS_PER_MASK_T; }
374         int next(int previous) const override {
375             int retval = previous+1;
376             while (retval < end() && !is_set(retval))
377                 ++retval;
378             return retval;
379         }
380         int set_system_affinity(bool abort_on_error) const override {
381             if (__kmp_num_proc_groups > 1) {
382                 // Check for a valid mask.
383                 GROUP_AFFINITY ga;
384                 int group = get_proc_group();
385                 if (group < 0) {
386                     if (abort_on_error) {
387                         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
388                     }
389                     return -1;
390                 }
391                 // Transform the bit vector into a GROUP_AFFINITY struct
392                 // and make the system call to set affinity.
393                 ga.Group = group;
394                 ga.Mask = mask[group];
395                 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
396 
397                 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
398                 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
399                     DWORD error = GetLastError();
400                     if (abort_on_error) {
401                         __kmp_msg(kmp_ms_fatal, KMP_MSG( CantSetThreadAffMask ),
402                                   KMP_ERR( error ), __kmp_msg_null);
403                     }
404                     return error;
405                 }
406             } else {
407                 if (!SetThreadAffinityMask( GetCurrentThread(), *mask )) {
408                     DWORD error = GetLastError();
409                     if (abort_on_error) {
410                         __kmp_msg(kmp_ms_fatal, KMP_MSG( CantSetThreadAffMask ),
411                                   KMP_ERR( error ), __kmp_msg_null);
412                     }
413                     return error;
414                 }
415             }
416             return 0;
417         }
418         int get_system_affinity(bool abort_on_error) override {
419             if (__kmp_num_proc_groups > 1) {
420                 this->zero();
421                 GROUP_AFFINITY ga;
422                 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
423                 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
424                     DWORD error = GetLastError();
425                     if (abort_on_error) {
426                         __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
427                                   KMP_ERR(error), __kmp_msg_null);
428                     }
429                     return error;
430                 }
431                 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || (ga.Mask == 0)) {
432                     return -1;
433                 }
434                 mask[ga.Group] = ga.Mask;
435             } else {
436                 mask_t newMask, sysMask, retval;
437                 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
438                     DWORD error = GetLastError();
439                     if (abort_on_error) {
440                         __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
441                                   KMP_ERR(error), __kmp_msg_null);
442                     }
443                     return error;
444                 }
445                 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
446                 if (! retval) {
447                     DWORD error = GetLastError();
448                     if (abort_on_error) {
449                         __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
450                                   KMP_ERR(error), __kmp_msg_null);
451                     }
452                     return error;
453                 }
454                 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
455                 if (! newMask) {
456                     DWORD error = GetLastError();
457                     if (abort_on_error) {
458                         __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
459                                   KMP_ERR(error), __kmp_msg_null);
460                     }
461                 }
462                 *mask = retval;
463             }
464             return 0;
465         }
466         int get_proc_group() const override {
467             int group = -1;
468             if (__kmp_num_proc_groups == 1) {
469                 return 1;
470             }
471             for (int i = 0; i < __kmp_num_proc_groups; i++) {
472                 if (mask[i] == 0)
473                     continue;
474                 if (group >= 0)
475                     return -1;
476                 group = i;
477             }
478             return group;
479         }
480     };
481     void determine_capable(const char* env_var) override {
482         __kmp_affinity_determine_capable(env_var);
483     }
484     void bind_thread(int which) override {
485         __kmp_affinity_bind_thread(which);
486     }
487     KMPAffinity::Mask* allocate_mask() override { return new Mask();  }
488     void deallocate_mask(KMPAffinity::Mask* m) override { delete m; }
489     KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
490     void deallocate_mask_array(KMPAffinity::Mask* array) override {
491         Mask* windows_array = static_cast<Mask*>(array);
492         delete[] windows_array;
493     }
494     KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
495         Mask* windows_array = static_cast<Mask*>(array);
496         return &(windows_array[index]);
497     }
498     api_type get_api_type() const override { return NATIVE_OS; }
499 };
500 #endif /* KMP_OS_WINDOWS */
501 #endif /* KMP_AFFINITY_SUPPORTED */
502 
503 class Address {
504 public:
505     static const unsigned maxDepth = 32;
506     unsigned labels[maxDepth];
507     unsigned childNums[maxDepth];
508     unsigned depth;
509     unsigned leader;
510     Address(unsigned _depth)
511       : depth(_depth), leader(FALSE) {
512     }
513     Address &operator=(const Address &b) {
514         depth = b.depth;
515         for (unsigned i = 0; i < depth; i++) {
516             labels[i] = b.labels[i];
517             childNums[i] = b.childNums[i];
518         }
519         leader = FALSE;
520         return *this;
521     }
522     bool operator==(const Address &b) const {
523         if (depth != b.depth)
524             return false;
525         for (unsigned i = 0; i < depth; i++)
526             if(labels[i] != b.labels[i])
527                 return false;
528         return true;
529     }
530     bool isClose(const Address &b, int level) const {
531         if (depth != b.depth)
532             return false;
533         if ((unsigned)level >= depth)
534             return true;
535         for (unsigned i = 0; i < (depth - level); i++)
536             if(labels[i] != b.labels[i])
537                 return false;
538         return true;
539     }
540     bool operator!=(const Address &b) const {
541         return !operator==(b);
542     }
543     void print() const {
544         unsigned i;
545         printf("Depth: %u --- ", depth);
546         for(i=0;i<depth;i++) {
547             printf("%u ", labels[i]);
548         }
549     }
550 };
551 
552 class AddrUnsPair {
553 public:
554     Address first;
555     unsigned second;
556     AddrUnsPair(Address _first, unsigned _second)
557       : first(_first), second(_second) {
558     }
559     AddrUnsPair &operator=(const AddrUnsPair &b)
560     {
561         first = b.first;
562         second = b.second;
563         return *this;
564     }
565     void print() const {
566         printf("first = "); first.print();
567         printf(" --- second = %u", second);
568     }
569     bool operator==(const AddrUnsPair &b) const {
570         if(first != b.first) return false;
571         if(second != b.second) return false;
572         return true;
573     }
574     bool operator!=(const AddrUnsPair &b) const {
575         return !operator==(b);
576     }
577 };
578 
579 
580 static int
581 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
582 {
583     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
584       ->first);
585     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
586       ->first);
587     unsigned depth = aa->depth;
588     unsigned i;
589     KMP_DEBUG_ASSERT(depth == bb->depth);
590     for (i  = 0; i < depth; i++) {
591         if (aa->labels[i] < bb->labels[i]) return -1;
592         if (aa->labels[i] > bb->labels[i]) return 1;
593     }
594     return 0;
595 }
596 
597 
598 /** A structure for holding machine-specific hierarchy info to be computed once at init.
599     This structure represents a mapping of threads to the actual machine hierarchy, or to
600     our best guess at what the hierarchy might be, for the purpose of performing an
601     efficient barrier.  In the worst case, when there is no machine hierarchy information,
602     it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
603 class hierarchy_info {
604 public:
605     /** Good default values for number of leaves and branching factor, given no affinity information.
606 	Behaves a bit like hyper barrier. */
607     static const kmp_uint32 maxLeaves=4;
608     static const kmp_uint32 minBranch=4;
609     /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
610 	or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
611 	nomenclature.  When the machine is oversubscribed we add levels to duplicate the
612 	hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
613     kmp_uint32 maxLevels;
614 
615     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
616         number of levels along the longest path from root to any leaf. It corresponds to the
617         number of entries in numPerLevel if we exclude all but one trailing 1. */
618     kmp_uint32 depth;
619     kmp_uint32 base_num_threads;
620     enum init_status { initialized=0, not_initialized=1, initializing=2 };
621     volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 2=initialization in progress
622     volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
623 
624     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
625         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
626         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
627     kmp_uint32 *numPerLevel;
628     kmp_uint32 *skipPerLevel;
629 
630     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
631         int hier_depth = adr2os[0].first.depth;
632         int level = 0;
633         for (int i=hier_depth-1; i>=0; --i) {
634             int max = -1;
635             for (int j=0; j<num_addrs; ++j) {
636                 int next = adr2os[j].first.childNums[i];
637                 if (next > max) max = next;
638             }
639             numPerLevel[level] = max+1;
640             ++level;
641         }
642     }
643 
644     hierarchy_info() : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
645 
646     void fini() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
647 
648     void init(AddrUnsPair *adr2os, int num_addrs)
649     {
650         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, not_initialized, initializing);
651         if (bool_result == 0) { // Wait for initialization
652             while (TCR_1(uninitialized) != initialized) KMP_CPU_PAUSE();
653             return;
654         }
655         KMP_DEBUG_ASSERT(bool_result==1);
656 
657         /* Added explicit initialization of the data fields here to prevent usage of dirty value
658            observed when static library is re-initialized multiple times (e.g. when
659            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
660         depth = 1;
661         resizing = 0;
662         maxLevels = 7;
663         numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
664         skipPerLevel = &(numPerLevel[maxLevels]);
665         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
666             numPerLevel[i] = 1;
667             skipPerLevel[i] = 1;
668         }
669 
670         // Sort table by physical ID
671         if (adr2os) {
672             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
673             deriveLevels(adr2os, num_addrs);
674         }
675         else {
676             numPerLevel[0] = maxLeaves;
677             numPerLevel[1] = num_addrs/maxLeaves;
678             if (num_addrs%maxLeaves) numPerLevel[1]++;
679         }
680 
681         base_num_threads = num_addrs;
682         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
683             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
684                 depth++;
685 
686         kmp_uint32 branch = minBranch;
687         if (numPerLevel[0] == 1) branch = num_addrs/maxLeaves;
688         if (branch<minBranch) branch=minBranch;
689         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
690             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>maxLeaves)) { // max 4 on level 0!
691                 if (numPerLevel[d] & 1) numPerLevel[d]++;
692                 numPerLevel[d] = numPerLevel[d] >> 1;
693                 if (numPerLevel[d+1] == 1) depth++;
694                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
695             }
696             if(numPerLevel[0] == 1) {
697                 branch = branch >> 1;
698                 if (branch<4) branch = minBranch;
699             }
700         }
701 
702         for (kmp_uint32 i=1; i<depth; ++i)
703             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
704         // Fill in hierarchy in the case of oversubscription
705         for (kmp_uint32 i=depth; i<maxLevels; ++i)
706             skipPerLevel[i] = 2*skipPerLevel[i-1];
707 
708         uninitialized = initialized; // One writer
709 
710     }
711 
712     // Resize the hierarchy if nproc changes to something larger than before
713     void resize(kmp_uint32 nproc)
714     {
715         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
716         while (bool_result == 0) { // someone else is trying to resize
717             KMP_CPU_PAUSE();
718             if (nproc <= base_num_threads)  // happy with other thread's resize
719                 return;
720             else // try to resize
721                 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
722         }
723         KMP_DEBUG_ASSERT(bool_result!=0);
724         if (nproc <= base_num_threads) return; // happy with other thread's resize
725 
726         // Calculate new maxLevels
727         kmp_uint32 old_sz = skipPerLevel[depth-1];
728         kmp_uint32 incs = 0, old_maxLevels = maxLevels;
729         // First see if old maxLevels is enough to contain new size
730         for (kmp_uint32 i=depth; i<maxLevels && nproc>old_sz; ++i) {
731             skipPerLevel[i] = 2*skipPerLevel[i-1];
732             numPerLevel[i-1] *= 2;
733             old_sz *= 2;
734             depth++;
735         }
736         if (nproc > old_sz) { // Not enough space, need to expand hierarchy
737             while (nproc > old_sz) {
738                 old_sz *=2;
739                 incs++;
740                 depth++;
741             }
742             maxLevels += incs;
743 
744             // Resize arrays
745             kmp_uint32 *old_numPerLevel = numPerLevel;
746             kmp_uint32 *old_skipPerLevel = skipPerLevel;
747             numPerLevel = skipPerLevel = NULL;
748             numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
749             skipPerLevel = &(numPerLevel[maxLevels]);
750 
751             // Copy old elements from old arrays
752             for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
753                 numPerLevel[i] = old_numPerLevel[i];
754                 skipPerLevel[i] = old_skipPerLevel[i];
755             }
756 
757             // Init new elements in arrays to 1
758             for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
759                 numPerLevel[i] = 1;
760                 skipPerLevel[i] = 1;
761             }
762 
763             // Free old arrays
764             __kmp_free(old_numPerLevel);
765         }
766 
767         // Fill in oversubscription levels of hierarchy
768         for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
769             skipPerLevel[i] = 2*skipPerLevel[i-1];
770 
771         base_num_threads = nproc;
772         resizing = 0; // One writer
773 
774     }
775 };
776 #endif // KMP_AFFINITY_H
777