1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         KMP_SNPRINTF(scan, buf_len, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         KMP_SNPRINTF(scan, buf_len,  ",...");
72         while (*scan != '\0') scan++;
73     }
74     KMP_SNPRINTF(scan, buf_len, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init.
316     This structure represents a mapping of threads to the actual machine hierarchy, or to
317     our best guess at what the hierarchy might be, for the purpose of performing an
318     efficient barrier.  In the worst case, when there is no machine hierarchy information,
319     it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
320 class hierarchy_info {
321 public:
322     /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
323     or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
324     nomenclature.  When the machine is oversubscribed we add levels to duplicate the
325     hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
326     kmp_uint32 maxLevels;
327 
328     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
329         number of levels along the longest path from root to any leaf. It corresponds to the
330         number of entries in numPerLevel if we exclude all but one trailing 1. */
331     kmp_uint32 depth;
332     kmp_uint32 base_num_threads;
333     volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
334     volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
335 
336     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
337         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
338         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
339     kmp_uint32 *numPerLevel;
340     kmp_uint32 *skipPerLevel;
341 
342     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
343         int hier_depth = adr2os[0].first.depth;
344         int level = 0;
345         for (int i=hier_depth-1; i>=0; --i) {
346             int max = -1;
347             for (int j=0; j<num_addrs; ++j) {
348                 int next = adr2os[j].first.childNums[i];
349                 if (next > max) max = next;
350             }
351             numPerLevel[level] = max+1;
352             ++level;
353         }
354     }
355 
356     hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
357 
358     // TO FIX: This destructor causes a segfault in the library at shutdown.
359     //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
360 
361     void init(AddrUnsPair *adr2os, int num_addrs)
362     {
363         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
364         if (bool_result == 0) { // Wait for initialization
365             while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
366             return;
367         }
368         KMP_DEBUG_ASSERT(bool_result==1);
369 
370         /* Added explicit initialization of the data fields here to prevent usage of dirty value
371            observed when static library is re-initialized multiple times (e.g. when
372            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
373         depth = 1;
374         resizing = 0;
375         maxLevels = 7;
376         numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
377         skipPerLevel = &(numPerLevel[maxLevels]);
378         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
379             numPerLevel[i] = 1;
380             skipPerLevel[i] = 1;
381         }
382 
383         // Sort table by physical ID
384         if (adr2os) {
385             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
386             deriveLevels(adr2os, num_addrs);
387         }
388         else {
389             numPerLevel[0] = 4;
390             numPerLevel[1] = num_addrs/4;
391             if (num_addrs%4) numPerLevel[1]++;
392         }
393 
394         base_num_threads = num_addrs;
395         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
396             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
397                 depth++;
398 
399         kmp_uint32 branch = 4;
400         if (numPerLevel[0] == 1) branch = num_addrs/4;
401         if (branch<4) branch=4;
402         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
403             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
404                 if (numPerLevel[d] & 1) numPerLevel[d]++;
405                 numPerLevel[d] = numPerLevel[d] >> 1;
406                 if (numPerLevel[d+1] == 1) depth++;
407                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
408             }
409             if(numPerLevel[0] == 1) {
410                 branch = branch >> 1;
411                 if (branch<4) branch = 4;
412             }
413         }
414 
415         for (kmp_uint32 i=1; i<depth; ++i)
416             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
417         // Fill in hierarchy in the case of oversubscription
418         for (kmp_uint32 i=depth; i<maxLevels; ++i)
419             skipPerLevel[i] = 2*skipPerLevel[i-1];
420 
421         uninitialized = 0; // One writer
422 
423     }
424 
425     void resize(kmp_uint32 nproc)
426     {
427         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
428         if (bool_result == 0) { // Someone else is resizing
429             while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
430             return;
431         }
432         KMP_DEBUG_ASSERT(bool_result!=0);
433         KMP_DEBUG_ASSERT(nproc > base_num_threads);
434 
435         // Calculate new max_levels
436         kmp_uint32 old_sz = skipPerLevel[depth-1];
437         kmp_uint32 incs = 0, old_maxLevels= maxLevels;
438         while (nproc > old_sz) {
439             old_sz *=2;
440             incs++;
441         }
442         maxLevels += incs;
443 
444         // Resize arrays
445         kmp_uint32 *old_numPerLevel = numPerLevel;
446         kmp_uint32 *old_skipPerLevel = skipPerLevel;
447         numPerLevel = skipPerLevel = NULL;
448         numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
449         skipPerLevel = &(numPerLevel[maxLevels]);
450 
451         // Copy old elements from old arrays
452         for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
453             numPerLevel[i] = old_numPerLevel[i];
454             skipPerLevel[i] = old_skipPerLevel[i];
455         }
456 
457         // Init new elements in arrays to 1
458         for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
459             numPerLevel[i] = 1;
460             skipPerLevel[i] = 1;
461         }
462 
463         // Free old arrays
464         __kmp_free(old_numPerLevel);
465 
466         // Fill in oversubscription levels of hierarchy
467         for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
468             skipPerLevel[i] = 2*skipPerLevel[i-1];
469 
470         base_num_threads = nproc;
471         resizing = 0; // One writer
472 
473     }
474 };
475 
476 static hierarchy_info machine_hierarchy;
477 
478 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
479     kmp_uint32 depth;
480     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
481     if (TCR_1(machine_hierarchy.uninitialized))
482         machine_hierarchy.init(NULL, nproc);
483     // Adjust the hierarchy in case num threads exceeds original
484     if (nproc > machine_hierarchy.base_num_threads)
485         machine_hierarchy.resize(nproc);
486 
487     depth = machine_hierarchy.depth;
488     KMP_DEBUG_ASSERT(depth > 0);
489     // The loop below adjusts the depth in the case of a resize
490     while (nproc > machine_hierarchy.skipPerLevel[depth-1])
491         depth++;
492 
493     thr_bar->depth = depth;
494     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
495     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
496 }
497 
498 //
499 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
500 // called to renumber the labels from [0..n] and place them into the child_num
501 // vector of the address object.  This is done in case the labels used for
502 // the children at one node of the hierarchy differ from those used for
503 // another node at the same level.  Example:  suppose the machine has 2 nodes
504 // with 2 packages each.  The first node contains packages 601 and 602, and
505 // second node contains packages 603 and 604.  If we try to sort the table
506 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
507 // because we are paying attention to the labels themselves, not the ordinal
508 // child numbers.  By using the child numbers in the sort, the result is
509 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
510 //
511 static void
512 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
513   int numAddrs)
514 {
515     KMP_DEBUG_ASSERT(numAddrs > 0);
516     int depth = address2os->first.depth;
517     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
518     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
519       * sizeof(unsigned));
520     int labCt;
521     for (labCt = 0; labCt < depth; labCt++) {
522         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
523         lastLabel[labCt] = address2os[0].first.labels[labCt];
524     }
525     int i;
526     for (i = 1; i < numAddrs; i++) {
527         for (labCt = 0; labCt < depth; labCt++) {
528             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
529                 int labCt2;
530                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
531                     counts[labCt2] = 0;
532                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
533                 }
534                 counts[labCt]++;
535                 lastLabel[labCt] = address2os[i].first.labels[labCt];
536                 break;
537             }
538         }
539         for (labCt = 0; labCt < depth; labCt++) {
540             address2os[i].first.childNums[labCt] = counts[labCt];
541         }
542         for (; labCt < (int)Address::maxDepth; labCt++) {
543             address2os[i].first.childNums[labCt] = 0;
544         }
545     }
546 }
547 
548 
549 //
550 // All of the __kmp_affinity_create_*_map() routines should set
551 // __kmp_affinity_masks to a vector of affinity mask objects of length
552 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
553 // return the number of levels in the machine topology tree (zero if
554 // __kmp_affinity_type == affinity_none).
555 //
556 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
557 // to the affinity mask for the initialization thread.  They need to save and
558 // restore the mask, and it could be needed later, so saving it is just an
559 // optimization to avoid calling kmp_get_system_affinity() again.
560 //
561 static kmp_affin_mask_t *fullMask = NULL;
562 
563 kmp_affin_mask_t *
564 __kmp_affinity_get_fullMask() { return fullMask; }
565 
566 
567 static int nCoresPerPkg, nPackages;
568 static int __kmp_nThreadsPerCore;
569 #ifndef KMP_DFLT_NTH_CORES
570 static int __kmp_ncores;
571 #endif
572 
573 //
574 // __kmp_affinity_uniform_topology() doesn't work when called from
575 // places which support arbitrarily many levels in the machine topology
576 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
577 // __kmp_affinity_create_x2apicid_map().
578 //
579 inline static bool
580 __kmp_affinity_uniform_topology()
581 {
582     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
583 }
584 
585 
586 //
587 // Print out the detailed machine topology map, i.e. the physical locations
588 // of each OS proc.
589 //
590 static void
591 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
592   int pkgLevel, int coreLevel, int threadLevel)
593 {
594     int proc;
595 
596     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
597     for (proc = 0; proc < len; proc++) {
598         int level;
599         kmp_str_buf_t buf;
600         __kmp_str_buf_init(&buf);
601         for (level = 0; level < depth; level++) {
602             if (level == threadLevel) {
603                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
604             }
605             else if (level == coreLevel) {
606                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
607             }
608             else if (level == pkgLevel) {
609                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
610             }
611             else if (level > pkgLevel) {
612                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
613                   level - pkgLevel - 1);
614             }
615             else {
616                 __kmp_str_buf_print(&buf, "L%d ", level);
617             }
618             __kmp_str_buf_print(&buf, "%d ",
619               address2os[proc].first.labels[level]);
620         }
621         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
622           buf.str);
623         __kmp_str_buf_free(&buf);
624     }
625 }
626 
627 
628 //
629 // If we don't know how to retrieve the machine's processor topology, or
630 // encounter an error in doing so, this routine is called to form a "flat"
631 // mapping of os thread id's <-> processor id's.
632 //
633 static int
634 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
635   kmp_i18n_id_t *const msg_id)
636 {
637     *address2os = NULL;
638     *msg_id = kmp_i18n_null;
639 
640     //
641     // Even if __kmp_affinity_type == affinity_none, this routine might still
642     // called to set __kmp_ncores, as well as
643     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
644     //
645     if (! KMP_AFFINITY_CAPABLE()) {
646         KMP_ASSERT(__kmp_affinity_type == affinity_none);
647         __kmp_ncores = nPackages = __kmp_xproc;
648         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
649         if (__kmp_affinity_verbose) {
650             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
651             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652             KMP_INFORM(Uniform, "KMP_AFFINITY");
653             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654               __kmp_nThreadsPerCore, __kmp_ncores);
655         }
656         return 0;
657     }
658 
659     //
660     // When affinity is off, this routine will still be called to set
661     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
662     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
663     //  correctly, and return now if affinity is not enabled.
664     //
665     __kmp_ncores = nPackages = __kmp_avail_proc;
666     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
667     if (__kmp_affinity_verbose) {
668         char buf[KMP_AFFIN_MASK_PRINT_LEN];
669         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
670 
671         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
672         if (__kmp_affinity_respect_mask) {
673             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
674         } else {
675             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
676         }
677         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
678         KMP_INFORM(Uniform, "KMP_AFFINITY");
679         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
680           __kmp_nThreadsPerCore, __kmp_ncores);
681     }
682     if (__kmp_affinity_type == affinity_none) {
683         return 0;
684     }
685 
686     //
687     // Contruct the data structure to be returned.
688     //
689     *address2os = (AddrUnsPair*)
690       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
691     int avail_ct = 0;
692     unsigned int i;
693     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
694         //
695         // Skip this proc if it is not included in the machine model.
696         //
697         if (! KMP_CPU_ISSET(i, fullMask)) {
698             continue;
699         }
700 
701         Address addr(1);
702         addr.labels[0] = i;
703         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
704     }
705     if (__kmp_affinity_verbose) {
706         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
707     }
708 
709     if (__kmp_affinity_gran_levels < 0) {
710         //
711         // Only the package level is modeled in the machine topology map,
712         // so the #levels of granularity is either 0 or 1.
713         //
714         if (__kmp_affinity_gran > affinity_gran_package) {
715             __kmp_affinity_gran_levels = 1;
716         }
717         else {
718             __kmp_affinity_gran_levels = 0;
719         }
720     }
721     return 1;
722 }
723 
724 
725 # if KMP_GROUP_AFFINITY
726 
727 //
728 // If multiple Windows* OS processor groups exist, we can create a 2-level
729 // topology map with the groups at level 0 and the individual procs at
730 // level 1.
731 //
732 // This facilitates letting the threads float among all procs in a group,
733 // if granularity=group (the default when there are multiple groups).
734 //
735 static int
736 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
737   kmp_i18n_id_t *const msg_id)
738 {
739     *address2os = NULL;
740     *msg_id = kmp_i18n_null;
741 
742     //
743     // If we don't have multiple processor groups, return now.
744     // The flat mapping will be used.
745     //
746     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
747         // FIXME set *msg_id
748         return -1;
749     }
750 
751     //
752     // Contruct the data structure to be returned.
753     //
754     *address2os = (AddrUnsPair*)
755       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
756     int avail_ct = 0;
757     int i;
758     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
759         //
760         // Skip this proc if it is not included in the machine model.
761         //
762         if (! KMP_CPU_ISSET(i, fullMask)) {
763             continue;
764         }
765 
766         Address addr(2);
767         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
768         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
769         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
770 
771         if (__kmp_affinity_verbose) {
772             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
773               addr.labels[1]);
774         }
775     }
776 
777     if (__kmp_affinity_gran_levels < 0) {
778         if (__kmp_affinity_gran == affinity_gran_group) {
779             __kmp_affinity_gran_levels = 1;
780         }
781         else if ((__kmp_affinity_gran == affinity_gran_fine)
782           || (__kmp_affinity_gran == affinity_gran_thread)) {
783             __kmp_affinity_gran_levels = 0;
784         }
785         else {
786             const char *gran_str = NULL;
787             if (__kmp_affinity_gran == affinity_gran_core) {
788                 gran_str = "core";
789             }
790             else if (__kmp_affinity_gran == affinity_gran_package) {
791                 gran_str = "package";
792             }
793             else if (__kmp_affinity_gran == affinity_gran_node) {
794                 gran_str = "node";
795             }
796             else {
797                 KMP_ASSERT(0);
798             }
799 
800             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
801             __kmp_affinity_gran_levels = 0;
802         }
803     }
804     return 2;
805 }
806 
807 # endif /* KMP_GROUP_AFFINITY */
808 
809 
810 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
811 
812 static int
813 __kmp_cpuid_mask_width(int count) {
814     int r = 0;
815 
816     while((1<<r) < count)
817         ++r;
818     return r;
819 }
820 
821 
822 class apicThreadInfo {
823 public:
824     unsigned osId;              // param to __kmp_affinity_bind_thread
825     unsigned apicId;            // from cpuid after binding
826     unsigned maxCoresPerPkg;    //      ""
827     unsigned maxThreadsPerPkg;  //      ""
828     unsigned pkgId;             // inferred from above values
829     unsigned coreId;            //      ""
830     unsigned threadId;          //      ""
831 };
832 
833 
834 static int
835 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
836 {
837     const apicThreadInfo *aa = (const apicThreadInfo *)a;
838     const apicThreadInfo *bb = (const apicThreadInfo *)b;
839     if (aa->osId < bb->osId) return -1;
840     if (aa->osId > bb->osId) return 1;
841     return 0;
842 }
843 
844 
845 static int
846 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
847 {
848     const apicThreadInfo *aa = (const apicThreadInfo *)a;
849     const apicThreadInfo *bb = (const apicThreadInfo *)b;
850     if (aa->pkgId < bb->pkgId) return -1;
851     if (aa->pkgId > bb->pkgId) return 1;
852     if (aa->coreId < bb->coreId) return -1;
853     if (aa->coreId > bb->coreId) return 1;
854     if (aa->threadId < bb->threadId) return -1;
855     if (aa->threadId > bb->threadId) return 1;
856     return 0;
857 }
858 
859 
860 //
861 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
862 // an algorithm which cycles through the available os threads, setting
863 // the current thread's affinity mask to that thread, and then retrieves
864 // the Apic Id for each thread context using the cpuid instruction.
865 //
866 static int
867 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
868   kmp_i18n_id_t *const msg_id)
869 {
870     kmp_cpuid buf;
871     int rc;
872     *address2os = NULL;
873     *msg_id = kmp_i18n_null;
874 
875     //
876     // Check if cpuid leaf 4 is supported.
877     //
878         __kmp_x86_cpuid(0, 0, &buf);
879         if (buf.eax < 4) {
880             *msg_id = kmp_i18n_str_NoLeaf4Support;
881             return -1;
882         }
883 
884     //
885     // The algorithm used starts by setting the affinity to each available
886     // thread and retrieving info from the cpuid instruction, so if we are
887     // not capable of calling __kmp_get_system_affinity() and
888     // _kmp_get_system_affinity(), then we need to do something else - use
889     // the defaults that we calculated from issuing cpuid without binding
890     // to each proc.
891     //
892     if (! KMP_AFFINITY_CAPABLE()) {
893         //
894         // Hack to try and infer the machine topology using only the data
895         // available from cpuid on the current thread, and __kmp_xproc.
896         //
897         KMP_ASSERT(__kmp_affinity_type == affinity_none);
898 
899         //
900         // Get an upper bound on the number of threads per package using
901         // cpuid(1).
902         //
903         // On some OS/chps combinations where HT is supported by the chip
904         // but is disabled, this value will be 2 on a single core chip.
905         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
906         //
907         __kmp_x86_cpuid(1, 0, &buf);
908         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
909         if (maxThreadsPerPkg == 0) {
910             maxThreadsPerPkg = 1;
911         }
912 
913         //
914         // The num cores per pkg comes from cpuid(4).
915         // 1 must be added to the encoded value.
916         //
917         // The author of cpu_count.cpp treated this only an upper bound
918         // on the number of cores, but I haven't seen any cases where it
919         // was greater than the actual number of cores, so we will treat
920         // it as exact in this block of code.
921         //
922         // First, we need to check if cpuid(4) is supported on this chip.
923         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
924         // has the value n or greater.
925         //
926         __kmp_x86_cpuid(0, 0, &buf);
927         if (buf.eax >= 4) {
928             __kmp_x86_cpuid(4, 0, &buf);
929             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
930         }
931         else {
932             nCoresPerPkg = 1;
933         }
934 
935         //
936         // There is no way to reliably tell if HT is enabled without issuing
937         // the cpuid instruction from every thread, can correlating the cpuid
938         // info, so if the machine is not affinity capable, we assume that HT
939         // is off.  We have seen quite a few machines where maxThreadsPerPkg
940         // is 2, yet the machine does not support HT.
941         //
942         // - Older OSes are usually found on machines with older chips, which
943         //   do not support HT.
944         //
945         // - The performance penalty for mistakenly identifying a machine as
946         //   HT when it isn't (which results in blocktime being incorrecly set
947         //   to 0) is greater than the penalty when for mistakenly identifying
948         //   a machine as being 1 thread/core when it is really HT enabled
949         //   (which results in blocktime being incorrectly set to a positive
950         //   value).
951         //
952         __kmp_ncores = __kmp_xproc;
953         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
954         __kmp_nThreadsPerCore = 1;
955         if (__kmp_affinity_verbose) {
956             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
957             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
958             if (__kmp_affinity_uniform_topology()) {
959                 KMP_INFORM(Uniform, "KMP_AFFINITY");
960             } else {
961                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
962             }
963             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
964               __kmp_nThreadsPerCore, __kmp_ncores);
965         }
966         return 0;
967     }
968 
969     //
970     //
971     // From here on, we can assume that it is safe to call
972     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
973     // even if __kmp_affinity_type = affinity_none.
974     //
975 
976     //
977     // Save the affinity mask for the current thread.
978     //
979     kmp_affin_mask_t *oldMask;
980     KMP_CPU_ALLOC(oldMask);
981     KMP_ASSERT(oldMask != NULL);
982     __kmp_get_system_affinity(oldMask, TRUE);
983 
984     //
985     // Run through each of the available contexts, binding the current thread
986     // to it, and obtaining the pertinent information using the cpuid instr.
987     //
988     // The relevant information is:
989     //
990     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
991     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
992     //
993     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
994     //    value of this field determines the width of the core# + thread#
995     //    fields in the Apic Id.  It is also an upper bound on the number
996     //    of threads per package, but it has been verified that situations
997     //    happen were it is not exact.  In particular, on certain OS/chip
998     //    combinations where Intel(R) Hyper-Threading Technology is supported
999     //    by the chip but has
1000     //    been disabled, the value of this field will be 2 (for a single core
1001     //    chip).  On other OS/chip combinations supporting
1002     //    Intel(R) Hyper-Threading Technology, the value of
1003     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
1004     //    disabled and 2 when it is enabled.
1005     //
1006     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
1007     //    value of this field (+1) determines the width of the core# field in
1008     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
1009     //    an upper bound, but the IA-32 architecture manual says that it is
1010     //    exactly the number of cores per package, and I haven't seen any
1011     //    case where it wasn't.
1012     //
1013     // From this information, deduce the package Id, core Id, and thread Id,
1014     // and set the corresponding fields in the apicThreadInfo struct.
1015     //
1016     unsigned i;
1017     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1018       __kmp_avail_proc * sizeof(apicThreadInfo));
1019     unsigned nApics = 0;
1020     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1021         //
1022         // Skip this proc if it is not included in the machine model.
1023         //
1024         if (! KMP_CPU_ISSET(i, fullMask)) {
1025             continue;
1026         }
1027         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1028 
1029         __kmp_affinity_bind_thread(i);
1030         threadInfo[nApics].osId = i;
1031 
1032         //
1033         // The apic id and max threads per pkg come from cpuid(1).
1034         //
1035         __kmp_x86_cpuid(1, 0, &buf);
1036         if (! (buf.edx >> 9) & 1) {
1037             __kmp_set_system_affinity(oldMask, TRUE);
1038             __kmp_free(threadInfo);
1039             KMP_CPU_FREE(oldMask);
1040             *msg_id = kmp_i18n_str_ApicNotPresent;
1041             return -1;
1042         }
1043         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1044         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1045         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1046             threadInfo[nApics].maxThreadsPerPkg = 1;
1047         }
1048 
1049         //
1050         // Max cores per pkg comes from cpuid(4).
1051         // 1 must be added to the encoded value.
1052         //
1053         // First, we need to check if cpuid(4) is supported on this chip.
1054         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1055         // has the value n or greater.
1056         //
1057         __kmp_x86_cpuid(0, 0, &buf);
1058         if (buf.eax >= 4) {
1059             __kmp_x86_cpuid(4, 0, &buf);
1060             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1061         }
1062         else {
1063             threadInfo[nApics].maxCoresPerPkg = 1;
1064         }
1065 
1066         //
1067         // Infer the pkgId / coreId / threadId using only the info
1068         // obtained locally.
1069         //
1070         int widthCT = __kmp_cpuid_mask_width(
1071           threadInfo[nApics].maxThreadsPerPkg);
1072         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1073 
1074         int widthC = __kmp_cpuid_mask_width(
1075           threadInfo[nApics].maxCoresPerPkg);
1076         int widthT = widthCT - widthC;
1077         if (widthT < 0) {
1078             //
1079             // I've never seen this one happen, but I suppose it could, if
1080             // the cpuid instruction on a chip was really screwed up.
1081             // Make sure to restore the affinity mask before the tail call.
1082             //
1083             __kmp_set_system_affinity(oldMask, TRUE);
1084             __kmp_free(threadInfo);
1085             KMP_CPU_FREE(oldMask);
1086             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1087             return -1;
1088         }
1089 
1090         int maskC = (1 << widthC) - 1;
1091         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1092           &maskC;
1093 
1094         int maskT = (1 << widthT) - 1;
1095         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1096 
1097         nApics++;
1098     }
1099 
1100     //
1101     // We've collected all the info we need.
1102     // Restore the old affinity mask for this thread.
1103     //
1104     __kmp_set_system_affinity(oldMask, TRUE);
1105 
1106     //
1107     // If there's only one thread context to bind to, form an Address object
1108     // with depth 1 and return immediately (or, if affinity is off, set
1109     // address2os to NULL and return).
1110     //
1111     // If it is configured to omit the package level when there is only a
1112     // single package, the logic at the end of this routine won't work if
1113     // there is only a single thread - it would try to form an Address
1114     // object with depth 0.
1115     //
1116     KMP_ASSERT(nApics > 0);
1117     if (nApics == 1) {
1118         __kmp_ncores = nPackages = 1;
1119         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1120         if (__kmp_affinity_verbose) {
1121             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1122             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1123 
1124             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1125             if (__kmp_affinity_respect_mask) {
1126                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1127             } else {
1128                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1129             }
1130             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1131             KMP_INFORM(Uniform, "KMP_AFFINITY");
1132             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1133               __kmp_nThreadsPerCore, __kmp_ncores);
1134         }
1135 
1136         if (__kmp_affinity_type == affinity_none) {
1137             __kmp_free(threadInfo);
1138             KMP_CPU_FREE(oldMask);
1139             return 0;
1140         }
1141 
1142         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1143         Address addr(1);
1144         addr.labels[0] = threadInfo[0].pkgId;
1145         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1146 
1147         if (__kmp_affinity_gran_levels < 0) {
1148             __kmp_affinity_gran_levels = 0;
1149         }
1150 
1151         if (__kmp_affinity_verbose) {
1152             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1153         }
1154 
1155         __kmp_free(threadInfo);
1156         KMP_CPU_FREE(oldMask);
1157         return 1;
1158     }
1159 
1160     //
1161     // Sort the threadInfo table by physical Id.
1162     //
1163     qsort(threadInfo, nApics, sizeof(*threadInfo),
1164       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1165 
1166     //
1167     // The table is now sorted by pkgId / coreId / threadId, but we really
1168     // don't know the radix of any of the fields.  pkgId's may be sparsely
1169     // assigned among the chips on a system.  Although coreId's are usually
1170     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1171     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1172     //
1173     // For that matter, we don't know what coresPerPkg and threadsPerCore
1174     // (or the total # packages) are at this point - we want to determine
1175     // that now.  We only have an upper bound on the first two figures.
1176     //
1177     // We also perform a consistency check at this point: the values returned
1178     // by the cpuid instruction for any thread bound to a given package had
1179     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1180     //
1181     nPackages = 1;
1182     nCoresPerPkg = 1;
1183     __kmp_nThreadsPerCore = 1;
1184     unsigned nCores = 1;
1185 
1186     unsigned pkgCt = 1;                         // to determine radii
1187     unsigned lastPkgId = threadInfo[0].pkgId;
1188     unsigned coreCt = 1;
1189     unsigned lastCoreId = threadInfo[0].coreId;
1190     unsigned threadCt = 1;
1191     unsigned lastThreadId = threadInfo[0].threadId;
1192 
1193                                                 // intra-pkg consist checks
1194     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1195     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1196 
1197     for (i = 1; i < nApics; i++) {
1198         if (threadInfo[i].pkgId != lastPkgId) {
1199             nCores++;
1200             pkgCt++;
1201             lastPkgId = threadInfo[i].pkgId;
1202             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1203             coreCt = 1;
1204             lastCoreId = threadInfo[i].coreId;
1205             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1206             threadCt = 1;
1207             lastThreadId = threadInfo[i].threadId;
1208 
1209             //
1210             // This is a different package, so go on to the next iteration
1211             // without doing any consistency checks.  Reset the consistency
1212             // check vars, though.
1213             //
1214             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1215             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1216             continue;
1217         }
1218 
1219         if (threadInfo[i].coreId != lastCoreId) {
1220             nCores++;
1221             coreCt++;
1222             lastCoreId = threadInfo[i].coreId;
1223             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1224             threadCt = 1;
1225             lastThreadId = threadInfo[i].threadId;
1226         }
1227         else if (threadInfo[i].threadId != lastThreadId) {
1228             threadCt++;
1229             lastThreadId = threadInfo[i].threadId;
1230         }
1231         else {
1232             __kmp_free(threadInfo);
1233             KMP_CPU_FREE(oldMask);
1234             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1235             return -1;
1236         }
1237 
1238         //
1239         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1240         // fields agree between all the threads bounds to a given package.
1241         //
1242         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1243           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1244             __kmp_free(threadInfo);
1245             KMP_CPU_FREE(oldMask);
1246             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1247             return -1;
1248         }
1249     }
1250     nPackages = pkgCt;
1251     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1252     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1253 
1254     //
1255     // When affinity is off, this routine will still be called to set
1256     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1257     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1258     // correctly, and return now if affinity is not enabled.
1259     //
1260     __kmp_ncores = nCores;
1261     if (__kmp_affinity_verbose) {
1262         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1263         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1264 
1265         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1266         if (__kmp_affinity_respect_mask) {
1267             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1268         } else {
1269             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1270         }
1271         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1272         if (__kmp_affinity_uniform_topology()) {
1273             KMP_INFORM(Uniform, "KMP_AFFINITY");
1274         } else {
1275             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1276         }
1277         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1278           __kmp_nThreadsPerCore, __kmp_ncores);
1279 
1280     }
1281 
1282     if (__kmp_affinity_type == affinity_none) {
1283         __kmp_free(threadInfo);
1284         KMP_CPU_FREE(oldMask);
1285         return 0;
1286     }
1287 
1288     //
1289     // Now that we've determined the number of packages, the number of cores
1290     // per package, and the number of threads per core, we can construct the
1291     // data structure that is to be returned.
1292     //
1293     int pkgLevel = 0;
1294     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1295     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1296     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1297 
1298     KMP_ASSERT(depth > 0);
1299     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1300 
1301     for (i = 0; i < nApics; ++i) {
1302         Address addr(depth);
1303         unsigned os = threadInfo[i].osId;
1304         int d = 0;
1305 
1306         if (pkgLevel >= 0) {
1307             addr.labels[d++] = threadInfo[i].pkgId;
1308         }
1309         if (coreLevel >= 0) {
1310             addr.labels[d++] = threadInfo[i].coreId;
1311         }
1312         if (threadLevel >= 0) {
1313             addr.labels[d++] = threadInfo[i].threadId;
1314         }
1315         (*address2os)[i] = AddrUnsPair(addr, os);
1316     }
1317 
1318     if (__kmp_affinity_gran_levels < 0) {
1319         //
1320         // Set the granularity level based on what levels are modeled
1321         // in the machine topology map.
1322         //
1323         __kmp_affinity_gran_levels = 0;
1324         if ((threadLevel >= 0)
1325           && (__kmp_affinity_gran > affinity_gran_thread)) {
1326             __kmp_affinity_gran_levels++;
1327         }
1328         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1329             __kmp_affinity_gran_levels++;
1330         }
1331         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1332             __kmp_affinity_gran_levels++;
1333         }
1334     }
1335 
1336     if (__kmp_affinity_verbose) {
1337         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1338           coreLevel, threadLevel);
1339     }
1340 
1341     __kmp_free(threadInfo);
1342     KMP_CPU_FREE(oldMask);
1343     return depth;
1344 }
1345 
1346 
1347 //
1348 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1349 // architectures support a newer interface for specifying the x2APIC Ids,
1350 // based on cpuid leaf 11.
1351 //
1352 static int
1353 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1354   kmp_i18n_id_t *const msg_id)
1355 {
1356     kmp_cpuid buf;
1357 
1358     *address2os = NULL;
1359     *msg_id = kmp_i18n_null;
1360 
1361     //
1362     // Check to see if cpuid leaf 11 is supported.
1363     //
1364     __kmp_x86_cpuid(0, 0, &buf);
1365     if (buf.eax < 11) {
1366         *msg_id = kmp_i18n_str_NoLeaf11Support;
1367         return -1;
1368     }
1369     __kmp_x86_cpuid(11, 0, &buf);
1370     if (buf.ebx == 0) {
1371         *msg_id = kmp_i18n_str_NoLeaf11Support;
1372         return -1;
1373     }
1374 
1375     //
1376     // Find the number of levels in the machine topology.  While we're at it,
1377     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1378     // try to get more accurate values later by explicitly counting them,
1379     // but get reasonable defaults now, in case we return early.
1380     //
1381     int level;
1382     int threadLevel = -1;
1383     int coreLevel = -1;
1384     int pkgLevel = -1;
1385     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1386 
1387     for (level = 0;; level++) {
1388         if (level > 31) {
1389             //
1390             // FIXME: Hack for DPD200163180
1391             //
1392             // If level is big then something went wrong -> exiting
1393             //
1394             // There could actually be 32 valid levels in the machine topology,
1395             // but so far, the only machine we have seen which does not exit
1396             // this loop before iteration 32 has fubar x2APIC settings.
1397             //
1398             // For now, just reject this case based upon loop trip count.
1399             //
1400             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401             return -1;
1402         }
1403         __kmp_x86_cpuid(11, level, &buf);
1404         if (buf.ebx == 0) {
1405             if (pkgLevel < 0) {
1406                 //
1407                 // Will infer nPackages from __kmp_xproc
1408                 //
1409                 pkgLevel = level;
1410                 level++;
1411             }
1412             break;
1413         }
1414         int kind = (buf.ecx >> 8) & 0xff;
1415         if (kind == 1) {
1416             //
1417             // SMT level
1418             //
1419             threadLevel = level;
1420             coreLevel = -1;
1421             pkgLevel = -1;
1422             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1423             if (__kmp_nThreadsPerCore == 0) {
1424                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1425                 return -1;
1426             }
1427         }
1428         else if (kind == 2) {
1429             //
1430             // core level
1431             //
1432             coreLevel = level;
1433             pkgLevel = -1;
1434             nCoresPerPkg = buf.ebx & 0xff;
1435             if (nCoresPerPkg == 0) {
1436                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437                 return -1;
1438             }
1439         }
1440         else {
1441             if (level <= 0) {
1442                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1443                 return -1;
1444             }
1445             if (pkgLevel >= 0) {
1446                 continue;
1447             }
1448             pkgLevel = level;
1449             nPackages = buf.ebx & 0xff;
1450             if (nPackages == 0) {
1451                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1452                 return -1;
1453             }
1454         }
1455     }
1456     int depth = level;
1457 
1458     //
1459     // In the above loop, "level" was counted from the finest level (usually
1460     // thread) to the coarsest.  The caller expects that we will place the
1461     // labels in (*address2os)[].first.labels[] in the inverse order, so
1462     // we need to invert the vars saying which level means what.
1463     //
1464     if (threadLevel >= 0) {
1465         threadLevel = depth - threadLevel - 1;
1466     }
1467     if (coreLevel >= 0) {
1468         coreLevel = depth - coreLevel - 1;
1469     }
1470     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1471     pkgLevel = depth - pkgLevel - 1;
1472 
1473     //
1474     // The algorithm used starts by setting the affinity to each available
1475     // thread and retrieving info from the cpuid instruction, so if we are
1476     // not capable of calling __kmp_get_system_affinity() and
1477     // _kmp_get_system_affinity(), then we need to do something else - use
1478     // the defaults that we calculated from issuing cpuid without binding
1479     // to each proc.
1480     //
1481     if (! KMP_AFFINITY_CAPABLE())
1482     {
1483         //
1484         // Hack to try and infer the machine topology using only the data
1485         // available from cpuid on the current thread, and __kmp_xproc.
1486         //
1487         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1488 
1489         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1490         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1491         if (__kmp_affinity_verbose) {
1492             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1493             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1494             if (__kmp_affinity_uniform_topology()) {
1495                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1496             } else {
1497                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1498             }
1499             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1500               __kmp_nThreadsPerCore, __kmp_ncores);
1501         }
1502         return 0;
1503     }
1504 
1505     //
1506     //
1507     // From here on, we can assume that it is safe to call
1508     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1509     // even if __kmp_affinity_type = affinity_none.
1510     //
1511 
1512     //
1513     // Save the affinity mask for the current thread.
1514     //
1515     kmp_affin_mask_t *oldMask;
1516     KMP_CPU_ALLOC(oldMask);
1517     __kmp_get_system_affinity(oldMask, TRUE);
1518 
1519     //
1520     // Allocate the data structure to be returned.
1521     //
1522     AddrUnsPair *retval = (AddrUnsPair *)
1523       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1524 
1525     //
1526     // Run through each of the available contexts, binding the current thread
1527     // to it, and obtaining the pertinent information using the cpuid instr.
1528     //
1529     unsigned int proc;
1530     int nApics = 0;
1531     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1532         //
1533         // Skip this proc if it is not included in the machine model.
1534         //
1535         if (! KMP_CPU_ISSET(proc, fullMask)) {
1536             continue;
1537         }
1538         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1539 
1540         __kmp_affinity_bind_thread(proc);
1541 
1542         //
1543         // Extrach the labels for each level in the machine topology map
1544         // from the Apic ID.
1545         //
1546         Address addr(depth);
1547         int prev_shift = 0;
1548 
1549         for (level = 0; level < depth; level++) {
1550             __kmp_x86_cpuid(11, level, &buf);
1551             unsigned apicId = buf.edx;
1552             if (buf.ebx == 0) {
1553                 if (level != depth - 1) {
1554                     KMP_CPU_FREE(oldMask);
1555                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1556                     return -1;
1557                 }
1558                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1559                 level++;
1560                 break;
1561             }
1562             int shift = buf.eax & 0x1f;
1563             int mask = (1 << shift) - 1;
1564             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1565             prev_shift = shift;
1566         }
1567         if (level != depth) {
1568             KMP_CPU_FREE(oldMask);
1569             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1570             return -1;
1571         }
1572 
1573         retval[nApics] = AddrUnsPair(addr, proc);
1574         nApics++;
1575     }
1576 
1577     //
1578     // We've collected all the info we need.
1579     // Restore the old affinity mask for this thread.
1580     //
1581     __kmp_set_system_affinity(oldMask, TRUE);
1582 
1583     //
1584     // If there's only one thread context to bind to, return now.
1585     //
1586     KMP_ASSERT(nApics > 0);
1587     if (nApics == 1) {
1588         __kmp_ncores = nPackages = 1;
1589         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1590         if (__kmp_affinity_verbose) {
1591             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1592             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1593 
1594             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1595             if (__kmp_affinity_respect_mask) {
1596                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1597             } else {
1598                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1599             }
1600             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1601             KMP_INFORM(Uniform, "KMP_AFFINITY");
1602             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1603               __kmp_nThreadsPerCore, __kmp_ncores);
1604         }
1605 
1606         if (__kmp_affinity_type == affinity_none) {
1607             __kmp_free(retval);
1608             KMP_CPU_FREE(oldMask);
1609             return 0;
1610         }
1611 
1612         //
1613         // Form an Address object which only includes the package level.
1614         //
1615         Address addr(1);
1616         addr.labels[0] = retval[0].first.labels[pkgLevel];
1617         retval[0].first = addr;
1618 
1619         if (__kmp_affinity_gran_levels < 0) {
1620             __kmp_affinity_gran_levels = 0;
1621         }
1622 
1623         if (__kmp_affinity_verbose) {
1624             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1625         }
1626 
1627         *address2os = retval;
1628         KMP_CPU_FREE(oldMask);
1629         return 1;
1630     }
1631 
1632     //
1633     // Sort the table by physical Id.
1634     //
1635     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1636 
1637     //
1638     // Find the radix at each of the levels.
1639     //
1640     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1641     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1642     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1643     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1644     for (level = 0; level < depth; level++) {
1645         totals[level] = 1;
1646         maxCt[level] = 1;
1647         counts[level] = 1;
1648         last[level] = retval[0].first.labels[level];
1649     }
1650 
1651     //
1652     // From here on, the iteration variable "level" runs from the finest
1653     // level to the coarsest, i.e. we iterate forward through
1654     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1655     // backwards.
1656     //
1657     for (proc = 1; (int)proc < nApics; proc++) {
1658         int level;
1659         for (level = 0; level < depth; level++) {
1660             if (retval[proc].first.labels[level] != last[level]) {
1661                 int j;
1662                 for (j = level + 1; j < depth; j++) {
1663                     totals[j]++;
1664                     counts[j] = 1;
1665                     // The line below causes printing incorrect topology information
1666                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1667                     // some less value while going through the array.
1668                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1669                     // whereas it must be 4.
1670                     // TODO!!! Check if it can be commented safely
1671                     //maxCt[j] = 1;
1672                     last[j] = retval[proc].first.labels[j];
1673                 }
1674                 totals[level]++;
1675                 counts[level]++;
1676                 if (counts[level] > maxCt[level]) {
1677                     maxCt[level] = counts[level];
1678                 }
1679                 last[level] = retval[proc].first.labels[level];
1680                 break;
1681             }
1682             else if (level == depth - 1) {
1683                 __kmp_free(last);
1684                 __kmp_free(maxCt);
1685                 __kmp_free(counts);
1686                 __kmp_free(totals);
1687                 __kmp_free(retval);
1688                 KMP_CPU_FREE(oldMask);
1689                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1690                 return -1;
1691             }
1692         }
1693     }
1694 
1695     //
1696     // When affinity is off, this routine will still be called to set
1697     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1698     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1699     // correctly, and return if affinity is not enabled.
1700     //
1701     if (threadLevel >= 0) {
1702         __kmp_nThreadsPerCore = maxCt[threadLevel];
1703     }
1704     else {
1705         __kmp_nThreadsPerCore = 1;
1706     }
1707     nPackages = totals[pkgLevel];
1708 
1709     if (coreLevel >= 0) {
1710         __kmp_ncores = totals[coreLevel];
1711         nCoresPerPkg = maxCt[coreLevel];
1712     }
1713     else {
1714         __kmp_ncores = nPackages;
1715         nCoresPerPkg = 1;
1716     }
1717 
1718     //
1719     // Check to see if the machine topology is uniform
1720     //
1721     unsigned prod = maxCt[0];
1722     for (level = 1; level < depth; level++) {
1723        prod *= maxCt[level];
1724     }
1725     bool uniform = (prod == totals[level - 1]);
1726 
1727     //
1728     // Print the machine topology summary.
1729     //
1730     if (__kmp_affinity_verbose) {
1731         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1732         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1733 
1734         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1735         if (__kmp_affinity_respect_mask) {
1736             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1737         } else {
1738             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1739         }
1740         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1741         if (uniform) {
1742             KMP_INFORM(Uniform, "KMP_AFFINITY");
1743         } else {
1744             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1745         }
1746 
1747         kmp_str_buf_t buf;
1748         __kmp_str_buf_init(&buf);
1749 
1750         __kmp_str_buf_print(&buf, "%d", totals[0]);
1751         for (level = 1; level <= pkgLevel; level++) {
1752             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1753         }
1754         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1755           __kmp_nThreadsPerCore, __kmp_ncores);
1756 
1757         __kmp_str_buf_free(&buf);
1758     }
1759 
1760     if (__kmp_affinity_type == affinity_none) {
1761         __kmp_free(last);
1762         __kmp_free(maxCt);
1763         __kmp_free(counts);
1764         __kmp_free(totals);
1765         __kmp_free(retval);
1766         KMP_CPU_FREE(oldMask);
1767         return 0;
1768     }
1769 
1770     //
1771     // Find any levels with radiix 1, and remove them from the map
1772     // (except for the package level).
1773     //
1774     int new_depth = 0;
1775     for (level = 0; level < depth; level++) {
1776         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1777            continue;
1778         }
1779         new_depth++;
1780     }
1781 
1782     //
1783     // If we are removing any levels, allocate a new vector to return,
1784     // and copy the relevant information to it.
1785     //
1786     if (new_depth != depth) {
1787         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1788           sizeof(AddrUnsPair) * nApics);
1789         for (proc = 0; (int)proc < nApics; proc++) {
1790             Address addr(new_depth);
1791             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1792         }
1793         int new_level = 0;
1794         for (level = 0; level < depth; level++) {
1795             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1796                if (level == threadLevel) {
1797                    threadLevel = -1;
1798                }
1799                else if ((threadLevel >= 0) && (level < threadLevel)) {
1800                    threadLevel--;
1801                }
1802                if (level == coreLevel) {
1803                    coreLevel = -1;
1804                }
1805                else if ((coreLevel >= 0) && (level < coreLevel)) {
1806                    coreLevel--;
1807                }
1808                if (level < pkgLevel) {
1809                    pkgLevel--;
1810                }
1811                continue;
1812             }
1813             for (proc = 0; (int)proc < nApics; proc++) {
1814                 new_retval[proc].first.labels[new_level]
1815                   = retval[proc].first.labels[level];
1816             }
1817             new_level++;
1818         }
1819 
1820         __kmp_free(retval);
1821         retval = new_retval;
1822         depth = new_depth;
1823     }
1824 
1825     if (__kmp_affinity_gran_levels < 0) {
1826         //
1827         // Set the granularity level based on what levels are modeled
1828         // in the machine topology map.
1829         //
1830         __kmp_affinity_gran_levels = 0;
1831         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1832             __kmp_affinity_gran_levels++;
1833         }
1834         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1835             __kmp_affinity_gran_levels++;
1836         }
1837         if (__kmp_affinity_gran > affinity_gran_package) {
1838             __kmp_affinity_gran_levels++;
1839         }
1840     }
1841 
1842     if (__kmp_affinity_verbose) {
1843         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1844           coreLevel, threadLevel);
1845     }
1846 
1847     __kmp_free(last);
1848     __kmp_free(maxCt);
1849     __kmp_free(counts);
1850     __kmp_free(totals);
1851     KMP_CPU_FREE(oldMask);
1852     *address2os = retval;
1853     return depth;
1854 }
1855 
1856 
1857 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1858 
1859 
1860 #define osIdIndex       0
1861 #define threadIdIndex   1
1862 #define coreIdIndex     2
1863 #define pkgIdIndex      3
1864 #define nodeIdIndex     4
1865 
1866 typedef unsigned *ProcCpuInfo;
1867 static unsigned maxIndex = pkgIdIndex;
1868 
1869 
1870 static int
1871 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1872 {
1873     const unsigned *aa = (const unsigned *)a;
1874     const unsigned *bb = (const unsigned *)b;
1875     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1876     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1877     return 0;
1878 };
1879 
1880 
1881 static int
1882 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1883 {
1884     unsigned i;
1885     const unsigned *aa = *((const unsigned **)a);
1886     const unsigned *bb = *((const unsigned **)b);
1887     for (i = maxIndex; ; i--) {
1888         if (aa[i] < bb[i]) return -1;
1889         if (aa[i] > bb[i]) return 1;
1890         if (i == osIdIndex) break;
1891     }
1892     return 0;
1893 }
1894 
1895 
1896 //
1897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1898 // affinity map.
1899 //
1900 static int
1901 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1902   kmp_i18n_id_t *const msg_id, FILE *f)
1903 {
1904     *address2os = NULL;
1905     *msg_id = kmp_i18n_null;
1906 
1907     //
1908     // Scan of the file, and count the number of "processor" (osId) fields,
1909     // and find the highest value of <n> for a node_<n> field.
1910     //
1911     char buf[256];
1912     unsigned num_records = 0;
1913     while (! feof(f)) {
1914         buf[sizeof(buf) - 1] = 1;
1915         if (! fgets(buf, sizeof(buf), f)) {
1916             //
1917             // Read errors presumably because of EOF
1918             //
1919             break;
1920         }
1921 
1922         char s1[] = "processor";
1923         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1924             num_records++;
1925             continue;
1926         }
1927 
1928         //
1929         // FIXME - this will match "node_<n> <garbage>"
1930         //
1931         unsigned level;
1932         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1933             if (nodeIdIndex + level >= maxIndex) {
1934                 maxIndex = nodeIdIndex + level;
1935             }
1936             continue;
1937         }
1938     }
1939 
1940     //
1941     // Check for empty file / no valid processor records, or too many.
1942     // The number of records can't exceed the number of valid bits in the
1943     // affinity mask.
1944     //
1945     if (num_records == 0) {
1946         *line = 0;
1947         *msg_id = kmp_i18n_str_NoProcRecords;
1948         return -1;
1949     }
1950     if (num_records > (unsigned)__kmp_xproc) {
1951         *line = 0;
1952         *msg_id = kmp_i18n_str_TooManyProcRecords;
1953         return -1;
1954     }
1955 
1956     //
1957     // Set the file pointer back to the begginning, so that we can scan the
1958     // file again, this time performing a full parse of the data.
1959     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1960     // Adding an extra element at the end allows us to remove a lot of extra
1961     // checks for termination conditions.
1962     //
1963     if (fseek(f, 0, SEEK_SET) != 0) {
1964         *line = 0;
1965         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1966         return -1;
1967     }
1968 
1969     //
1970     // Allocate the array of records to store the proc info in.  The dummy
1971     // element at the end makes the logic in filling them out easier to code.
1972     //
1973     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1974       * sizeof(unsigned *));
1975     unsigned i;
1976     for (i = 0; i <= num_records; i++) {
1977         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1978           * sizeof(unsigned));
1979     }
1980 
1981 #define CLEANUP_THREAD_INFO \
1982     for (i = 0; i <= num_records; i++) {                                \
1983         __kmp_free(threadInfo[i]);                                      \
1984     }                                                                   \
1985     __kmp_free(threadInfo);
1986 
1987     //
1988     // A value of UINT_MAX means that we didn't find the field
1989     //
1990     unsigned __index;
1991 
1992 #define INIT_PROC_INFO(p) \
1993     for (__index = 0; __index <= maxIndex; __index++) {                 \
1994         (p)[__index] = UINT_MAX;                                        \
1995     }
1996 
1997     for (i = 0; i <= num_records; i++) {
1998         INIT_PROC_INFO(threadInfo[i]);
1999     }
2000 
2001     unsigned num_avail = 0;
2002     *line = 0;
2003     while (! feof(f)) {
2004         //
2005         // Create an inner scoping level, so that all the goto targets at the
2006         // end of the loop appear in an outer scoping level.  This avoids
2007         // warnings about jumping past an initialization to a target in the
2008         // same block.
2009         //
2010         {
2011             buf[sizeof(buf) - 1] = 1;
2012             bool long_line = false;
2013             if (! fgets(buf, sizeof(buf), f)) {
2014                 //
2015                 // Read errors presumably because of EOF
2016                 //
2017                 // If there is valid data in threadInfo[num_avail], then fake
2018                 // a blank line in ensure that the last address gets parsed.
2019                 //
2020                 bool valid = false;
2021                 for (i = 0; i <= maxIndex; i++) {
2022                     if (threadInfo[num_avail][i] != UINT_MAX) {
2023                         valid = true;
2024                     }
2025                 }
2026                 if (! valid) {
2027                     break;
2028                 }
2029                 buf[0] = 0;
2030             } else if (!buf[sizeof(buf) - 1]) {
2031                 //
2032                 // The line is longer than the buffer.  Set a flag and don't
2033                 // emit an error if we were going to ignore the line, anyway.
2034                 //
2035                 long_line = true;
2036 
2037 #define CHECK_LINE \
2038     if (long_line) {                                                    \
2039         CLEANUP_THREAD_INFO;                                            \
2040         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2041         return -1;                                                      \
2042     }
2043             }
2044             (*line)++;
2045 
2046             char s1[] = "processor";
2047             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2048                 CHECK_LINE;
2049                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2050                 unsigned val;
2051                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2052                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2053                 threadInfo[num_avail][osIdIndex] = val;
2054 #if KMP_OS_LINUX && USE_SYSFS_INFO
2055                 char path[256];
2056                 KMP_SNPRINTF(path, sizeof(path),
2057                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2058                     threadInfo[num_avail][osIdIndex]);
2059                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2060 
2061                 KMP_SNPRINTF(path, sizeof(path),
2062                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2063                     threadInfo[num_avail][osIdIndex]);
2064                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2065                 continue;
2066 #else
2067             }
2068             char s2[] = "physical id";
2069             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2070                 CHECK_LINE;
2071                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2072                 unsigned val;
2073                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2074                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2075                 threadInfo[num_avail][pkgIdIndex] = val;
2076                 continue;
2077             }
2078             char s3[] = "core id";
2079             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2080                 CHECK_LINE;
2081                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2082                 unsigned val;
2083                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2084                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2085                 threadInfo[num_avail][coreIdIndex] = val;
2086                 continue;
2087 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2088             }
2089             char s4[] = "thread id";
2090             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2091                 CHECK_LINE;
2092                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2093                 unsigned val;
2094                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2095                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2096                 threadInfo[num_avail][threadIdIndex] = val;
2097                 continue;
2098             }
2099             unsigned level;
2100             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2101                 CHECK_LINE;
2102                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2103                 unsigned val;
2104                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2105                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2106                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2107                 threadInfo[num_avail][nodeIdIndex + level] = val;
2108                 continue;
2109             }
2110 
2111             //
2112             // We didn't recognize the leading token on the line.
2113             // There are lots of leading tokens that we don't recognize -
2114             // if the line isn't empty, go on to the next line.
2115             //
2116             if ((*buf != 0) && (*buf != '\n')) {
2117                 //
2118                 // If the line is longer than the buffer, read characters
2119                 // until we find a newline.
2120                 //
2121                 if (long_line) {
2122                     int ch;
2123                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2124                 }
2125                 continue;
2126             }
2127 
2128             //
2129             // A newline has signalled the end of the processor record.
2130             // Check that there aren't too many procs specified.
2131             //
2132             if ((int)num_avail == __kmp_xproc) {
2133                 CLEANUP_THREAD_INFO;
2134                 *msg_id = kmp_i18n_str_TooManyEntries;
2135                 return -1;
2136             }
2137 
2138             //
2139             // Check for missing fields.  The osId field must be there, and we
2140             // currently require that the physical id field is specified, also.
2141             //
2142             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2143                 CLEANUP_THREAD_INFO;
2144                 *msg_id = kmp_i18n_str_MissingProcField;
2145                 return -1;
2146             }
2147             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2148                 CLEANUP_THREAD_INFO;
2149                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2150                 return -1;
2151             }
2152 
2153             //
2154             // Skip this proc if it is not included in the machine model.
2155             //
2156             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2157                 INIT_PROC_INFO(threadInfo[num_avail]);
2158                 continue;
2159             }
2160 
2161             //
2162             // We have a successful parse of this proc's info.
2163             // Increment the counter, and prepare for the next proc.
2164             //
2165             num_avail++;
2166             KMP_ASSERT(num_avail <= num_records);
2167             INIT_PROC_INFO(threadInfo[num_avail]);
2168         }
2169         continue;
2170 
2171         no_val:
2172         CLEANUP_THREAD_INFO;
2173         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2174         return -1;
2175 
2176         dup_field:
2177         CLEANUP_THREAD_INFO;
2178         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2179         return -1;
2180     }
2181     *line = 0;
2182 
2183 # if KMP_MIC && REDUCE_TEAM_SIZE
2184     unsigned teamSize = 0;
2185 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2186 
2187     // check for num_records == __kmp_xproc ???
2188 
2189     //
2190     // If there's only one thread context to bind to, form an Address object
2191     // with depth 1 and return immediately (or, if affinity is off, set
2192     // address2os to NULL and return).
2193     //
2194     // If it is configured to omit the package level when there is only a
2195     // single package, the logic at the end of this routine won't work if
2196     // there is only a single thread - it would try to form an Address
2197     // object with depth 0.
2198     //
2199     KMP_ASSERT(num_avail > 0);
2200     KMP_ASSERT(num_avail <= num_records);
2201     if (num_avail == 1) {
2202         __kmp_ncores = 1;
2203         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2204         if (__kmp_affinity_verbose) {
2205             if (! KMP_AFFINITY_CAPABLE()) {
2206                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2207                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2208                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2209             }
2210             else {
2211                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2212                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2213                   fullMask);
2214                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2215                 if (__kmp_affinity_respect_mask) {
2216                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2217                 } else {
2218                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2219                 }
2220                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2221                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2222             }
2223             int index;
2224             kmp_str_buf_t buf;
2225             __kmp_str_buf_init(&buf);
2226             __kmp_str_buf_print(&buf, "1");
2227             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2228                 __kmp_str_buf_print(&buf, " x 1");
2229             }
2230             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2231             __kmp_str_buf_free(&buf);
2232         }
2233 
2234         if (__kmp_affinity_type == affinity_none) {
2235             CLEANUP_THREAD_INFO;
2236             return 0;
2237         }
2238 
2239         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2240         Address addr(1);
2241         addr.labels[0] = threadInfo[0][pkgIdIndex];
2242         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2243 
2244         if (__kmp_affinity_gran_levels < 0) {
2245             __kmp_affinity_gran_levels = 0;
2246         }
2247 
2248         if (__kmp_affinity_verbose) {
2249             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2250         }
2251 
2252         CLEANUP_THREAD_INFO;
2253         return 1;
2254     }
2255 
2256     //
2257     // Sort the threadInfo table by physical Id.
2258     //
2259     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2260       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2261 
2262     //
2263     // The table is now sorted by pkgId / coreId / threadId, but we really
2264     // don't know the radix of any of the fields.  pkgId's may be sparsely
2265     // assigned among the chips on a system.  Although coreId's are usually
2266     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2267     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2268     //
2269     // For that matter, we don't know what coresPerPkg and threadsPerCore
2270     // (or the total # packages) are at this point - we want to determine
2271     // that now.  We only have an upper bound on the first two figures.
2272     //
2273     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2274       * sizeof(unsigned));
2275     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2276       * sizeof(unsigned));
2277     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2278       * sizeof(unsigned));
2279     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2280       * sizeof(unsigned));
2281 
2282     bool assign_thread_ids = false;
2283     unsigned threadIdCt;
2284     unsigned index;
2285 
2286     restart_radix_check:
2287     threadIdCt = 0;
2288 
2289     //
2290     // Initialize the counter arrays with data from threadInfo[0].
2291     //
2292     if (assign_thread_ids) {
2293         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2294             threadInfo[0][threadIdIndex] = threadIdCt++;
2295         }
2296         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2297             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2298         }
2299     }
2300     for (index = 0; index <= maxIndex; index++) {
2301         counts[index] = 1;
2302         maxCt[index] = 1;
2303         totals[index] = 1;
2304         lastId[index] = threadInfo[0][index];;
2305     }
2306 
2307     //
2308     // Run through the rest of the OS procs.
2309     //
2310     for (i = 1; i < num_avail; i++) {
2311         //
2312         // Find the most significant index whose id differs
2313         // from the id for the previous OS proc.
2314         //
2315         for (index = maxIndex; index >= threadIdIndex; index--) {
2316             if (assign_thread_ids && (index == threadIdIndex)) {
2317                 //
2318                 // Auto-assign the thread id field if it wasn't specified.
2319                 //
2320                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2321                     threadInfo[i][threadIdIndex] = threadIdCt++;
2322                 }
2323 
2324                 //
2325                 // Aparrently the thread id field was specified for some
2326                 // entries and not others.  Start the thread id counter
2327                 // off at the next higher thread id.
2328                 //
2329                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2330                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2331                 }
2332             }
2333             if (threadInfo[i][index] != lastId[index]) {
2334                 //
2335                 // Run through all indices which are less significant,
2336                 // and reset the counts to 1.
2337                 //
2338                 // At all levels up to and including index, we need to
2339                 // increment the totals and record the last id.
2340                 //
2341                 unsigned index2;
2342                 for (index2 = threadIdIndex; index2 < index; index2++) {
2343                     totals[index2]++;
2344                     if (counts[index2] > maxCt[index2]) {
2345                         maxCt[index2] = counts[index2];
2346                     }
2347                     counts[index2] = 1;
2348                     lastId[index2] = threadInfo[i][index2];
2349                 }
2350                 counts[index]++;
2351                 totals[index]++;
2352                 lastId[index] = threadInfo[i][index];
2353 
2354                 if (assign_thread_ids && (index > threadIdIndex)) {
2355 
2356 # if KMP_MIC && REDUCE_TEAM_SIZE
2357                     //
2358                     // The default team size is the total #threads in the machine
2359                     // minus 1 thread for every core that has 3 or more threads.
2360                     //
2361                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2362 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2363 
2364                     //
2365                     // Restart the thread counter, as we are on a new core.
2366                     //
2367                     threadIdCt = 0;
2368 
2369                     //
2370                     // Auto-assign the thread id field if it wasn't specified.
2371                     //
2372                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2373                         threadInfo[i][threadIdIndex] = threadIdCt++;
2374                     }
2375 
2376                     //
2377                     // Aparrently the thread id field was specified for some
2378                     // entries and not others.  Start the thread id counter
2379                     // off at the next higher thread id.
2380                     //
2381                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2382                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2383                     }
2384                 }
2385                 break;
2386             }
2387         }
2388         if (index < threadIdIndex) {
2389             //
2390             // If thread ids were specified, it is an error if they are not
2391             // unique.  Also, check that we waven't already restarted the
2392             // loop (to be safe - shouldn't need to).
2393             //
2394             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2395               || assign_thread_ids) {
2396                 __kmp_free(lastId);
2397                 __kmp_free(totals);
2398                 __kmp_free(maxCt);
2399                 __kmp_free(counts);
2400                 CLEANUP_THREAD_INFO;
2401                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2402                 return -1;
2403             }
2404 
2405             //
2406             // If the thread ids were not specified and we see entries
2407             // entries that are duplicates, start the loop over and
2408             // assign the thread ids manually.
2409             //
2410             assign_thread_ids = true;
2411             goto restart_radix_check;
2412         }
2413     }
2414 
2415 # if KMP_MIC && REDUCE_TEAM_SIZE
2416     //
2417     // The default team size is the total #threads in the machine
2418     // minus 1 thread for every core that has 3 or more threads.
2419     //
2420     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2421 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2422 
2423     for (index = threadIdIndex; index <= maxIndex; index++) {
2424         if (counts[index] > maxCt[index]) {
2425             maxCt[index] = counts[index];
2426         }
2427     }
2428 
2429     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2430     nCoresPerPkg = maxCt[coreIdIndex];
2431     nPackages = totals[pkgIdIndex];
2432 
2433     //
2434     // Check to see if the machine topology is uniform
2435     //
2436     unsigned prod = totals[maxIndex];
2437     for (index = threadIdIndex; index < maxIndex; index++) {
2438        prod *= maxCt[index];
2439     }
2440     bool uniform = (prod == totals[threadIdIndex]);
2441 
2442     //
2443     // When affinity is off, this routine will still be called to set
2444     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2445     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2446     // correctly, and return now if affinity is not enabled.
2447     //
2448     __kmp_ncores = totals[coreIdIndex];
2449 
2450     if (__kmp_affinity_verbose) {
2451         if (! KMP_AFFINITY_CAPABLE()) {
2452                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2453                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2454                 if (uniform) {
2455                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2456                 } else {
2457                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2458                 }
2459         }
2460         else {
2461             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2462             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2463                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2464                 if (__kmp_affinity_respect_mask) {
2465                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2466                 } else {
2467                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2468                 }
2469                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2470                 if (uniform) {
2471                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2472                 } else {
2473                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2474                 }
2475         }
2476         kmp_str_buf_t buf;
2477         __kmp_str_buf_init(&buf);
2478 
2479         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2480         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2481             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2482         }
2483         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2484           maxCt[threadIdIndex], __kmp_ncores);
2485 
2486         __kmp_str_buf_free(&buf);
2487     }
2488 
2489 # if KMP_MIC && REDUCE_TEAM_SIZE
2490     //
2491     // Set the default team size.
2492     //
2493     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2494         __kmp_dflt_team_nth = teamSize;
2495         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2496           __kmp_dflt_team_nth));
2497     }
2498 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2499 
2500     if (__kmp_affinity_type == affinity_none) {
2501         __kmp_free(lastId);
2502         __kmp_free(totals);
2503         __kmp_free(maxCt);
2504         __kmp_free(counts);
2505         CLEANUP_THREAD_INFO;
2506         return 0;
2507     }
2508 
2509     //
2510     // Count the number of levels which have more nodes at that level than
2511     // at the parent's level (with there being an implicit root node of
2512     // the top level).  This is equivalent to saying that there is at least
2513     // one node at this level which has a sibling.  These levels are in the
2514     // map, and the package level is always in the map.
2515     //
2516     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2517     int level = 0;
2518     for (index = threadIdIndex; index < maxIndex; index++) {
2519         KMP_ASSERT(totals[index] >= totals[index + 1]);
2520         inMap[index] = (totals[index] > totals[index + 1]);
2521     }
2522     inMap[maxIndex] = (totals[maxIndex] > 1);
2523     inMap[pkgIdIndex] = true;
2524 
2525     int depth = 0;
2526     for (index = threadIdIndex; index <= maxIndex; index++) {
2527         if (inMap[index]) {
2528             depth++;
2529         }
2530     }
2531     KMP_ASSERT(depth > 0);
2532 
2533     //
2534     // Construct the data structure that is to be returned.
2535     //
2536     *address2os = (AddrUnsPair*)
2537       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2538     int pkgLevel = -1;
2539     int coreLevel = -1;
2540     int threadLevel = -1;
2541 
2542     for (i = 0; i < num_avail; ++i) {
2543         Address addr(depth);
2544         unsigned os = threadInfo[i][osIdIndex];
2545         int src_index;
2546         int dst_index = 0;
2547 
2548         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2549             if (! inMap[src_index]) {
2550                 continue;
2551             }
2552             addr.labels[dst_index] = threadInfo[i][src_index];
2553             if (src_index == pkgIdIndex) {
2554                 pkgLevel = dst_index;
2555             }
2556             else if (src_index == coreIdIndex) {
2557                 coreLevel = dst_index;
2558             }
2559             else if (src_index == threadIdIndex) {
2560                 threadLevel = dst_index;
2561             }
2562             dst_index++;
2563         }
2564         (*address2os)[i] = AddrUnsPair(addr, os);
2565     }
2566 
2567     if (__kmp_affinity_gran_levels < 0) {
2568         //
2569         // Set the granularity level based on what levels are modeled
2570         // in the machine topology map.
2571         //
2572         unsigned src_index;
2573         __kmp_affinity_gran_levels = 0;
2574         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2575             if (! inMap[src_index]) {
2576                 continue;
2577             }
2578             switch (src_index) {
2579                 case threadIdIndex:
2580                 if (__kmp_affinity_gran > affinity_gran_thread) {
2581                     __kmp_affinity_gran_levels++;
2582                 }
2583 
2584                 break;
2585                 case coreIdIndex:
2586                 if (__kmp_affinity_gran > affinity_gran_core) {
2587                     __kmp_affinity_gran_levels++;
2588                 }
2589                 break;
2590 
2591                 case pkgIdIndex:
2592                 if (__kmp_affinity_gran > affinity_gran_package) {
2593                     __kmp_affinity_gran_levels++;
2594                 }
2595                 break;
2596             }
2597         }
2598     }
2599 
2600     if (__kmp_affinity_verbose) {
2601         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2602           coreLevel, threadLevel);
2603     }
2604 
2605     __kmp_free(inMap);
2606     __kmp_free(lastId);
2607     __kmp_free(totals);
2608     __kmp_free(maxCt);
2609     __kmp_free(counts);
2610     CLEANUP_THREAD_INFO;
2611     return depth;
2612 }
2613 
2614 
2615 //
2616 // Create and return a table of affinity masks, indexed by OS thread ID.
2617 // This routine handles OR'ing together all the affinity masks of threads
2618 // that are sufficiently close, if granularity > fine.
2619 //
2620 static kmp_affin_mask_t *
2621 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2622   AddrUnsPair *address2os, unsigned numAddrs)
2623 {
2624     //
2625     // First form a table of affinity masks in order of OS thread id.
2626     //
2627     unsigned depth;
2628     unsigned maxOsId;
2629     unsigned i;
2630 
2631     KMP_ASSERT(numAddrs > 0);
2632     depth = address2os[0].first.depth;
2633 
2634     maxOsId = 0;
2635     for (i = 0; i < numAddrs; i++) {
2636         unsigned osId = address2os[i].second;
2637         if (osId > maxOsId) {
2638             maxOsId = osId;
2639         }
2640     }
2641     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2642       (maxOsId + 1) * __kmp_affin_mask_size);
2643 
2644     //
2645     // Sort the address2os table according to physical order.  Doing so
2646     // will put all threads on the same core/package/node in consecutive
2647     // locations.
2648     //
2649     qsort(address2os, numAddrs, sizeof(*address2os),
2650       __kmp_affinity_cmp_Address_labels);
2651 
2652     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2653     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2654         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2655     }
2656     if (__kmp_affinity_gran_levels >= (int)depth) {
2657         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2658           && (__kmp_affinity_type != affinity_none))) {
2659             KMP_WARNING(AffThreadsMayMigrate);
2660         }
2661     }
2662 
2663     //
2664     // Run through the table, forming the masks for all threads on each
2665     // core.  Threads on the same core will have identical "Address"
2666     // objects, not considering the last level, which must be the thread
2667     // id.  All threads on a core will appear consecutively.
2668     //
2669     unsigned unique = 0;
2670     unsigned j = 0;                             // index of 1st thread on core
2671     unsigned leader = 0;
2672     Address *leaderAddr = &(address2os[0].first);
2673     kmp_affin_mask_t *sum
2674       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2675     KMP_CPU_ZERO(sum);
2676     KMP_CPU_SET(address2os[0].second, sum);
2677     for (i = 1; i < numAddrs; i++) {
2678         //
2679         // If this thread is sufficiently close to the leader (within the
2680         // granularity setting), then set the bit for this os thread in the
2681         // affinity mask for this group, and go on to the next thread.
2682         //
2683         if (leaderAddr->isClose(address2os[i].first,
2684           __kmp_affinity_gran_levels)) {
2685             KMP_CPU_SET(address2os[i].second, sum);
2686             continue;
2687         }
2688 
2689         //
2690         // For every thread in this group, copy the mask to the thread's
2691         // entry in the osId2Mask table.  Mark the first address as a
2692         // leader.
2693         //
2694         for (; j < i; j++) {
2695             unsigned osId = address2os[j].second;
2696             KMP_DEBUG_ASSERT(osId <= maxOsId);
2697             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2698             KMP_CPU_COPY(mask, sum);
2699             address2os[j].first.leader = (j == leader);
2700         }
2701         unique++;
2702 
2703         //
2704         // Start a new mask.
2705         //
2706         leader = i;
2707         leaderAddr = &(address2os[i].first);
2708         KMP_CPU_ZERO(sum);
2709         KMP_CPU_SET(address2os[i].second, sum);
2710     }
2711 
2712     //
2713     // For every thread in last group, copy the mask to the thread's
2714     // entry in the osId2Mask table.
2715     //
2716     for (; j < i; j++) {
2717         unsigned osId = address2os[j].second;
2718         KMP_DEBUG_ASSERT(osId <= maxOsId);
2719         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2720         KMP_CPU_COPY(mask, sum);
2721         address2os[j].first.leader = (j == leader);
2722     }
2723     unique++;
2724 
2725     *maxIndex = maxOsId;
2726     *numUnique = unique;
2727     return osId2Mask;
2728 }
2729 
2730 
2731 //
2732 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2733 // as file-static than to try and pass them through the calling sequence of
2734 // the recursive-descent OMP_PLACES parser.
2735 //
2736 static kmp_affin_mask_t *newMasks;
2737 static int numNewMasks;
2738 static int nextNewMask;
2739 
2740 #define ADD_MASK(_mask) \
2741     {                                                                   \
2742         if (nextNewMask >= numNewMasks) {                               \
2743             numNewMasks *= 2;                                           \
2744             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2745               numNewMasks * __kmp_affin_mask_size);                     \
2746         }                                                               \
2747         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2748         nextNewMask++;                                                  \
2749     }
2750 
2751 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2752     {                                                                   \
2753         if (((_osId) > _maxOsId) ||                                     \
2754           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2755             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2756               && (__kmp_affinity_type != affinity_none))) {             \
2757                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2758             }                                                           \
2759         }                                                               \
2760         else {                                                          \
2761             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2762         }                                                               \
2763     }
2764 
2765 
2766 //
2767 // Re-parse the proclist (for the explicit affinity type), and form the list
2768 // of affinity newMasks indexed by gtid.
2769 //
2770 static void
2771 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2772   unsigned int *out_numMasks, const char *proclist,
2773   kmp_affin_mask_t *osId2Mask, int maxOsId)
2774 {
2775     const char *scan = proclist;
2776     const char *next = proclist;
2777 
2778     //
2779     // We use malloc() for the temporary mask vector,
2780     // so that we can use realloc() to extend it.
2781     //
2782     numNewMasks = 2;
2783     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2784       * __kmp_affin_mask_size);
2785     nextNewMask = 0;
2786     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2787       __kmp_affin_mask_size);
2788     int setSize = 0;
2789 
2790     for (;;) {
2791         int start, end, stride;
2792 
2793         SKIP_WS(scan);
2794         next = scan;
2795         if (*next == '\0') {
2796             break;
2797         }
2798 
2799         if (*next == '{') {
2800             int num;
2801             setSize = 0;
2802             next++;     // skip '{'
2803             SKIP_WS(next);
2804             scan = next;
2805 
2806             //
2807             // Read the first integer in the set.
2808             //
2809             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2810               "bad proclist");
2811             SKIP_DIGITS(next);
2812             num = __kmp_str_to_int(scan, *next);
2813             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2814 
2815             //
2816             // Copy the mask for that osId to the sum (union) mask.
2817             //
2818             if ((num > maxOsId) ||
2819               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2820                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2821                   && (__kmp_affinity_type != affinity_none))) {
2822                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2823                 }
2824                 KMP_CPU_ZERO(sumMask);
2825             }
2826             else {
2827                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2828                 setSize = 1;
2829             }
2830 
2831             for (;;) {
2832                 //
2833                 // Check for end of set.
2834                 //
2835                 SKIP_WS(next);
2836                 if (*next == '}') {
2837                     next++;     // skip '}'
2838                     break;
2839                 }
2840 
2841                 //
2842                 // Skip optional comma.
2843                 //
2844                 if (*next == ',') {
2845                     next++;
2846                 }
2847                 SKIP_WS(next);
2848 
2849                 //
2850                 // Read the next integer in the set.
2851                 //
2852                 scan = next;
2853                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2854                   "bad explicit proc list");
2855 
2856                 SKIP_DIGITS(next);
2857                 num = __kmp_str_to_int(scan, *next);
2858                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2859 
2860                 //
2861                 // Add the mask for that osId to the sum mask.
2862                 //
2863                 if ((num > maxOsId) ||
2864                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2865                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2866                       && (__kmp_affinity_type != affinity_none))) {
2867                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2868                     }
2869                 }
2870                 else {
2871                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2872                     setSize++;
2873                 }
2874             }
2875             if (setSize > 0) {
2876                 ADD_MASK(sumMask);
2877             }
2878 
2879             SKIP_WS(next);
2880             if (*next == ',') {
2881                 next++;
2882             }
2883             scan = next;
2884             continue;
2885         }
2886 
2887         //
2888         // Read the first integer.
2889         //
2890         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2891         SKIP_DIGITS(next);
2892         start = __kmp_str_to_int(scan, *next);
2893         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2894         SKIP_WS(next);
2895 
2896         //
2897         // If this isn't a range, then add a mask to the list and go on.
2898         //
2899         if (*next != '-') {
2900             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2901 
2902             //
2903             // Skip optional comma.
2904             //
2905             if (*next == ',') {
2906                 next++;
2907             }
2908             scan = next;
2909             continue;
2910         }
2911 
2912         //
2913         // This is a range.  Skip over the '-' and read in the 2nd int.
2914         //
2915         next++;         // skip '-'
2916         SKIP_WS(next);
2917         scan = next;
2918         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2919         SKIP_DIGITS(next);
2920         end = __kmp_str_to_int(scan, *next);
2921         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2922 
2923         //
2924         // Check for a stride parameter
2925         //
2926         stride = 1;
2927         SKIP_WS(next);
2928         if (*next == ':') {
2929             //
2930             // A stride is specified.  Skip over the ':" and read the 3rd int.
2931             //
2932             int sign = +1;
2933             next++;         // skip ':'
2934             SKIP_WS(next);
2935             scan = next;
2936             if (*next == '-') {
2937                 sign = -1;
2938                 next++;
2939                 SKIP_WS(next);
2940                 scan = next;
2941             }
2942             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2943               "bad explicit proc list");
2944             SKIP_DIGITS(next);
2945             stride = __kmp_str_to_int(scan, *next);
2946             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2947             stride *= sign;
2948         }
2949 
2950         //
2951         // Do some range checks.
2952         //
2953         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2954         if (stride > 0) {
2955             KMP_ASSERT2(start <= end, "bad explicit proc list");
2956         }
2957         else {
2958             KMP_ASSERT2(start >= end, "bad explicit proc list");
2959         }
2960         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2961 
2962         //
2963         // Add the mask for each OS proc # to the list.
2964         //
2965         if (stride > 0) {
2966             do {
2967                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2968                 start += stride;
2969             } while (start <= end);
2970         }
2971         else {
2972             do {
2973                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2974                 start += stride;
2975             } while (start >= end);
2976         }
2977 
2978         //
2979         // Skip optional comma.
2980         //
2981         SKIP_WS(next);
2982         if (*next == ',') {
2983             next++;
2984         }
2985         scan = next;
2986     }
2987 
2988     *out_numMasks = nextNewMask;
2989     if (nextNewMask == 0) {
2990         *out_masks = NULL;
2991         KMP_INTERNAL_FREE(newMasks);
2992         return;
2993     }
2994     *out_masks
2995       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2996     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2997     __kmp_free(sumMask);
2998     KMP_INTERNAL_FREE(newMasks);
2999 }
3000 
3001 
3002 # if OMP_40_ENABLED
3003 
3004 /*-----------------------------------------------------------------------------
3005 
3006 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3007 places.  Again, Here is the grammar:
3008 
3009 place_list := place
3010 place_list := place , place_list
3011 place := num
3012 place := place : num
3013 place := place : num : signed
3014 place := { subplacelist }
3015 place := ! place                  // (lowest priority)
3016 subplace_list := subplace
3017 subplace_list := subplace , subplace_list
3018 subplace := num
3019 subplace := num : num
3020 subplace := num : num : signed
3021 signed := num
3022 signed := + signed
3023 signed := - signed
3024 
3025 -----------------------------------------------------------------------------*/
3026 
3027 static void
3028 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3029   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3030 {
3031     const char *next;
3032 
3033     for (;;) {
3034         int start, count, stride, i;
3035 
3036         //
3037         // Read in the starting proc id
3038         //
3039         SKIP_WS(*scan);
3040         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3041           "bad explicit places list");
3042         next = *scan;
3043         SKIP_DIGITS(next);
3044         start = __kmp_str_to_int(*scan, *next);
3045         KMP_ASSERT(start >= 0);
3046         *scan = next;
3047 
3048         //
3049         // valid follow sets are ',' ':' and '}'
3050         //
3051         SKIP_WS(*scan);
3052         if (**scan == '}' || **scan == ',') {
3053             if ((start > maxOsId) ||
3054               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3055                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3056                   && (__kmp_affinity_type != affinity_none))) {
3057                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3058                 }
3059             }
3060             else {
3061                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3062                 (*setSize)++;
3063             }
3064             if (**scan == '}') {
3065                 break;
3066             }
3067             (*scan)++;  // skip ','
3068             continue;
3069         }
3070         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3071         (*scan)++;      // skip ':'
3072 
3073         //
3074         // Read count parameter
3075         //
3076         SKIP_WS(*scan);
3077         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3078           "bad explicit places list");
3079         next = *scan;
3080         SKIP_DIGITS(next);
3081         count = __kmp_str_to_int(*scan, *next);
3082         KMP_ASSERT(count >= 0);
3083         *scan = next;
3084 
3085         //
3086         // valid follow sets are ',' ':' and '}'
3087         //
3088         SKIP_WS(*scan);
3089         if (**scan == '}' || **scan == ',') {
3090             for (i = 0; i < count; i++) {
3091                 if ((start > maxOsId) ||
3092                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3093                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3094                       && (__kmp_affinity_type != affinity_none))) {
3095                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3096                     }
3097                     break;  // don't proliferate warnings for large count
3098                 }
3099                 else {
3100                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101                     start++;
3102                     (*setSize)++;
3103                 }
3104             }
3105             if (**scan == '}') {
3106                 break;
3107             }
3108             (*scan)++;  // skip ','
3109             continue;
3110         }
3111         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3112         (*scan)++;      // skip ':'
3113 
3114         //
3115         // Read stride parameter
3116         //
3117         int sign = +1;
3118         for (;;) {
3119             SKIP_WS(*scan);
3120             if (**scan == '+') {
3121                 (*scan)++; // skip '+'
3122                 continue;
3123             }
3124             if (**scan == '-') {
3125                 sign *= -1;
3126                 (*scan)++; // skip '-'
3127                 continue;
3128             }
3129             break;
3130         }
3131         SKIP_WS(*scan);
3132         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3133           "bad explicit places list");
3134         next = *scan;
3135         SKIP_DIGITS(next);
3136         stride = __kmp_str_to_int(*scan, *next);
3137         KMP_ASSERT(stride >= 0);
3138         *scan = next;
3139         stride *= sign;
3140 
3141         //
3142         // valid follow sets are ',' and '}'
3143         //
3144         SKIP_WS(*scan);
3145         if (**scan == '}' || **scan == ',') {
3146             for (i = 0; i < count; i++) {
3147                 if ((start > maxOsId) ||
3148                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3149                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3150                       && (__kmp_affinity_type != affinity_none))) {
3151                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3152                     }
3153                     break;  // don't proliferate warnings for large count
3154                 }
3155                 else {
3156                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3157                     start += stride;
3158                     (*setSize)++;
3159                 }
3160             }
3161             if (**scan == '}') {
3162                 break;
3163             }
3164             (*scan)++;  // skip ','
3165             continue;
3166         }
3167 
3168         KMP_ASSERT2(0, "bad explicit places list");
3169     }
3170 }
3171 
3172 
3173 static void
3174 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3175   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3176 {
3177     const char *next;
3178 
3179     //
3180     // valid follow sets are '{' '!' and num
3181     //
3182     SKIP_WS(*scan);
3183     if (**scan == '{') {
3184         (*scan)++;      // skip '{'
3185         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3186           setSize);
3187         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3188         (*scan)++;      // skip '}'
3189     }
3190     else if (**scan == '!') {
3191         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3192         KMP_CPU_COMPLEMENT(tempMask);
3193         (*scan)++;      // skip '!'
3194     }
3195     else if ((**scan >= '0') && (**scan <= '9')) {
3196         next = *scan;
3197         SKIP_DIGITS(next);
3198         int num = __kmp_str_to_int(*scan, *next);
3199         KMP_ASSERT(num >= 0);
3200         if ((num > maxOsId) ||
3201           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3202             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3203               && (__kmp_affinity_type != affinity_none))) {
3204                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3205             }
3206         }
3207         else {
3208             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3209             (*setSize)++;
3210         }
3211         *scan = next;  // skip num
3212     }
3213     else {
3214         KMP_ASSERT2(0, "bad explicit places list");
3215     }
3216 }
3217 
3218 
3219 //static void
3220 void
3221 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3222   unsigned int *out_numMasks, const char *placelist,
3223   kmp_affin_mask_t *osId2Mask, int maxOsId)
3224 {
3225     const char *scan = placelist;
3226     const char *next = placelist;
3227 
3228     numNewMasks = 2;
3229     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3230       * __kmp_affin_mask_size);
3231     nextNewMask = 0;
3232 
3233     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3234       __kmp_affin_mask_size);
3235     KMP_CPU_ZERO(tempMask);
3236     int setSize = 0;
3237 
3238     for (;;) {
3239         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3240 
3241         //
3242         // valid follow sets are ',' ':' and EOL
3243         //
3244         SKIP_WS(scan);
3245         if (*scan == '\0' || *scan == ',') {
3246             if (setSize > 0) {
3247                 ADD_MASK(tempMask);
3248             }
3249             KMP_CPU_ZERO(tempMask);
3250             setSize = 0;
3251             if (*scan == '\0') {
3252                 break;
3253             }
3254             scan++;     // skip ','
3255             continue;
3256         }
3257 
3258         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3259         scan++;         // skip ':'
3260 
3261         //
3262         // Read count parameter
3263         //
3264         SKIP_WS(scan);
3265         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3266           "bad explicit places list");
3267         next = scan;
3268         SKIP_DIGITS(next);
3269         int count = __kmp_str_to_int(scan, *next);
3270         KMP_ASSERT(count >= 0);
3271         scan = next;
3272 
3273         //
3274         // valid follow sets are ',' ':' and EOL
3275         //
3276         SKIP_WS(scan);
3277         int stride;
3278         if (*scan == '\0' || *scan == ',') {
3279             stride = +1;
3280         }
3281         else {
3282             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3283             scan++;         // skip ':'
3284 
3285             //
3286             // Read stride parameter
3287             //
3288             int sign = +1;
3289             for (;;) {
3290                 SKIP_WS(scan);
3291                 if (*scan == '+') {
3292                     scan++; // skip '+'
3293                     continue;
3294                 }
3295                 if (*scan == '-') {
3296                     sign *= -1;
3297                     scan++; // skip '-'
3298                     continue;
3299                 }
3300                 break;
3301             }
3302             SKIP_WS(scan);
3303             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3304               "bad explicit places list");
3305             next = scan;
3306             SKIP_DIGITS(next);
3307             stride = __kmp_str_to_int(scan, *next);
3308             KMP_DEBUG_ASSERT(stride >= 0);
3309             scan = next;
3310             stride *= sign;
3311         }
3312 
3313         if (stride > 0) {
3314             int i;
3315             for (i = 0; i < count; i++) {
3316                 int j;
3317                 if (setSize == 0) {
3318                     break;
3319                 }
3320                 ADD_MASK(tempMask);
3321                 setSize = 0;
3322                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3323                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3324                         KMP_CPU_CLR(j, tempMask);
3325                     }
3326                     else if ((j > maxOsId) ||
3327                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3328                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3329                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3330                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3331                         }
3332                         KMP_CPU_CLR(j, tempMask);
3333                     }
3334                     else {
3335                         KMP_CPU_SET(j, tempMask);
3336                         setSize++;
3337                     }
3338                 }
3339                 for (; j >= 0; j--) {
3340                     KMP_CPU_CLR(j, tempMask);
3341                 }
3342             }
3343         }
3344         else {
3345             int i;
3346             for (i = 0; i < count; i++) {
3347                 int j;
3348                 if (setSize == 0) {
3349                     break;
3350                 }
3351                 ADD_MASK(tempMask);
3352                 setSize = 0;
3353                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3354                   j++) {
3355                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3356                         KMP_CPU_CLR(j, tempMask);
3357                     }
3358                     else if ((j > maxOsId) ||
3359                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3360                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3361                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3362                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3363                         }
3364                         KMP_CPU_CLR(j, tempMask);
3365                     }
3366                     else {
3367                         KMP_CPU_SET(j, tempMask);
3368                         setSize++;
3369                     }
3370                 }
3371                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3372                     KMP_CPU_CLR(j, tempMask);
3373                 }
3374             }
3375         }
3376         KMP_CPU_ZERO(tempMask);
3377         setSize = 0;
3378 
3379         //
3380         // valid follow sets are ',' and EOL
3381         //
3382         SKIP_WS(scan);
3383         if (*scan == '\0') {
3384             break;
3385         }
3386         if (*scan == ',') {
3387             scan++;     // skip ','
3388             continue;
3389         }
3390 
3391         KMP_ASSERT2(0, "bad explicit places list");
3392     }
3393 
3394     *out_numMasks = nextNewMask;
3395     if (nextNewMask == 0) {
3396         *out_masks = NULL;
3397         KMP_INTERNAL_FREE(newMasks);
3398         return;
3399     }
3400     *out_masks
3401       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3402     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3403     __kmp_free(tempMask);
3404     KMP_INTERNAL_FREE(newMasks);
3405 }
3406 
3407 # endif /* OMP_40_ENABLED */
3408 
3409 #undef ADD_MASK
3410 #undef ADD_MASK_OSID
3411 
3412 static void
3413 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3414 {
3415     if ( __kmp_place_num_cores == 0 ) {
3416         if ( __kmp_place_num_threads_per_core == 0 ) {
3417             return;   // no cores limiting actions requested, exit
3418         }
3419         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3420     }
3421     if ( !__kmp_affinity_uniform_topology() ) {
3422         KMP_WARNING( AffThrPlaceNonUniform );
3423         return; // don't support non-uniform topology
3424     }
3425     if ( depth != 3 ) {
3426         KMP_WARNING( AffThrPlaceNonThreeLevel );
3427         return; // don't support not-3-level topology
3428     }
3429     if ( __kmp_place_num_threads_per_core == 0 ) {
3430         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3431     }
3432     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3433         KMP_WARNING( AffThrPlaceManyCores );
3434         return;
3435     }
3436 
3437     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3438                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3439     int i, j, k, n_old = 0, n_new = 0;
3440     for ( i = 0; i < nPackages; ++i ) {
3441         for ( j = 0; j < nCoresPerPkg; ++j ) {
3442             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3443                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3444             } else {
3445                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3446                     if ( k < __kmp_place_num_threads_per_core ) {
3447                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3448                         n_new++;
3449                     }
3450                     n_old++;
3451                 }
3452             }
3453         }
3454     }
3455     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3456     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3457     __kmp_avail_proc = n_new;                                 // correct avail_proc
3458     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3459 
3460     __kmp_free( *pAddr );
3461     *pAddr = newAddr;      // replace old topology with new one
3462 }
3463 
3464 
3465 static AddrUnsPair *address2os = NULL;
3466 static int           * procarr = NULL;
3467 static int     __kmp_aff_depth = 0;
3468 
3469 static void
3470 __kmp_aux_affinity_initialize(void)
3471 {
3472     if (__kmp_affinity_masks != NULL) {
3473         KMP_ASSERT(fullMask != NULL);
3474         return;
3475     }
3476 
3477     //
3478     // Create the "full" mask - this defines all of the processors that we
3479     // consider to be in the machine model.  If respect is set, then it is
3480     // the initialization thread's affinity mask.  Otherwise, it is all
3481     // processors that we know about on the machine.
3482     //
3483     if (fullMask == NULL) {
3484         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3485     }
3486     if (KMP_AFFINITY_CAPABLE()) {
3487         if (__kmp_affinity_respect_mask) {
3488             __kmp_get_system_affinity(fullMask, TRUE);
3489 
3490             //
3491             // Count the number of available processors.
3492             //
3493             unsigned i;
3494             __kmp_avail_proc = 0;
3495             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3496                 if (! KMP_CPU_ISSET(i, fullMask)) {
3497                     continue;
3498                 }
3499                 __kmp_avail_proc++;
3500             }
3501             if (__kmp_avail_proc > __kmp_xproc) {
3502                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3503                   && (__kmp_affinity_type != affinity_none))) {
3504                     KMP_WARNING(ErrorInitializeAffinity);
3505                 }
3506                 __kmp_affinity_type = affinity_none;
3507                 KMP_AFFINITY_DISABLE();
3508                 return;
3509             }
3510         }
3511         else {
3512             __kmp_affinity_entire_machine_mask(fullMask);
3513             __kmp_avail_proc = __kmp_xproc;
3514         }
3515     }
3516 
3517     int depth = -1;
3518     kmp_i18n_id_t msg_id = kmp_i18n_null;
3519 
3520     //
3521     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3522     // KMP_TOPOLOGY_METHOD=cpuinfo
3523     //
3524     if ((__kmp_cpuinfo_file != NULL) &&
3525       (__kmp_affinity_top_method == affinity_top_method_all)) {
3526         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3527     }
3528 
3529     if (__kmp_affinity_top_method == affinity_top_method_all) {
3530         //
3531         // In the default code path, errors are not fatal - we just try using
3532         // another method.  We only emit a warning message if affinity is on,
3533         // or the verbose flag is set, an the nowarnings flag was not set.
3534         //
3535         const char *file_name = NULL;
3536         int line = 0;
3537 
3538 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3539 
3540         if (__kmp_affinity_verbose) {
3541             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3542         }
3543 
3544         file_name = NULL;
3545         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3546         if (depth == 0) {
3547             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3548             KMP_ASSERT(address2os == NULL);
3549             return;
3550         }
3551 
3552         if (depth < 0) {
3553             if (__kmp_affinity_verbose) {
3554                 if (msg_id != kmp_i18n_null) {
3555                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3556                       KMP_I18N_STR(DecodingLegacyAPIC));
3557                 }
3558                 else {
3559                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3560                 }
3561             }
3562 
3563             file_name = NULL;
3564             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3565             if (depth == 0) {
3566                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3567                 KMP_ASSERT(address2os == NULL);
3568                 return;
3569             }
3570         }
3571 
3572 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3573 
3574 # if KMP_OS_LINUX
3575 
3576         if (depth < 0) {
3577             if (__kmp_affinity_verbose) {
3578                 if (msg_id != kmp_i18n_null) {
3579                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3580                 }
3581                 else {
3582                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3583                 }
3584             }
3585 
3586             FILE *f = fopen("/proc/cpuinfo", "r");
3587             if (f == NULL) {
3588                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3589             }
3590             else {
3591                 file_name = "/proc/cpuinfo";
3592                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3593                 fclose(f);
3594                 if (depth == 0) {
3595                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3596                     KMP_ASSERT(address2os == NULL);
3597                     return;
3598                 }
3599             }
3600         }
3601 
3602 # endif /* KMP_OS_LINUX */
3603 
3604 # if KMP_GROUP_AFFINITY
3605 
3606         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3607             if (__kmp_affinity_verbose) {
3608                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3609             }
3610 
3611             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3612             KMP_ASSERT(depth != 0);
3613         }
3614 
3615 # endif /* KMP_GROUP_AFFINITY */
3616 
3617         if (depth < 0) {
3618             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3619                 if (file_name == NULL) {
3620                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3621                 }
3622                 else if (line == 0) {
3623                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3624                 }
3625                 else {
3626                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3627                 }
3628             }
3629             // FIXME - print msg if msg_id = kmp_i18n_null ???
3630 
3631             file_name = "";
3632             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3633             if (depth == 0) {
3634                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3635                 KMP_ASSERT(address2os == NULL);
3636                 return;
3637             }
3638             KMP_ASSERT(depth > 0);
3639             KMP_ASSERT(address2os != NULL);
3640         }
3641     }
3642 
3643     //
3644     // If the user has specified that a paricular topology discovery method
3645     // is to be used, then we abort if that method fails.  The exception is
3646     // group affinity, which might have been implicitly set.
3647     //
3648 
3649 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3650 
3651     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3652         if (__kmp_affinity_verbose) {
3653             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3654               KMP_I18N_STR(Decodingx2APIC));
3655         }
3656 
3657         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3658         if (depth == 0) {
3659             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3660             KMP_ASSERT(address2os == NULL);
3661             return;
3662         }
3663         if (depth < 0) {
3664             KMP_ASSERT(msg_id != kmp_i18n_null);
3665             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3666         }
3667     }
3668     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3669         if (__kmp_affinity_verbose) {
3670             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3671               KMP_I18N_STR(DecodingLegacyAPIC));
3672         }
3673 
3674         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3675         if (depth == 0) {
3676             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3677             KMP_ASSERT(address2os == NULL);
3678             return;
3679         }
3680         if (depth < 0) {
3681             KMP_ASSERT(msg_id != kmp_i18n_null);
3682             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3683         }
3684     }
3685 
3686 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3687 
3688     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3689         const char *filename;
3690         if (__kmp_cpuinfo_file != NULL) {
3691             filename = __kmp_cpuinfo_file;
3692         }
3693         else {
3694             filename = "/proc/cpuinfo";
3695         }
3696 
3697         if (__kmp_affinity_verbose) {
3698             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3699         }
3700 
3701         FILE *f = fopen(filename, "r");
3702         if (f == NULL) {
3703             int code = errno;
3704             if (__kmp_cpuinfo_file != NULL) {
3705                 __kmp_msg(
3706                     kmp_ms_fatal,
3707                     KMP_MSG(CantOpenFileForReading, filename),
3708                     KMP_ERR(code),
3709                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3710                     __kmp_msg_null
3711                 );
3712             }
3713             else {
3714                 __kmp_msg(
3715                     kmp_ms_fatal,
3716                     KMP_MSG(CantOpenFileForReading, filename),
3717                     KMP_ERR(code),
3718                     __kmp_msg_null
3719                 );
3720             }
3721         }
3722         int line = 0;
3723         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3724         fclose(f);
3725         if (depth < 0) {
3726             KMP_ASSERT(msg_id != kmp_i18n_null);
3727             if (line > 0) {
3728                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3729             }
3730             else {
3731                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3732             }
3733         }
3734         if (__kmp_affinity_type == affinity_none) {
3735             KMP_ASSERT(depth == 0);
3736             KMP_ASSERT(address2os == NULL);
3737             return;
3738         }
3739     }
3740 
3741 # if KMP_GROUP_AFFINITY
3742 
3743     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3744         if (__kmp_affinity_verbose) {
3745             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3746         }
3747 
3748         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3749         KMP_ASSERT(depth != 0);
3750         if (depth < 0) {
3751             KMP_ASSERT(msg_id != kmp_i18n_null);
3752             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3753         }
3754     }
3755 
3756 # endif /* KMP_GROUP_AFFINITY */
3757 
3758     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3759         if (__kmp_affinity_verbose) {
3760             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3761         }
3762 
3763         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3764         if (depth == 0) {
3765             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3766             KMP_ASSERT(address2os == NULL);
3767             return;
3768         }
3769         // should not fail
3770         KMP_ASSERT(depth > 0);
3771         KMP_ASSERT(address2os != NULL);
3772     }
3773 
3774     if (address2os == NULL) {
3775         if (KMP_AFFINITY_CAPABLE()
3776           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3777           && (__kmp_affinity_type != affinity_none)))) {
3778             KMP_WARNING(ErrorInitializeAffinity);
3779         }
3780         __kmp_affinity_type = affinity_none;
3781         KMP_AFFINITY_DISABLE();
3782         return;
3783     }
3784 
3785     __kmp_apply_thread_places(&address2os, depth);
3786 
3787     //
3788     // Create the table of masks, indexed by thread Id.
3789     //
3790     unsigned maxIndex;
3791     unsigned numUnique;
3792     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3793       address2os, __kmp_avail_proc);
3794     if (__kmp_affinity_gran_levels == 0) {
3795         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3796     }
3797 
3798     //
3799     // Set the childNums vector in all Address objects.  This must be done
3800     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3801     // which takes into account the setting of __kmp_affinity_compact.
3802     //
3803     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3804 
3805     switch (__kmp_affinity_type) {
3806 
3807         case affinity_explicit:
3808         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3809 # if OMP_40_ENABLED
3810         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3811 # endif
3812         {
3813             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3814               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3815               maxIndex);
3816         }
3817 # if OMP_40_ENABLED
3818         else {
3819             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3820               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3821               maxIndex);
3822         }
3823 # endif
3824         if (__kmp_affinity_num_masks == 0) {
3825             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3826               && (__kmp_affinity_type != affinity_none))) {
3827                 KMP_WARNING(AffNoValidProcID);
3828             }
3829             __kmp_affinity_type = affinity_none;
3830             return;
3831         }
3832         break;
3833 
3834         //
3835         // The other affinity types rely on sorting the Addresses according
3836         // to some permutation of the machine topology tree.  Set
3837         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3838         // then jump to a common code fragment to do the sort and create
3839         // the array of affinity masks.
3840         //
3841 
3842         case affinity_logical:
3843         __kmp_affinity_compact = 0;
3844         if (__kmp_affinity_offset) {
3845             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3846               % __kmp_avail_proc;
3847         }
3848         goto sortAddresses;
3849 
3850         case affinity_physical:
3851         if (__kmp_nThreadsPerCore > 1) {
3852             __kmp_affinity_compact = 1;
3853             if (__kmp_affinity_compact >= depth) {
3854                 __kmp_affinity_compact = 0;
3855             }
3856         } else {
3857             __kmp_affinity_compact = 0;
3858         }
3859         if (__kmp_affinity_offset) {
3860             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3861               % __kmp_avail_proc;
3862         }
3863         goto sortAddresses;
3864 
3865         case affinity_scatter:
3866         if (__kmp_affinity_compact >= depth) {
3867             __kmp_affinity_compact = 0;
3868         }
3869         else {
3870             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3871         }
3872         goto sortAddresses;
3873 
3874         case affinity_compact:
3875         if (__kmp_affinity_compact >= depth) {
3876             __kmp_affinity_compact = depth - 1;
3877         }
3878         goto sortAddresses;
3879 
3880         case affinity_balanced:
3881         // Balanced works only for the case of a single package
3882         if( nPackages > 1 ) {
3883             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3884                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3885             }
3886             __kmp_affinity_type = affinity_none;
3887             return;
3888         } else if( __kmp_affinity_uniform_topology() ) {
3889             break;
3890         } else { // Non-uniform topology
3891 
3892             // Save the depth for further usage
3893             __kmp_aff_depth = depth;
3894 
3895             // Number of hyper threads per core in HT machine
3896             int nth_per_core = __kmp_nThreadsPerCore;
3897 
3898             int core_level;
3899             if( nth_per_core > 1 ) {
3900                 core_level = depth - 2;
3901             } else {
3902                 core_level = depth - 1;
3903             }
3904             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3905             int nproc = nth_per_core * ncores;
3906 
3907             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3908             for( int i = 0; i < nproc; i++ ) {
3909                 procarr[ i ] = -1;
3910             }
3911 
3912             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3913                 int proc = address2os[ i ].second;
3914                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3915                 // If there is only one thread per core then depth == 2: level 0 - package,
3916                 // level 1 - core.
3917                 int level = depth - 1;
3918 
3919                 // __kmp_nth_per_core == 1
3920                 int thread = 0;
3921                 int core = address2os[ i ].first.labels[ level ];
3922                 // If the thread level exists, that is we have more than one thread context per core
3923                 if( nth_per_core > 1 ) {
3924                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3925                     core = address2os[ i ].first.labels[ level - 1 ];
3926                 }
3927                 procarr[ core * nth_per_core + thread ] = proc;
3928             }
3929 
3930             break;
3931         }
3932 
3933         sortAddresses:
3934         //
3935         // Allocate the gtid->affinity mask table.
3936         //
3937         if (__kmp_affinity_dups) {
3938             __kmp_affinity_num_masks = __kmp_avail_proc;
3939         }
3940         else {
3941             __kmp_affinity_num_masks = numUnique;
3942         }
3943 
3944 # if OMP_40_ENABLED
3945         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3946           && ( __kmp_affinity_num_places > 0 )
3947           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3948             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3949         }
3950 # endif
3951 
3952         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3953           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3954 
3955         //
3956         // Sort the address2os table according to the current setting of
3957         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3958         //
3959         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3960           __kmp_affinity_cmp_Address_child_num);
3961         {
3962             int i;
3963             unsigned j;
3964             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3965                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3966                     continue;
3967                 }
3968                 unsigned osId = address2os[i].second;
3969                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3970                 kmp_affin_mask_t *dest
3971                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3972                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3973                 KMP_CPU_COPY(dest, src);
3974                 if (++j >= __kmp_affinity_num_masks) {
3975                     break;
3976                 }
3977             }
3978             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3979         }
3980         break;
3981 
3982         default:
3983         KMP_ASSERT2(0, "Unexpected affinity setting");
3984     }
3985 
3986     __kmp_free(osId2Mask);
3987     machine_hierarchy.init(address2os, __kmp_avail_proc);
3988 }
3989 
3990 
3991 void
3992 __kmp_affinity_initialize(void)
3993 {
3994     //
3995     // Much of the code above was written assumming that if a machine was not
3996     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3997     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3998     //
3999     // There are too many checks for __kmp_affinity_type == affinity_none
4000     // in this code.  Instead of trying to change them all, check if
4001     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4002     // affinity_none, call the real initialization routine, then restore
4003     // __kmp_affinity_type to affinity_disabled.
4004     //
4005     int disabled = (__kmp_affinity_type == affinity_disabled);
4006     if (! KMP_AFFINITY_CAPABLE()) {
4007         KMP_ASSERT(disabled);
4008     }
4009     if (disabled) {
4010         __kmp_affinity_type = affinity_none;
4011     }
4012     __kmp_aux_affinity_initialize();
4013     if (disabled) {
4014         __kmp_affinity_type = affinity_disabled;
4015     }
4016 }
4017 
4018 
4019 void
4020 __kmp_affinity_uninitialize(void)
4021 {
4022     if (__kmp_affinity_masks != NULL) {
4023         __kmp_free(__kmp_affinity_masks);
4024         __kmp_affinity_masks = NULL;
4025     }
4026     if (fullMask != NULL) {
4027         KMP_CPU_FREE(fullMask);
4028         fullMask = NULL;
4029     }
4030     __kmp_affinity_num_masks = 0;
4031 # if OMP_40_ENABLED
4032     __kmp_affinity_num_places = 0;
4033 # endif
4034     if (__kmp_affinity_proclist != NULL) {
4035         __kmp_free(__kmp_affinity_proclist);
4036         __kmp_affinity_proclist = NULL;
4037     }
4038     if( address2os != NULL ) {
4039         __kmp_free( address2os );
4040         address2os = NULL;
4041     }
4042     if( procarr != NULL ) {
4043         __kmp_free( procarr );
4044         procarr = NULL;
4045     }
4046 }
4047 
4048 
4049 void
4050 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4051 {
4052     if (! KMP_AFFINITY_CAPABLE()) {
4053         return;
4054     }
4055 
4056     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4057     if (th->th.th_affin_mask == NULL) {
4058         KMP_CPU_ALLOC(th->th.th_affin_mask);
4059     }
4060     else {
4061         KMP_CPU_ZERO(th->th.th_affin_mask);
4062     }
4063 
4064     //
4065     // Copy the thread mask to the kmp_info_t strucuture.
4066     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4067     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4068     // is set, then the full mask is the same as the mask of the initialization
4069     // thread.
4070     //
4071     kmp_affin_mask_t *mask;
4072     int i;
4073 
4074 # if OMP_40_ENABLED
4075     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4076 # endif
4077     {
4078         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4079           ) {
4080 # if KMP_GROUP_AFFINITY
4081             if (__kmp_num_proc_groups > 1) {
4082                 return;
4083             }
4084 # endif
4085             KMP_ASSERT(fullMask != NULL);
4086             i = KMP_PLACE_ALL;
4087             mask = fullMask;
4088         }
4089         else {
4090             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4091             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4092             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4093         }
4094     }
4095 # if OMP_40_ENABLED
4096     else {
4097         if ((! isa_root)
4098           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4099 #  if KMP_GROUP_AFFINITY
4100             if (__kmp_num_proc_groups > 1) {
4101                 return;
4102             }
4103 #  endif
4104             KMP_ASSERT(fullMask != NULL);
4105             i = KMP_PLACE_ALL;
4106             mask = fullMask;
4107         }
4108         else {
4109             //
4110             // int i = some hash function or just a counter that doesn't
4111             // always start at 0.  Use gtid for now.
4112             //
4113             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4114             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4115             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4116         }
4117     }
4118 # endif
4119 
4120 # if OMP_40_ENABLED
4121     th->th.th_current_place = i;
4122     if (isa_root) {
4123         th->th.th_new_place = i;
4124         th->th.th_first_place = 0;
4125         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4126     }
4127 
4128     if (i == KMP_PLACE_ALL) {
4129         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4130           gtid));
4131     }
4132     else {
4133         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4134           gtid, i));
4135     }
4136 # else
4137     if (i == -1) {
4138         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4139           gtid));
4140     }
4141     else {
4142         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4143           gtid, i));
4144     }
4145 # endif /* OMP_40_ENABLED */
4146 
4147     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4148 
4149     if (__kmp_affinity_verbose) {
4150         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4151         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4152           th->th.th_affin_mask);
4153         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4154           buf);
4155     }
4156 
4157 # if KMP_OS_WINDOWS
4158     //
4159     // On Windows* OS, the process affinity mask might have changed.
4160     // If the user didn't request affinity and this call fails,
4161     // just continue silently.  See CQ171393.
4162     //
4163     if ( __kmp_affinity_type == affinity_none ) {
4164         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4165     }
4166     else
4167 # endif
4168     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4169 }
4170 
4171 
4172 # if OMP_40_ENABLED
4173 
4174 void
4175 __kmp_affinity_set_place(int gtid)
4176 {
4177     int retval;
4178 
4179     if (! KMP_AFFINITY_CAPABLE()) {
4180         return;
4181     }
4182 
4183     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4184 
4185     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4186       gtid, th->th.th_new_place, th->th.th_current_place));
4187 
4188     //
4189     // Check that the new place is within this thread's partition.
4190     //
4191     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4192     KMP_ASSERT(th->th.th_new_place >= 0);
4193     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4194     if (th->th.th_first_place <= th->th.th_last_place) {
4195         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4196          && (th->th.th_new_place <= th->th.th_last_place));
4197     }
4198     else {
4199         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4200          || (th->th.th_new_place >= th->th.th_last_place));
4201     }
4202 
4203     //
4204     // Copy the thread mask to the kmp_info_t strucuture,
4205     // and set this thread's affinity.
4206     //
4207     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4208       th->th.th_new_place);
4209     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4210     th->th.th_current_place = th->th.th_new_place;
4211 
4212     if (__kmp_affinity_verbose) {
4213         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4214         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4215           th->th.th_affin_mask);
4216         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4217           gtid, buf);
4218     }
4219     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4220 }
4221 
4222 # endif /* OMP_40_ENABLED */
4223 
4224 
4225 int
4226 __kmp_aux_set_affinity(void **mask)
4227 {
4228     int gtid;
4229     kmp_info_t *th;
4230     int retval;
4231 
4232     if (! KMP_AFFINITY_CAPABLE()) {
4233         return -1;
4234     }
4235 
4236     gtid = __kmp_entry_gtid();
4237     KA_TRACE(1000, ;{
4238         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4239         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4240           (kmp_affin_mask_t *)(*mask));
4241         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4242           gtid, buf);
4243     });
4244 
4245     if (__kmp_env_consistency_check) {
4246         if ((mask == NULL) || (*mask == NULL)) {
4247             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4248         }
4249         else {
4250             unsigned proc;
4251             int num_procs = 0;
4252 
4253             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4254                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4255                     continue;
4256                 }
4257                 num_procs++;
4258                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4259                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4260                     break;
4261                 }
4262             }
4263             if (num_procs == 0) {
4264                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4265             }
4266 
4267 # if KMP_GROUP_AFFINITY
4268             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4269                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4270             }
4271 # endif /* KMP_GROUP_AFFINITY */
4272 
4273         }
4274     }
4275 
4276     th = __kmp_threads[gtid];
4277     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4278     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4279     if (retval == 0) {
4280         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4281     }
4282 
4283 # if OMP_40_ENABLED
4284     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4285     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4286     th->th.th_first_place = 0;
4287     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4288 
4289     //
4290     // Turn off 4.0 affinity for the current tread at this parallel level.
4291     //
4292     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4293 # endif
4294 
4295     return retval;
4296 }
4297 
4298 
4299 int
4300 __kmp_aux_get_affinity(void **mask)
4301 {
4302     int gtid;
4303     int retval;
4304     kmp_info_t *th;
4305 
4306     if (! KMP_AFFINITY_CAPABLE()) {
4307         return -1;
4308     }
4309 
4310     gtid = __kmp_entry_gtid();
4311     th = __kmp_threads[gtid];
4312     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4313 
4314     KA_TRACE(1000, ;{
4315         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4316         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4317           th->th.th_affin_mask);
4318         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4319     });
4320 
4321     if (__kmp_env_consistency_check) {
4322         if ((mask == NULL) || (*mask == NULL)) {
4323             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4324         }
4325     }
4326 
4327 # if !KMP_OS_WINDOWS
4328 
4329     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4330     KA_TRACE(1000, ;{
4331         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4332         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4333           (kmp_affin_mask_t *)(*mask));
4334         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4335     });
4336     return retval;
4337 
4338 # else
4339 
4340     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4341     return 0;
4342 
4343 # endif /* KMP_OS_WINDOWS */
4344 
4345 }
4346 
4347 int
4348 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4349 {
4350     int retval;
4351 
4352     if (! KMP_AFFINITY_CAPABLE()) {
4353         return -1;
4354     }
4355 
4356     KA_TRACE(1000, ;{
4357         int gtid = __kmp_entry_gtid();
4358         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4359         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4360           (kmp_affin_mask_t *)(*mask));
4361         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4362           proc, gtid, buf);
4363     });
4364 
4365     if (__kmp_env_consistency_check) {
4366         if ((mask == NULL) || (*mask == NULL)) {
4367             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4368         }
4369     }
4370 
4371     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4372         return -1;
4373     }
4374     if (! KMP_CPU_ISSET(proc, fullMask)) {
4375         return -2;
4376     }
4377 
4378     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4379     return 0;
4380 }
4381 
4382 
4383 int
4384 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4385 {
4386     int retval;
4387 
4388     if (! KMP_AFFINITY_CAPABLE()) {
4389         return -1;
4390     }
4391 
4392     KA_TRACE(1000, ;{
4393         int gtid = __kmp_entry_gtid();
4394         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4395         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4396           (kmp_affin_mask_t *)(*mask));
4397         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4398           proc, gtid, buf);
4399     });
4400 
4401     if (__kmp_env_consistency_check) {
4402         if ((mask == NULL) || (*mask == NULL)) {
4403             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4404         }
4405     }
4406 
4407     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4408         return -1;
4409     }
4410     if (! KMP_CPU_ISSET(proc, fullMask)) {
4411         return -2;
4412     }
4413 
4414     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4415     return 0;
4416 }
4417 
4418 
4419 int
4420 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4421 {
4422     int retval;
4423 
4424     if (! KMP_AFFINITY_CAPABLE()) {
4425         return -1;
4426     }
4427 
4428     KA_TRACE(1000, ;{
4429         int gtid = __kmp_entry_gtid();
4430         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4431         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4432           (kmp_affin_mask_t *)(*mask));
4433         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4434           proc, gtid, buf);
4435     });
4436 
4437     if (__kmp_env_consistency_check) {
4438         if ((mask == NULL) || (*mask == NULL)) {
4439             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4440         }
4441     }
4442 
4443     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4444         return 0;
4445     }
4446     if (! KMP_CPU_ISSET(proc, fullMask)) {
4447         return 0;
4448     }
4449 
4450     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4451 }
4452 
4453 
4454 // Dynamic affinity settings - Affinity balanced
4455 void __kmp_balanced_affinity( int tid, int nthreads )
4456 {
4457     if( __kmp_affinity_uniform_topology() ) {
4458         int coreID;
4459         int threadID;
4460         // Number of hyper threads per core in HT machine
4461         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4462         // Number of cores
4463         int ncores = __kmp_ncores;
4464         // How many threads will be bound to each core
4465         int chunk = nthreads / ncores;
4466         // How many cores will have an additional thread bound to it - "big cores"
4467         int big_cores = nthreads % ncores;
4468         // Number of threads on the big cores
4469         int big_nth = ( chunk + 1 ) * big_cores;
4470         if( tid < big_nth ) {
4471             coreID = tid / (chunk + 1 );
4472             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4473         } else { //tid >= big_nth
4474             coreID = ( tid - big_cores ) / chunk;
4475             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4476         }
4477 
4478         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4479           "Illegal set affinity operation when not capable");
4480 
4481         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4482         KMP_CPU_ZERO(mask);
4483 
4484         // Granularity == thread
4485         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4486             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4487             KMP_CPU_SET( osID, mask);
4488         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4489             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4490                 int osID;
4491                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4492                 KMP_CPU_SET( osID, mask);
4493             }
4494         }
4495         if (__kmp_affinity_verbose) {
4496             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4497             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4498             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4499               tid, buf);
4500         }
4501         __kmp_set_system_affinity( mask, TRUE );
4502     } else { // Non-uniform topology
4503 
4504         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4505         KMP_CPU_ZERO(mask);
4506 
4507         // Number of hyper threads per core in HT machine
4508         int nth_per_core = __kmp_nThreadsPerCore;
4509         int core_level;
4510         if( nth_per_core > 1 ) {
4511             core_level = __kmp_aff_depth - 2;
4512         } else {
4513             core_level = __kmp_aff_depth - 1;
4514         }
4515 
4516         // Number of cores - maximum value; it does not count trail cores with 0 processors
4517         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4518 
4519         // For performance gain consider the special case nthreads == __kmp_avail_proc
4520         if( nthreads == __kmp_avail_proc ) {
4521             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4522                 int osID = address2os[ tid ].second;
4523                 KMP_CPU_SET( osID, mask);
4524             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4525                 int coreID = address2os[ tid ].first.labels[ core_level ];
4526                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4527                 // since the address2os is sortied we can break when cnt==nth_per_core
4528                 int cnt = 0;
4529                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4530                     int osID = address2os[ i ].second;
4531                     int core = address2os[ i ].first.labels[ core_level ];
4532                     if( core == coreID ) {
4533                         KMP_CPU_SET( osID, mask);
4534                         cnt++;
4535                         if( cnt == nth_per_core ) {
4536                             break;
4537                         }
4538                     }
4539                 }
4540             }
4541         } else if( nthreads <= __kmp_ncores ) {
4542 
4543             int core = 0;
4544             for( int i = 0; i < ncores; i++ ) {
4545                 // Check if this core from procarr[] is in the mask
4546                 int in_mask = 0;
4547                 for( int j = 0; j < nth_per_core; j++ ) {
4548                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4549                         in_mask = 1;
4550                         break;
4551                     }
4552                 }
4553                 if( in_mask ) {
4554                     if( tid == core ) {
4555                         for( int j = 0; j < nth_per_core; j++ ) {
4556                             int osID = procarr[ i * nth_per_core + j ];
4557                             if( osID != -1 ) {
4558                                 KMP_CPU_SET( osID, mask );
4559                                 // For granularity=thread it is enough to set the first available osID for this core
4560                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4561                                     break;
4562                                 }
4563                             }
4564                         }
4565                         break;
4566                     } else {
4567                         core++;
4568                     }
4569                 }
4570             }
4571 
4572         } else { // nthreads > __kmp_ncores
4573 
4574             // Array to save the number of processors at each core
4575             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4576             // Array to save the number of cores with "x" available processors;
4577             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4578             // Array to save the number of cores with # procs from x to nth_per_core
4579             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4580 
4581             for( int i = 0; i <= nth_per_core; i++ ) {
4582                 ncores_with_x_procs[ i ] = 0;
4583                 ncores_with_x_to_max_procs[ i ] = 0;
4584             }
4585 
4586             for( int i = 0; i < ncores; i++ ) {
4587                 int cnt = 0;
4588                 for( int j = 0; j < nth_per_core; j++ ) {
4589                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4590                         cnt++;
4591                     }
4592                 }
4593                 nproc_at_core[ i ] = cnt;
4594                 ncores_with_x_procs[ cnt ]++;
4595             }
4596 
4597             for( int i = 0; i <= nth_per_core; i++ ) {
4598                 for( int j = i; j <= nth_per_core; j++ ) {
4599                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4600                 }
4601             }
4602 
4603             // Max number of processors
4604             int nproc = nth_per_core * ncores;
4605             // An array to keep number of threads per each context
4606             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4607             for( int i = 0; i < nproc; i++ ) {
4608                 newarr[ i ] = 0;
4609             }
4610 
4611             int nth = nthreads;
4612             int flag = 0;
4613             while( nth > 0 ) {
4614                 for( int j = 1; j <= nth_per_core; j++ ) {
4615                     int cnt = ncores_with_x_to_max_procs[ j ];
4616                     for( int i = 0; i < ncores; i++ ) {
4617                         // Skip the core with 0 processors
4618                         if( nproc_at_core[ i ] == 0 ) {
4619                             continue;
4620                         }
4621                         for( int k = 0; k < nth_per_core; k++ ) {
4622                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4623                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4624                                     newarr[ i * nth_per_core + k ] = 1;
4625                                     cnt--;
4626                                     nth--;
4627                                     break;
4628                                 } else {
4629                                     if( flag != 0 ) {
4630                                         newarr[ i * nth_per_core + k ] ++;
4631                                         cnt--;
4632                                         nth--;
4633                                         break;
4634                                     }
4635                                 }
4636                             }
4637                         }
4638                         if( cnt == 0 || nth == 0 ) {
4639                             break;
4640                         }
4641                     }
4642                     if( nth == 0 ) {
4643                         break;
4644                     }
4645                 }
4646                 flag = 1;
4647             }
4648             int sum = 0;
4649             for( int i = 0; i < nproc; i++ ) {
4650                 sum += newarr[ i ];
4651                 if( sum > tid ) {
4652                     // Granularity == thread
4653                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4654                         int osID = procarr[ i ];
4655                         KMP_CPU_SET( osID, mask);
4656                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4657                         int coreID = i / nth_per_core;
4658                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4659                             int osID = procarr[ coreID * nth_per_core + ii ];
4660                             if( osID != -1 ) {
4661                                 KMP_CPU_SET( osID, mask);
4662                             }
4663                         }
4664                     }
4665                     break;
4666                 }
4667             }
4668             __kmp_free( newarr );
4669         }
4670 
4671         if (__kmp_affinity_verbose) {
4672             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4673             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4674             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4675               tid, buf);
4676         }
4677         __kmp_set_system_affinity( mask, TRUE );
4678     }
4679 }
4680 
4681 #else
4682     // affinity not supported
4683 
4684 static const kmp_uint32 noaff_maxLevels=7;
4685 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4686 kmp_uint32 noaff_depth;
4687 kmp_uint8 noaff_leaf_kids;
4688 kmp_int8 noaff_uninitialized=1;
4689 
4690 void noaff_init(int nprocs)
4691 {
4692     kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4693     if (result == 0) return; // Already initialized
4694     else if (result == 2) { // Someone else is initializing
4695         while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4696         return;
4697     }
4698     KMP_DEBUG_ASSERT(result==1);
4699 
4700     kmp_uint32 numPerLevel[noaff_maxLevels];
4701     noaff_depth = 1;
4702     for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4703         numPerLevel[i] = 1;
4704         noaff_skipPerLevel[i] = 1;
4705     }
4706 
4707     numPerLevel[0] = 4;
4708     numPerLevel[1] = nprocs/4;
4709     if (nprocs%4) numPerLevel[1]++;
4710 
4711     for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4712         if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4713             noaff_depth++;
4714 
4715     kmp_uint32 branch = 4;
4716     if (numPerLevel[0] == 1) branch = nprocs/4;
4717     if (branch<4) branch=4;
4718     for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4719         while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4720             if (numPerLevel[d] & 1) numPerLevel[d]++;
4721             numPerLevel[d] = numPerLevel[d] >> 1;
4722             if (numPerLevel[d+1] == 1) noaff_depth++;
4723             numPerLevel[d+1] = numPerLevel[d+1] << 1;
4724         }
4725         if(numPerLevel[0] == 1) {
4726             branch = branch >> 1;
4727             if (branch<4) branch = 4;
4728         }
4729     }
4730 
4731     for (kmp_uint32 i=1; i<noaff_depth; ++i)
4732         noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4733     // Fill in hierarchy in the case of oversubscription
4734     for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4735         noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4736     noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4737     noaff_uninitialized = 0; // One writer
4738 
4739 }
4740 
4741 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4742     if (noaff_uninitialized)
4743         noaff_init(nproc);
4744 
4745     thr_bar->depth = noaff_depth;
4746     thr_bar->base_leaf_kids = noaff_leaf_kids;
4747     thr_bar->skip_per_level = noaff_skipPerLevel;
4748 }
4749 
4750 #endif // KMP_AFFINITY_SUPPORTED
4751