1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         KMP_SNPRINTF(scan, buf_len, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         KMP_SNPRINTF(scan, buf_len,  ",...");
72         while (*scan != '\0') scan++;
73     }
74     KMP_SNPRINTF(scan, buf_len, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init.
316     This structure represents a mapping of threads to the actual machine hierarchy, or to
317     our best guess at what the hierarchy might be, for the purpose of performing an
318     efficient barrier.  In the worst case, when there is no machine hierarchy information,
319     it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
320 class hierarchy_info {
321 public:
322     /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
323     or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
324     nomenclature.  When the machine is oversubscribed we add levels to duplicate the
325     hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
326     kmp_uint32 maxLevels;
327 
328     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
329         number of levels along the longest path from root to any leaf. It corresponds to the
330         number of entries in numPerLevel if we exclude all but one trailing 1. */
331     kmp_uint32 depth;
332     kmp_uint32 base_num_threads;
333     volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
334     volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
335 
336     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
337         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
338         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
339     kmp_uint32 *numPerLevel;
340     kmp_uint32 *skipPerLevel;
341 
342     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
343         int hier_depth = adr2os[0].first.depth;
344         int level = 0;
345         for (int i=hier_depth-1; i>=0; --i) {
346             int max = -1;
347             for (int j=0; j<num_addrs; ++j) {
348                 int next = adr2os[j].first.childNums[i];
349                 if (next > max) max = next;
350             }
351             numPerLevel[level] = max+1;
352             ++level;
353         }
354     }
355 
356     hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
357 
358     // TO FIX: This destructor causes a segfault in the library at shutdown.
359     //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
360 
361     void init(AddrUnsPair *adr2os, int num_addrs)
362     {
363         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
364         if (bool_result == 0) { // Wait for initialization
365             while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
366             return;
367         }
368         KMP_DEBUG_ASSERT(bool_result==1);
369 
370         /* Added explicit initialization of the data fields here to prevent usage of dirty value
371            observed when static library is re-initialized multiple times (e.g. when
372            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
373         depth = 1;
374         resizing = 0;
375         maxLevels = 7;
376         numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
377         skipPerLevel = &(numPerLevel[maxLevels]);
378         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
379             numPerLevel[i] = 1;
380             skipPerLevel[i] = 1;
381         }
382 
383         // Sort table by physical ID
384         if (adr2os) {
385             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
386             deriveLevels(adr2os, num_addrs);
387         }
388         else {
389             numPerLevel[0] = 4;
390             numPerLevel[1] = num_addrs/4;
391             if (num_addrs%4) numPerLevel[1]++;
392         }
393 
394         base_num_threads = num_addrs;
395         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
396             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
397                 depth++;
398 
399         kmp_uint32 branch = 4;
400         if (numPerLevel[0] == 1) branch = num_addrs/4;
401         if (branch<4) branch=4;
402         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
403             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
404                 if (numPerLevel[d] & 1) numPerLevel[d]++;
405                 numPerLevel[d] = numPerLevel[d] >> 1;
406                 if (numPerLevel[d+1] == 1) depth++;
407                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
408             }
409             if(numPerLevel[0] == 1) {
410                 branch = branch >> 1;
411                 if (branch<4) branch = 4;
412             }
413         }
414 
415         for (kmp_uint32 i=1; i<depth; ++i)
416             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
417         // Fill in hierarchy in the case of oversubscription
418         for (kmp_uint32 i=depth; i<maxLevels; ++i)
419             skipPerLevel[i] = 2*skipPerLevel[i-1];
420 
421         uninitialized = 0; // One writer
422 
423     }
424 
425     void resize(kmp_uint32 nproc)
426     {
427         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
428         if (bool_result == 0) { // Someone else is resizing
429             while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
430             return;
431         }
432         KMP_DEBUG_ASSERT(bool_result!=0);
433         KMP_DEBUG_ASSERT(nproc > base_num_threads);
434 
435         // Calculate new max_levels
436         kmp_uint32 old_sz = skipPerLevel[depth-1];
437         kmp_uint32 incs = 0, old_maxLevels= maxLevels;
438         while (nproc > old_sz) {
439             old_sz *=2;
440             incs++;
441         }
442         maxLevels += incs;
443 
444         // Resize arrays
445         kmp_uint32 *old_numPerLevel = numPerLevel;
446         kmp_uint32 *old_skipPerLevel = skipPerLevel;
447         numPerLevel = skipPerLevel = NULL;
448         numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
449         skipPerLevel = &(numPerLevel[maxLevels]);
450 
451         // Copy old elements from old arrays
452         for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
453             numPerLevel[i] = old_numPerLevel[i];
454             skipPerLevel[i] = old_skipPerLevel[i];
455         }
456 
457         // Init new elements in arrays to 1
458         for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
459             numPerLevel[i] = 1;
460             skipPerLevel[i] = 1;
461         }
462 
463         // Free old arrays
464         __kmp_free(old_numPerLevel);
465 
466         // Fill in oversubscription levels of hierarchy
467         for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
468             skipPerLevel[i] = 2*skipPerLevel[i-1];
469 
470         base_num_threads = nproc;
471         resizing = 0; // One writer
472 
473     }
474 };
475 
476 static hierarchy_info machine_hierarchy;
477 
478 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
479     kmp_uint32 depth;
480     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
481     if (TCR_1(machine_hierarchy.uninitialized))
482         machine_hierarchy.init(NULL, nproc);
483     // Adjust the hierarchy in case num threads exceeds original
484     if (nproc > machine_hierarchy.base_num_threads)
485         machine_hierarchy.resize(nproc);
486 
487     depth = machine_hierarchy.depth;
488     KMP_DEBUG_ASSERT(depth > 0);
489     // The loop below adjusts the depth in the case of a resize
490     while (nproc > machine_hierarchy.skipPerLevel[depth-1])
491         depth++;
492 
493     thr_bar->depth = depth;
494     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
495     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
496 }
497 
498 //
499 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
500 // called to renumber the labels from [0..n] and place them into the child_num
501 // vector of the address object.  This is done in case the labels used for
502 // the children at one node of the hierarchy differ from those used for
503 // another node at the same level.  Example:  suppose the machine has 2 nodes
504 // with 2 packages each.  The first node contains packages 601 and 602, and
505 // second node contains packages 603 and 604.  If we try to sort the table
506 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
507 // because we are paying attention to the labels themselves, not the ordinal
508 // child numbers.  By using the child numbers in the sort, the result is
509 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
510 //
511 static void
512 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
513   int numAddrs)
514 {
515     KMP_DEBUG_ASSERT(numAddrs > 0);
516     int depth = address2os->first.depth;
517     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
518     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
519       * sizeof(unsigned));
520     int labCt;
521     for (labCt = 0; labCt < depth; labCt++) {
522         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
523         lastLabel[labCt] = address2os[0].first.labels[labCt];
524     }
525     int i;
526     for (i = 1; i < numAddrs; i++) {
527         for (labCt = 0; labCt < depth; labCt++) {
528             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
529                 int labCt2;
530                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
531                     counts[labCt2] = 0;
532                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
533                 }
534                 counts[labCt]++;
535                 lastLabel[labCt] = address2os[i].first.labels[labCt];
536                 break;
537             }
538         }
539         for (labCt = 0; labCt < depth; labCt++) {
540             address2os[i].first.childNums[labCt] = counts[labCt];
541         }
542         for (; labCt < (int)Address::maxDepth; labCt++) {
543             address2os[i].first.childNums[labCt] = 0;
544         }
545     }
546 }
547 
548 
549 //
550 // All of the __kmp_affinity_create_*_map() routines should set
551 // __kmp_affinity_masks to a vector of affinity mask objects of length
552 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
553 // return the number of levels in the machine topology tree (zero if
554 // __kmp_affinity_type == affinity_none).
555 //
556 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
557 // to the affinity mask for the initialization thread.  They need to save and
558 // restore the mask, and it could be needed later, so saving it is just an
559 // optimization to avoid calling kmp_get_system_affinity() again.
560 //
561 static kmp_affin_mask_t *fullMask = NULL;
562 
563 kmp_affin_mask_t *
564 __kmp_affinity_get_fullMask() { return fullMask; }
565 
566 
567 static int nCoresPerPkg, nPackages;
568 static int __kmp_nThreadsPerCore;
569 #ifndef KMP_DFLT_NTH_CORES
570 static int __kmp_ncores;
571 #endif
572 
573 //
574 // __kmp_affinity_uniform_topology() doesn't work when called from
575 // places which support arbitrarily many levels in the machine topology
576 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
577 // __kmp_affinity_create_x2apicid_map().
578 //
579 inline static bool
580 __kmp_affinity_uniform_topology()
581 {
582     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
583 }
584 
585 
586 //
587 // Print out the detailed machine topology map, i.e. the physical locations
588 // of each OS proc.
589 //
590 static void
591 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
592   int pkgLevel, int coreLevel, int threadLevel)
593 {
594     int proc;
595 
596     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
597     for (proc = 0; proc < len; proc++) {
598         int level;
599         kmp_str_buf_t buf;
600         __kmp_str_buf_init(&buf);
601         for (level = 0; level < depth; level++) {
602             if (level == threadLevel) {
603                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
604             }
605             else if (level == coreLevel) {
606                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
607             }
608             else if (level == pkgLevel) {
609                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
610             }
611             else if (level > pkgLevel) {
612                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
613                   level - pkgLevel - 1);
614             }
615             else {
616                 __kmp_str_buf_print(&buf, "L%d ", level);
617             }
618             __kmp_str_buf_print(&buf, "%d ",
619               address2os[proc].first.labels[level]);
620         }
621         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
622           buf.str);
623         __kmp_str_buf_free(&buf);
624     }
625 }
626 
627 
628 //
629 // If we don't know how to retrieve the machine's processor topology, or
630 // encounter an error in doing so, this routine is called to form a "flat"
631 // mapping of os thread id's <-> processor id's.
632 //
633 static int
634 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
635   kmp_i18n_id_t *const msg_id)
636 {
637     *address2os = NULL;
638     *msg_id = kmp_i18n_null;
639 
640     //
641     // Even if __kmp_affinity_type == affinity_none, this routine might still
642     // called to set __kmp_ncores, as well as
643     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
644     //
645     if (! KMP_AFFINITY_CAPABLE()) {
646         KMP_ASSERT(__kmp_affinity_type == affinity_none);
647         __kmp_ncores = nPackages = __kmp_xproc;
648         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
649         if (__kmp_affinity_verbose) {
650             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
651             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652             KMP_INFORM(Uniform, "KMP_AFFINITY");
653             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654               __kmp_nThreadsPerCore, __kmp_ncores);
655         }
656         return 0;
657     }
658 
659     //
660     // When affinity is off, this routine will still be called to set
661     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
662     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
663     //  correctly, and return now if affinity is not enabled.
664     //
665     __kmp_ncores = nPackages = __kmp_avail_proc;
666     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
667     if (__kmp_affinity_verbose) {
668         char buf[KMP_AFFIN_MASK_PRINT_LEN];
669         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
670 
671         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
672         if (__kmp_affinity_respect_mask) {
673             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
674         } else {
675             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
676         }
677         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
678         KMP_INFORM(Uniform, "KMP_AFFINITY");
679         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
680           __kmp_nThreadsPerCore, __kmp_ncores);
681     }
682     if (__kmp_affinity_type == affinity_none) {
683         return 0;
684     }
685 
686     //
687     // Contruct the data structure to be returned.
688     //
689     *address2os = (AddrUnsPair*)
690       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
691     int avail_ct = 0;
692     unsigned int i;
693     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
694         //
695         // Skip this proc if it is not included in the machine model.
696         //
697         if (! KMP_CPU_ISSET(i, fullMask)) {
698             continue;
699         }
700 
701         Address addr(1);
702         addr.labels[0] = i;
703         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
704     }
705     if (__kmp_affinity_verbose) {
706         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
707     }
708 
709     if (__kmp_affinity_gran_levels < 0) {
710         //
711         // Only the package level is modeled in the machine topology map,
712         // so the #levels of granularity is either 0 or 1.
713         //
714         if (__kmp_affinity_gran > affinity_gran_package) {
715             __kmp_affinity_gran_levels = 1;
716         }
717         else {
718             __kmp_affinity_gran_levels = 0;
719         }
720     }
721     return 1;
722 }
723 
724 
725 # if KMP_GROUP_AFFINITY
726 
727 //
728 // If multiple Windows* OS processor groups exist, we can create a 2-level
729 // topology map with the groups at level 0 and the individual procs at
730 // level 1.
731 //
732 // This facilitates letting the threads float among all procs in a group,
733 // if granularity=group (the default when there are multiple groups).
734 //
735 static int
736 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
737   kmp_i18n_id_t *const msg_id)
738 {
739     *address2os = NULL;
740     *msg_id = kmp_i18n_null;
741 
742     //
743     // If we don't have multiple processor groups, return now.
744     // The flat mapping will be used.
745     //
746     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
747         // FIXME set *msg_id
748         return -1;
749     }
750 
751     //
752     // Contruct the data structure to be returned.
753     //
754     *address2os = (AddrUnsPair*)
755       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
756     int avail_ct = 0;
757     int i;
758     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
759         //
760         // Skip this proc if it is not included in the machine model.
761         //
762         if (! KMP_CPU_ISSET(i, fullMask)) {
763             continue;
764         }
765 
766         Address addr(2);
767         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
768         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
769         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
770 
771         if (__kmp_affinity_verbose) {
772             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
773               addr.labels[1]);
774         }
775     }
776 
777     if (__kmp_affinity_gran_levels < 0) {
778         if (__kmp_affinity_gran == affinity_gran_group) {
779             __kmp_affinity_gran_levels = 1;
780         }
781         else if ((__kmp_affinity_gran == affinity_gran_fine)
782           || (__kmp_affinity_gran == affinity_gran_thread)) {
783             __kmp_affinity_gran_levels = 0;
784         }
785         else {
786             const char *gran_str = NULL;
787             if (__kmp_affinity_gran == affinity_gran_core) {
788                 gran_str = "core";
789             }
790             else if (__kmp_affinity_gran == affinity_gran_package) {
791                 gran_str = "package";
792             }
793             else if (__kmp_affinity_gran == affinity_gran_node) {
794                 gran_str = "node";
795             }
796             else {
797                 KMP_ASSERT(0);
798             }
799 
800             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
801             __kmp_affinity_gran_levels = 0;
802         }
803     }
804     return 2;
805 }
806 
807 # endif /* KMP_GROUP_AFFINITY */
808 
809 
810 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
811 
812 static int
813 __kmp_cpuid_mask_width(int count) {
814     int r = 0;
815 
816     while((1<<r) < count)
817         ++r;
818     return r;
819 }
820 
821 
822 class apicThreadInfo {
823 public:
824     unsigned osId;              // param to __kmp_affinity_bind_thread
825     unsigned apicId;            // from cpuid after binding
826     unsigned maxCoresPerPkg;    //      ""
827     unsigned maxThreadsPerPkg;  //      ""
828     unsigned pkgId;             // inferred from above values
829     unsigned coreId;            //      ""
830     unsigned threadId;          //      ""
831 };
832 
833 
834 static int
835 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
836 {
837     const apicThreadInfo *aa = (const apicThreadInfo *)a;
838     const apicThreadInfo *bb = (const apicThreadInfo *)b;
839     if (aa->osId < bb->osId) return -1;
840     if (aa->osId > bb->osId) return 1;
841     return 0;
842 }
843 
844 
845 static int
846 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
847 {
848     const apicThreadInfo *aa = (const apicThreadInfo *)a;
849     const apicThreadInfo *bb = (const apicThreadInfo *)b;
850     if (aa->pkgId < bb->pkgId) return -1;
851     if (aa->pkgId > bb->pkgId) return 1;
852     if (aa->coreId < bb->coreId) return -1;
853     if (aa->coreId > bb->coreId) return 1;
854     if (aa->threadId < bb->threadId) return -1;
855     if (aa->threadId > bb->threadId) return 1;
856     return 0;
857 }
858 
859 
860 //
861 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
862 // an algorithm which cycles through the available os threads, setting
863 // the current thread's affinity mask to that thread, and then retrieves
864 // the Apic Id for each thread context using the cpuid instruction.
865 //
866 static int
867 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
868   kmp_i18n_id_t *const msg_id)
869 {
870     kmp_cpuid buf;
871     int rc;
872     *address2os = NULL;
873     *msg_id = kmp_i18n_null;
874 
875     //
876     // Check if cpuid leaf 4 is supported.
877     //
878         __kmp_x86_cpuid(0, 0, &buf);
879         if (buf.eax < 4) {
880             *msg_id = kmp_i18n_str_NoLeaf4Support;
881             return -1;
882         }
883 
884     //
885     // The algorithm used starts by setting the affinity to each available
886     // thread and retrieving info from the cpuid instruction, so if we are
887     // not capable of calling __kmp_get_system_affinity() and
888     // _kmp_get_system_affinity(), then we need to do something else - use
889     // the defaults that we calculated from issuing cpuid without binding
890     // to each proc.
891     //
892     if (! KMP_AFFINITY_CAPABLE()) {
893         //
894         // Hack to try and infer the machine topology using only the data
895         // available from cpuid on the current thread, and __kmp_xproc.
896         //
897         KMP_ASSERT(__kmp_affinity_type == affinity_none);
898 
899         //
900         // Get an upper bound on the number of threads per package using
901         // cpuid(1).
902         //
903         // On some OS/chps combinations where HT is supported by the chip
904         // but is disabled, this value will be 2 on a single core chip.
905         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
906         //
907         __kmp_x86_cpuid(1, 0, &buf);
908         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
909         if (maxThreadsPerPkg == 0) {
910             maxThreadsPerPkg = 1;
911         }
912 
913         //
914         // The num cores per pkg comes from cpuid(4).
915         // 1 must be added to the encoded value.
916         //
917         // The author of cpu_count.cpp treated this only an upper bound
918         // on the number of cores, but I haven't seen any cases where it
919         // was greater than the actual number of cores, so we will treat
920         // it as exact in this block of code.
921         //
922         // First, we need to check if cpuid(4) is supported on this chip.
923         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
924         // has the value n or greater.
925         //
926         __kmp_x86_cpuid(0, 0, &buf);
927         if (buf.eax >= 4) {
928             __kmp_x86_cpuid(4, 0, &buf);
929             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
930         }
931         else {
932             nCoresPerPkg = 1;
933         }
934 
935         //
936         // There is no way to reliably tell if HT is enabled without issuing
937         // the cpuid instruction from every thread, can correlating the cpuid
938         // info, so if the machine is not affinity capable, we assume that HT
939         // is off.  We have seen quite a few machines where maxThreadsPerPkg
940         // is 2, yet the machine does not support HT.
941         //
942         // - Older OSes are usually found on machines with older chips, which
943         //   do not support HT.
944         //
945         // - The performance penalty for mistakenly identifying a machine as
946         //   HT when it isn't (which results in blocktime being incorrecly set
947         //   to 0) is greater than the penalty when for mistakenly identifying
948         //   a machine as being 1 thread/core when it is really HT enabled
949         //   (which results in blocktime being incorrectly set to a positive
950         //   value).
951         //
952         __kmp_ncores = __kmp_xproc;
953         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
954         __kmp_nThreadsPerCore = 1;
955         if (__kmp_affinity_verbose) {
956             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
957             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
958             if (__kmp_affinity_uniform_topology()) {
959                 KMP_INFORM(Uniform, "KMP_AFFINITY");
960             } else {
961                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
962             }
963             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
964               __kmp_nThreadsPerCore, __kmp_ncores);
965         }
966         return 0;
967     }
968 
969     //
970     //
971     // From here on, we can assume that it is safe to call
972     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
973     // even if __kmp_affinity_type = affinity_none.
974     //
975 
976     //
977     // Save the affinity mask for the current thread.
978     //
979     kmp_affin_mask_t *oldMask;
980     KMP_CPU_ALLOC(oldMask);
981     KMP_ASSERT(oldMask != NULL);
982     __kmp_get_system_affinity(oldMask, TRUE);
983 
984     //
985     // Run through each of the available contexts, binding the current thread
986     // to it, and obtaining the pertinent information using the cpuid instr.
987     //
988     // The relevant information is:
989     //
990     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
991     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
992     //
993     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
994     //    value of this field determines the width of the core# + thread#
995     //    fields in the Apic Id.  It is also an upper bound on the number
996     //    of threads per package, but it has been verified that situations
997     //    happen were it is not exact.  In particular, on certain OS/chip
998     //    combinations where Intel(R) Hyper-Threading Technology is supported
999     //    by the chip but has
1000     //    been disabled, the value of this field will be 2 (for a single core
1001     //    chip).  On other OS/chip combinations supporting
1002     //    Intel(R) Hyper-Threading Technology, the value of
1003     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
1004     //    disabled and 2 when it is enabled.
1005     //
1006     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
1007     //    value of this field (+1) determines the width of the core# field in
1008     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
1009     //    an upper bound, but the IA-32 architecture manual says that it is
1010     //    exactly the number of cores per package, and I haven't seen any
1011     //    case where it wasn't.
1012     //
1013     // From this information, deduce the package Id, core Id, and thread Id,
1014     // and set the corresponding fields in the apicThreadInfo struct.
1015     //
1016     unsigned i;
1017     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1018       __kmp_avail_proc * sizeof(apicThreadInfo));
1019     unsigned nApics = 0;
1020     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1021         //
1022         // Skip this proc if it is not included in the machine model.
1023         //
1024         if (! KMP_CPU_ISSET(i, fullMask)) {
1025             continue;
1026         }
1027         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1028 
1029         __kmp_affinity_bind_thread(i);
1030         threadInfo[nApics].osId = i;
1031 
1032         //
1033         // The apic id and max threads per pkg come from cpuid(1).
1034         //
1035         __kmp_x86_cpuid(1, 0, &buf);
1036         if (! (buf.edx >> 9) & 1) {
1037             __kmp_set_system_affinity(oldMask, TRUE);
1038             __kmp_free(threadInfo);
1039             KMP_CPU_FREE(oldMask);
1040             *msg_id = kmp_i18n_str_ApicNotPresent;
1041             return -1;
1042         }
1043         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1044         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1045         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1046             threadInfo[nApics].maxThreadsPerPkg = 1;
1047         }
1048 
1049         //
1050         // Max cores per pkg comes from cpuid(4).
1051         // 1 must be added to the encoded value.
1052         //
1053         // First, we need to check if cpuid(4) is supported on this chip.
1054         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1055         // has the value n or greater.
1056         //
1057         __kmp_x86_cpuid(0, 0, &buf);
1058         if (buf.eax >= 4) {
1059             __kmp_x86_cpuid(4, 0, &buf);
1060             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1061         }
1062         else {
1063             threadInfo[nApics].maxCoresPerPkg = 1;
1064         }
1065 
1066         //
1067         // Infer the pkgId / coreId / threadId using only the info
1068         // obtained locally.
1069         //
1070         int widthCT = __kmp_cpuid_mask_width(
1071           threadInfo[nApics].maxThreadsPerPkg);
1072         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1073 
1074         int widthC = __kmp_cpuid_mask_width(
1075           threadInfo[nApics].maxCoresPerPkg);
1076         int widthT = widthCT - widthC;
1077         if (widthT < 0) {
1078             //
1079             // I've never seen this one happen, but I suppose it could, if
1080             // the cpuid instruction on a chip was really screwed up.
1081             // Make sure to restore the affinity mask before the tail call.
1082             //
1083             __kmp_set_system_affinity(oldMask, TRUE);
1084             __kmp_free(threadInfo);
1085             KMP_CPU_FREE(oldMask);
1086             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1087             return -1;
1088         }
1089 
1090         int maskC = (1 << widthC) - 1;
1091         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1092           &maskC;
1093 
1094         int maskT = (1 << widthT) - 1;
1095         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1096 
1097         nApics++;
1098     }
1099 
1100     //
1101     // We've collected all the info we need.
1102     // Restore the old affinity mask for this thread.
1103     //
1104     __kmp_set_system_affinity(oldMask, TRUE);
1105 
1106     //
1107     // If there's only one thread context to bind to, form an Address object
1108     // with depth 1 and return immediately (or, if affinity is off, set
1109     // address2os to NULL and return).
1110     //
1111     // If it is configured to omit the package level when there is only a
1112     // single package, the logic at the end of this routine won't work if
1113     // there is only a single thread - it would try to form an Address
1114     // object with depth 0.
1115     //
1116     KMP_ASSERT(nApics > 0);
1117     if (nApics == 1) {
1118         __kmp_ncores = nPackages = 1;
1119         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1120         if (__kmp_affinity_verbose) {
1121             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1122             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1123 
1124             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1125             if (__kmp_affinity_respect_mask) {
1126                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1127             } else {
1128                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1129             }
1130             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1131             KMP_INFORM(Uniform, "KMP_AFFINITY");
1132             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1133               __kmp_nThreadsPerCore, __kmp_ncores);
1134         }
1135 
1136         if (__kmp_affinity_type == affinity_none) {
1137             __kmp_free(threadInfo);
1138             KMP_CPU_FREE(oldMask);
1139             return 0;
1140         }
1141 
1142         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1143         Address addr(1);
1144         addr.labels[0] = threadInfo[0].pkgId;
1145         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1146 
1147         if (__kmp_affinity_gran_levels < 0) {
1148             __kmp_affinity_gran_levels = 0;
1149         }
1150 
1151         if (__kmp_affinity_verbose) {
1152             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1153         }
1154 
1155         __kmp_free(threadInfo);
1156         KMP_CPU_FREE(oldMask);
1157         return 1;
1158     }
1159 
1160     //
1161     // Sort the threadInfo table by physical Id.
1162     //
1163     qsort(threadInfo, nApics, sizeof(*threadInfo),
1164       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1165 
1166     //
1167     // The table is now sorted by pkgId / coreId / threadId, but we really
1168     // don't know the radix of any of the fields.  pkgId's may be sparsely
1169     // assigned among the chips on a system.  Although coreId's are usually
1170     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1171     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1172     //
1173     // For that matter, we don't know what coresPerPkg and threadsPerCore
1174     // (or the total # packages) are at this point - we want to determine
1175     // that now.  We only have an upper bound on the first two figures.
1176     //
1177     // We also perform a consistency check at this point: the values returned
1178     // by the cpuid instruction for any thread bound to a given package had
1179     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1180     //
1181     nPackages = 1;
1182     nCoresPerPkg = 1;
1183     __kmp_nThreadsPerCore = 1;
1184     unsigned nCores = 1;
1185 
1186     unsigned pkgCt = 1;                         // to determine radii
1187     unsigned lastPkgId = threadInfo[0].pkgId;
1188     unsigned coreCt = 1;
1189     unsigned lastCoreId = threadInfo[0].coreId;
1190     unsigned threadCt = 1;
1191     unsigned lastThreadId = threadInfo[0].threadId;
1192 
1193                                                 // intra-pkg consist checks
1194     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1195     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1196 
1197     for (i = 1; i < nApics; i++) {
1198         if (threadInfo[i].pkgId != lastPkgId) {
1199             nCores++;
1200             pkgCt++;
1201             lastPkgId = threadInfo[i].pkgId;
1202             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1203             coreCt = 1;
1204             lastCoreId = threadInfo[i].coreId;
1205             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1206             threadCt = 1;
1207             lastThreadId = threadInfo[i].threadId;
1208 
1209             //
1210             // This is a different package, so go on to the next iteration
1211             // without doing any consistency checks.  Reset the consistency
1212             // check vars, though.
1213             //
1214             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1215             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1216             continue;
1217         }
1218 
1219         if (threadInfo[i].coreId != lastCoreId) {
1220             nCores++;
1221             coreCt++;
1222             lastCoreId = threadInfo[i].coreId;
1223             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1224             threadCt = 1;
1225             lastThreadId = threadInfo[i].threadId;
1226         }
1227         else if (threadInfo[i].threadId != lastThreadId) {
1228             threadCt++;
1229             lastThreadId = threadInfo[i].threadId;
1230         }
1231         else {
1232             __kmp_free(threadInfo);
1233             KMP_CPU_FREE(oldMask);
1234             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1235             return -1;
1236         }
1237 
1238         //
1239         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1240         // fields agree between all the threads bounds to a given package.
1241         //
1242         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1243           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1244             __kmp_free(threadInfo);
1245             KMP_CPU_FREE(oldMask);
1246             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1247             return -1;
1248         }
1249     }
1250     nPackages = pkgCt;
1251     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1252     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1253 
1254     //
1255     // When affinity is off, this routine will still be called to set
1256     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1257     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1258     // correctly, and return now if affinity is not enabled.
1259     //
1260     __kmp_ncores = nCores;
1261     if (__kmp_affinity_verbose) {
1262         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1263         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1264 
1265         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1266         if (__kmp_affinity_respect_mask) {
1267             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1268         } else {
1269             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1270         }
1271         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1272         if (__kmp_affinity_uniform_topology()) {
1273             KMP_INFORM(Uniform, "KMP_AFFINITY");
1274         } else {
1275             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1276         }
1277         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1278           __kmp_nThreadsPerCore, __kmp_ncores);
1279 
1280     }
1281 
1282     if (__kmp_affinity_type == affinity_none) {
1283         __kmp_free(threadInfo);
1284         KMP_CPU_FREE(oldMask);
1285         return 0;
1286     }
1287 
1288     //
1289     // Now that we've determined the number of packages, the number of cores
1290     // per package, and the number of threads per core, we can construct the
1291     // data structure that is to be returned.
1292     //
1293     int pkgLevel = 0;
1294     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1295     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1296     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1297 
1298     KMP_ASSERT(depth > 0);
1299     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1300 
1301     for (i = 0; i < nApics; ++i) {
1302         Address addr(depth);
1303         unsigned os = threadInfo[i].osId;
1304         int d = 0;
1305 
1306         if (pkgLevel >= 0) {
1307             addr.labels[d++] = threadInfo[i].pkgId;
1308         }
1309         if (coreLevel >= 0) {
1310             addr.labels[d++] = threadInfo[i].coreId;
1311         }
1312         if (threadLevel >= 0) {
1313             addr.labels[d++] = threadInfo[i].threadId;
1314         }
1315         (*address2os)[i] = AddrUnsPair(addr, os);
1316     }
1317 
1318     if (__kmp_affinity_gran_levels < 0) {
1319         //
1320         // Set the granularity level based on what levels are modeled
1321         // in the machine topology map.
1322         //
1323         __kmp_affinity_gran_levels = 0;
1324         if ((threadLevel >= 0)
1325           && (__kmp_affinity_gran > affinity_gran_thread)) {
1326             __kmp_affinity_gran_levels++;
1327         }
1328         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1329             __kmp_affinity_gran_levels++;
1330         }
1331         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1332             __kmp_affinity_gran_levels++;
1333         }
1334     }
1335 
1336     if (__kmp_affinity_verbose) {
1337         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1338           coreLevel, threadLevel);
1339     }
1340 
1341     __kmp_free(threadInfo);
1342     KMP_CPU_FREE(oldMask);
1343     return depth;
1344 }
1345 
1346 
1347 //
1348 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1349 // architectures support a newer interface for specifying the x2APIC Ids,
1350 // based on cpuid leaf 11.
1351 //
1352 static int
1353 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1354   kmp_i18n_id_t *const msg_id)
1355 {
1356     kmp_cpuid buf;
1357 
1358     *address2os = NULL;
1359     *msg_id = kmp_i18n_null;
1360 
1361     //
1362     // Check to see if cpuid leaf 11 is supported.
1363     //
1364     __kmp_x86_cpuid(0, 0, &buf);
1365     if (buf.eax < 11) {
1366         *msg_id = kmp_i18n_str_NoLeaf11Support;
1367         return -1;
1368     }
1369     __kmp_x86_cpuid(11, 0, &buf);
1370     if (buf.ebx == 0) {
1371         *msg_id = kmp_i18n_str_NoLeaf11Support;
1372         return -1;
1373     }
1374 
1375     //
1376     // Find the number of levels in the machine topology.  While we're at it,
1377     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1378     // try to get more accurate values later by explicitly counting them,
1379     // but get reasonable defaults now, in case we return early.
1380     //
1381     int level;
1382     int threadLevel = -1;
1383     int coreLevel = -1;
1384     int pkgLevel = -1;
1385     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1386 
1387     for (level = 0;; level++) {
1388         if (level > 31) {
1389             //
1390             // FIXME: Hack for DPD200163180
1391             //
1392             // If level is big then something went wrong -> exiting
1393             //
1394             // There could actually be 32 valid levels in the machine topology,
1395             // but so far, the only machine we have seen which does not exit
1396             // this loop before iteration 32 has fubar x2APIC settings.
1397             //
1398             // For now, just reject this case based upon loop trip count.
1399             //
1400             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401             return -1;
1402         }
1403         __kmp_x86_cpuid(11, level, &buf);
1404         if (buf.ebx == 0) {
1405             if (pkgLevel < 0) {
1406                 //
1407                 // Will infer nPackages from __kmp_xproc
1408                 //
1409                 pkgLevel = level;
1410                 level++;
1411             }
1412             break;
1413         }
1414         int kind = (buf.ecx >> 8) & 0xff;
1415         if (kind == 1) {
1416             //
1417             // SMT level
1418             //
1419             threadLevel = level;
1420             coreLevel = -1;
1421             pkgLevel = -1;
1422             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1423             if (__kmp_nThreadsPerCore == 0) {
1424                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1425                 return -1;
1426             }
1427         }
1428         else if (kind == 2) {
1429             //
1430             // core level
1431             //
1432             coreLevel = level;
1433             pkgLevel = -1;
1434             nCoresPerPkg = buf.ebx & 0xff;
1435             if (nCoresPerPkg == 0) {
1436                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437                 return -1;
1438             }
1439         }
1440         else {
1441             if (level <= 0) {
1442                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1443                 return -1;
1444             }
1445             if (pkgLevel >= 0) {
1446                 continue;
1447             }
1448             pkgLevel = level;
1449             nPackages = buf.ebx & 0xff;
1450             if (nPackages == 0) {
1451                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1452                 return -1;
1453             }
1454         }
1455     }
1456     int depth = level;
1457 
1458     //
1459     // In the above loop, "level" was counted from the finest level (usually
1460     // thread) to the coarsest.  The caller expects that we will place the
1461     // labels in (*address2os)[].first.labels[] in the inverse order, so
1462     // we need to invert the vars saying which level means what.
1463     //
1464     if (threadLevel >= 0) {
1465         threadLevel = depth - threadLevel - 1;
1466     }
1467     if (coreLevel >= 0) {
1468         coreLevel = depth - coreLevel - 1;
1469     }
1470     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1471     pkgLevel = depth - pkgLevel - 1;
1472 
1473     //
1474     // The algorithm used starts by setting the affinity to each available
1475     // thread and retrieving info from the cpuid instruction, so if we are
1476     // not capable of calling __kmp_get_system_affinity() and
1477     // _kmp_get_system_affinity(), then we need to do something else - use
1478     // the defaults that we calculated from issuing cpuid without binding
1479     // to each proc.
1480     //
1481     if (! KMP_AFFINITY_CAPABLE())
1482     {
1483         //
1484         // Hack to try and infer the machine topology using only the data
1485         // available from cpuid on the current thread, and __kmp_xproc.
1486         //
1487         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1488 
1489         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1490         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1491         if (__kmp_affinity_verbose) {
1492             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1493             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1494             if (__kmp_affinity_uniform_topology()) {
1495                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1496             } else {
1497                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1498             }
1499             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1500               __kmp_nThreadsPerCore, __kmp_ncores);
1501         }
1502         return 0;
1503     }
1504 
1505     //
1506     //
1507     // From here on, we can assume that it is safe to call
1508     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1509     // even if __kmp_affinity_type = affinity_none.
1510     //
1511 
1512     //
1513     // Save the affinity mask for the current thread.
1514     //
1515     kmp_affin_mask_t *oldMask;
1516     KMP_CPU_ALLOC(oldMask);
1517     __kmp_get_system_affinity(oldMask, TRUE);
1518 
1519     //
1520     // Allocate the data structure to be returned.
1521     //
1522     AddrUnsPair *retval = (AddrUnsPair *)
1523       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1524 
1525     //
1526     // Run through each of the available contexts, binding the current thread
1527     // to it, and obtaining the pertinent information using the cpuid instr.
1528     //
1529     unsigned int proc;
1530     int nApics = 0;
1531     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1532         //
1533         // Skip this proc if it is not included in the machine model.
1534         //
1535         if (! KMP_CPU_ISSET(proc, fullMask)) {
1536             continue;
1537         }
1538         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1539 
1540         __kmp_affinity_bind_thread(proc);
1541 
1542         //
1543         // Extrach the labels for each level in the machine topology map
1544         // from the Apic ID.
1545         //
1546         Address addr(depth);
1547         int prev_shift = 0;
1548 
1549         for (level = 0; level < depth; level++) {
1550             __kmp_x86_cpuid(11, level, &buf);
1551             unsigned apicId = buf.edx;
1552             if (buf.ebx == 0) {
1553                 if (level != depth - 1) {
1554                     KMP_CPU_FREE(oldMask);
1555                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1556                     return -1;
1557                 }
1558                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1559                 level++;
1560                 break;
1561             }
1562             int shift = buf.eax & 0x1f;
1563             int mask = (1 << shift) - 1;
1564             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1565             prev_shift = shift;
1566         }
1567         if (level != depth) {
1568             KMP_CPU_FREE(oldMask);
1569             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1570             return -1;
1571         }
1572 
1573         retval[nApics] = AddrUnsPair(addr, proc);
1574         nApics++;
1575     }
1576 
1577     //
1578     // We've collected all the info we need.
1579     // Restore the old affinity mask for this thread.
1580     //
1581     __kmp_set_system_affinity(oldMask, TRUE);
1582 
1583     //
1584     // If there's only one thread context to bind to, return now.
1585     //
1586     KMP_ASSERT(nApics > 0);
1587     if (nApics == 1) {
1588         __kmp_ncores = nPackages = 1;
1589         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1590         if (__kmp_affinity_verbose) {
1591             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1592             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1593 
1594             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1595             if (__kmp_affinity_respect_mask) {
1596                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1597             } else {
1598                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1599             }
1600             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1601             KMP_INFORM(Uniform, "KMP_AFFINITY");
1602             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1603               __kmp_nThreadsPerCore, __kmp_ncores);
1604         }
1605 
1606         if (__kmp_affinity_type == affinity_none) {
1607             __kmp_free(retval);
1608             KMP_CPU_FREE(oldMask);
1609             return 0;
1610         }
1611 
1612         //
1613         // Form an Address object which only includes the package level.
1614         //
1615         Address addr(1);
1616         addr.labels[0] = retval[0].first.labels[pkgLevel];
1617         retval[0].first = addr;
1618 
1619         if (__kmp_affinity_gran_levels < 0) {
1620             __kmp_affinity_gran_levels = 0;
1621         }
1622 
1623         if (__kmp_affinity_verbose) {
1624             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1625         }
1626 
1627         *address2os = retval;
1628         KMP_CPU_FREE(oldMask);
1629         return 1;
1630     }
1631 
1632     //
1633     // Sort the table by physical Id.
1634     //
1635     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1636 
1637     //
1638     // Find the radix at each of the levels.
1639     //
1640     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1641     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1642     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1643     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1644     for (level = 0; level < depth; level++) {
1645         totals[level] = 1;
1646         maxCt[level] = 1;
1647         counts[level] = 1;
1648         last[level] = retval[0].first.labels[level];
1649     }
1650 
1651     //
1652     // From here on, the iteration variable "level" runs from the finest
1653     // level to the coarsest, i.e. we iterate forward through
1654     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1655     // backwards.
1656     //
1657     for (proc = 1; (int)proc < nApics; proc++) {
1658         int level;
1659         for (level = 0; level < depth; level++) {
1660             if (retval[proc].first.labels[level] != last[level]) {
1661                 int j;
1662                 for (j = level + 1; j < depth; j++) {
1663                     totals[j]++;
1664                     counts[j] = 1;
1665                     // The line below causes printing incorrect topology information
1666                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1667                     // some less value while going through the array.
1668                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1669                     // whereas it must be 4.
1670                     // TODO!!! Check if it can be commented safely
1671                     //maxCt[j] = 1;
1672                     last[j] = retval[proc].first.labels[j];
1673                 }
1674                 totals[level]++;
1675                 counts[level]++;
1676                 if (counts[level] > maxCt[level]) {
1677                     maxCt[level] = counts[level];
1678                 }
1679                 last[level] = retval[proc].first.labels[level];
1680                 break;
1681             }
1682             else if (level == depth - 1) {
1683                 __kmp_free(last);
1684                 __kmp_free(maxCt);
1685                 __kmp_free(counts);
1686                 __kmp_free(totals);
1687                 __kmp_free(retval);
1688                 KMP_CPU_FREE(oldMask);
1689                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1690                 return -1;
1691             }
1692         }
1693     }
1694 
1695     //
1696     // When affinity is off, this routine will still be called to set
1697     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1698     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1699     // correctly, and return if affinity is not enabled.
1700     //
1701     if (threadLevel >= 0) {
1702         __kmp_nThreadsPerCore = maxCt[threadLevel];
1703     }
1704     else {
1705         __kmp_nThreadsPerCore = 1;
1706     }
1707     nPackages = totals[pkgLevel];
1708 
1709     if (coreLevel >= 0) {
1710         __kmp_ncores = totals[coreLevel];
1711         nCoresPerPkg = maxCt[coreLevel];
1712     }
1713     else {
1714         __kmp_ncores = nPackages;
1715         nCoresPerPkg = 1;
1716     }
1717 
1718     //
1719     // Check to see if the machine topology is uniform
1720     //
1721     unsigned prod = maxCt[0];
1722     for (level = 1; level < depth; level++) {
1723        prod *= maxCt[level];
1724     }
1725     bool uniform = (prod == totals[level - 1]);
1726 
1727     //
1728     // Print the machine topology summary.
1729     //
1730     if (__kmp_affinity_verbose) {
1731         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1732         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1733 
1734         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1735         if (__kmp_affinity_respect_mask) {
1736             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1737         } else {
1738             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1739         }
1740         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1741         if (uniform) {
1742             KMP_INFORM(Uniform, "KMP_AFFINITY");
1743         } else {
1744             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1745         }
1746 
1747         kmp_str_buf_t buf;
1748         __kmp_str_buf_init(&buf);
1749 
1750         __kmp_str_buf_print(&buf, "%d", totals[0]);
1751         for (level = 1; level <= pkgLevel; level++) {
1752             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1753         }
1754         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1755           __kmp_nThreadsPerCore, __kmp_ncores);
1756 
1757         __kmp_str_buf_free(&buf);
1758     }
1759 
1760     if (__kmp_affinity_type == affinity_none) {
1761         __kmp_free(last);
1762         __kmp_free(maxCt);
1763         __kmp_free(counts);
1764         __kmp_free(totals);
1765         __kmp_free(retval);
1766         KMP_CPU_FREE(oldMask);
1767         return 0;
1768     }
1769 
1770     //
1771     // Find any levels with radiix 1, and remove them from the map
1772     // (except for the package level).
1773     //
1774     int new_depth = 0;
1775     for (level = 0; level < depth; level++) {
1776         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1777            continue;
1778         }
1779         new_depth++;
1780     }
1781 
1782     //
1783     // If we are removing any levels, allocate a new vector to return,
1784     // and copy the relevant information to it.
1785     //
1786     if (new_depth != depth) {
1787         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1788           sizeof(AddrUnsPair) * nApics);
1789         for (proc = 0; (int)proc < nApics; proc++) {
1790             Address addr(new_depth);
1791             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1792         }
1793         int new_level = 0;
1794         int newPkgLevel = -1;
1795         int newCoreLevel = -1;
1796         int newThreadLevel = -1;
1797         int i;
1798         for (level = 0; level < depth; level++) {
1799             if ((maxCt[level] == 1)
1800               && (level != pkgLevel)) {
1801                 //
1802                 // Remove this level. Never remove the package level
1803                 //
1804                 continue;
1805             }
1806             if (level == pkgLevel) {
1807                 newPkgLevel = level;
1808             }
1809             if (level == coreLevel) {
1810                 newCoreLevel = level;
1811             }
1812             if (level == threadLevel) {
1813                 newThreadLevel = level;
1814             }
1815             for (proc = 0; (int)proc < nApics; proc++) {
1816                 new_retval[proc].first.labels[new_level]
1817                   = retval[proc].first.labels[level];
1818             }
1819             new_level++;
1820         }
1821 
1822         __kmp_free(retval);
1823         retval = new_retval;
1824         depth = new_depth;
1825         pkgLevel = newPkgLevel;
1826         coreLevel = newCoreLevel;
1827         threadLevel = newThreadLevel;
1828     }
1829 
1830     if (__kmp_affinity_gran_levels < 0) {
1831         //
1832         // Set the granularity level based on what levels are modeled
1833         // in the machine topology map.
1834         //
1835         __kmp_affinity_gran_levels = 0;
1836         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1837             __kmp_affinity_gran_levels++;
1838         }
1839         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1840             __kmp_affinity_gran_levels++;
1841         }
1842         if (__kmp_affinity_gran > affinity_gran_package) {
1843             __kmp_affinity_gran_levels++;
1844         }
1845     }
1846 
1847     if (__kmp_affinity_verbose) {
1848         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1849           coreLevel, threadLevel);
1850     }
1851 
1852     __kmp_free(last);
1853     __kmp_free(maxCt);
1854     __kmp_free(counts);
1855     __kmp_free(totals);
1856     KMP_CPU_FREE(oldMask);
1857     *address2os = retval;
1858     return depth;
1859 }
1860 
1861 
1862 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1863 
1864 
1865 #define osIdIndex       0
1866 #define threadIdIndex   1
1867 #define coreIdIndex     2
1868 #define pkgIdIndex      3
1869 #define nodeIdIndex     4
1870 
1871 typedef unsigned *ProcCpuInfo;
1872 static unsigned maxIndex = pkgIdIndex;
1873 
1874 
1875 static int
1876 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1877 {
1878     const unsigned *aa = (const unsigned *)a;
1879     const unsigned *bb = (const unsigned *)b;
1880     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1881     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1882     return 0;
1883 };
1884 
1885 
1886 static int
1887 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1888 {
1889     unsigned i;
1890     const unsigned *aa = *((const unsigned **)a);
1891     const unsigned *bb = *((const unsigned **)b);
1892     for (i = maxIndex; ; i--) {
1893         if (aa[i] < bb[i]) return -1;
1894         if (aa[i] > bb[i]) return 1;
1895         if (i == osIdIndex) break;
1896     }
1897     return 0;
1898 }
1899 
1900 
1901 //
1902 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1903 // affinity map.
1904 //
1905 static int
1906 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1907   kmp_i18n_id_t *const msg_id, FILE *f)
1908 {
1909     *address2os = NULL;
1910     *msg_id = kmp_i18n_null;
1911 
1912     //
1913     // Scan of the file, and count the number of "processor" (osId) fields,
1914     // and find the highest value of <n> for a node_<n> field.
1915     //
1916     char buf[256];
1917     unsigned num_records = 0;
1918     while (! feof(f)) {
1919         buf[sizeof(buf) - 1] = 1;
1920         if (! fgets(buf, sizeof(buf), f)) {
1921             //
1922             // Read errors presumably because of EOF
1923             //
1924             break;
1925         }
1926 
1927         char s1[] = "processor";
1928         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1929             num_records++;
1930             continue;
1931         }
1932 
1933         //
1934         // FIXME - this will match "node_<n> <garbage>"
1935         //
1936         unsigned level;
1937         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1938             if (nodeIdIndex + level >= maxIndex) {
1939                 maxIndex = nodeIdIndex + level;
1940             }
1941             continue;
1942         }
1943     }
1944 
1945     //
1946     // Check for empty file / no valid processor records, or too many.
1947     // The number of records can't exceed the number of valid bits in the
1948     // affinity mask.
1949     //
1950     if (num_records == 0) {
1951         *line = 0;
1952         *msg_id = kmp_i18n_str_NoProcRecords;
1953         return -1;
1954     }
1955     if (num_records > (unsigned)__kmp_xproc) {
1956         *line = 0;
1957         *msg_id = kmp_i18n_str_TooManyProcRecords;
1958         return -1;
1959     }
1960 
1961     //
1962     // Set the file pointer back to the begginning, so that we can scan the
1963     // file again, this time performing a full parse of the data.
1964     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1965     // Adding an extra element at the end allows us to remove a lot of extra
1966     // checks for termination conditions.
1967     //
1968     if (fseek(f, 0, SEEK_SET) != 0) {
1969         *line = 0;
1970         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1971         return -1;
1972     }
1973 
1974     //
1975     // Allocate the array of records to store the proc info in.  The dummy
1976     // element at the end makes the logic in filling them out easier to code.
1977     //
1978     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1979       * sizeof(unsigned *));
1980     unsigned i;
1981     for (i = 0; i <= num_records; i++) {
1982         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1983           * sizeof(unsigned));
1984     }
1985 
1986 #define CLEANUP_THREAD_INFO \
1987     for (i = 0; i <= num_records; i++) {                                \
1988         __kmp_free(threadInfo[i]);                                      \
1989     }                                                                   \
1990     __kmp_free(threadInfo);
1991 
1992     //
1993     // A value of UINT_MAX means that we didn't find the field
1994     //
1995     unsigned __index;
1996 
1997 #define INIT_PROC_INFO(p) \
1998     for (__index = 0; __index <= maxIndex; __index++) {                 \
1999         (p)[__index] = UINT_MAX;                                        \
2000     }
2001 
2002     for (i = 0; i <= num_records; i++) {
2003         INIT_PROC_INFO(threadInfo[i]);
2004     }
2005 
2006     unsigned num_avail = 0;
2007     *line = 0;
2008     while (! feof(f)) {
2009         //
2010         // Create an inner scoping level, so that all the goto targets at the
2011         // end of the loop appear in an outer scoping level.  This avoids
2012         // warnings about jumping past an initialization to a target in the
2013         // same block.
2014         //
2015         {
2016             buf[sizeof(buf) - 1] = 1;
2017             bool long_line = false;
2018             if (! fgets(buf, sizeof(buf), f)) {
2019                 //
2020                 // Read errors presumably because of EOF
2021                 //
2022                 // If there is valid data in threadInfo[num_avail], then fake
2023                 // a blank line in ensure that the last address gets parsed.
2024                 //
2025                 bool valid = false;
2026                 for (i = 0; i <= maxIndex; i++) {
2027                     if (threadInfo[num_avail][i] != UINT_MAX) {
2028                         valid = true;
2029                     }
2030                 }
2031                 if (! valid) {
2032                     break;
2033                 }
2034                 buf[0] = 0;
2035             } else if (!buf[sizeof(buf) - 1]) {
2036                 //
2037                 // The line is longer than the buffer.  Set a flag and don't
2038                 // emit an error if we were going to ignore the line, anyway.
2039                 //
2040                 long_line = true;
2041 
2042 #define CHECK_LINE \
2043     if (long_line) {                                                    \
2044         CLEANUP_THREAD_INFO;                                            \
2045         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2046         return -1;                                                      \
2047     }
2048             }
2049             (*line)++;
2050 
2051             char s1[] = "processor";
2052             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2053                 CHECK_LINE;
2054                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2055                 unsigned val;
2056                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2057                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2058                 threadInfo[num_avail][osIdIndex] = val;
2059 #if KMP_OS_LINUX && USE_SYSFS_INFO
2060                 char path[256];
2061                 KMP_SNPRINTF(path, sizeof(path),
2062                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2063                     threadInfo[num_avail][osIdIndex]);
2064                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2065 
2066                 KMP_SNPRINTF(path, sizeof(path),
2067                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2068                     threadInfo[num_avail][osIdIndex]);
2069                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2070                 continue;
2071 #else
2072             }
2073             char s2[] = "physical id";
2074             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2075                 CHECK_LINE;
2076                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2077                 unsigned val;
2078                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2079                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2080                 threadInfo[num_avail][pkgIdIndex] = val;
2081                 continue;
2082             }
2083             char s3[] = "core id";
2084             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2085                 CHECK_LINE;
2086                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2087                 unsigned val;
2088                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2089                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2090                 threadInfo[num_avail][coreIdIndex] = val;
2091                 continue;
2092 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2093             }
2094             char s4[] = "thread id";
2095             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2096                 CHECK_LINE;
2097                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2098                 unsigned val;
2099                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2100                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2101                 threadInfo[num_avail][threadIdIndex] = val;
2102                 continue;
2103             }
2104             unsigned level;
2105             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2106                 CHECK_LINE;
2107                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2108                 unsigned val;
2109                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2110                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2111                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2112                 threadInfo[num_avail][nodeIdIndex + level] = val;
2113                 continue;
2114             }
2115 
2116             //
2117             // We didn't recognize the leading token on the line.
2118             // There are lots of leading tokens that we don't recognize -
2119             // if the line isn't empty, go on to the next line.
2120             //
2121             if ((*buf != 0) && (*buf != '\n')) {
2122                 //
2123                 // If the line is longer than the buffer, read characters
2124                 // until we find a newline.
2125                 //
2126                 if (long_line) {
2127                     int ch;
2128                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2129                 }
2130                 continue;
2131             }
2132 
2133             //
2134             // A newline has signalled the end of the processor record.
2135             // Check that there aren't too many procs specified.
2136             //
2137             if ((int)num_avail == __kmp_xproc) {
2138                 CLEANUP_THREAD_INFO;
2139                 *msg_id = kmp_i18n_str_TooManyEntries;
2140                 return -1;
2141             }
2142 
2143             //
2144             // Check for missing fields.  The osId field must be there, and we
2145             // currently require that the physical id field is specified, also.
2146             //
2147             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2148                 CLEANUP_THREAD_INFO;
2149                 *msg_id = kmp_i18n_str_MissingProcField;
2150                 return -1;
2151             }
2152             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2153                 CLEANUP_THREAD_INFO;
2154                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2155                 return -1;
2156             }
2157 
2158             //
2159             // Skip this proc if it is not included in the machine model.
2160             //
2161             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2162                 INIT_PROC_INFO(threadInfo[num_avail]);
2163                 continue;
2164             }
2165 
2166             //
2167             // We have a successful parse of this proc's info.
2168             // Increment the counter, and prepare for the next proc.
2169             //
2170             num_avail++;
2171             KMP_ASSERT(num_avail <= num_records);
2172             INIT_PROC_INFO(threadInfo[num_avail]);
2173         }
2174         continue;
2175 
2176         no_val:
2177         CLEANUP_THREAD_INFO;
2178         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2179         return -1;
2180 
2181         dup_field:
2182         CLEANUP_THREAD_INFO;
2183         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2184         return -1;
2185     }
2186     *line = 0;
2187 
2188 # if KMP_MIC && REDUCE_TEAM_SIZE
2189     unsigned teamSize = 0;
2190 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2191 
2192     // check for num_records == __kmp_xproc ???
2193 
2194     //
2195     // If there's only one thread context to bind to, form an Address object
2196     // with depth 1 and return immediately (or, if affinity is off, set
2197     // address2os to NULL and return).
2198     //
2199     // If it is configured to omit the package level when there is only a
2200     // single package, the logic at the end of this routine won't work if
2201     // there is only a single thread - it would try to form an Address
2202     // object with depth 0.
2203     //
2204     KMP_ASSERT(num_avail > 0);
2205     KMP_ASSERT(num_avail <= num_records);
2206     if (num_avail == 1) {
2207         __kmp_ncores = 1;
2208         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2209         if (__kmp_affinity_verbose) {
2210             if (! KMP_AFFINITY_CAPABLE()) {
2211                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2212                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2213                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2214             }
2215             else {
2216                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2217                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2218                   fullMask);
2219                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2220                 if (__kmp_affinity_respect_mask) {
2221                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2222                 } else {
2223                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2224                 }
2225                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2226                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2227             }
2228             int index;
2229             kmp_str_buf_t buf;
2230             __kmp_str_buf_init(&buf);
2231             __kmp_str_buf_print(&buf, "1");
2232             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2233                 __kmp_str_buf_print(&buf, " x 1");
2234             }
2235             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2236             __kmp_str_buf_free(&buf);
2237         }
2238 
2239         if (__kmp_affinity_type == affinity_none) {
2240             CLEANUP_THREAD_INFO;
2241             return 0;
2242         }
2243 
2244         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2245         Address addr(1);
2246         addr.labels[0] = threadInfo[0][pkgIdIndex];
2247         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2248 
2249         if (__kmp_affinity_gran_levels < 0) {
2250             __kmp_affinity_gran_levels = 0;
2251         }
2252 
2253         if (__kmp_affinity_verbose) {
2254             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2255         }
2256 
2257         CLEANUP_THREAD_INFO;
2258         return 1;
2259     }
2260 
2261     //
2262     // Sort the threadInfo table by physical Id.
2263     //
2264     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2265       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2266 
2267     //
2268     // The table is now sorted by pkgId / coreId / threadId, but we really
2269     // don't know the radix of any of the fields.  pkgId's may be sparsely
2270     // assigned among the chips on a system.  Although coreId's are usually
2271     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2272     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2273     //
2274     // For that matter, we don't know what coresPerPkg and threadsPerCore
2275     // (or the total # packages) are at this point - we want to determine
2276     // that now.  We only have an upper bound on the first two figures.
2277     //
2278     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2279       * sizeof(unsigned));
2280     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2281       * sizeof(unsigned));
2282     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2283       * sizeof(unsigned));
2284     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2285       * sizeof(unsigned));
2286 
2287     bool assign_thread_ids = false;
2288     unsigned threadIdCt;
2289     unsigned index;
2290 
2291     restart_radix_check:
2292     threadIdCt = 0;
2293 
2294     //
2295     // Initialize the counter arrays with data from threadInfo[0].
2296     //
2297     if (assign_thread_ids) {
2298         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2299             threadInfo[0][threadIdIndex] = threadIdCt++;
2300         }
2301         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2302             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2303         }
2304     }
2305     for (index = 0; index <= maxIndex; index++) {
2306         counts[index] = 1;
2307         maxCt[index] = 1;
2308         totals[index] = 1;
2309         lastId[index] = threadInfo[0][index];;
2310     }
2311 
2312     //
2313     // Run through the rest of the OS procs.
2314     //
2315     for (i = 1; i < num_avail; i++) {
2316         //
2317         // Find the most significant index whose id differs
2318         // from the id for the previous OS proc.
2319         //
2320         for (index = maxIndex; index >= threadIdIndex; index--) {
2321             if (assign_thread_ids && (index == threadIdIndex)) {
2322                 //
2323                 // Auto-assign the thread id field if it wasn't specified.
2324                 //
2325                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2326                     threadInfo[i][threadIdIndex] = threadIdCt++;
2327                 }
2328 
2329                 //
2330                 // Aparrently the thread id field was specified for some
2331                 // entries and not others.  Start the thread id counter
2332                 // off at the next higher thread id.
2333                 //
2334                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2335                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2336                 }
2337             }
2338             if (threadInfo[i][index] != lastId[index]) {
2339                 //
2340                 // Run through all indices which are less significant,
2341                 // and reset the counts to 1.
2342                 //
2343                 // At all levels up to and including index, we need to
2344                 // increment the totals and record the last id.
2345                 //
2346                 unsigned index2;
2347                 for (index2 = threadIdIndex; index2 < index; index2++) {
2348                     totals[index2]++;
2349                     if (counts[index2] > maxCt[index2]) {
2350                         maxCt[index2] = counts[index2];
2351                     }
2352                     counts[index2] = 1;
2353                     lastId[index2] = threadInfo[i][index2];
2354                 }
2355                 counts[index]++;
2356                 totals[index]++;
2357                 lastId[index] = threadInfo[i][index];
2358 
2359                 if (assign_thread_ids && (index > threadIdIndex)) {
2360 
2361 # if KMP_MIC && REDUCE_TEAM_SIZE
2362                     //
2363                     // The default team size is the total #threads in the machine
2364                     // minus 1 thread for every core that has 3 or more threads.
2365                     //
2366                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2367 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2368 
2369                     //
2370                     // Restart the thread counter, as we are on a new core.
2371                     //
2372                     threadIdCt = 0;
2373 
2374                     //
2375                     // Auto-assign the thread id field if it wasn't specified.
2376                     //
2377                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2378                         threadInfo[i][threadIdIndex] = threadIdCt++;
2379                     }
2380 
2381                     //
2382                     // Aparrently the thread id field was specified for some
2383                     // entries and not others.  Start the thread id counter
2384                     // off at the next higher thread id.
2385                     //
2386                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2387                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2388                     }
2389                 }
2390                 break;
2391             }
2392         }
2393         if (index < threadIdIndex) {
2394             //
2395             // If thread ids were specified, it is an error if they are not
2396             // unique.  Also, check that we waven't already restarted the
2397             // loop (to be safe - shouldn't need to).
2398             //
2399             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2400               || assign_thread_ids) {
2401                 __kmp_free(lastId);
2402                 __kmp_free(totals);
2403                 __kmp_free(maxCt);
2404                 __kmp_free(counts);
2405                 CLEANUP_THREAD_INFO;
2406                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2407                 return -1;
2408             }
2409 
2410             //
2411             // If the thread ids were not specified and we see entries
2412             // entries that are duplicates, start the loop over and
2413             // assign the thread ids manually.
2414             //
2415             assign_thread_ids = true;
2416             goto restart_radix_check;
2417         }
2418     }
2419 
2420 # if KMP_MIC && REDUCE_TEAM_SIZE
2421     //
2422     // The default team size is the total #threads in the machine
2423     // minus 1 thread for every core that has 3 or more threads.
2424     //
2425     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2426 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2427 
2428     for (index = threadIdIndex; index <= maxIndex; index++) {
2429         if (counts[index] > maxCt[index]) {
2430             maxCt[index] = counts[index];
2431         }
2432     }
2433 
2434     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2435     nCoresPerPkg = maxCt[coreIdIndex];
2436     nPackages = totals[pkgIdIndex];
2437 
2438     //
2439     // Check to see if the machine topology is uniform
2440     //
2441     unsigned prod = totals[maxIndex];
2442     for (index = threadIdIndex; index < maxIndex; index++) {
2443        prod *= maxCt[index];
2444     }
2445     bool uniform = (prod == totals[threadIdIndex]);
2446 
2447     //
2448     // When affinity is off, this routine will still be called to set
2449     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2450     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2451     // correctly, and return now if affinity is not enabled.
2452     //
2453     __kmp_ncores = totals[coreIdIndex];
2454 
2455     if (__kmp_affinity_verbose) {
2456         if (! KMP_AFFINITY_CAPABLE()) {
2457                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2458                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2459                 if (uniform) {
2460                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2461                 } else {
2462                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2463                 }
2464         }
2465         else {
2466             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2467             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2468                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2469                 if (__kmp_affinity_respect_mask) {
2470                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2471                 } else {
2472                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2473                 }
2474                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2475                 if (uniform) {
2476                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2477                 } else {
2478                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2479                 }
2480         }
2481         kmp_str_buf_t buf;
2482         __kmp_str_buf_init(&buf);
2483 
2484         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2485         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2486             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2487         }
2488         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2489           maxCt[threadIdIndex], __kmp_ncores);
2490 
2491         __kmp_str_buf_free(&buf);
2492     }
2493 
2494 # if KMP_MIC && REDUCE_TEAM_SIZE
2495     //
2496     // Set the default team size.
2497     //
2498     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2499         __kmp_dflt_team_nth = teamSize;
2500         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2501           __kmp_dflt_team_nth));
2502     }
2503 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2504 
2505     if (__kmp_affinity_type == affinity_none) {
2506         __kmp_free(lastId);
2507         __kmp_free(totals);
2508         __kmp_free(maxCt);
2509         __kmp_free(counts);
2510         CLEANUP_THREAD_INFO;
2511         return 0;
2512     }
2513 
2514     //
2515     // Count the number of levels which have more nodes at that level than
2516     // at the parent's level (with there being an implicit root node of
2517     // the top level).  This is equivalent to saying that there is at least
2518     // one node at this level which has a sibling.  These levels are in the
2519     // map, and the package level is always in the map.
2520     //
2521     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2522     int level = 0;
2523     for (index = threadIdIndex; index < maxIndex; index++) {
2524         KMP_ASSERT(totals[index] >= totals[index + 1]);
2525         inMap[index] = (totals[index] > totals[index + 1]);
2526     }
2527     inMap[maxIndex] = (totals[maxIndex] > 1);
2528     inMap[pkgIdIndex] = true;
2529 
2530     int depth = 0;
2531     for (index = threadIdIndex; index <= maxIndex; index++) {
2532         if (inMap[index]) {
2533             depth++;
2534         }
2535     }
2536     KMP_ASSERT(depth > 0);
2537 
2538     //
2539     // Construct the data structure that is to be returned.
2540     //
2541     *address2os = (AddrUnsPair*)
2542       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2543     int pkgLevel = -1;
2544     int coreLevel = -1;
2545     int threadLevel = -1;
2546 
2547     for (i = 0; i < num_avail; ++i) {
2548         Address addr(depth);
2549         unsigned os = threadInfo[i][osIdIndex];
2550         int src_index;
2551         int dst_index = 0;
2552 
2553         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2554             if (! inMap[src_index]) {
2555                 continue;
2556             }
2557             addr.labels[dst_index] = threadInfo[i][src_index];
2558             if (src_index == pkgIdIndex) {
2559                 pkgLevel = dst_index;
2560             }
2561             else if (src_index == coreIdIndex) {
2562                 coreLevel = dst_index;
2563             }
2564             else if (src_index == threadIdIndex) {
2565                 threadLevel = dst_index;
2566             }
2567             dst_index++;
2568         }
2569         (*address2os)[i] = AddrUnsPair(addr, os);
2570     }
2571 
2572     if (__kmp_affinity_gran_levels < 0) {
2573         //
2574         // Set the granularity level based on what levels are modeled
2575         // in the machine topology map.
2576         //
2577         unsigned src_index;
2578         __kmp_affinity_gran_levels = 0;
2579         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2580             if (! inMap[src_index]) {
2581                 continue;
2582             }
2583             switch (src_index) {
2584                 case threadIdIndex:
2585                 if (__kmp_affinity_gran > affinity_gran_thread) {
2586                     __kmp_affinity_gran_levels++;
2587                 }
2588 
2589                 break;
2590                 case coreIdIndex:
2591                 if (__kmp_affinity_gran > affinity_gran_core) {
2592                     __kmp_affinity_gran_levels++;
2593                 }
2594                 break;
2595 
2596                 case pkgIdIndex:
2597                 if (__kmp_affinity_gran > affinity_gran_package) {
2598                     __kmp_affinity_gran_levels++;
2599                 }
2600                 break;
2601             }
2602         }
2603     }
2604 
2605     if (__kmp_affinity_verbose) {
2606         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2607           coreLevel, threadLevel);
2608     }
2609 
2610     __kmp_free(inMap);
2611     __kmp_free(lastId);
2612     __kmp_free(totals);
2613     __kmp_free(maxCt);
2614     __kmp_free(counts);
2615     CLEANUP_THREAD_INFO;
2616     return depth;
2617 }
2618 
2619 
2620 //
2621 // Create and return a table of affinity masks, indexed by OS thread ID.
2622 // This routine handles OR'ing together all the affinity masks of threads
2623 // that are sufficiently close, if granularity > fine.
2624 //
2625 static kmp_affin_mask_t *
2626 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2627   AddrUnsPair *address2os, unsigned numAddrs)
2628 {
2629     //
2630     // First form a table of affinity masks in order of OS thread id.
2631     //
2632     unsigned depth;
2633     unsigned maxOsId;
2634     unsigned i;
2635 
2636     KMP_ASSERT(numAddrs > 0);
2637     depth = address2os[0].first.depth;
2638 
2639     maxOsId = 0;
2640     for (i = 0; i < numAddrs; i++) {
2641         unsigned osId = address2os[i].second;
2642         if (osId > maxOsId) {
2643             maxOsId = osId;
2644         }
2645     }
2646     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2647       (maxOsId + 1) * __kmp_affin_mask_size);
2648 
2649     //
2650     // Sort the address2os table according to physical order.  Doing so
2651     // will put all threads on the same core/package/node in consecutive
2652     // locations.
2653     //
2654     qsort(address2os, numAddrs, sizeof(*address2os),
2655       __kmp_affinity_cmp_Address_labels);
2656 
2657     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2658     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2659         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2660     }
2661     if (__kmp_affinity_gran_levels >= (int)depth) {
2662         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2663           && (__kmp_affinity_type != affinity_none))) {
2664             KMP_WARNING(AffThreadsMayMigrate);
2665         }
2666     }
2667 
2668     //
2669     // Run through the table, forming the masks for all threads on each
2670     // core.  Threads on the same core will have identical "Address"
2671     // objects, not considering the last level, which must be the thread
2672     // id.  All threads on a core will appear consecutively.
2673     //
2674     unsigned unique = 0;
2675     unsigned j = 0;                             // index of 1st thread on core
2676     unsigned leader = 0;
2677     Address *leaderAddr = &(address2os[0].first);
2678     kmp_affin_mask_t *sum
2679       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2680     KMP_CPU_ZERO(sum);
2681     KMP_CPU_SET(address2os[0].second, sum);
2682     for (i = 1; i < numAddrs; i++) {
2683         //
2684         // If this thread is sufficiently close to the leader (within the
2685         // granularity setting), then set the bit for this os thread in the
2686         // affinity mask for this group, and go on to the next thread.
2687         //
2688         if (leaderAddr->isClose(address2os[i].first,
2689           __kmp_affinity_gran_levels)) {
2690             KMP_CPU_SET(address2os[i].second, sum);
2691             continue;
2692         }
2693 
2694         //
2695         // For every thread in this group, copy the mask to the thread's
2696         // entry in the osId2Mask table.  Mark the first address as a
2697         // leader.
2698         //
2699         for (; j < i; j++) {
2700             unsigned osId = address2os[j].second;
2701             KMP_DEBUG_ASSERT(osId <= maxOsId);
2702             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2703             KMP_CPU_COPY(mask, sum);
2704             address2os[j].first.leader = (j == leader);
2705         }
2706         unique++;
2707 
2708         //
2709         // Start a new mask.
2710         //
2711         leader = i;
2712         leaderAddr = &(address2os[i].first);
2713         KMP_CPU_ZERO(sum);
2714         KMP_CPU_SET(address2os[i].second, sum);
2715     }
2716 
2717     //
2718     // For every thread in last group, copy the mask to the thread's
2719     // entry in the osId2Mask table.
2720     //
2721     for (; j < i; j++) {
2722         unsigned osId = address2os[j].second;
2723         KMP_DEBUG_ASSERT(osId <= maxOsId);
2724         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2725         KMP_CPU_COPY(mask, sum);
2726         address2os[j].first.leader = (j == leader);
2727     }
2728     unique++;
2729 
2730     *maxIndex = maxOsId;
2731     *numUnique = unique;
2732     return osId2Mask;
2733 }
2734 
2735 
2736 //
2737 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2738 // as file-static than to try and pass them through the calling sequence of
2739 // the recursive-descent OMP_PLACES parser.
2740 //
2741 static kmp_affin_mask_t *newMasks;
2742 static int numNewMasks;
2743 static int nextNewMask;
2744 
2745 #define ADD_MASK(_mask) \
2746     {                                                                   \
2747         if (nextNewMask >= numNewMasks) {                               \
2748             numNewMasks *= 2;                                           \
2749             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2750               numNewMasks * __kmp_affin_mask_size);                     \
2751         }                                                               \
2752         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2753         nextNewMask++;                                                  \
2754     }
2755 
2756 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2757     {                                                                   \
2758         if (((_osId) > _maxOsId) ||                                     \
2759           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2760             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2761               && (__kmp_affinity_type != affinity_none))) {             \
2762                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2763             }                                                           \
2764         }                                                               \
2765         else {                                                          \
2766             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2767         }                                                               \
2768     }
2769 
2770 
2771 //
2772 // Re-parse the proclist (for the explicit affinity type), and form the list
2773 // of affinity newMasks indexed by gtid.
2774 //
2775 static void
2776 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2777   unsigned int *out_numMasks, const char *proclist,
2778   kmp_affin_mask_t *osId2Mask, int maxOsId)
2779 {
2780     const char *scan = proclist;
2781     const char *next = proclist;
2782 
2783     //
2784     // We use malloc() for the temporary mask vector,
2785     // so that we can use realloc() to extend it.
2786     //
2787     numNewMasks = 2;
2788     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2789       * __kmp_affin_mask_size);
2790     nextNewMask = 0;
2791     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2792       __kmp_affin_mask_size);
2793     int setSize = 0;
2794 
2795     for (;;) {
2796         int start, end, stride;
2797 
2798         SKIP_WS(scan);
2799         next = scan;
2800         if (*next == '\0') {
2801             break;
2802         }
2803 
2804         if (*next == '{') {
2805             int num;
2806             setSize = 0;
2807             next++;     // skip '{'
2808             SKIP_WS(next);
2809             scan = next;
2810 
2811             //
2812             // Read the first integer in the set.
2813             //
2814             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2815               "bad proclist");
2816             SKIP_DIGITS(next);
2817             num = __kmp_str_to_int(scan, *next);
2818             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2819 
2820             //
2821             // Copy the mask for that osId to the sum (union) mask.
2822             //
2823             if ((num > maxOsId) ||
2824               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2825                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2826                   && (__kmp_affinity_type != affinity_none))) {
2827                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2828                 }
2829                 KMP_CPU_ZERO(sumMask);
2830             }
2831             else {
2832                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2833                 setSize = 1;
2834             }
2835 
2836             for (;;) {
2837                 //
2838                 // Check for end of set.
2839                 //
2840                 SKIP_WS(next);
2841                 if (*next == '}') {
2842                     next++;     // skip '}'
2843                     break;
2844                 }
2845 
2846                 //
2847                 // Skip optional comma.
2848                 //
2849                 if (*next == ',') {
2850                     next++;
2851                 }
2852                 SKIP_WS(next);
2853 
2854                 //
2855                 // Read the next integer in the set.
2856                 //
2857                 scan = next;
2858                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2859                   "bad explicit proc list");
2860 
2861                 SKIP_DIGITS(next);
2862                 num = __kmp_str_to_int(scan, *next);
2863                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2864 
2865                 //
2866                 // Add the mask for that osId to the sum mask.
2867                 //
2868                 if ((num > maxOsId) ||
2869                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2870                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2871                       && (__kmp_affinity_type != affinity_none))) {
2872                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2873                     }
2874                 }
2875                 else {
2876                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2877                     setSize++;
2878                 }
2879             }
2880             if (setSize > 0) {
2881                 ADD_MASK(sumMask);
2882             }
2883 
2884             SKIP_WS(next);
2885             if (*next == ',') {
2886                 next++;
2887             }
2888             scan = next;
2889             continue;
2890         }
2891 
2892         //
2893         // Read the first integer.
2894         //
2895         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2896         SKIP_DIGITS(next);
2897         start = __kmp_str_to_int(scan, *next);
2898         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2899         SKIP_WS(next);
2900 
2901         //
2902         // If this isn't a range, then add a mask to the list and go on.
2903         //
2904         if (*next != '-') {
2905             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2906 
2907             //
2908             // Skip optional comma.
2909             //
2910             if (*next == ',') {
2911                 next++;
2912             }
2913             scan = next;
2914             continue;
2915         }
2916 
2917         //
2918         // This is a range.  Skip over the '-' and read in the 2nd int.
2919         //
2920         next++;         // skip '-'
2921         SKIP_WS(next);
2922         scan = next;
2923         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2924         SKIP_DIGITS(next);
2925         end = __kmp_str_to_int(scan, *next);
2926         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2927 
2928         //
2929         // Check for a stride parameter
2930         //
2931         stride = 1;
2932         SKIP_WS(next);
2933         if (*next == ':') {
2934             //
2935             // A stride is specified.  Skip over the ':" and read the 3rd int.
2936             //
2937             int sign = +1;
2938             next++;         // skip ':'
2939             SKIP_WS(next);
2940             scan = next;
2941             if (*next == '-') {
2942                 sign = -1;
2943                 next++;
2944                 SKIP_WS(next);
2945                 scan = next;
2946             }
2947             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2948               "bad explicit proc list");
2949             SKIP_DIGITS(next);
2950             stride = __kmp_str_to_int(scan, *next);
2951             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2952             stride *= sign;
2953         }
2954 
2955         //
2956         // Do some range checks.
2957         //
2958         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2959         if (stride > 0) {
2960             KMP_ASSERT2(start <= end, "bad explicit proc list");
2961         }
2962         else {
2963             KMP_ASSERT2(start >= end, "bad explicit proc list");
2964         }
2965         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2966 
2967         //
2968         // Add the mask for each OS proc # to the list.
2969         //
2970         if (stride > 0) {
2971             do {
2972                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2973                 start += stride;
2974             } while (start <= end);
2975         }
2976         else {
2977             do {
2978                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2979                 start += stride;
2980             } while (start >= end);
2981         }
2982 
2983         //
2984         // Skip optional comma.
2985         //
2986         SKIP_WS(next);
2987         if (*next == ',') {
2988             next++;
2989         }
2990         scan = next;
2991     }
2992 
2993     *out_numMasks = nextNewMask;
2994     if (nextNewMask == 0) {
2995         *out_masks = NULL;
2996         KMP_INTERNAL_FREE(newMasks);
2997         return;
2998     }
2999     *out_masks
3000       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3001     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3002     __kmp_free(sumMask);
3003     KMP_INTERNAL_FREE(newMasks);
3004 }
3005 
3006 
3007 # if OMP_40_ENABLED
3008 
3009 /*-----------------------------------------------------------------------------
3010 
3011 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3012 places.  Again, Here is the grammar:
3013 
3014 place_list := place
3015 place_list := place , place_list
3016 place := num
3017 place := place : num
3018 place := place : num : signed
3019 place := { subplacelist }
3020 place := ! place                  // (lowest priority)
3021 subplace_list := subplace
3022 subplace_list := subplace , subplace_list
3023 subplace := num
3024 subplace := num : num
3025 subplace := num : num : signed
3026 signed := num
3027 signed := + signed
3028 signed := - signed
3029 
3030 -----------------------------------------------------------------------------*/
3031 
3032 static void
3033 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3034   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3035 {
3036     const char *next;
3037 
3038     for (;;) {
3039         int start, count, stride, i;
3040 
3041         //
3042         // Read in the starting proc id
3043         //
3044         SKIP_WS(*scan);
3045         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3046           "bad explicit places list");
3047         next = *scan;
3048         SKIP_DIGITS(next);
3049         start = __kmp_str_to_int(*scan, *next);
3050         KMP_ASSERT(start >= 0);
3051         *scan = next;
3052 
3053         //
3054         // valid follow sets are ',' ':' and '}'
3055         //
3056         SKIP_WS(*scan);
3057         if (**scan == '}' || **scan == ',') {
3058             if ((start > maxOsId) ||
3059               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3060                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3061                   && (__kmp_affinity_type != affinity_none))) {
3062                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3063                 }
3064             }
3065             else {
3066                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3067                 (*setSize)++;
3068             }
3069             if (**scan == '}') {
3070                 break;
3071             }
3072             (*scan)++;  // skip ','
3073             continue;
3074         }
3075         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3076         (*scan)++;      // skip ':'
3077 
3078         //
3079         // Read count parameter
3080         //
3081         SKIP_WS(*scan);
3082         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3083           "bad explicit places list");
3084         next = *scan;
3085         SKIP_DIGITS(next);
3086         count = __kmp_str_to_int(*scan, *next);
3087         KMP_ASSERT(count >= 0);
3088         *scan = next;
3089 
3090         //
3091         // valid follow sets are ',' ':' and '}'
3092         //
3093         SKIP_WS(*scan);
3094         if (**scan == '}' || **scan == ',') {
3095             for (i = 0; i < count; i++) {
3096                 if ((start > maxOsId) ||
3097                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3098                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3099                       && (__kmp_affinity_type != affinity_none))) {
3100                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3101                     }
3102                     break;  // don't proliferate warnings for large count
3103                 }
3104                 else {
3105                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3106                     start++;
3107                     (*setSize)++;
3108                 }
3109             }
3110             if (**scan == '}') {
3111                 break;
3112             }
3113             (*scan)++;  // skip ','
3114             continue;
3115         }
3116         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3117         (*scan)++;      // skip ':'
3118 
3119         //
3120         // Read stride parameter
3121         //
3122         int sign = +1;
3123         for (;;) {
3124             SKIP_WS(*scan);
3125             if (**scan == '+') {
3126                 (*scan)++; // skip '+'
3127                 continue;
3128             }
3129             if (**scan == '-') {
3130                 sign *= -1;
3131                 (*scan)++; // skip '-'
3132                 continue;
3133             }
3134             break;
3135         }
3136         SKIP_WS(*scan);
3137         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3138           "bad explicit places list");
3139         next = *scan;
3140         SKIP_DIGITS(next);
3141         stride = __kmp_str_to_int(*scan, *next);
3142         KMP_ASSERT(stride >= 0);
3143         *scan = next;
3144         stride *= sign;
3145 
3146         //
3147         // valid follow sets are ',' and '}'
3148         //
3149         SKIP_WS(*scan);
3150         if (**scan == '}' || **scan == ',') {
3151             for (i = 0; i < count; i++) {
3152                 if ((start > maxOsId) ||
3153                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3154                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3155                       && (__kmp_affinity_type != affinity_none))) {
3156                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3157                     }
3158                     break;  // don't proliferate warnings for large count
3159                 }
3160                 else {
3161                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3162                     start += stride;
3163                     (*setSize)++;
3164                 }
3165             }
3166             if (**scan == '}') {
3167                 break;
3168             }
3169             (*scan)++;  // skip ','
3170             continue;
3171         }
3172 
3173         KMP_ASSERT2(0, "bad explicit places list");
3174     }
3175 }
3176 
3177 
3178 static void
3179 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3180   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3181 {
3182     const char *next;
3183 
3184     //
3185     // valid follow sets are '{' '!' and num
3186     //
3187     SKIP_WS(*scan);
3188     if (**scan == '{') {
3189         (*scan)++;      // skip '{'
3190         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3191           setSize);
3192         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3193         (*scan)++;      // skip '}'
3194     }
3195     else if (**scan == '!') {
3196         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3197         KMP_CPU_COMPLEMENT(tempMask);
3198         (*scan)++;      // skip '!'
3199     }
3200     else if ((**scan >= '0') && (**scan <= '9')) {
3201         next = *scan;
3202         SKIP_DIGITS(next);
3203         int num = __kmp_str_to_int(*scan, *next);
3204         KMP_ASSERT(num >= 0);
3205         if ((num > maxOsId) ||
3206           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3207             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3208               && (__kmp_affinity_type != affinity_none))) {
3209                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3210             }
3211         }
3212         else {
3213             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3214             (*setSize)++;
3215         }
3216         *scan = next;  // skip num
3217     }
3218     else {
3219         KMP_ASSERT2(0, "bad explicit places list");
3220     }
3221 }
3222 
3223 
3224 //static void
3225 void
3226 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3227   unsigned int *out_numMasks, const char *placelist,
3228   kmp_affin_mask_t *osId2Mask, int maxOsId)
3229 {
3230     const char *scan = placelist;
3231     const char *next = placelist;
3232 
3233     numNewMasks = 2;
3234     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3235       * __kmp_affin_mask_size);
3236     nextNewMask = 0;
3237 
3238     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3239       __kmp_affin_mask_size);
3240     KMP_CPU_ZERO(tempMask);
3241     int setSize = 0;
3242 
3243     for (;;) {
3244         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3245 
3246         //
3247         // valid follow sets are ',' ':' and EOL
3248         //
3249         SKIP_WS(scan);
3250         if (*scan == '\0' || *scan == ',') {
3251             if (setSize > 0) {
3252                 ADD_MASK(tempMask);
3253             }
3254             KMP_CPU_ZERO(tempMask);
3255             setSize = 0;
3256             if (*scan == '\0') {
3257                 break;
3258             }
3259             scan++;     // skip ','
3260             continue;
3261         }
3262 
3263         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3264         scan++;         // skip ':'
3265 
3266         //
3267         // Read count parameter
3268         //
3269         SKIP_WS(scan);
3270         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3271           "bad explicit places list");
3272         next = scan;
3273         SKIP_DIGITS(next);
3274         int count = __kmp_str_to_int(scan, *next);
3275         KMP_ASSERT(count >= 0);
3276         scan = next;
3277 
3278         //
3279         // valid follow sets are ',' ':' and EOL
3280         //
3281         SKIP_WS(scan);
3282         int stride;
3283         if (*scan == '\0' || *scan == ',') {
3284             stride = +1;
3285         }
3286         else {
3287             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3288             scan++;         // skip ':'
3289 
3290             //
3291             // Read stride parameter
3292             //
3293             int sign = +1;
3294             for (;;) {
3295                 SKIP_WS(scan);
3296                 if (*scan == '+') {
3297                     scan++; // skip '+'
3298                     continue;
3299                 }
3300                 if (*scan == '-') {
3301                     sign *= -1;
3302                     scan++; // skip '-'
3303                     continue;
3304                 }
3305                 break;
3306             }
3307             SKIP_WS(scan);
3308             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3309               "bad explicit places list");
3310             next = scan;
3311             SKIP_DIGITS(next);
3312             stride = __kmp_str_to_int(scan, *next);
3313             KMP_DEBUG_ASSERT(stride >= 0);
3314             scan = next;
3315             stride *= sign;
3316         }
3317 
3318         if (stride > 0) {
3319             int i;
3320             for (i = 0; i < count; i++) {
3321                 int j;
3322                 if (setSize == 0) {
3323                     break;
3324                 }
3325                 ADD_MASK(tempMask);
3326                 setSize = 0;
3327                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3328                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3329                         KMP_CPU_CLR(j, tempMask);
3330                     }
3331                     else if ((j > maxOsId) ||
3332                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3333                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3334                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3335                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3336                         }
3337                         KMP_CPU_CLR(j, tempMask);
3338                     }
3339                     else {
3340                         KMP_CPU_SET(j, tempMask);
3341                         setSize++;
3342                     }
3343                 }
3344                 for (; j >= 0; j--) {
3345                     KMP_CPU_CLR(j, tempMask);
3346                 }
3347             }
3348         }
3349         else {
3350             int i;
3351             for (i = 0; i < count; i++) {
3352                 int j;
3353                 if (setSize == 0) {
3354                     break;
3355                 }
3356                 ADD_MASK(tempMask);
3357                 setSize = 0;
3358                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3359                   j++) {
3360                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3361                         KMP_CPU_CLR(j, tempMask);
3362                     }
3363                     else if ((j > maxOsId) ||
3364                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3365                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3366                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3367                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3368                         }
3369                         KMP_CPU_CLR(j, tempMask);
3370                     }
3371                     else {
3372                         KMP_CPU_SET(j, tempMask);
3373                         setSize++;
3374                     }
3375                 }
3376                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3377                     KMP_CPU_CLR(j, tempMask);
3378                 }
3379             }
3380         }
3381         KMP_CPU_ZERO(tempMask);
3382         setSize = 0;
3383 
3384         //
3385         // valid follow sets are ',' and EOL
3386         //
3387         SKIP_WS(scan);
3388         if (*scan == '\0') {
3389             break;
3390         }
3391         if (*scan == ',') {
3392             scan++;     // skip ','
3393             continue;
3394         }
3395 
3396         KMP_ASSERT2(0, "bad explicit places list");
3397     }
3398 
3399     *out_numMasks = nextNewMask;
3400     if (nextNewMask == 0) {
3401         *out_masks = NULL;
3402         KMP_INTERNAL_FREE(newMasks);
3403         return;
3404     }
3405     *out_masks
3406       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3407     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3408     __kmp_free(tempMask);
3409     KMP_INTERNAL_FREE(newMasks);
3410 }
3411 
3412 # endif /* OMP_40_ENABLED */
3413 
3414 #undef ADD_MASK
3415 #undef ADD_MASK_OSID
3416 
3417 static void
3418 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3419 {
3420     if ( __kmp_place_num_cores == 0 ) {
3421         if ( __kmp_place_num_threads_per_core == 0 ) {
3422             return;   // no cores limiting actions requested, exit
3423         }
3424         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3425     }
3426     if ( !__kmp_affinity_uniform_topology() ) {
3427         KMP_WARNING( AffThrPlaceNonUniform );
3428         return; // don't support non-uniform topology
3429     }
3430     if ( depth != 3 ) {
3431         KMP_WARNING( AffThrPlaceNonThreeLevel );
3432         return; // don't support not-3-level topology
3433     }
3434     if ( __kmp_place_num_threads_per_core == 0 ) {
3435         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3436     }
3437     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3438         KMP_WARNING( AffThrPlaceManyCores );
3439         return;
3440     }
3441 
3442     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3443                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3444     int i, j, k, n_old = 0, n_new = 0;
3445     for ( i = 0; i < nPackages; ++i ) {
3446         for ( j = 0; j < nCoresPerPkg; ++j ) {
3447             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3448                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3449             } else {
3450                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3451                     if ( k < __kmp_place_num_threads_per_core ) {
3452                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3453                         n_new++;
3454                     }
3455                     n_old++;
3456                 }
3457             }
3458         }
3459     }
3460     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3461     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3462     __kmp_avail_proc = n_new;                                 // correct avail_proc
3463     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3464 
3465     __kmp_free( *pAddr );
3466     *pAddr = newAddr;      // replace old topology with new one
3467 }
3468 
3469 
3470 static AddrUnsPair *address2os = NULL;
3471 static int           * procarr = NULL;
3472 static int     __kmp_aff_depth = 0;
3473 
3474 static void
3475 __kmp_aux_affinity_initialize(void)
3476 {
3477     if (__kmp_affinity_masks != NULL) {
3478         KMP_ASSERT(fullMask != NULL);
3479         return;
3480     }
3481 
3482     //
3483     // Create the "full" mask - this defines all of the processors that we
3484     // consider to be in the machine model.  If respect is set, then it is
3485     // the initialization thread's affinity mask.  Otherwise, it is all
3486     // processors that we know about on the machine.
3487     //
3488     if (fullMask == NULL) {
3489         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3490     }
3491     if (KMP_AFFINITY_CAPABLE()) {
3492         if (__kmp_affinity_respect_mask) {
3493             __kmp_get_system_affinity(fullMask, TRUE);
3494 
3495             //
3496             // Count the number of available processors.
3497             //
3498             unsigned i;
3499             __kmp_avail_proc = 0;
3500             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3501                 if (! KMP_CPU_ISSET(i, fullMask)) {
3502                     continue;
3503                 }
3504                 __kmp_avail_proc++;
3505             }
3506             if (__kmp_avail_proc > __kmp_xproc) {
3507                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3508                   && (__kmp_affinity_type != affinity_none))) {
3509                     KMP_WARNING(ErrorInitializeAffinity);
3510                 }
3511                 __kmp_affinity_type = affinity_none;
3512                 KMP_AFFINITY_DISABLE();
3513                 return;
3514             }
3515         }
3516         else {
3517             __kmp_affinity_entire_machine_mask(fullMask);
3518             __kmp_avail_proc = __kmp_xproc;
3519         }
3520     }
3521 
3522     int depth = -1;
3523     kmp_i18n_id_t msg_id = kmp_i18n_null;
3524 
3525     //
3526     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3527     // KMP_TOPOLOGY_METHOD=cpuinfo
3528     //
3529     if ((__kmp_cpuinfo_file != NULL) &&
3530       (__kmp_affinity_top_method == affinity_top_method_all)) {
3531         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3532     }
3533 
3534     if (__kmp_affinity_top_method == affinity_top_method_all) {
3535         //
3536         // In the default code path, errors are not fatal - we just try using
3537         // another method.  We only emit a warning message if affinity is on,
3538         // or the verbose flag is set, an the nowarnings flag was not set.
3539         //
3540         const char *file_name = NULL;
3541         int line = 0;
3542 
3543 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3544 
3545         if (__kmp_affinity_verbose) {
3546             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3547         }
3548 
3549         file_name = NULL;
3550         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3551         if (depth == 0) {
3552             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3553             KMP_ASSERT(address2os == NULL);
3554             return;
3555         }
3556 
3557         if (depth < 0) {
3558             if (__kmp_affinity_verbose) {
3559                 if (msg_id != kmp_i18n_null) {
3560                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3561                       KMP_I18N_STR(DecodingLegacyAPIC));
3562                 }
3563                 else {
3564                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3565                 }
3566             }
3567 
3568             file_name = NULL;
3569             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3570             if (depth == 0) {
3571                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3572                 KMP_ASSERT(address2os == NULL);
3573                 return;
3574             }
3575         }
3576 
3577 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3578 
3579 # if KMP_OS_LINUX
3580 
3581         if (depth < 0) {
3582             if (__kmp_affinity_verbose) {
3583                 if (msg_id != kmp_i18n_null) {
3584                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3585                 }
3586                 else {
3587                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3588                 }
3589             }
3590 
3591             FILE *f = fopen("/proc/cpuinfo", "r");
3592             if (f == NULL) {
3593                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3594             }
3595             else {
3596                 file_name = "/proc/cpuinfo";
3597                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3598                 fclose(f);
3599                 if (depth == 0) {
3600                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3601                     KMP_ASSERT(address2os == NULL);
3602                     return;
3603                 }
3604             }
3605         }
3606 
3607 # endif /* KMP_OS_LINUX */
3608 
3609 # if KMP_GROUP_AFFINITY
3610 
3611         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3612             if (__kmp_affinity_verbose) {
3613                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3614             }
3615 
3616             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3617             KMP_ASSERT(depth != 0);
3618         }
3619 
3620 # endif /* KMP_GROUP_AFFINITY */
3621 
3622         if (depth < 0) {
3623             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3624                 if (file_name == NULL) {
3625                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3626                 }
3627                 else if (line == 0) {
3628                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3629                 }
3630                 else {
3631                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3632                 }
3633             }
3634             // FIXME - print msg if msg_id = kmp_i18n_null ???
3635 
3636             file_name = "";
3637             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3638             if (depth == 0) {
3639                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3640                 KMP_ASSERT(address2os == NULL);
3641                 return;
3642             }
3643             KMP_ASSERT(depth > 0);
3644             KMP_ASSERT(address2os != NULL);
3645         }
3646     }
3647 
3648     //
3649     // If the user has specified that a paricular topology discovery method
3650     // is to be used, then we abort if that method fails.  The exception is
3651     // group affinity, which might have been implicitly set.
3652     //
3653 
3654 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3655 
3656     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3657         if (__kmp_affinity_verbose) {
3658             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3659               KMP_I18N_STR(Decodingx2APIC));
3660         }
3661 
3662         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3663         if (depth == 0) {
3664             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3665             KMP_ASSERT(address2os == NULL);
3666             return;
3667         }
3668         if (depth < 0) {
3669             KMP_ASSERT(msg_id != kmp_i18n_null);
3670             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3671         }
3672     }
3673     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3674         if (__kmp_affinity_verbose) {
3675             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3676               KMP_I18N_STR(DecodingLegacyAPIC));
3677         }
3678 
3679         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3680         if (depth == 0) {
3681             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3682             KMP_ASSERT(address2os == NULL);
3683             return;
3684         }
3685         if (depth < 0) {
3686             KMP_ASSERT(msg_id != kmp_i18n_null);
3687             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3688         }
3689     }
3690 
3691 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3692 
3693     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3694         const char *filename;
3695         if (__kmp_cpuinfo_file != NULL) {
3696             filename = __kmp_cpuinfo_file;
3697         }
3698         else {
3699             filename = "/proc/cpuinfo";
3700         }
3701 
3702         if (__kmp_affinity_verbose) {
3703             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3704         }
3705 
3706         FILE *f = fopen(filename, "r");
3707         if (f == NULL) {
3708             int code = errno;
3709             if (__kmp_cpuinfo_file != NULL) {
3710                 __kmp_msg(
3711                     kmp_ms_fatal,
3712                     KMP_MSG(CantOpenFileForReading, filename),
3713                     KMP_ERR(code),
3714                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3715                     __kmp_msg_null
3716                 );
3717             }
3718             else {
3719                 __kmp_msg(
3720                     kmp_ms_fatal,
3721                     KMP_MSG(CantOpenFileForReading, filename),
3722                     KMP_ERR(code),
3723                     __kmp_msg_null
3724                 );
3725             }
3726         }
3727         int line = 0;
3728         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3729         fclose(f);
3730         if (depth < 0) {
3731             KMP_ASSERT(msg_id != kmp_i18n_null);
3732             if (line > 0) {
3733                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3734             }
3735             else {
3736                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3737             }
3738         }
3739         if (__kmp_affinity_type == affinity_none) {
3740             KMP_ASSERT(depth == 0);
3741             KMP_ASSERT(address2os == NULL);
3742             return;
3743         }
3744     }
3745 
3746 # if KMP_GROUP_AFFINITY
3747 
3748     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3749         if (__kmp_affinity_verbose) {
3750             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3751         }
3752 
3753         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3754         KMP_ASSERT(depth != 0);
3755         if (depth < 0) {
3756             KMP_ASSERT(msg_id != kmp_i18n_null);
3757             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3758         }
3759     }
3760 
3761 # endif /* KMP_GROUP_AFFINITY */
3762 
3763     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3764         if (__kmp_affinity_verbose) {
3765             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3766         }
3767 
3768         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3769         if (depth == 0) {
3770             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3771             KMP_ASSERT(address2os == NULL);
3772             return;
3773         }
3774         // should not fail
3775         KMP_ASSERT(depth > 0);
3776         KMP_ASSERT(address2os != NULL);
3777     }
3778 
3779     if (address2os == NULL) {
3780         if (KMP_AFFINITY_CAPABLE()
3781           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3782           && (__kmp_affinity_type != affinity_none)))) {
3783             KMP_WARNING(ErrorInitializeAffinity);
3784         }
3785         __kmp_affinity_type = affinity_none;
3786         KMP_AFFINITY_DISABLE();
3787         return;
3788     }
3789 
3790     __kmp_apply_thread_places(&address2os, depth);
3791 
3792     //
3793     // Create the table of masks, indexed by thread Id.
3794     //
3795     unsigned maxIndex;
3796     unsigned numUnique;
3797     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3798       address2os, __kmp_avail_proc);
3799     if (__kmp_affinity_gran_levels == 0) {
3800         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3801     }
3802 
3803     //
3804     // Set the childNums vector in all Address objects.  This must be done
3805     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3806     // which takes into account the setting of __kmp_affinity_compact.
3807     //
3808     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3809 
3810     switch (__kmp_affinity_type) {
3811 
3812         case affinity_explicit:
3813         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3814 # if OMP_40_ENABLED
3815         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3816 # endif
3817         {
3818             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3819               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3820               maxIndex);
3821         }
3822 # if OMP_40_ENABLED
3823         else {
3824             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3825               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3826               maxIndex);
3827         }
3828 # endif
3829         if (__kmp_affinity_num_masks == 0) {
3830             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3831               && (__kmp_affinity_type != affinity_none))) {
3832                 KMP_WARNING(AffNoValidProcID);
3833             }
3834             __kmp_affinity_type = affinity_none;
3835             return;
3836         }
3837         break;
3838 
3839         //
3840         // The other affinity types rely on sorting the Addresses according
3841         // to some permutation of the machine topology tree.  Set
3842         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3843         // then jump to a common code fragment to do the sort and create
3844         // the array of affinity masks.
3845         //
3846 
3847         case affinity_logical:
3848         __kmp_affinity_compact = 0;
3849         if (__kmp_affinity_offset) {
3850             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3851               % __kmp_avail_proc;
3852         }
3853         goto sortAddresses;
3854 
3855         case affinity_physical:
3856         if (__kmp_nThreadsPerCore > 1) {
3857             __kmp_affinity_compact = 1;
3858             if (__kmp_affinity_compact >= depth) {
3859                 __kmp_affinity_compact = 0;
3860             }
3861         } else {
3862             __kmp_affinity_compact = 0;
3863         }
3864         if (__kmp_affinity_offset) {
3865             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3866               % __kmp_avail_proc;
3867         }
3868         goto sortAddresses;
3869 
3870         case affinity_scatter:
3871         if (__kmp_affinity_compact >= depth) {
3872             __kmp_affinity_compact = 0;
3873         }
3874         else {
3875             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3876         }
3877         goto sortAddresses;
3878 
3879         case affinity_compact:
3880         if (__kmp_affinity_compact >= depth) {
3881             __kmp_affinity_compact = depth - 1;
3882         }
3883         goto sortAddresses;
3884 
3885         case affinity_balanced:
3886         // Balanced works only for the case of a single package
3887         if( nPackages > 1 ) {
3888             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3889                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3890             }
3891             __kmp_affinity_type = affinity_none;
3892             return;
3893         } else if( __kmp_affinity_uniform_topology() ) {
3894             break;
3895         } else { // Non-uniform topology
3896 
3897             // Save the depth for further usage
3898             __kmp_aff_depth = depth;
3899 
3900             // Number of hyper threads per core in HT machine
3901             int nth_per_core = __kmp_nThreadsPerCore;
3902 
3903             int core_level;
3904             if( nth_per_core > 1 ) {
3905                 core_level = depth - 2;
3906             } else {
3907                 core_level = depth - 1;
3908             }
3909             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3910             int nproc = nth_per_core * ncores;
3911 
3912             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3913             for( int i = 0; i < nproc; i++ ) {
3914                 procarr[ i ] = -1;
3915             }
3916 
3917             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3918                 int proc = address2os[ i ].second;
3919                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3920                 // If there is only one thread per core then depth == 2: level 0 - package,
3921                 // level 1 - core.
3922                 int level = depth - 1;
3923 
3924                 // __kmp_nth_per_core == 1
3925                 int thread = 0;
3926                 int core = address2os[ i ].first.labels[ level ];
3927                 // If the thread level exists, that is we have more than one thread context per core
3928                 if( nth_per_core > 1 ) {
3929                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3930                     core = address2os[ i ].first.labels[ level - 1 ];
3931                 }
3932                 procarr[ core * nth_per_core + thread ] = proc;
3933             }
3934 
3935             break;
3936         }
3937 
3938         sortAddresses:
3939         //
3940         // Allocate the gtid->affinity mask table.
3941         //
3942         if (__kmp_affinity_dups) {
3943             __kmp_affinity_num_masks = __kmp_avail_proc;
3944         }
3945         else {
3946             __kmp_affinity_num_masks = numUnique;
3947         }
3948 
3949 # if OMP_40_ENABLED
3950         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3951           && ( __kmp_affinity_num_places > 0 )
3952           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3953             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3954         }
3955 # endif
3956 
3957         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3958           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3959 
3960         //
3961         // Sort the address2os table according to the current setting of
3962         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3963         //
3964         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3965           __kmp_affinity_cmp_Address_child_num);
3966         {
3967             int i;
3968             unsigned j;
3969             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3970                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3971                     continue;
3972                 }
3973                 unsigned osId = address2os[i].second;
3974                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3975                 kmp_affin_mask_t *dest
3976                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3977                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3978                 KMP_CPU_COPY(dest, src);
3979                 if (++j >= __kmp_affinity_num_masks) {
3980                     break;
3981                 }
3982             }
3983             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3984         }
3985         break;
3986 
3987         default:
3988         KMP_ASSERT2(0, "Unexpected affinity setting");
3989     }
3990 
3991     __kmp_free(osId2Mask);
3992     machine_hierarchy.init(address2os, __kmp_avail_proc);
3993 }
3994 
3995 
3996 void
3997 __kmp_affinity_initialize(void)
3998 {
3999     //
4000     // Much of the code above was written assumming that if a machine was not
4001     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4002     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4003     //
4004     // There are too many checks for __kmp_affinity_type == affinity_none
4005     // in this code.  Instead of trying to change them all, check if
4006     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4007     // affinity_none, call the real initialization routine, then restore
4008     // __kmp_affinity_type to affinity_disabled.
4009     //
4010     int disabled = (__kmp_affinity_type == affinity_disabled);
4011     if (! KMP_AFFINITY_CAPABLE()) {
4012         KMP_ASSERT(disabled);
4013     }
4014     if (disabled) {
4015         __kmp_affinity_type = affinity_none;
4016     }
4017     __kmp_aux_affinity_initialize();
4018     if (disabled) {
4019         __kmp_affinity_type = affinity_disabled;
4020     }
4021 }
4022 
4023 
4024 void
4025 __kmp_affinity_uninitialize(void)
4026 {
4027     if (__kmp_affinity_masks != NULL) {
4028         __kmp_free(__kmp_affinity_masks);
4029         __kmp_affinity_masks = NULL;
4030     }
4031     if (fullMask != NULL) {
4032         KMP_CPU_FREE(fullMask);
4033         fullMask = NULL;
4034     }
4035     __kmp_affinity_num_masks = 0;
4036 # if OMP_40_ENABLED
4037     __kmp_affinity_num_places = 0;
4038 # endif
4039     if (__kmp_affinity_proclist != NULL) {
4040         __kmp_free(__kmp_affinity_proclist);
4041         __kmp_affinity_proclist = NULL;
4042     }
4043     if( address2os != NULL ) {
4044         __kmp_free( address2os );
4045         address2os = NULL;
4046     }
4047     if( procarr != NULL ) {
4048         __kmp_free( procarr );
4049         procarr = NULL;
4050     }
4051 }
4052 
4053 
4054 void
4055 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4056 {
4057     if (! KMP_AFFINITY_CAPABLE()) {
4058         return;
4059     }
4060 
4061     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4062     if (th->th.th_affin_mask == NULL) {
4063         KMP_CPU_ALLOC(th->th.th_affin_mask);
4064     }
4065     else {
4066         KMP_CPU_ZERO(th->th.th_affin_mask);
4067     }
4068 
4069     //
4070     // Copy the thread mask to the kmp_info_t strucuture.
4071     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4072     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4073     // is set, then the full mask is the same as the mask of the initialization
4074     // thread.
4075     //
4076     kmp_affin_mask_t *mask;
4077     int i;
4078 
4079 # if OMP_40_ENABLED
4080     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4081 # endif
4082     {
4083         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4084           ) {
4085 # if KMP_GROUP_AFFINITY
4086             if (__kmp_num_proc_groups > 1) {
4087                 return;
4088             }
4089 # endif
4090             KMP_ASSERT(fullMask != NULL);
4091             i = KMP_PLACE_ALL;
4092             mask = fullMask;
4093         }
4094         else {
4095             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4096             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4097             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4098         }
4099     }
4100 # if OMP_40_ENABLED
4101     else {
4102         if ((! isa_root)
4103           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4104 #  if KMP_GROUP_AFFINITY
4105             if (__kmp_num_proc_groups > 1) {
4106                 return;
4107             }
4108 #  endif
4109             KMP_ASSERT(fullMask != NULL);
4110             i = KMP_PLACE_ALL;
4111             mask = fullMask;
4112         }
4113         else {
4114             //
4115             // int i = some hash function or just a counter that doesn't
4116             // always start at 0.  Use gtid for now.
4117             //
4118             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4119             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4120             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4121         }
4122     }
4123 # endif
4124 
4125 # if OMP_40_ENABLED
4126     th->th.th_current_place = i;
4127     if (isa_root) {
4128         th->th.th_new_place = i;
4129         th->th.th_first_place = 0;
4130         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4131     }
4132 
4133     if (i == KMP_PLACE_ALL) {
4134         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4135           gtid));
4136     }
4137     else {
4138         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4139           gtid, i));
4140     }
4141 # else
4142     if (i == -1) {
4143         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4144           gtid));
4145     }
4146     else {
4147         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4148           gtid, i));
4149     }
4150 # endif /* OMP_40_ENABLED */
4151 
4152     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4153 
4154     if (__kmp_affinity_verbose) {
4155         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4156         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4157           th->th.th_affin_mask);
4158         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4159           buf);
4160     }
4161 
4162 # if KMP_OS_WINDOWS
4163     //
4164     // On Windows* OS, the process affinity mask might have changed.
4165     // If the user didn't request affinity and this call fails,
4166     // just continue silently.  See CQ171393.
4167     //
4168     if ( __kmp_affinity_type == affinity_none ) {
4169         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4170     }
4171     else
4172 # endif
4173     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4174 }
4175 
4176 
4177 # if OMP_40_ENABLED
4178 
4179 void
4180 __kmp_affinity_set_place(int gtid)
4181 {
4182     int retval;
4183 
4184     if (! KMP_AFFINITY_CAPABLE()) {
4185         return;
4186     }
4187 
4188     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4189 
4190     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4191       gtid, th->th.th_new_place, th->th.th_current_place));
4192 
4193     //
4194     // Check that the new place is within this thread's partition.
4195     //
4196     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4197     KMP_ASSERT(th->th.th_new_place >= 0);
4198     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4199     if (th->th.th_first_place <= th->th.th_last_place) {
4200         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4201          && (th->th.th_new_place <= th->th.th_last_place));
4202     }
4203     else {
4204         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4205          || (th->th.th_new_place >= th->th.th_last_place));
4206     }
4207 
4208     //
4209     // Copy the thread mask to the kmp_info_t strucuture,
4210     // and set this thread's affinity.
4211     //
4212     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4213       th->th.th_new_place);
4214     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4215     th->th.th_current_place = th->th.th_new_place;
4216 
4217     if (__kmp_affinity_verbose) {
4218         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4219         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4220           th->th.th_affin_mask);
4221         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4222           gtid, buf);
4223     }
4224     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4225 }
4226 
4227 # endif /* OMP_40_ENABLED */
4228 
4229 
4230 int
4231 __kmp_aux_set_affinity(void **mask)
4232 {
4233     int gtid;
4234     kmp_info_t *th;
4235     int retval;
4236 
4237     if (! KMP_AFFINITY_CAPABLE()) {
4238         return -1;
4239     }
4240 
4241     gtid = __kmp_entry_gtid();
4242     KA_TRACE(1000, ;{
4243         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4244         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4245           (kmp_affin_mask_t *)(*mask));
4246         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4247           gtid, buf);
4248     });
4249 
4250     if (__kmp_env_consistency_check) {
4251         if ((mask == NULL) || (*mask == NULL)) {
4252             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4253         }
4254         else {
4255             unsigned proc;
4256             int num_procs = 0;
4257 
4258             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4259                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4260                     continue;
4261                 }
4262                 num_procs++;
4263                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4264                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4265                     break;
4266                 }
4267             }
4268             if (num_procs == 0) {
4269                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4270             }
4271 
4272 # if KMP_GROUP_AFFINITY
4273             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4274                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4275             }
4276 # endif /* KMP_GROUP_AFFINITY */
4277 
4278         }
4279     }
4280 
4281     th = __kmp_threads[gtid];
4282     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4283     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4284     if (retval == 0) {
4285         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4286     }
4287 
4288 # if OMP_40_ENABLED
4289     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4290     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4291     th->th.th_first_place = 0;
4292     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4293 
4294     //
4295     // Turn off 4.0 affinity for the current tread at this parallel level.
4296     //
4297     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4298 # endif
4299 
4300     return retval;
4301 }
4302 
4303 
4304 int
4305 __kmp_aux_get_affinity(void **mask)
4306 {
4307     int gtid;
4308     int retval;
4309     kmp_info_t *th;
4310 
4311     if (! KMP_AFFINITY_CAPABLE()) {
4312         return -1;
4313     }
4314 
4315     gtid = __kmp_entry_gtid();
4316     th = __kmp_threads[gtid];
4317     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4318 
4319     KA_TRACE(1000, ;{
4320         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4321         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4322           th->th.th_affin_mask);
4323         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4324     });
4325 
4326     if (__kmp_env_consistency_check) {
4327         if ((mask == NULL) || (*mask == NULL)) {
4328             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4329         }
4330     }
4331 
4332 # if !KMP_OS_WINDOWS
4333 
4334     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4335     KA_TRACE(1000, ;{
4336         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4337         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4338           (kmp_affin_mask_t *)(*mask));
4339         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4340     });
4341     return retval;
4342 
4343 # else
4344 
4345     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4346     return 0;
4347 
4348 # endif /* KMP_OS_WINDOWS */
4349 
4350 }
4351 
4352 int
4353 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4354 {
4355     int retval;
4356 
4357     if (! KMP_AFFINITY_CAPABLE()) {
4358         return -1;
4359     }
4360 
4361     KA_TRACE(1000, ;{
4362         int gtid = __kmp_entry_gtid();
4363         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4364         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4365           (kmp_affin_mask_t *)(*mask));
4366         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4367           proc, gtid, buf);
4368     });
4369 
4370     if (__kmp_env_consistency_check) {
4371         if ((mask == NULL) || (*mask == NULL)) {
4372             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4373         }
4374     }
4375 
4376     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4377         return -1;
4378     }
4379     if (! KMP_CPU_ISSET(proc, fullMask)) {
4380         return -2;
4381     }
4382 
4383     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4384     return 0;
4385 }
4386 
4387 
4388 int
4389 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4390 {
4391     int retval;
4392 
4393     if (! KMP_AFFINITY_CAPABLE()) {
4394         return -1;
4395     }
4396 
4397     KA_TRACE(1000, ;{
4398         int gtid = __kmp_entry_gtid();
4399         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4400         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4401           (kmp_affin_mask_t *)(*mask));
4402         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4403           proc, gtid, buf);
4404     });
4405 
4406     if (__kmp_env_consistency_check) {
4407         if ((mask == NULL) || (*mask == NULL)) {
4408             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4409         }
4410     }
4411 
4412     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4413         return -1;
4414     }
4415     if (! KMP_CPU_ISSET(proc, fullMask)) {
4416         return -2;
4417     }
4418 
4419     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4420     return 0;
4421 }
4422 
4423 
4424 int
4425 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4426 {
4427     int retval;
4428 
4429     if (! KMP_AFFINITY_CAPABLE()) {
4430         return -1;
4431     }
4432 
4433     KA_TRACE(1000, ;{
4434         int gtid = __kmp_entry_gtid();
4435         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4436         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4437           (kmp_affin_mask_t *)(*mask));
4438         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4439           proc, gtid, buf);
4440     });
4441 
4442     if (__kmp_env_consistency_check) {
4443         if ((mask == NULL) || (*mask == NULL)) {
4444             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4445         }
4446     }
4447 
4448     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4449         return 0;
4450     }
4451     if (! KMP_CPU_ISSET(proc, fullMask)) {
4452         return 0;
4453     }
4454 
4455     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4456 }
4457 
4458 
4459 // Dynamic affinity settings - Affinity balanced
4460 void __kmp_balanced_affinity( int tid, int nthreads )
4461 {
4462     if( __kmp_affinity_uniform_topology() ) {
4463         int coreID;
4464         int threadID;
4465         // Number of hyper threads per core in HT machine
4466         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4467         // Number of cores
4468         int ncores = __kmp_ncores;
4469         // How many threads will be bound to each core
4470         int chunk = nthreads / ncores;
4471         // How many cores will have an additional thread bound to it - "big cores"
4472         int big_cores = nthreads % ncores;
4473         // Number of threads on the big cores
4474         int big_nth = ( chunk + 1 ) * big_cores;
4475         if( tid < big_nth ) {
4476             coreID = tid / (chunk + 1 );
4477             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4478         } else { //tid >= big_nth
4479             coreID = ( tid - big_cores ) / chunk;
4480             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4481         }
4482 
4483         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4484           "Illegal set affinity operation when not capable");
4485 
4486         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4487         KMP_CPU_ZERO(mask);
4488 
4489         // Granularity == thread
4490         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4491             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4492             KMP_CPU_SET( osID, mask);
4493         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4494             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4495                 int osID;
4496                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4497                 KMP_CPU_SET( osID, mask);
4498             }
4499         }
4500         if (__kmp_affinity_verbose) {
4501             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4502             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4503             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4504               tid, buf);
4505         }
4506         __kmp_set_system_affinity( mask, TRUE );
4507     } else { // Non-uniform topology
4508 
4509         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4510         KMP_CPU_ZERO(mask);
4511 
4512         // Number of hyper threads per core in HT machine
4513         int nth_per_core = __kmp_nThreadsPerCore;
4514         int core_level;
4515         if( nth_per_core > 1 ) {
4516             core_level = __kmp_aff_depth - 2;
4517         } else {
4518             core_level = __kmp_aff_depth - 1;
4519         }
4520 
4521         // Number of cores - maximum value; it does not count trail cores with 0 processors
4522         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4523 
4524         // For performance gain consider the special case nthreads == __kmp_avail_proc
4525         if( nthreads == __kmp_avail_proc ) {
4526             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4527                 int osID = address2os[ tid ].second;
4528                 KMP_CPU_SET( osID, mask);
4529             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4530                 int coreID = address2os[ tid ].first.labels[ core_level ];
4531                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4532                 // since the address2os is sortied we can break when cnt==nth_per_core
4533                 int cnt = 0;
4534                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4535                     int osID = address2os[ i ].second;
4536                     int core = address2os[ i ].first.labels[ core_level ];
4537                     if( core == coreID ) {
4538                         KMP_CPU_SET( osID, mask);
4539                         cnt++;
4540                         if( cnt == nth_per_core ) {
4541                             break;
4542                         }
4543                     }
4544                 }
4545             }
4546         } else if( nthreads <= __kmp_ncores ) {
4547 
4548             int core = 0;
4549             for( int i = 0; i < ncores; i++ ) {
4550                 // Check if this core from procarr[] is in the mask
4551                 int in_mask = 0;
4552                 for( int j = 0; j < nth_per_core; j++ ) {
4553                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4554                         in_mask = 1;
4555                         break;
4556                     }
4557                 }
4558                 if( in_mask ) {
4559                     if( tid == core ) {
4560                         for( int j = 0; j < nth_per_core; j++ ) {
4561                             int osID = procarr[ i * nth_per_core + j ];
4562                             if( osID != -1 ) {
4563                                 KMP_CPU_SET( osID, mask );
4564                                 // For granularity=thread it is enough to set the first available osID for this core
4565                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4566                                     break;
4567                                 }
4568                             }
4569                         }
4570                         break;
4571                     } else {
4572                         core++;
4573                     }
4574                 }
4575             }
4576 
4577         } else { // nthreads > __kmp_ncores
4578 
4579             // Array to save the number of processors at each core
4580             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4581             // Array to save the number of cores with "x" available processors;
4582             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4583             // Array to save the number of cores with # procs from x to nth_per_core
4584             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4585 
4586             for( int i = 0; i <= nth_per_core; i++ ) {
4587                 ncores_with_x_procs[ i ] = 0;
4588                 ncores_with_x_to_max_procs[ i ] = 0;
4589             }
4590 
4591             for( int i = 0; i < ncores; i++ ) {
4592                 int cnt = 0;
4593                 for( int j = 0; j < nth_per_core; j++ ) {
4594                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4595                         cnt++;
4596                     }
4597                 }
4598                 nproc_at_core[ i ] = cnt;
4599                 ncores_with_x_procs[ cnt ]++;
4600             }
4601 
4602             for( int i = 0; i <= nth_per_core; i++ ) {
4603                 for( int j = i; j <= nth_per_core; j++ ) {
4604                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4605                 }
4606             }
4607 
4608             // Max number of processors
4609             int nproc = nth_per_core * ncores;
4610             // An array to keep number of threads per each context
4611             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4612             for( int i = 0; i < nproc; i++ ) {
4613                 newarr[ i ] = 0;
4614             }
4615 
4616             int nth = nthreads;
4617             int flag = 0;
4618             while( nth > 0 ) {
4619                 for( int j = 1; j <= nth_per_core; j++ ) {
4620                     int cnt = ncores_with_x_to_max_procs[ j ];
4621                     for( int i = 0; i < ncores; i++ ) {
4622                         // Skip the core with 0 processors
4623                         if( nproc_at_core[ i ] == 0 ) {
4624                             continue;
4625                         }
4626                         for( int k = 0; k < nth_per_core; k++ ) {
4627                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4628                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4629                                     newarr[ i * nth_per_core + k ] = 1;
4630                                     cnt--;
4631                                     nth--;
4632                                     break;
4633                                 } else {
4634                                     if( flag != 0 ) {
4635                                         newarr[ i * nth_per_core + k ] ++;
4636                                         cnt--;
4637                                         nth--;
4638                                         break;
4639                                     }
4640                                 }
4641                             }
4642                         }
4643                         if( cnt == 0 || nth == 0 ) {
4644                             break;
4645                         }
4646                     }
4647                     if( nth == 0 ) {
4648                         break;
4649                     }
4650                 }
4651                 flag = 1;
4652             }
4653             int sum = 0;
4654             for( int i = 0; i < nproc; i++ ) {
4655                 sum += newarr[ i ];
4656                 if( sum > tid ) {
4657                     // Granularity == thread
4658                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4659                         int osID = procarr[ i ];
4660                         KMP_CPU_SET( osID, mask);
4661                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4662                         int coreID = i / nth_per_core;
4663                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4664                             int osID = procarr[ coreID * nth_per_core + ii ];
4665                             if( osID != -1 ) {
4666                                 KMP_CPU_SET( osID, mask);
4667                             }
4668                         }
4669                     }
4670                     break;
4671                 }
4672             }
4673             __kmp_free( newarr );
4674         }
4675 
4676         if (__kmp_affinity_verbose) {
4677             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4678             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4679             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4680               tid, buf);
4681         }
4682         __kmp_set_system_affinity( mask, TRUE );
4683     }
4684 }
4685 
4686 #else
4687     // affinity not supported
4688 
4689 static const kmp_uint32 noaff_maxLevels=7;
4690 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4691 kmp_uint32 noaff_depth;
4692 kmp_uint8 noaff_leaf_kids;
4693 kmp_int8 noaff_uninitialized=1;
4694 
4695 void noaff_init(int nprocs)
4696 {
4697     kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4698     if (result == 0) return; // Already initialized
4699     else if (result == 2) { // Someone else is initializing
4700         while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4701         return;
4702     }
4703     KMP_DEBUG_ASSERT(result==1);
4704 
4705     kmp_uint32 numPerLevel[noaff_maxLevels];
4706     noaff_depth = 1;
4707     for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4708         numPerLevel[i] = 1;
4709         noaff_skipPerLevel[i] = 1;
4710     }
4711 
4712     numPerLevel[0] = 4;
4713     numPerLevel[1] = nprocs/4;
4714     if (nprocs%4) numPerLevel[1]++;
4715 
4716     for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4717         if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4718             noaff_depth++;
4719 
4720     kmp_uint32 branch = 4;
4721     if (numPerLevel[0] == 1) branch = nprocs/4;
4722     if (branch<4) branch=4;
4723     for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4724         while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4725             if (numPerLevel[d] & 1) numPerLevel[d]++;
4726             numPerLevel[d] = numPerLevel[d] >> 1;
4727             if (numPerLevel[d+1] == 1) noaff_depth++;
4728             numPerLevel[d+1] = numPerLevel[d+1] << 1;
4729         }
4730         if(numPerLevel[0] == 1) {
4731             branch = branch >> 1;
4732             if (branch<4) branch = 4;
4733         }
4734     }
4735 
4736     for (kmp_uint32 i=1; i<noaff_depth; ++i)
4737         noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4738     // Fill in hierarchy in the case of oversubscription
4739     for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4740         noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4741     noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4742     noaff_uninitialized = 0; // One writer
4743 
4744 }
4745 
4746 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4747     if (noaff_uninitialized)
4748         noaff_init(nproc);
4749 
4750     thr_bar->depth = noaff_depth;
4751     thr_bar->base_leaf_kids = noaff_leaf_kids;
4752     thr_bar->skip_per_level = noaff_skipPerLevel;
4753 }
4754 
4755 #endif // KMP_AFFINITY_SUPPORTED
4756