1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         sprintf(scan, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     sprintf(scan, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         sprintf(scan, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         sprintf(scan, ",...");
72         while (*scan != '\0') scan++;
73     }
74     sprintf(scan, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libiomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */
316 class hierarchy_info {
317 public:
318     /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
319         etc.  We don't want to get specific with nomenclature */
320     static const kmp_uint32 maxLevels=7;
321 
322     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
323         number of levels along the longest path from root to any leaf. It corresponds to the
324         number of entries in numPerLevel if we exclude all but one trailing 1. */
325     kmp_uint32 depth;
326     kmp_uint32 base_depth;
327     kmp_uint32 base_num_threads;
328     bool uninitialized;
329 
330     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
331         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
332         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
333     kmp_uint32 numPerLevel[maxLevels];
334     kmp_uint32 skipPerLevel[maxLevels];
335 
336     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
337         int hier_depth = adr2os[0].first.depth;
338         int level = 0;
339         for (int i=hier_depth-1; i>=0; --i) {
340             int max = -1;
341             for (int j=0; j<num_addrs; ++j) {
342                 int next = adr2os[j].first.childNums[i];
343                 if (next > max) max = next;
344             }
345             numPerLevel[level] = max+1;
346             ++level;
347         }
348     }
349 
350     hierarchy_info() : depth(1), uninitialized(true) {}
351     void init(AddrUnsPair *adr2os, int num_addrs)
352     {
353         uninitialized = false;
354         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
355             numPerLevel[i] = 1;
356             skipPerLevel[i] = 1;
357         }
358 
359         // Sort table by physical ID
360         if (adr2os) {
361             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
362             deriveLevels(adr2os, num_addrs);
363         }
364         else {
365             numPerLevel[0] = 4;
366             numPerLevel[1] = num_addrs/4;
367             if (num_addrs%4) numPerLevel[1]++;
368         }
369 
370         base_num_threads = num_addrs;
371         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
372             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
373                 depth++;
374 
375         kmp_uint32 branch = 4;
376         if (numPerLevel[0] == 1) branch = num_addrs/4;
377         if (branch<4) branch=4;
378         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
379             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
380                 if (numPerLevel[d] & 1) numPerLevel[d]++;
381                 numPerLevel[d] = numPerLevel[d] >> 1;
382                 if (numPerLevel[d+1] == 1) depth++;
383                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
384             }
385             if(numPerLevel[0] == 1) {
386                 branch = branch >> 1;
387                 if (branch<4) branch = 4;
388             }
389         }
390 
391         for (kmp_uint32 i=1; i<depth; ++i)
392             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
393 
394         base_depth = depth;
395     }
396 };
397 
398 static hierarchy_info machine_hierarchy;
399 
400 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
401     if (machine_hierarchy.uninitialized)
402         machine_hierarchy.init(NULL, nproc);
403 
404     if (nproc <= machine_hierarchy.base_num_threads)
405         machine_hierarchy.depth = machine_hierarchy.base_depth;
406     KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
407     while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
408         machine_hierarchy.depth++;
409         machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
410     }
411     thr_bar->depth = machine_hierarchy.depth;
412     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
413     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
414 }
415 
416 //
417 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
418 // called to renumber the labels from [0..n] and place them into the child_num
419 // vector of the address object.  This is done in case the labels used for
420 // the children at one node of the hierarchy differ from those used for
421 // another node at the same level.  Example:  suppose the machine has 2 nodes
422 // with 2 packages each.  The first node contains packages 601 and 602, and
423 // second node contains packages 603 and 604.  If we try to sort the table
424 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
425 // because we are paying attention to the labels themselves, not the ordinal
426 // child numbers.  By using the child numbers in the sort, the result is
427 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
428 //
429 static void
430 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
431   int numAddrs)
432 {
433     KMP_DEBUG_ASSERT(numAddrs > 0);
434     int depth = address2os->first.depth;
435     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
436     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
437       * sizeof(unsigned));
438     int labCt;
439     for (labCt = 0; labCt < depth; labCt++) {
440         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
441         lastLabel[labCt] = address2os[0].first.labels[labCt];
442     }
443     int i;
444     for (i = 1; i < numAddrs; i++) {
445         for (labCt = 0; labCt < depth; labCt++) {
446             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
447                 int labCt2;
448                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
449                     counts[labCt2] = 0;
450                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
451                 }
452                 counts[labCt]++;
453                 lastLabel[labCt] = address2os[i].first.labels[labCt];
454                 break;
455             }
456         }
457         for (labCt = 0; labCt < depth; labCt++) {
458             address2os[i].first.childNums[labCt] = counts[labCt];
459         }
460         for (; labCt < (int)Address::maxDepth; labCt++) {
461             address2os[i].first.childNums[labCt] = 0;
462         }
463     }
464 }
465 
466 
467 //
468 // All of the __kmp_affinity_create_*_map() routines should set
469 // __kmp_affinity_masks to a vector of affinity mask objects of length
470 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
471 // return the number of levels in the machine topology tree (zero if
472 // __kmp_affinity_type == affinity_none).
473 //
474 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
475 // to the affinity mask for the initialization thread.  They need to save and
476 // restore the mask, and it could be needed later, so saving it is just an
477 // optimization to avoid calling kmp_get_system_affinity() again.
478 //
479 static kmp_affin_mask_t *fullMask = NULL;
480 
481 kmp_affin_mask_t *
482 __kmp_affinity_get_fullMask() { return fullMask; }
483 
484 
485 static int nCoresPerPkg, nPackages;
486 static int __kmp_nThreadsPerCore;
487 #ifndef KMP_DFLT_NTH_CORES
488 static int __kmp_ncores;
489 #endif
490 
491 //
492 // __kmp_affinity_uniform_topology() doesn't work when called from
493 // places which support arbitrarily many levels in the machine topology
494 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
495 // __kmp_affinity_create_x2apicid_map().
496 //
497 inline static bool
498 __kmp_affinity_uniform_topology()
499 {
500     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
501 }
502 
503 
504 //
505 // Print out the detailed machine topology map, i.e. the physical locations
506 // of each OS proc.
507 //
508 static void
509 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
510   int pkgLevel, int coreLevel, int threadLevel)
511 {
512     int proc;
513 
514     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
515     for (proc = 0; proc < len; proc++) {
516         int level;
517         kmp_str_buf_t buf;
518         __kmp_str_buf_init(&buf);
519         for (level = 0; level < depth; level++) {
520             if (level == threadLevel) {
521                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
522             }
523             else if (level == coreLevel) {
524                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
525             }
526             else if (level == pkgLevel) {
527                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
528             }
529             else if (level > pkgLevel) {
530                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
531                   level - pkgLevel - 1);
532             }
533             else {
534                 __kmp_str_buf_print(&buf, "L%d ", level);
535             }
536             __kmp_str_buf_print(&buf, "%d ",
537               address2os[proc].first.labels[level]);
538         }
539         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
540           buf.str);
541         __kmp_str_buf_free(&buf);
542     }
543 }
544 
545 
546 //
547 // If we don't know how to retrieve the machine's processor topology, or
548 // encounter an error in doing so, this routine is called to form a "flat"
549 // mapping of os thread id's <-> processor id's.
550 //
551 static int
552 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
553   kmp_i18n_id_t *const msg_id)
554 {
555     *address2os = NULL;
556     *msg_id = kmp_i18n_null;
557 
558     //
559     // Even if __kmp_affinity_type == affinity_none, this routine might still
560     // called to set __kmp_ncores, as well as
561     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
562     //
563     if (! KMP_AFFINITY_CAPABLE()) {
564         KMP_ASSERT(__kmp_affinity_type == affinity_none);
565         __kmp_ncores = nPackages = __kmp_xproc;
566         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
567         if (__kmp_affinity_verbose) {
568             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
569             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
570             KMP_INFORM(Uniform, "KMP_AFFINITY");
571             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
572               __kmp_nThreadsPerCore, __kmp_ncores);
573         }
574         return 0;
575     }
576 
577     //
578     // When affinity is off, this routine will still be called to set
579     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
580     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
581     //  correctly, and return now if affinity is not enabled.
582     //
583     __kmp_ncores = nPackages = __kmp_avail_proc;
584     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
585     if (__kmp_affinity_verbose) {
586         char buf[KMP_AFFIN_MASK_PRINT_LEN];
587         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
588 
589         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
590         if (__kmp_affinity_respect_mask) {
591             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
592         } else {
593             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
594         }
595         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
596         KMP_INFORM(Uniform, "KMP_AFFINITY");
597         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
598           __kmp_nThreadsPerCore, __kmp_ncores);
599     }
600     if (__kmp_affinity_type == affinity_none) {
601         return 0;
602     }
603 
604     //
605     // Contruct the data structure to be returned.
606     //
607     *address2os = (AddrUnsPair*)
608       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
609     int avail_ct = 0;
610     unsigned int i;
611     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
612         //
613         // Skip this proc if it is not included in the machine model.
614         //
615         if (! KMP_CPU_ISSET(i, fullMask)) {
616             continue;
617         }
618 
619         Address addr(1);
620         addr.labels[0] = i;
621         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
622     }
623     if (__kmp_affinity_verbose) {
624         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
625     }
626 
627     if (__kmp_affinity_gran_levels < 0) {
628         //
629         // Only the package level is modeled in the machine topology map,
630         // so the #levels of granularity is either 0 or 1.
631         //
632         if (__kmp_affinity_gran > affinity_gran_package) {
633             __kmp_affinity_gran_levels = 1;
634         }
635         else {
636             __kmp_affinity_gran_levels = 0;
637         }
638     }
639     return 1;
640 }
641 
642 
643 # if KMP_GROUP_AFFINITY
644 
645 //
646 // If multiple Windows* OS processor groups exist, we can create a 2-level
647 // topology map with the groups at level 0 and the individual procs at
648 // level 1.
649 //
650 // This facilitates letting the threads float among all procs in a group,
651 // if granularity=group (the default when there are multiple groups).
652 //
653 static int
654 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
655   kmp_i18n_id_t *const msg_id)
656 {
657     *address2os = NULL;
658     *msg_id = kmp_i18n_null;
659 
660     //
661     // If we don't have multiple processor groups, return now.
662     // The flat mapping will be used.
663     //
664     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
665         // FIXME set *msg_id
666         return -1;
667     }
668 
669     //
670     // Contruct the data structure to be returned.
671     //
672     *address2os = (AddrUnsPair*)
673       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
674     int avail_ct = 0;
675     int i;
676     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
677         //
678         // Skip this proc if it is not included in the machine model.
679         //
680         if (! KMP_CPU_ISSET(i, fullMask)) {
681             continue;
682         }
683 
684         Address addr(2);
685         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
686         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
687         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
688 
689         if (__kmp_affinity_verbose) {
690             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
691               addr.labels[1]);
692         }
693     }
694 
695     if (__kmp_affinity_gran_levels < 0) {
696         if (__kmp_affinity_gran == affinity_gran_group) {
697             __kmp_affinity_gran_levels = 1;
698         }
699         else if ((__kmp_affinity_gran == affinity_gran_fine)
700           || (__kmp_affinity_gran == affinity_gran_thread)) {
701             __kmp_affinity_gran_levels = 0;
702         }
703         else {
704             const char *gran_str = NULL;
705             if (__kmp_affinity_gran == affinity_gran_core) {
706                 gran_str = "core";
707             }
708             else if (__kmp_affinity_gran == affinity_gran_package) {
709                 gran_str = "package";
710             }
711             else if (__kmp_affinity_gran == affinity_gran_node) {
712                 gran_str = "node";
713             }
714             else {
715                 KMP_ASSERT(0);
716             }
717 
718             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
719             __kmp_affinity_gran_levels = 0;
720         }
721     }
722     return 2;
723 }
724 
725 # endif /* KMP_GROUP_AFFINITY */
726 
727 
728 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
729 
730 static int
731 __kmp_cpuid_mask_width(int count) {
732     int r = 0;
733 
734     while((1<<r) < count)
735         ++r;
736     return r;
737 }
738 
739 
740 class apicThreadInfo {
741 public:
742     unsigned osId;              // param to __kmp_affinity_bind_thread
743     unsigned apicId;            // from cpuid after binding
744     unsigned maxCoresPerPkg;    //      ""
745     unsigned maxThreadsPerPkg;  //      ""
746     unsigned pkgId;             // inferred from above values
747     unsigned coreId;            //      ""
748     unsigned threadId;          //      ""
749 };
750 
751 
752 static int
753 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
754 {
755     const apicThreadInfo *aa = (const apicThreadInfo *)a;
756     const apicThreadInfo *bb = (const apicThreadInfo *)b;
757     if (aa->osId < bb->osId) return -1;
758     if (aa->osId > bb->osId) return 1;
759     return 0;
760 }
761 
762 
763 static int
764 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
765 {
766     const apicThreadInfo *aa = (const apicThreadInfo *)a;
767     const apicThreadInfo *bb = (const apicThreadInfo *)b;
768     if (aa->pkgId < bb->pkgId) return -1;
769     if (aa->pkgId > bb->pkgId) return 1;
770     if (aa->coreId < bb->coreId) return -1;
771     if (aa->coreId > bb->coreId) return 1;
772     if (aa->threadId < bb->threadId) return -1;
773     if (aa->threadId > bb->threadId) return 1;
774     return 0;
775 }
776 
777 
778 //
779 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
780 // an algorithm which cycles through the available os threads, setting
781 // the current thread's affinity mask to that thread, and then retrieves
782 // the Apic Id for each thread context using the cpuid instruction.
783 //
784 static int
785 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
786   kmp_i18n_id_t *const msg_id)
787 {
788     kmp_cpuid buf;
789     int rc;
790     *address2os = NULL;
791     *msg_id = kmp_i18n_null;
792 
793     //
794     // Check if cpuid leaf 4 is supported.
795     //
796         __kmp_x86_cpuid(0, 0, &buf);
797         if (buf.eax < 4) {
798             *msg_id = kmp_i18n_str_NoLeaf4Support;
799             return -1;
800         }
801 
802     //
803     // The algorithm used starts by setting the affinity to each available
804     // thread and retrieving info from the cpuid instruction, so if we are
805     // not capable of calling __kmp_get_system_affinity() and
806     // _kmp_get_system_affinity(), then we need to do something else - use
807     // the defaults that we calculated from issuing cpuid without binding
808     // to each proc.
809     //
810     if (! KMP_AFFINITY_CAPABLE()) {
811         //
812         // Hack to try and infer the machine topology using only the data
813         // available from cpuid on the current thread, and __kmp_xproc.
814         //
815         KMP_ASSERT(__kmp_affinity_type == affinity_none);
816 
817         //
818         // Get an upper bound on the number of threads per package using
819         // cpuid(1).
820         //
821         // On some OS/chps combinations where HT is supported by the chip
822         // but is disabled, this value will be 2 on a single core chip.
823         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
824         //
825         __kmp_x86_cpuid(1, 0, &buf);
826         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
827         if (maxThreadsPerPkg == 0) {
828             maxThreadsPerPkg = 1;
829         }
830 
831         //
832         // The num cores per pkg comes from cpuid(4).
833         // 1 must be added to the encoded value.
834         //
835         // The author of cpu_count.cpp treated this only an upper bound
836         // on the number of cores, but I haven't seen any cases where it
837         // was greater than the actual number of cores, so we will treat
838         // it as exact in this block of code.
839         //
840         // First, we need to check if cpuid(4) is supported on this chip.
841         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
842         // has the value n or greater.
843         //
844         __kmp_x86_cpuid(0, 0, &buf);
845         if (buf.eax >= 4) {
846             __kmp_x86_cpuid(4, 0, &buf);
847             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
848         }
849         else {
850             nCoresPerPkg = 1;
851         }
852 
853         //
854         // There is no way to reliably tell if HT is enabled without issuing
855         // the cpuid instruction from every thread, can correlating the cpuid
856         // info, so if the machine is not affinity capable, we assume that HT
857         // is off.  We have seen quite a few machines where maxThreadsPerPkg
858         // is 2, yet the machine does not support HT.
859         //
860         // - Older OSes are usually found on machines with older chips, which
861         //   do not support HT.
862         //
863         // - The performance penalty for mistakenly identifying a machine as
864         //   HT when it isn't (which results in blocktime being incorrecly set
865         //   to 0) is greater than the penalty when for mistakenly identifying
866         //   a machine as being 1 thread/core when it is really HT enabled
867         //   (which results in blocktime being incorrectly set to a positive
868         //   value).
869         //
870         __kmp_ncores = __kmp_xproc;
871         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
872         __kmp_nThreadsPerCore = 1;
873         if (__kmp_affinity_verbose) {
874             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
875             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
876             if (__kmp_affinity_uniform_topology()) {
877                 KMP_INFORM(Uniform, "KMP_AFFINITY");
878             } else {
879                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
880             }
881             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
882               __kmp_nThreadsPerCore, __kmp_ncores);
883         }
884         return 0;
885     }
886 
887     //
888     //
889     // From here on, we can assume that it is safe to call
890     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
891     // even if __kmp_affinity_type = affinity_none.
892     //
893 
894     //
895     // Save the affinity mask for the current thread.
896     //
897     kmp_affin_mask_t *oldMask;
898     KMP_CPU_ALLOC(oldMask);
899     KMP_ASSERT(oldMask != NULL);
900     __kmp_get_system_affinity(oldMask, TRUE);
901 
902     //
903     // Run through each of the available contexts, binding the current thread
904     // to it, and obtaining the pertinent information using the cpuid instr.
905     //
906     // The relevant information is:
907     //
908     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
909     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
910     //
911     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
912     //    value of this field determines the width of the core# + thread#
913     //    fields in the Apic Id.  It is also an upper bound on the number
914     //    of threads per package, but it has been verified that situations
915     //    happen were it is not exact.  In particular, on certain OS/chip
916     //    combinations where Intel(R) Hyper-Threading Technology is supported
917     //    by the chip but has
918     //    been disabled, the value of this field will be 2 (for a single core
919     //    chip).  On other OS/chip combinations supporting
920     //    Intel(R) Hyper-Threading Technology, the value of
921     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
922     //    disabled and 2 when it is enabled.
923     //
924     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
925     //    value of this field (+1) determines the width of the core# field in
926     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
927     //    an upper bound, but the IA-32 architecture manual says that it is
928     //    exactly the number of cores per package, and I haven't seen any
929     //    case where it wasn't.
930     //
931     // From this information, deduce the package Id, core Id, and thread Id,
932     // and set the corresponding fields in the apicThreadInfo struct.
933     //
934     unsigned i;
935     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
936       __kmp_avail_proc * sizeof(apicThreadInfo));
937     unsigned nApics = 0;
938     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
939         //
940         // Skip this proc if it is not included in the machine model.
941         //
942         if (! KMP_CPU_ISSET(i, fullMask)) {
943             continue;
944         }
945         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
946 
947         __kmp_affinity_bind_thread(i);
948         threadInfo[nApics].osId = i;
949 
950         //
951         // The apic id and max threads per pkg come from cpuid(1).
952         //
953         __kmp_x86_cpuid(1, 0, &buf);
954         if (! (buf.edx >> 9) & 1) {
955             __kmp_set_system_affinity(oldMask, TRUE);
956             __kmp_free(threadInfo);
957             KMP_CPU_FREE(oldMask);
958             *msg_id = kmp_i18n_str_ApicNotPresent;
959             return -1;
960         }
961         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
962         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
963         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
964             threadInfo[nApics].maxThreadsPerPkg = 1;
965         }
966 
967         //
968         // Max cores per pkg comes from cpuid(4).
969         // 1 must be added to the encoded value.
970         //
971         // First, we need to check if cpuid(4) is supported on this chip.
972         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
973         // has the value n or greater.
974         //
975         __kmp_x86_cpuid(0, 0, &buf);
976         if (buf.eax >= 4) {
977             __kmp_x86_cpuid(4, 0, &buf);
978             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
979         }
980         else {
981             threadInfo[nApics].maxCoresPerPkg = 1;
982         }
983 
984         //
985         // Infer the pkgId / coreId / threadId using only the info
986         // obtained locally.
987         //
988         int widthCT = __kmp_cpuid_mask_width(
989           threadInfo[nApics].maxThreadsPerPkg);
990         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
991 
992         int widthC = __kmp_cpuid_mask_width(
993           threadInfo[nApics].maxCoresPerPkg);
994         int widthT = widthCT - widthC;
995         if (widthT < 0) {
996             //
997             // I've never seen this one happen, but I suppose it could, if
998             // the cpuid instruction on a chip was really screwed up.
999             // Make sure to restore the affinity mask before the tail call.
1000             //
1001             __kmp_set_system_affinity(oldMask, TRUE);
1002             __kmp_free(threadInfo);
1003             KMP_CPU_FREE(oldMask);
1004             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1005             return -1;
1006         }
1007 
1008         int maskC = (1 << widthC) - 1;
1009         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1010           &maskC;
1011 
1012         int maskT = (1 << widthT) - 1;
1013         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1014 
1015         nApics++;
1016     }
1017 
1018     //
1019     // We've collected all the info we need.
1020     // Restore the old affinity mask for this thread.
1021     //
1022     __kmp_set_system_affinity(oldMask, TRUE);
1023 
1024     //
1025     // If there's only one thread context to bind to, form an Address object
1026     // with depth 1 and return immediately (or, if affinity is off, set
1027     // address2os to NULL and return).
1028     //
1029     // If it is configured to omit the package level when there is only a
1030     // single package, the logic at the end of this routine won't work if
1031     // there is only a single thread - it would try to form an Address
1032     // object with depth 0.
1033     //
1034     KMP_ASSERT(nApics > 0);
1035     if (nApics == 1) {
1036         __kmp_ncores = nPackages = 1;
1037         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1038         if (__kmp_affinity_verbose) {
1039             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1040             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1041 
1042             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1043             if (__kmp_affinity_respect_mask) {
1044                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1045             } else {
1046                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1047             }
1048             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1049             KMP_INFORM(Uniform, "KMP_AFFINITY");
1050             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1051               __kmp_nThreadsPerCore, __kmp_ncores);
1052         }
1053 
1054         if (__kmp_affinity_type == affinity_none) {
1055             __kmp_free(threadInfo);
1056             KMP_CPU_FREE(oldMask);
1057             return 0;
1058         }
1059 
1060         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1061         Address addr(1);
1062         addr.labels[0] = threadInfo[0].pkgId;
1063         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1064 
1065         if (__kmp_affinity_gran_levels < 0) {
1066             __kmp_affinity_gran_levels = 0;
1067         }
1068 
1069         if (__kmp_affinity_verbose) {
1070             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1071         }
1072 
1073         __kmp_free(threadInfo);
1074         KMP_CPU_FREE(oldMask);
1075         return 1;
1076     }
1077 
1078     //
1079     // Sort the threadInfo table by physical Id.
1080     //
1081     qsort(threadInfo, nApics, sizeof(*threadInfo),
1082       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1083 
1084     //
1085     // The table is now sorted by pkgId / coreId / threadId, but we really
1086     // don't know the radix of any of the fields.  pkgId's may be sparsely
1087     // assigned among the chips on a system.  Although coreId's are usually
1088     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1089     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1090     //
1091     // For that matter, we don't know what coresPerPkg and threadsPerCore
1092     // (or the total # packages) are at this point - we want to determine
1093     // that now.  We only have an upper bound on the first two figures.
1094     //
1095     // We also perform a consistency check at this point: the values returned
1096     // by the cpuid instruction for any thread bound to a given package had
1097     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1098     //
1099     nPackages = 1;
1100     nCoresPerPkg = 1;
1101     __kmp_nThreadsPerCore = 1;
1102     unsigned nCores = 1;
1103 
1104     unsigned pkgCt = 1;                         // to determine radii
1105     unsigned lastPkgId = threadInfo[0].pkgId;
1106     unsigned coreCt = 1;
1107     unsigned lastCoreId = threadInfo[0].coreId;
1108     unsigned threadCt = 1;
1109     unsigned lastThreadId = threadInfo[0].threadId;
1110 
1111                                                 // intra-pkg consist checks
1112     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1113     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1114 
1115     for (i = 1; i < nApics; i++) {
1116         if (threadInfo[i].pkgId != lastPkgId) {
1117             nCores++;
1118             pkgCt++;
1119             lastPkgId = threadInfo[i].pkgId;
1120             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1121             coreCt = 1;
1122             lastCoreId = threadInfo[i].coreId;
1123             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1124             threadCt = 1;
1125             lastThreadId = threadInfo[i].threadId;
1126 
1127             //
1128             // This is a different package, so go on to the next iteration
1129             // without doing any consistency checks.  Reset the consistency
1130             // check vars, though.
1131             //
1132             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1133             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1134             continue;
1135         }
1136 
1137         if (threadInfo[i].coreId != lastCoreId) {
1138             nCores++;
1139             coreCt++;
1140             lastCoreId = threadInfo[i].coreId;
1141             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1142             threadCt = 1;
1143             lastThreadId = threadInfo[i].threadId;
1144         }
1145         else if (threadInfo[i].threadId != lastThreadId) {
1146             threadCt++;
1147             lastThreadId = threadInfo[i].threadId;
1148         }
1149         else {
1150             __kmp_free(threadInfo);
1151             KMP_CPU_FREE(oldMask);
1152             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1153             return -1;
1154         }
1155 
1156         //
1157         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1158         // fields agree between all the threads bounds to a given package.
1159         //
1160         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1161           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1162             __kmp_free(threadInfo);
1163             KMP_CPU_FREE(oldMask);
1164             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1165             return -1;
1166         }
1167     }
1168     nPackages = pkgCt;
1169     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1170     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1171 
1172     //
1173     // When affinity is off, this routine will still be called to set
1174     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1175     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1176     // correctly, and return now if affinity is not enabled.
1177     //
1178     __kmp_ncores = nCores;
1179     if (__kmp_affinity_verbose) {
1180         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1181         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1182 
1183         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1184         if (__kmp_affinity_respect_mask) {
1185             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1186         } else {
1187             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1188         }
1189         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1190         if (__kmp_affinity_uniform_topology()) {
1191             KMP_INFORM(Uniform, "KMP_AFFINITY");
1192         } else {
1193             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1194         }
1195         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1196           __kmp_nThreadsPerCore, __kmp_ncores);
1197 
1198     }
1199 
1200     if (__kmp_affinity_type == affinity_none) {
1201         __kmp_free(threadInfo);
1202         KMP_CPU_FREE(oldMask);
1203         return 0;
1204     }
1205 
1206     //
1207     // Now that we've determined the number of packages, the number of cores
1208     // per package, and the number of threads per core, we can construct the
1209     // data structure that is to be returned.
1210     //
1211     int pkgLevel = 0;
1212     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1213     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1214     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1215 
1216     KMP_ASSERT(depth > 0);
1217     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1218 
1219     for (i = 0; i < nApics; ++i) {
1220         Address addr(depth);
1221         unsigned os = threadInfo[i].osId;
1222         int d = 0;
1223 
1224         if (pkgLevel >= 0) {
1225             addr.labels[d++] = threadInfo[i].pkgId;
1226         }
1227         if (coreLevel >= 0) {
1228             addr.labels[d++] = threadInfo[i].coreId;
1229         }
1230         if (threadLevel >= 0) {
1231             addr.labels[d++] = threadInfo[i].threadId;
1232         }
1233         (*address2os)[i] = AddrUnsPair(addr, os);
1234     }
1235 
1236     if (__kmp_affinity_gran_levels < 0) {
1237         //
1238         // Set the granularity level based on what levels are modeled
1239         // in the machine topology map.
1240         //
1241         __kmp_affinity_gran_levels = 0;
1242         if ((threadLevel >= 0)
1243           && (__kmp_affinity_gran > affinity_gran_thread)) {
1244             __kmp_affinity_gran_levels++;
1245         }
1246         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1247             __kmp_affinity_gran_levels++;
1248         }
1249         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1250             __kmp_affinity_gran_levels++;
1251         }
1252     }
1253 
1254     if (__kmp_affinity_verbose) {
1255         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1256           coreLevel, threadLevel);
1257     }
1258 
1259     __kmp_free(threadInfo);
1260     KMP_CPU_FREE(oldMask);
1261     return depth;
1262 }
1263 
1264 
1265 //
1266 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1267 // architectures support a newer interface for specifying the x2APIC Ids,
1268 // based on cpuid leaf 11.
1269 //
1270 static int
1271 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1272   kmp_i18n_id_t *const msg_id)
1273 {
1274     kmp_cpuid buf;
1275 
1276     *address2os = NULL;
1277     *msg_id = kmp_i18n_null;
1278 
1279     //
1280     // Check to see if cpuid leaf 11 is supported.
1281     //
1282     __kmp_x86_cpuid(0, 0, &buf);
1283     if (buf.eax < 11) {
1284         *msg_id = kmp_i18n_str_NoLeaf11Support;
1285         return -1;
1286     }
1287     __kmp_x86_cpuid(11, 0, &buf);
1288     if (buf.ebx == 0) {
1289         *msg_id = kmp_i18n_str_NoLeaf11Support;
1290         return -1;
1291     }
1292 
1293     //
1294     // Find the number of levels in the machine topology.  While we're at it,
1295     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1296     // try to get more accurate values later by explicitly counting them,
1297     // but get reasonable defaults now, in case we return early.
1298     //
1299     int level;
1300     int threadLevel = -1;
1301     int coreLevel = -1;
1302     int pkgLevel = -1;
1303     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1304 
1305     for (level = 0;; level++) {
1306         if (level > 31) {
1307             //
1308             // FIXME: Hack for DPD200163180
1309             //
1310             // If level is big then something went wrong -> exiting
1311             //
1312             // There could actually be 32 valid levels in the machine topology,
1313             // but so far, the only machine we have seen which does not exit
1314             // this loop before iteration 32 has fubar x2APIC settings.
1315             //
1316             // For now, just reject this case based upon loop trip count.
1317             //
1318             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1319             return -1;
1320         }
1321         __kmp_x86_cpuid(11, level, &buf);
1322         if (buf.ebx == 0) {
1323             if (pkgLevel < 0) {
1324                 //
1325                 // Will infer nPackages from __kmp_xproc
1326                 //
1327                 pkgLevel = level;
1328                 level++;
1329             }
1330             break;
1331         }
1332         int kind = (buf.ecx >> 8) & 0xff;
1333         if (kind == 1) {
1334             //
1335             // SMT level
1336             //
1337             threadLevel = level;
1338             coreLevel = -1;
1339             pkgLevel = -1;
1340             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1341             if (__kmp_nThreadsPerCore == 0) {
1342                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1343                 return -1;
1344             }
1345         }
1346         else if (kind == 2) {
1347             //
1348             // core level
1349             //
1350             coreLevel = level;
1351             pkgLevel = -1;
1352             nCoresPerPkg = buf.ebx & 0xff;
1353             if (nCoresPerPkg == 0) {
1354                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1355                 return -1;
1356             }
1357         }
1358         else {
1359             if (level <= 0) {
1360                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1361                 return -1;
1362             }
1363             if (pkgLevel >= 0) {
1364                 continue;
1365             }
1366             pkgLevel = level;
1367             nPackages = buf.ebx & 0xff;
1368             if (nPackages == 0) {
1369                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1370                 return -1;
1371             }
1372         }
1373     }
1374     int depth = level;
1375 
1376     //
1377     // In the above loop, "level" was counted from the finest level (usually
1378     // thread) to the coarsest.  The caller expects that we will place the
1379     // labels in (*address2os)[].first.labels[] in the inverse order, so
1380     // we need to invert the vars saying which level means what.
1381     //
1382     if (threadLevel >= 0) {
1383         threadLevel = depth - threadLevel - 1;
1384     }
1385     if (coreLevel >= 0) {
1386         coreLevel = depth - coreLevel - 1;
1387     }
1388     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1389     pkgLevel = depth - pkgLevel - 1;
1390 
1391     //
1392     // The algorithm used starts by setting the affinity to each available
1393     // thread and retrieving info from the cpuid instruction, so if we are
1394     // not capable of calling __kmp_get_system_affinity() and
1395     // _kmp_get_system_affinity(), then we need to do something else - use
1396     // the defaults that we calculated from issuing cpuid without binding
1397     // to each proc.
1398     //
1399     if (! KMP_AFFINITY_CAPABLE())
1400     {
1401         //
1402         // Hack to try and infer the machine topology using only the data
1403         // available from cpuid on the current thread, and __kmp_xproc.
1404         //
1405         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1406 
1407         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1408         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1409         if (__kmp_affinity_verbose) {
1410             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1411             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1412             if (__kmp_affinity_uniform_topology()) {
1413                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1414             } else {
1415                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1416             }
1417             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1418               __kmp_nThreadsPerCore, __kmp_ncores);
1419         }
1420         return 0;
1421     }
1422 
1423     //
1424     //
1425     // From here on, we can assume that it is safe to call
1426     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1427     // even if __kmp_affinity_type = affinity_none.
1428     //
1429 
1430     //
1431     // Save the affinity mask for the current thread.
1432     //
1433     kmp_affin_mask_t *oldMask;
1434     KMP_CPU_ALLOC(oldMask);
1435     __kmp_get_system_affinity(oldMask, TRUE);
1436 
1437     //
1438     // Allocate the data structure to be returned.
1439     //
1440     AddrUnsPair *retval = (AddrUnsPair *)
1441       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1442 
1443     //
1444     // Run through each of the available contexts, binding the current thread
1445     // to it, and obtaining the pertinent information using the cpuid instr.
1446     //
1447     unsigned int proc;
1448     int nApics = 0;
1449     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1450         //
1451         // Skip this proc if it is not included in the machine model.
1452         //
1453         if (! KMP_CPU_ISSET(proc, fullMask)) {
1454             continue;
1455         }
1456         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1457 
1458         __kmp_affinity_bind_thread(proc);
1459 
1460         //
1461         // Extrach the labels for each level in the machine topology map
1462         // from the Apic ID.
1463         //
1464         Address addr(depth);
1465         int prev_shift = 0;
1466 
1467         for (level = 0; level < depth; level++) {
1468             __kmp_x86_cpuid(11, level, &buf);
1469             unsigned apicId = buf.edx;
1470             if (buf.ebx == 0) {
1471                 if (level != depth - 1) {
1472                     KMP_CPU_FREE(oldMask);
1473                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1474                     return -1;
1475                 }
1476                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1477                 level++;
1478                 break;
1479             }
1480             int shift = buf.eax & 0x1f;
1481             int mask = (1 << shift) - 1;
1482             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1483             prev_shift = shift;
1484         }
1485         if (level != depth) {
1486             KMP_CPU_FREE(oldMask);
1487             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1488             return -1;
1489         }
1490 
1491         retval[nApics] = AddrUnsPair(addr, proc);
1492         nApics++;
1493     }
1494 
1495     //
1496     // We've collected all the info we need.
1497     // Restore the old affinity mask for this thread.
1498     //
1499     __kmp_set_system_affinity(oldMask, TRUE);
1500 
1501     //
1502     // If there's only one thread context to bind to, return now.
1503     //
1504     KMP_ASSERT(nApics > 0);
1505     if (nApics == 1) {
1506         __kmp_ncores = nPackages = 1;
1507         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1508         if (__kmp_affinity_verbose) {
1509             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1510             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1511 
1512             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1513             if (__kmp_affinity_respect_mask) {
1514                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1515             } else {
1516                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1517             }
1518             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1519             KMP_INFORM(Uniform, "KMP_AFFINITY");
1520             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1521               __kmp_nThreadsPerCore, __kmp_ncores);
1522         }
1523 
1524         if (__kmp_affinity_type == affinity_none) {
1525             __kmp_free(retval);
1526             KMP_CPU_FREE(oldMask);
1527             return 0;
1528         }
1529 
1530         //
1531         // Form an Address object which only includes the package level.
1532         //
1533         Address addr(1);
1534         addr.labels[0] = retval[0].first.labels[pkgLevel];
1535         retval[0].first = addr;
1536 
1537         if (__kmp_affinity_gran_levels < 0) {
1538             __kmp_affinity_gran_levels = 0;
1539         }
1540 
1541         if (__kmp_affinity_verbose) {
1542             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1543         }
1544 
1545         *address2os = retval;
1546         KMP_CPU_FREE(oldMask);
1547         return 1;
1548     }
1549 
1550     //
1551     // Sort the table by physical Id.
1552     //
1553     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1554 
1555     //
1556     // Find the radix at each of the levels.
1557     //
1558     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1559     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1560     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1561     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1562     for (level = 0; level < depth; level++) {
1563         totals[level] = 1;
1564         maxCt[level] = 1;
1565         counts[level] = 1;
1566         last[level] = retval[0].first.labels[level];
1567     }
1568 
1569     //
1570     // From here on, the iteration variable "level" runs from the finest
1571     // level to the coarsest, i.e. we iterate forward through
1572     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1573     // backwards.
1574     //
1575     for (proc = 1; (int)proc < nApics; proc++) {
1576         int level;
1577         for (level = 0; level < depth; level++) {
1578             if (retval[proc].first.labels[level] != last[level]) {
1579                 int j;
1580                 for (j = level + 1; j < depth; j++) {
1581                     totals[j]++;
1582                     counts[j] = 1;
1583                     // The line below causes printing incorrect topology information
1584                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1585                     // some less value while going through the array.
1586                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1587                     // whereas it must be 4.
1588                     // TODO!!! Check if it can be commented safely
1589                     //maxCt[j] = 1;
1590                     last[j] = retval[proc].first.labels[j];
1591                 }
1592                 totals[level]++;
1593                 counts[level]++;
1594                 if (counts[level] > maxCt[level]) {
1595                     maxCt[level] = counts[level];
1596                 }
1597                 last[level] = retval[proc].first.labels[level];
1598                 break;
1599             }
1600             else if (level == depth - 1) {
1601                 __kmp_free(last);
1602                 __kmp_free(maxCt);
1603                 __kmp_free(counts);
1604                 __kmp_free(totals);
1605                 __kmp_free(retval);
1606                 KMP_CPU_FREE(oldMask);
1607                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1608                 return -1;
1609             }
1610         }
1611     }
1612 
1613     //
1614     // When affinity is off, this routine will still be called to set
1615     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1616     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1617     // correctly, and return if affinity is not enabled.
1618     //
1619     if (threadLevel >= 0) {
1620         __kmp_nThreadsPerCore = maxCt[threadLevel];
1621     }
1622     else {
1623         __kmp_nThreadsPerCore = 1;
1624     }
1625     nPackages = totals[pkgLevel];
1626 
1627     if (coreLevel >= 0) {
1628         __kmp_ncores = totals[coreLevel];
1629         nCoresPerPkg = maxCt[coreLevel];
1630     }
1631     else {
1632         __kmp_ncores = nPackages;
1633         nCoresPerPkg = 1;
1634     }
1635 
1636     //
1637     // Check to see if the machine topology is uniform
1638     //
1639     unsigned prod = maxCt[0];
1640     for (level = 1; level < depth; level++) {
1641        prod *= maxCt[level];
1642     }
1643     bool uniform = (prod == totals[level - 1]);
1644 
1645     //
1646     // Print the machine topology summary.
1647     //
1648     if (__kmp_affinity_verbose) {
1649         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1650         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1651 
1652         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1653         if (__kmp_affinity_respect_mask) {
1654             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1655         } else {
1656             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1657         }
1658         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1659         if (uniform) {
1660             KMP_INFORM(Uniform, "KMP_AFFINITY");
1661         } else {
1662             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1663         }
1664 
1665         kmp_str_buf_t buf;
1666         __kmp_str_buf_init(&buf);
1667 
1668         __kmp_str_buf_print(&buf, "%d", totals[0]);
1669         for (level = 1; level <= pkgLevel; level++) {
1670             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1671         }
1672         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1673           __kmp_nThreadsPerCore, __kmp_ncores);
1674 
1675         __kmp_str_buf_free(&buf);
1676     }
1677 
1678     if (__kmp_affinity_type == affinity_none) {
1679         __kmp_free(last);
1680         __kmp_free(maxCt);
1681         __kmp_free(counts);
1682         __kmp_free(totals);
1683         __kmp_free(retval);
1684         KMP_CPU_FREE(oldMask);
1685         return 0;
1686     }
1687 
1688     //
1689     // Find any levels with radiix 1, and remove them from the map
1690     // (except for the package level).
1691     //
1692     int new_depth = 0;
1693     for (level = 0; level < depth; level++) {
1694         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1695            continue;
1696         }
1697         new_depth++;
1698     }
1699 
1700     //
1701     // If we are removing any levels, allocate a new vector to return,
1702     // and copy the relevant information to it.
1703     //
1704     if (new_depth != depth) {
1705         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1706           sizeof(AddrUnsPair) * nApics);
1707         for (proc = 0; (int)proc < nApics; proc++) {
1708             Address addr(new_depth);
1709             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1710         }
1711         int new_level = 0;
1712         for (level = 0; level < depth; level++) {
1713             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1714                if (level == threadLevel) {
1715                    threadLevel = -1;
1716                }
1717                else if ((threadLevel >= 0) && (level < threadLevel)) {
1718                    threadLevel--;
1719                }
1720                if (level == coreLevel) {
1721                    coreLevel = -1;
1722                }
1723                else if ((coreLevel >= 0) && (level < coreLevel)) {
1724                    coreLevel--;
1725                }
1726                if (level < pkgLevel) {
1727                    pkgLevel--;
1728                }
1729                continue;
1730             }
1731             for (proc = 0; (int)proc < nApics; proc++) {
1732                 new_retval[proc].first.labels[new_level]
1733                   = retval[proc].first.labels[level];
1734             }
1735             new_level++;
1736         }
1737 
1738         __kmp_free(retval);
1739         retval = new_retval;
1740         depth = new_depth;
1741     }
1742 
1743     if (__kmp_affinity_gran_levels < 0) {
1744         //
1745         // Set the granularity level based on what levels are modeled
1746         // in the machine topology map.
1747         //
1748         __kmp_affinity_gran_levels = 0;
1749         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1750             __kmp_affinity_gran_levels++;
1751         }
1752         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1753             __kmp_affinity_gran_levels++;
1754         }
1755         if (__kmp_affinity_gran > affinity_gran_package) {
1756             __kmp_affinity_gran_levels++;
1757         }
1758     }
1759 
1760     if (__kmp_affinity_verbose) {
1761         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1762           coreLevel, threadLevel);
1763     }
1764 
1765     __kmp_free(last);
1766     __kmp_free(maxCt);
1767     __kmp_free(counts);
1768     __kmp_free(totals);
1769     KMP_CPU_FREE(oldMask);
1770     *address2os = retval;
1771     return depth;
1772 }
1773 
1774 
1775 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1776 
1777 
1778 #define osIdIndex       0
1779 #define threadIdIndex   1
1780 #define coreIdIndex     2
1781 #define pkgIdIndex      3
1782 #define nodeIdIndex     4
1783 
1784 typedef unsigned *ProcCpuInfo;
1785 static unsigned maxIndex = pkgIdIndex;
1786 
1787 
1788 static int
1789 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1790 {
1791     const unsigned *aa = (const unsigned *)a;
1792     const unsigned *bb = (const unsigned *)b;
1793     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1794     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1795     return 0;
1796 };
1797 
1798 
1799 static int
1800 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1801 {
1802     unsigned i;
1803     const unsigned *aa = *((const unsigned **)a);
1804     const unsigned *bb = *((const unsigned **)b);
1805     for (i = maxIndex; ; i--) {
1806         if (aa[i] < bb[i]) return -1;
1807         if (aa[i] > bb[i]) return 1;
1808         if (i == osIdIndex) break;
1809     }
1810     return 0;
1811 }
1812 
1813 
1814 //
1815 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1816 // affinity map.
1817 //
1818 static int
1819 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1820   kmp_i18n_id_t *const msg_id, FILE *f)
1821 {
1822     *address2os = NULL;
1823     *msg_id = kmp_i18n_null;
1824 
1825     //
1826     // Scan of the file, and count the number of "processor" (osId) fields,
1827     // and find the highest value of <n> for a node_<n> field.
1828     //
1829     char buf[256];
1830     unsigned num_records = 0;
1831     while (! feof(f)) {
1832         buf[sizeof(buf) - 1] = 1;
1833         if (! fgets(buf, sizeof(buf), f)) {
1834             //
1835             // Read errors presumably because of EOF
1836             //
1837             break;
1838         }
1839 
1840         char s1[] = "processor";
1841         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1842             num_records++;
1843             continue;
1844         }
1845 
1846         //
1847         // FIXME - this will match "node_<n> <garbage>"
1848         //
1849         unsigned level;
1850         if (sscanf(buf, "node_%d id", &level) == 1) {
1851             if (nodeIdIndex + level >= maxIndex) {
1852                 maxIndex = nodeIdIndex + level;
1853             }
1854             continue;
1855         }
1856     }
1857 
1858     //
1859     // Check for empty file / no valid processor records, or too many.
1860     // The number of records can't exceed the number of valid bits in the
1861     // affinity mask.
1862     //
1863     if (num_records == 0) {
1864         *line = 0;
1865         *msg_id = kmp_i18n_str_NoProcRecords;
1866         return -1;
1867     }
1868     if (num_records > (unsigned)__kmp_xproc) {
1869         *line = 0;
1870         *msg_id = kmp_i18n_str_TooManyProcRecords;
1871         return -1;
1872     }
1873 
1874     //
1875     // Set the file pointer back to the begginning, so that we can scan the
1876     // file again, this time performing a full parse of the data.
1877     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1878     // Adding an extra element at the end allows us to remove a lot of extra
1879     // checks for termination conditions.
1880     //
1881     if (fseek(f, 0, SEEK_SET) != 0) {
1882         *line = 0;
1883         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1884         return -1;
1885     }
1886 
1887     //
1888     // Allocate the array of records to store the proc info in.  The dummy
1889     // element at the end makes the logic in filling them out easier to code.
1890     //
1891     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1892       * sizeof(unsigned *));
1893     unsigned i;
1894     for (i = 0; i <= num_records; i++) {
1895         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1896           * sizeof(unsigned));
1897     }
1898 
1899 #define CLEANUP_THREAD_INFO \
1900     for (i = 0; i <= num_records; i++) {                                \
1901         __kmp_free(threadInfo[i]);                                      \
1902     }                                                                   \
1903     __kmp_free(threadInfo);
1904 
1905     //
1906     // A value of UINT_MAX means that we didn't find the field
1907     //
1908     unsigned __index;
1909 
1910 #define INIT_PROC_INFO(p) \
1911     for (__index = 0; __index <= maxIndex; __index++) {                 \
1912         (p)[__index] = UINT_MAX;                                        \
1913     }
1914 
1915     for (i = 0; i <= num_records; i++) {
1916         INIT_PROC_INFO(threadInfo[i]);
1917     }
1918 
1919     unsigned num_avail = 0;
1920     *line = 0;
1921     while (! feof(f)) {
1922         //
1923         // Create an inner scoping level, so that all the goto targets at the
1924         // end of the loop appear in an outer scoping level.  This avoids
1925         // warnings about jumping past an initialization to a target in the
1926         // same block.
1927         //
1928         {
1929             buf[sizeof(buf) - 1] = 1;
1930             bool long_line = false;
1931             if (! fgets(buf, sizeof(buf), f)) {
1932                 //
1933                 // Read errors presumably because of EOF
1934                 //
1935                 // If there is valid data in threadInfo[num_avail], then fake
1936                 // a blank line in ensure that the last address gets parsed.
1937                 //
1938                 bool valid = false;
1939                 for (i = 0; i <= maxIndex; i++) {
1940                     if (threadInfo[num_avail][i] != UINT_MAX) {
1941                         valid = true;
1942                     }
1943                 }
1944                 if (! valid) {
1945                     break;
1946                 }
1947                 buf[0] = 0;
1948             } else if (!buf[sizeof(buf) - 1]) {
1949                 //
1950                 // The line is longer than the buffer.  Set a flag and don't
1951                 // emit an error if we were going to ignore the line, anyway.
1952                 //
1953                 long_line = true;
1954 
1955 #define CHECK_LINE \
1956     if (long_line) {                                                    \
1957         CLEANUP_THREAD_INFO;                                            \
1958         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1959         return -1;                                                      \
1960     }
1961             }
1962             (*line)++;
1963 
1964             char s1[] = "processor";
1965             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1966                 CHECK_LINE;
1967                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1968                 unsigned val;
1969                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1970                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1971                 threadInfo[num_avail][osIdIndex] = val;
1972 #if KMP_OS_LINUX && USE_SYSFS_INFO
1973                 char path[256];
1974                 snprintf(path, sizeof(path),
1975                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1976                     threadInfo[num_avail][osIdIndex]);
1977                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1978 
1979                 snprintf(path, sizeof(path),
1980                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1981                     threadInfo[num_avail][osIdIndex]);
1982                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1983                 continue;
1984 #else
1985             }
1986             char s2[] = "physical id";
1987             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1988                 CHECK_LINE;
1989                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1990                 unsigned val;
1991                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1992                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1993                 threadInfo[num_avail][pkgIdIndex] = val;
1994                 continue;
1995             }
1996             char s3[] = "core id";
1997             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1998                 CHECK_LINE;
1999                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2000                 unsigned val;
2001                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2002                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2003                 threadInfo[num_avail][coreIdIndex] = val;
2004                 continue;
2005 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2006             }
2007             char s4[] = "thread id";
2008             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2009                 CHECK_LINE;
2010                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2011                 unsigned val;
2012                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2013                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2014                 threadInfo[num_avail][threadIdIndex] = val;
2015                 continue;
2016             }
2017             unsigned level;
2018             if (sscanf(buf, "node_%d id", &level) == 1) {
2019                 CHECK_LINE;
2020                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2021                 unsigned val;
2022                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2023                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2024                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2025                 threadInfo[num_avail][nodeIdIndex + level] = val;
2026                 continue;
2027             }
2028 
2029             //
2030             // We didn't recognize the leading token on the line.
2031             // There are lots of leading tokens that we don't recognize -
2032             // if the line isn't empty, go on to the next line.
2033             //
2034             if ((*buf != 0) && (*buf != '\n')) {
2035                 //
2036                 // If the line is longer than the buffer, read characters
2037                 // until we find a newline.
2038                 //
2039                 if (long_line) {
2040                     int ch;
2041                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2042                 }
2043                 continue;
2044             }
2045 
2046             //
2047             // A newline has signalled the end of the processor record.
2048             // Check that there aren't too many procs specified.
2049             //
2050             if ((int)num_avail == __kmp_xproc) {
2051                 CLEANUP_THREAD_INFO;
2052                 *msg_id = kmp_i18n_str_TooManyEntries;
2053                 return -1;
2054             }
2055 
2056             //
2057             // Check for missing fields.  The osId field must be there, and we
2058             // currently require that the physical id field is specified, also.
2059             //
2060             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2061                 CLEANUP_THREAD_INFO;
2062                 *msg_id = kmp_i18n_str_MissingProcField;
2063                 return -1;
2064             }
2065             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2066                 CLEANUP_THREAD_INFO;
2067                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2068                 return -1;
2069             }
2070 
2071             //
2072             // Skip this proc if it is not included in the machine model.
2073             //
2074             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2075                 INIT_PROC_INFO(threadInfo[num_avail]);
2076                 continue;
2077             }
2078 
2079             //
2080             // We have a successful parse of this proc's info.
2081             // Increment the counter, and prepare for the next proc.
2082             //
2083             num_avail++;
2084             KMP_ASSERT(num_avail <= num_records);
2085             INIT_PROC_INFO(threadInfo[num_avail]);
2086         }
2087         continue;
2088 
2089         no_val:
2090         CLEANUP_THREAD_INFO;
2091         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2092         return -1;
2093 
2094         dup_field:
2095         CLEANUP_THREAD_INFO;
2096         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2097         return -1;
2098     }
2099     *line = 0;
2100 
2101 # if KMP_MIC && REDUCE_TEAM_SIZE
2102     unsigned teamSize = 0;
2103 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2104 
2105     // check for num_records == __kmp_xproc ???
2106 
2107     //
2108     // If there's only one thread context to bind to, form an Address object
2109     // with depth 1 and return immediately (or, if affinity is off, set
2110     // address2os to NULL and return).
2111     //
2112     // If it is configured to omit the package level when there is only a
2113     // single package, the logic at the end of this routine won't work if
2114     // there is only a single thread - it would try to form an Address
2115     // object with depth 0.
2116     //
2117     KMP_ASSERT(num_avail > 0);
2118     KMP_ASSERT(num_avail <= num_records);
2119     if (num_avail == 1) {
2120         __kmp_ncores = 1;
2121         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2122         if (__kmp_affinity_verbose) {
2123             if (! KMP_AFFINITY_CAPABLE()) {
2124                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2125                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2126                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2127             }
2128             else {
2129                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2130                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2131                   fullMask);
2132                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2133                 if (__kmp_affinity_respect_mask) {
2134                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2135                 } else {
2136                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2137                 }
2138                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2139                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2140             }
2141             int index;
2142             kmp_str_buf_t buf;
2143             __kmp_str_buf_init(&buf);
2144             __kmp_str_buf_print(&buf, "1");
2145             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2146                 __kmp_str_buf_print(&buf, " x 1");
2147             }
2148             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2149             __kmp_str_buf_free(&buf);
2150         }
2151 
2152         if (__kmp_affinity_type == affinity_none) {
2153             CLEANUP_THREAD_INFO;
2154             return 0;
2155         }
2156 
2157         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2158         Address addr(1);
2159         addr.labels[0] = threadInfo[0][pkgIdIndex];
2160         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2161 
2162         if (__kmp_affinity_gran_levels < 0) {
2163             __kmp_affinity_gran_levels = 0;
2164         }
2165 
2166         if (__kmp_affinity_verbose) {
2167             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2168         }
2169 
2170         CLEANUP_THREAD_INFO;
2171         return 1;
2172     }
2173 
2174     //
2175     // Sort the threadInfo table by physical Id.
2176     //
2177     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2178       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2179 
2180     //
2181     // The table is now sorted by pkgId / coreId / threadId, but we really
2182     // don't know the radix of any of the fields.  pkgId's may be sparsely
2183     // assigned among the chips on a system.  Although coreId's are usually
2184     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2185     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2186     //
2187     // For that matter, we don't know what coresPerPkg and threadsPerCore
2188     // (or the total # packages) are at this point - we want to determine
2189     // that now.  We only have an upper bound on the first two figures.
2190     //
2191     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2192       * sizeof(unsigned));
2193     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2194       * sizeof(unsigned));
2195     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2196       * sizeof(unsigned));
2197     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2198       * sizeof(unsigned));
2199 
2200     bool assign_thread_ids = false;
2201     unsigned threadIdCt;
2202     unsigned index;
2203 
2204     restart_radix_check:
2205     threadIdCt = 0;
2206 
2207     //
2208     // Initialize the counter arrays with data from threadInfo[0].
2209     //
2210     if (assign_thread_ids) {
2211         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2212             threadInfo[0][threadIdIndex] = threadIdCt++;
2213         }
2214         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2215             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2216         }
2217     }
2218     for (index = 0; index <= maxIndex; index++) {
2219         counts[index] = 1;
2220         maxCt[index] = 1;
2221         totals[index] = 1;
2222         lastId[index] = threadInfo[0][index];;
2223     }
2224 
2225     //
2226     // Run through the rest of the OS procs.
2227     //
2228     for (i = 1; i < num_avail; i++) {
2229         //
2230         // Find the most significant index whose id differs
2231         // from the id for the previous OS proc.
2232         //
2233         for (index = maxIndex; index >= threadIdIndex; index--) {
2234             if (assign_thread_ids && (index == threadIdIndex)) {
2235                 //
2236                 // Auto-assign the thread id field if it wasn't specified.
2237                 //
2238                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2239                     threadInfo[i][threadIdIndex] = threadIdCt++;
2240                 }
2241 
2242                 //
2243                 // Aparrently the thread id field was specified for some
2244                 // entries and not others.  Start the thread id counter
2245                 // off at the next higher thread id.
2246                 //
2247                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2248                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2249                 }
2250             }
2251             if (threadInfo[i][index] != lastId[index]) {
2252                 //
2253                 // Run through all indices which are less significant,
2254                 // and reset the counts to 1.
2255                 //
2256                 // At all levels up to and including index, we need to
2257                 // increment the totals and record the last id.
2258                 //
2259                 unsigned index2;
2260                 for (index2 = threadIdIndex; index2 < index; index2++) {
2261                     totals[index2]++;
2262                     if (counts[index2] > maxCt[index2]) {
2263                         maxCt[index2] = counts[index2];
2264                     }
2265                     counts[index2] = 1;
2266                     lastId[index2] = threadInfo[i][index2];
2267                 }
2268                 counts[index]++;
2269                 totals[index]++;
2270                 lastId[index] = threadInfo[i][index];
2271 
2272                 if (assign_thread_ids && (index > threadIdIndex)) {
2273 
2274 # if KMP_MIC && REDUCE_TEAM_SIZE
2275                     //
2276                     // The default team size is the total #threads in the machine
2277                     // minus 1 thread for every core that has 3 or more threads.
2278                     //
2279                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2280 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2281 
2282                     //
2283                     // Restart the thread counter, as we are on a new core.
2284                     //
2285                     threadIdCt = 0;
2286 
2287                     //
2288                     // Auto-assign the thread id field if it wasn't specified.
2289                     //
2290                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2291                         threadInfo[i][threadIdIndex] = threadIdCt++;
2292                     }
2293 
2294                     //
2295                     // Aparrently the thread id field was specified for some
2296                     // entries and not others.  Start the thread id counter
2297                     // off at the next higher thread id.
2298                     //
2299                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2300                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2301                     }
2302                 }
2303                 break;
2304             }
2305         }
2306         if (index < threadIdIndex) {
2307             //
2308             // If thread ids were specified, it is an error if they are not
2309             // unique.  Also, check that we waven't already restarted the
2310             // loop (to be safe - shouldn't need to).
2311             //
2312             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2313               || assign_thread_ids) {
2314                 __kmp_free(lastId);
2315                 __kmp_free(totals);
2316                 __kmp_free(maxCt);
2317                 __kmp_free(counts);
2318                 CLEANUP_THREAD_INFO;
2319                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2320                 return -1;
2321             }
2322 
2323             //
2324             // If the thread ids were not specified and we see entries
2325             // entries that are duplicates, start the loop over and
2326             // assign the thread ids manually.
2327             //
2328             assign_thread_ids = true;
2329             goto restart_radix_check;
2330         }
2331     }
2332 
2333 # if KMP_MIC && REDUCE_TEAM_SIZE
2334     //
2335     // The default team size is the total #threads in the machine
2336     // minus 1 thread for every core that has 3 or more threads.
2337     //
2338     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2339 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2340 
2341     for (index = threadIdIndex; index <= maxIndex; index++) {
2342         if (counts[index] > maxCt[index]) {
2343             maxCt[index] = counts[index];
2344         }
2345     }
2346 
2347     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2348     nCoresPerPkg = maxCt[coreIdIndex];
2349     nPackages = totals[pkgIdIndex];
2350 
2351     //
2352     // Check to see if the machine topology is uniform
2353     //
2354     unsigned prod = totals[maxIndex];
2355     for (index = threadIdIndex; index < maxIndex; index++) {
2356        prod *= maxCt[index];
2357     }
2358     bool uniform = (prod == totals[threadIdIndex]);
2359 
2360     //
2361     // When affinity is off, this routine will still be called to set
2362     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2363     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2364     // correctly, and return now if affinity is not enabled.
2365     //
2366     __kmp_ncores = totals[coreIdIndex];
2367 
2368     if (__kmp_affinity_verbose) {
2369         if (! KMP_AFFINITY_CAPABLE()) {
2370                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2371                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2372                 if (uniform) {
2373                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2374                 } else {
2375                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2376                 }
2377         }
2378         else {
2379             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2380             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2381                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2382                 if (__kmp_affinity_respect_mask) {
2383                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2384                 } else {
2385                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2386                 }
2387                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2388                 if (uniform) {
2389                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2390                 } else {
2391                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2392                 }
2393         }
2394         kmp_str_buf_t buf;
2395         __kmp_str_buf_init(&buf);
2396 
2397         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2398         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2399             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2400         }
2401         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2402           maxCt[threadIdIndex], __kmp_ncores);
2403 
2404         __kmp_str_buf_free(&buf);
2405     }
2406 
2407 # if KMP_MIC && REDUCE_TEAM_SIZE
2408     //
2409     // Set the default team size.
2410     //
2411     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2412         __kmp_dflt_team_nth = teamSize;
2413         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2414           __kmp_dflt_team_nth));
2415     }
2416 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2417 
2418     if (__kmp_affinity_type == affinity_none) {
2419         __kmp_free(lastId);
2420         __kmp_free(totals);
2421         __kmp_free(maxCt);
2422         __kmp_free(counts);
2423         CLEANUP_THREAD_INFO;
2424         return 0;
2425     }
2426 
2427     //
2428     // Count the number of levels which have more nodes at that level than
2429     // at the parent's level (with there being an implicit root node of
2430     // the top level).  This is equivalent to saying that there is at least
2431     // one node at this level which has a sibling.  These levels are in the
2432     // map, and the package level is always in the map.
2433     //
2434     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2435     int level = 0;
2436     for (index = threadIdIndex; index < maxIndex; index++) {
2437         KMP_ASSERT(totals[index] >= totals[index + 1]);
2438         inMap[index] = (totals[index] > totals[index + 1]);
2439     }
2440     inMap[maxIndex] = (totals[maxIndex] > 1);
2441     inMap[pkgIdIndex] = true;
2442 
2443     int depth = 0;
2444     for (index = threadIdIndex; index <= maxIndex; index++) {
2445         if (inMap[index]) {
2446             depth++;
2447         }
2448     }
2449     KMP_ASSERT(depth > 0);
2450 
2451     //
2452     // Construct the data structure that is to be returned.
2453     //
2454     *address2os = (AddrUnsPair*)
2455       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2456     int pkgLevel = -1;
2457     int coreLevel = -1;
2458     int threadLevel = -1;
2459 
2460     for (i = 0; i < num_avail; ++i) {
2461         Address addr(depth);
2462         unsigned os = threadInfo[i][osIdIndex];
2463         int src_index;
2464         int dst_index = 0;
2465 
2466         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2467             if (! inMap[src_index]) {
2468                 continue;
2469             }
2470             addr.labels[dst_index] = threadInfo[i][src_index];
2471             if (src_index == pkgIdIndex) {
2472                 pkgLevel = dst_index;
2473             }
2474             else if (src_index == coreIdIndex) {
2475                 coreLevel = dst_index;
2476             }
2477             else if (src_index == threadIdIndex) {
2478                 threadLevel = dst_index;
2479             }
2480             dst_index++;
2481         }
2482         (*address2os)[i] = AddrUnsPair(addr, os);
2483     }
2484 
2485     if (__kmp_affinity_gran_levels < 0) {
2486         //
2487         // Set the granularity level based on what levels are modeled
2488         // in the machine topology map.
2489         //
2490         unsigned src_index;
2491         __kmp_affinity_gran_levels = 0;
2492         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2493             if (! inMap[src_index]) {
2494                 continue;
2495             }
2496             switch (src_index) {
2497                 case threadIdIndex:
2498                 if (__kmp_affinity_gran > affinity_gran_thread) {
2499                     __kmp_affinity_gran_levels++;
2500                 }
2501 
2502                 break;
2503                 case coreIdIndex:
2504                 if (__kmp_affinity_gran > affinity_gran_core) {
2505                     __kmp_affinity_gran_levels++;
2506                 }
2507                 break;
2508 
2509                 case pkgIdIndex:
2510                 if (__kmp_affinity_gran > affinity_gran_package) {
2511                     __kmp_affinity_gran_levels++;
2512                 }
2513                 break;
2514             }
2515         }
2516     }
2517 
2518     if (__kmp_affinity_verbose) {
2519         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2520           coreLevel, threadLevel);
2521     }
2522 
2523     __kmp_free(inMap);
2524     __kmp_free(lastId);
2525     __kmp_free(totals);
2526     __kmp_free(maxCt);
2527     __kmp_free(counts);
2528     CLEANUP_THREAD_INFO;
2529     return depth;
2530 }
2531 
2532 
2533 //
2534 // Create and return a table of affinity masks, indexed by OS thread ID.
2535 // This routine handles OR'ing together all the affinity masks of threads
2536 // that are sufficiently close, if granularity > fine.
2537 //
2538 static kmp_affin_mask_t *
2539 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2540   AddrUnsPair *address2os, unsigned numAddrs)
2541 {
2542     //
2543     // First form a table of affinity masks in order of OS thread id.
2544     //
2545     unsigned depth;
2546     unsigned maxOsId;
2547     unsigned i;
2548 
2549     KMP_ASSERT(numAddrs > 0);
2550     depth = address2os[0].first.depth;
2551 
2552     maxOsId = 0;
2553     for (i = 0; i < numAddrs; i++) {
2554         unsigned osId = address2os[i].second;
2555         if (osId > maxOsId) {
2556             maxOsId = osId;
2557         }
2558     }
2559     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2560       (maxOsId + 1) * __kmp_affin_mask_size);
2561 
2562     //
2563     // Sort the address2os table according to physical order.  Doing so
2564     // will put all threads on the same core/package/node in consecutive
2565     // locations.
2566     //
2567     qsort(address2os, numAddrs, sizeof(*address2os),
2568       __kmp_affinity_cmp_Address_labels);
2569 
2570     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2571     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2572         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2573     }
2574     if (__kmp_affinity_gran_levels >= (int)depth) {
2575         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2576           && (__kmp_affinity_type != affinity_none))) {
2577             KMP_WARNING(AffThreadsMayMigrate);
2578         }
2579     }
2580 
2581     //
2582     // Run through the table, forming the masks for all threads on each
2583     // core.  Threads on the same core will have identical "Address"
2584     // objects, not considering the last level, which must be the thread
2585     // id.  All threads on a core will appear consecutively.
2586     //
2587     unsigned unique = 0;
2588     unsigned j = 0;                             // index of 1st thread on core
2589     unsigned leader = 0;
2590     Address *leaderAddr = &(address2os[0].first);
2591     kmp_affin_mask_t *sum
2592       = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2593     KMP_CPU_ZERO(sum);
2594     KMP_CPU_SET(address2os[0].second, sum);
2595     for (i = 1; i < numAddrs; i++) {
2596         //
2597         // If this thread is sufficiently close to the leader (within the
2598         // granularity setting), then set the bit for this os thread in the
2599         // affinity mask for this group, and go on to the next thread.
2600         //
2601         if (leaderAddr->isClose(address2os[i].first,
2602           __kmp_affinity_gran_levels)) {
2603             KMP_CPU_SET(address2os[i].second, sum);
2604             continue;
2605         }
2606 
2607         //
2608         // For every thread in this group, copy the mask to the thread's
2609         // entry in the osId2Mask table.  Mark the first address as a
2610         // leader.
2611         //
2612         for (; j < i; j++) {
2613             unsigned osId = address2os[j].second;
2614             KMP_DEBUG_ASSERT(osId <= maxOsId);
2615             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2616             KMP_CPU_COPY(mask, sum);
2617             address2os[j].first.leader = (j == leader);
2618         }
2619         unique++;
2620 
2621         //
2622         // Start a new mask.
2623         //
2624         leader = i;
2625         leaderAddr = &(address2os[i].first);
2626         KMP_CPU_ZERO(sum);
2627         KMP_CPU_SET(address2os[i].second, sum);
2628     }
2629 
2630     //
2631     // For every thread in last group, copy the mask to the thread's
2632     // entry in the osId2Mask table.
2633     //
2634     for (; j < i; j++) {
2635         unsigned osId = address2os[j].second;
2636         KMP_DEBUG_ASSERT(osId <= maxOsId);
2637         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2638         KMP_CPU_COPY(mask, sum);
2639         address2os[j].first.leader = (j == leader);
2640     }
2641     unique++;
2642 
2643     *maxIndex = maxOsId;
2644     *numUnique = unique;
2645     return osId2Mask;
2646 }
2647 
2648 
2649 //
2650 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2651 // as file-static than to try and pass them through the calling sequence of
2652 // the recursive-descent OMP_PLACES parser.
2653 //
2654 static kmp_affin_mask_t *newMasks;
2655 static int numNewMasks;
2656 static int nextNewMask;
2657 
2658 #define ADD_MASK(_mask) \
2659     {                                                                   \
2660         if (nextNewMask >= numNewMasks) {                               \
2661             numNewMasks *= 2;                                           \
2662             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2663               numNewMasks * __kmp_affin_mask_size);                     \
2664         }                                                               \
2665         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2666         nextNewMask++;                                                  \
2667     }
2668 
2669 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2670     {                                                                   \
2671         if (((_osId) > _maxOsId) ||                                     \
2672           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2673             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2674               && (__kmp_affinity_type != affinity_none))) {             \
2675                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2676             }                                                           \
2677         }                                                               \
2678         else {                                                          \
2679             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2680         }                                                               \
2681     }
2682 
2683 
2684 //
2685 // Re-parse the proclist (for the explicit affinity type), and form the list
2686 // of affinity newMasks indexed by gtid.
2687 //
2688 static void
2689 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2690   unsigned int *out_numMasks, const char *proclist,
2691   kmp_affin_mask_t *osId2Mask, int maxOsId)
2692 {
2693     const char *scan = proclist;
2694     const char *next = proclist;
2695 
2696     //
2697     // We use malloc() for the temporary mask vector,
2698     // so that we can use realloc() to extend it.
2699     //
2700     numNewMasks = 2;
2701     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2702       * __kmp_affin_mask_size);
2703     nextNewMask = 0;
2704     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2705       __kmp_affin_mask_size);
2706     int setSize = 0;
2707 
2708     for (;;) {
2709         int start, end, stride;
2710 
2711         SKIP_WS(scan);
2712         next = scan;
2713         if (*next == '\0') {
2714             break;
2715         }
2716 
2717         if (*next == '{') {
2718             int num;
2719             setSize = 0;
2720             next++;     // skip '{'
2721             SKIP_WS(next);
2722             scan = next;
2723 
2724             //
2725             // Read the first integer in the set.
2726             //
2727             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2728               "bad proclist");
2729             SKIP_DIGITS(next);
2730             num = __kmp_str_to_int(scan, *next);
2731             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2732 
2733             //
2734             // Copy the mask for that osId to the sum (union) mask.
2735             //
2736             if ((num > maxOsId) ||
2737               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2738                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2739                   && (__kmp_affinity_type != affinity_none))) {
2740                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2741                 }
2742                 KMP_CPU_ZERO(sumMask);
2743             }
2744             else {
2745                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2746                 setSize = 1;
2747             }
2748 
2749             for (;;) {
2750                 //
2751                 // Check for end of set.
2752                 //
2753                 SKIP_WS(next);
2754                 if (*next == '}') {
2755                     next++;     // skip '}'
2756                     break;
2757                 }
2758 
2759                 //
2760                 // Skip optional comma.
2761                 //
2762                 if (*next == ',') {
2763                     next++;
2764                 }
2765                 SKIP_WS(next);
2766 
2767                 //
2768                 // Read the next integer in the set.
2769                 //
2770                 scan = next;
2771                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2772                   "bad explicit proc list");
2773 
2774                 SKIP_DIGITS(next);
2775                 num = __kmp_str_to_int(scan, *next);
2776                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2777 
2778                 //
2779                 // Add the mask for that osId to the sum mask.
2780                 //
2781                 if ((num > maxOsId) ||
2782                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2783                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2784                       && (__kmp_affinity_type != affinity_none))) {
2785                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2786                     }
2787                 }
2788                 else {
2789                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2790                     setSize++;
2791                 }
2792             }
2793             if (setSize > 0) {
2794                 ADD_MASK(sumMask);
2795             }
2796 
2797             SKIP_WS(next);
2798             if (*next == ',') {
2799                 next++;
2800             }
2801             scan = next;
2802             continue;
2803         }
2804 
2805         //
2806         // Read the first integer.
2807         //
2808         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2809         SKIP_DIGITS(next);
2810         start = __kmp_str_to_int(scan, *next);
2811         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2812         SKIP_WS(next);
2813 
2814         //
2815         // If this isn't a range, then add a mask to the list and go on.
2816         //
2817         if (*next != '-') {
2818             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2819 
2820             //
2821             // Skip optional comma.
2822             //
2823             if (*next == ',') {
2824                 next++;
2825             }
2826             scan = next;
2827             continue;
2828         }
2829 
2830         //
2831         // This is a range.  Skip over the '-' and read in the 2nd int.
2832         //
2833         next++;         // skip '-'
2834         SKIP_WS(next);
2835         scan = next;
2836         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2837         SKIP_DIGITS(next);
2838         end = __kmp_str_to_int(scan, *next);
2839         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2840 
2841         //
2842         // Check for a stride parameter
2843         //
2844         stride = 1;
2845         SKIP_WS(next);
2846         if (*next == ':') {
2847             //
2848             // A stride is specified.  Skip over the ':" and read the 3rd int.
2849             //
2850             int sign = +1;
2851             next++;         // skip ':'
2852             SKIP_WS(next);
2853             scan = next;
2854             if (*next == '-') {
2855                 sign = -1;
2856                 next++;
2857                 SKIP_WS(next);
2858                 scan = next;
2859             }
2860             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2861               "bad explicit proc list");
2862             SKIP_DIGITS(next);
2863             stride = __kmp_str_to_int(scan, *next);
2864             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2865             stride *= sign;
2866         }
2867 
2868         //
2869         // Do some range checks.
2870         //
2871         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2872         if (stride > 0) {
2873             KMP_ASSERT2(start <= end, "bad explicit proc list");
2874         }
2875         else {
2876             KMP_ASSERT2(start >= end, "bad explicit proc list");
2877         }
2878         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2879 
2880         //
2881         // Add the mask for each OS proc # to the list.
2882         //
2883         if (stride > 0) {
2884             do {
2885                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2886                 start += stride;
2887             } while (start <= end);
2888         }
2889         else {
2890             do {
2891                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2892                 start += stride;
2893             } while (start >= end);
2894         }
2895 
2896         //
2897         // Skip optional comma.
2898         //
2899         SKIP_WS(next);
2900         if (*next == ',') {
2901             next++;
2902         }
2903         scan = next;
2904     }
2905 
2906     *out_numMasks = nextNewMask;
2907     if (nextNewMask == 0) {
2908         *out_masks = NULL;
2909         KMP_INTERNAL_FREE(newMasks);
2910         return;
2911     }
2912     *out_masks
2913       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2914     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2915     __kmp_free(sumMask);
2916     KMP_INTERNAL_FREE(newMasks);
2917 }
2918 
2919 
2920 # if OMP_40_ENABLED
2921 
2922 /*-----------------------------------------------------------------------------
2923 
2924 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2925 places.  Again, Here is the grammar:
2926 
2927 place_list := place
2928 place_list := place , place_list
2929 place := num
2930 place := place : num
2931 place := place : num : signed
2932 place := { subplacelist }
2933 place := ! place                  // (lowest priority)
2934 subplace_list := subplace
2935 subplace_list := subplace , subplace_list
2936 subplace := num
2937 subplace := num : num
2938 subplace := num : num : signed
2939 signed := num
2940 signed := + signed
2941 signed := - signed
2942 
2943 -----------------------------------------------------------------------------*/
2944 
2945 static void
2946 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2947   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2948 {
2949     const char *next;
2950 
2951     for (;;) {
2952         int start, count, stride, i;
2953 
2954         //
2955         // Read in the starting proc id
2956         //
2957         SKIP_WS(*scan);
2958         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2959           "bad explicit places list");
2960         next = *scan;
2961         SKIP_DIGITS(next);
2962         start = __kmp_str_to_int(*scan, *next);
2963         KMP_ASSERT(start >= 0);
2964         *scan = next;
2965 
2966         //
2967         // valid follow sets are ',' ':' and '}'
2968         //
2969         SKIP_WS(*scan);
2970         if (**scan == '}' || **scan == ',') {
2971             if ((start > maxOsId) ||
2972               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2973                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2974                   && (__kmp_affinity_type != affinity_none))) {
2975                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2976                 }
2977             }
2978             else {
2979                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2980                 (*setSize)++;
2981             }
2982             if (**scan == '}') {
2983                 break;
2984             }
2985             (*scan)++;  // skip ','
2986             continue;
2987         }
2988         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2989         (*scan)++;      // skip ':'
2990 
2991         //
2992         // Read count parameter
2993         //
2994         SKIP_WS(*scan);
2995         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2996           "bad explicit places list");
2997         next = *scan;
2998         SKIP_DIGITS(next);
2999         count = __kmp_str_to_int(*scan, *next);
3000         KMP_ASSERT(count >= 0);
3001         *scan = next;
3002 
3003         //
3004         // valid follow sets are ',' ':' and '}'
3005         //
3006         SKIP_WS(*scan);
3007         if (**scan == '}' || **scan == ',') {
3008             for (i = 0; i < count; i++) {
3009                 if ((start > maxOsId) ||
3010                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3011                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3012                       && (__kmp_affinity_type != affinity_none))) {
3013                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3014                     }
3015                     break;  // don't proliferate warnings for large count
3016                 }
3017                 else {
3018                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3019                     start++;
3020                     (*setSize)++;
3021                 }
3022             }
3023             if (**scan == '}') {
3024                 break;
3025             }
3026             (*scan)++;  // skip ','
3027             continue;
3028         }
3029         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3030         (*scan)++;      // skip ':'
3031 
3032         //
3033         // Read stride parameter
3034         //
3035         int sign = +1;
3036         for (;;) {
3037             SKIP_WS(*scan);
3038             if (**scan == '+') {
3039                 (*scan)++; // skip '+'
3040                 continue;
3041             }
3042             if (**scan == '-') {
3043                 sign *= -1;
3044                 (*scan)++; // skip '-'
3045                 continue;
3046             }
3047             break;
3048         }
3049         SKIP_WS(*scan);
3050         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3051           "bad explicit places list");
3052         next = *scan;
3053         SKIP_DIGITS(next);
3054         stride = __kmp_str_to_int(*scan, *next);
3055         KMP_ASSERT(stride >= 0);
3056         *scan = next;
3057         stride *= sign;
3058 
3059         //
3060         // valid follow sets are ',' and '}'
3061         //
3062         SKIP_WS(*scan);
3063         if (**scan == '}' || **scan == ',') {
3064             for (i = 0; i < count; i++) {
3065                 if ((start > maxOsId) ||
3066                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3067                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3068                       && (__kmp_affinity_type != affinity_none))) {
3069                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3070                     }
3071                     break;  // don't proliferate warnings for large count
3072                 }
3073                 else {
3074                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3075                     start += stride;
3076                     (*setSize)++;
3077                 }
3078             }
3079             if (**scan == '}') {
3080                 break;
3081             }
3082             (*scan)++;  // skip ','
3083             continue;
3084         }
3085 
3086         KMP_ASSERT2(0, "bad explicit places list");
3087     }
3088 }
3089 
3090 
3091 static void
3092 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3093   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3094 {
3095     const char *next;
3096 
3097     //
3098     // valid follow sets are '{' '!' and num
3099     //
3100     SKIP_WS(*scan);
3101     if (**scan == '{') {
3102         (*scan)++;      // skip '{'
3103         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3104           setSize);
3105         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3106         (*scan)++;      // skip '}'
3107     }
3108     else if (**scan == '!') {
3109         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3110         KMP_CPU_COMPLEMENT(tempMask);
3111         (*scan)++;      // skip '!'
3112     }
3113     else if ((**scan >= '0') && (**scan <= '9')) {
3114         next = *scan;
3115         SKIP_DIGITS(next);
3116         int num = __kmp_str_to_int(*scan, *next);
3117         KMP_ASSERT(num >= 0);
3118         if ((num > maxOsId) ||
3119           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3120             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3121               && (__kmp_affinity_type != affinity_none))) {
3122                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3123             }
3124         }
3125         else {
3126             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3127             (*setSize)++;
3128         }
3129         *scan = next;  // skip num
3130     }
3131     else {
3132         KMP_ASSERT2(0, "bad explicit places list");
3133     }
3134 }
3135 
3136 
3137 //static void
3138 void
3139 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3140   unsigned int *out_numMasks, const char *placelist,
3141   kmp_affin_mask_t *osId2Mask, int maxOsId)
3142 {
3143     const char *scan = placelist;
3144     const char *next = placelist;
3145 
3146     numNewMasks = 2;
3147     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3148       * __kmp_affin_mask_size);
3149     nextNewMask = 0;
3150 
3151     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3152       __kmp_affin_mask_size);
3153     KMP_CPU_ZERO(tempMask);
3154     int setSize = 0;
3155 
3156     for (;;) {
3157         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3158 
3159         //
3160         // valid follow sets are ',' ':' and EOL
3161         //
3162         SKIP_WS(scan);
3163         if (*scan == '\0' || *scan == ',') {
3164             if (setSize > 0) {
3165                 ADD_MASK(tempMask);
3166             }
3167             KMP_CPU_ZERO(tempMask);
3168             setSize = 0;
3169             if (*scan == '\0') {
3170                 break;
3171             }
3172             scan++;     // skip ','
3173             continue;
3174         }
3175 
3176         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3177         scan++;         // skip ':'
3178 
3179         //
3180         // Read count parameter
3181         //
3182         SKIP_WS(scan);
3183         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3184           "bad explicit places list");
3185         next = scan;
3186         SKIP_DIGITS(next);
3187         int count = __kmp_str_to_int(scan, *next);
3188         KMP_ASSERT(count >= 0);
3189         scan = next;
3190 
3191         //
3192         // valid follow sets are ',' ':' and EOL
3193         //
3194         SKIP_WS(scan);
3195         int stride;
3196         if (*scan == '\0' || *scan == ',') {
3197             stride = +1;
3198         }
3199         else {
3200             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3201             scan++;         // skip ':'
3202 
3203             //
3204             // Read stride parameter
3205             //
3206             int sign = +1;
3207             for (;;) {
3208                 SKIP_WS(scan);
3209                 if (*scan == '+') {
3210                     scan++; // skip '+'
3211                     continue;
3212                 }
3213                 if (*scan == '-') {
3214                     sign *= -1;
3215                     scan++; // skip '-'
3216                     continue;
3217                 }
3218                 break;
3219             }
3220             SKIP_WS(scan);
3221             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3222               "bad explicit places list");
3223             next = scan;
3224             SKIP_DIGITS(next);
3225             stride = __kmp_str_to_int(scan, *next);
3226             KMP_DEBUG_ASSERT(stride >= 0);
3227             scan = next;
3228             stride *= sign;
3229         }
3230 
3231         if (stride > 0) {
3232             int i;
3233             for (i = 0; i < count; i++) {
3234                 int j;
3235                 if (setSize == 0) {
3236                     break;
3237                 }
3238                 ADD_MASK(tempMask);
3239                 setSize = 0;
3240                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3241                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3242                         KMP_CPU_CLR(j, tempMask);
3243                     }
3244                     else if ((j > maxOsId) ||
3245                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3246                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3247                           && (__kmp_affinity_type != affinity_none))) {
3248                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3249                         }
3250                         KMP_CPU_CLR(j, tempMask);
3251                     }
3252                     else {
3253                         KMP_CPU_SET(j, tempMask);
3254                         setSize++;
3255                     }
3256                 }
3257                 for (; j >= 0; j--) {
3258                     KMP_CPU_CLR(j, tempMask);
3259                 }
3260             }
3261         }
3262         else {
3263             int i;
3264             for (i = 0; i < count; i++) {
3265                 int j;
3266                 if (setSize == 0) {
3267                     break;
3268                 }
3269                 ADD_MASK(tempMask);
3270                 setSize = 0;
3271                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3272                   j++) {
3273                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3274                         KMP_CPU_CLR(j, tempMask);
3275                     }
3276                     else if ((j > maxOsId) ||
3277                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3278                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3279                           && (__kmp_affinity_type != affinity_none))) {
3280                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3281                         }
3282                         KMP_CPU_CLR(j, tempMask);
3283                     }
3284                     else {
3285                         KMP_CPU_SET(j, tempMask);
3286                         setSize++;
3287                     }
3288                 }
3289                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3290                     KMP_CPU_CLR(j, tempMask);
3291                 }
3292             }
3293         }
3294         KMP_CPU_ZERO(tempMask);
3295         setSize = 0;
3296 
3297         //
3298         // valid follow sets are ',' and EOL
3299         //
3300         SKIP_WS(scan);
3301         if (*scan == '\0') {
3302             break;
3303         }
3304         if (*scan == ',') {
3305             scan++;     // skip ','
3306             continue;
3307         }
3308 
3309         KMP_ASSERT2(0, "bad explicit places list");
3310     }
3311 
3312     *out_numMasks = nextNewMask;
3313     if (nextNewMask == 0) {
3314         *out_masks = NULL;
3315         KMP_INTERNAL_FREE(newMasks);
3316         return;
3317     }
3318     *out_masks
3319       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3320     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3321     __kmp_free(tempMask);
3322     KMP_INTERNAL_FREE(newMasks);
3323 }
3324 
3325 # endif /* OMP_40_ENABLED */
3326 
3327 #undef ADD_MASK
3328 #undef ADD_MASK_OSID
3329 
3330 static void
3331 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3332 {
3333     if ( __kmp_place_num_cores == 0 ) {
3334         if ( __kmp_place_num_threads_per_core == 0 ) {
3335             return;   // no cores limiting actions requested, exit
3336         }
3337         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3338     }
3339     if ( !__kmp_affinity_uniform_topology() ) {
3340         KMP_WARNING( AffThrPlaceNonUniform );
3341         return; // don't support non-uniform topology
3342     }
3343     if ( depth != 3 ) {
3344         KMP_WARNING( AffThrPlaceNonThreeLevel );
3345         return; // don't support not-3-level topology
3346     }
3347     if ( __kmp_place_num_threads_per_core == 0 ) {
3348         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3349     }
3350     if ( __kmp_place_core_offset + __kmp_place_num_cores > (unsigned int)nCoresPerPkg ) {
3351         KMP_WARNING( AffThrPlaceManyCores );
3352         return;
3353     }
3354 
3355     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3356                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3357     int i, j, k, n_old = 0, n_new = 0;
3358     for ( i = 0; i < nPackages; ++i ) {
3359         for ( j = 0; j < nCoresPerPkg; ++j ) {
3360             if ( (unsigned int)j < __kmp_place_core_offset || (unsigned int)j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3361                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3362             } else {
3363                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3364                     if ( (unsigned int)k < __kmp_place_num_threads_per_core ) {
3365                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3366                         n_new++;
3367                     }
3368                     n_old++;
3369                 }
3370             }
3371         }
3372     }
3373     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3374     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3375     __kmp_avail_proc = n_new;                                 // correct avail_proc
3376     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3377 
3378     __kmp_free( *pAddr );
3379     *pAddr = newAddr;      // replace old topology with new one
3380 }
3381 
3382 
3383 static AddrUnsPair *address2os = NULL;
3384 static int           * procarr = NULL;
3385 static int     __kmp_aff_depth = 0;
3386 
3387 static void
3388 __kmp_aux_affinity_initialize(void)
3389 {
3390     if (__kmp_affinity_masks != NULL) {
3391         KMP_ASSERT(fullMask != NULL);
3392         return;
3393     }
3394 
3395     //
3396     // Create the "full" mask - this defines all of the processors that we
3397     // consider to be in the machine model.  If respect is set, then it is
3398     // the initialization thread's affinity mask.  Otherwise, it is all
3399     // processors that we know about on the machine.
3400     //
3401     if (fullMask == NULL) {
3402         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3403     }
3404     if (KMP_AFFINITY_CAPABLE()) {
3405         if (__kmp_affinity_respect_mask) {
3406             __kmp_get_system_affinity(fullMask, TRUE);
3407 
3408             //
3409             // Count the number of available processors.
3410             //
3411             unsigned i;
3412             __kmp_avail_proc = 0;
3413             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3414                 if (! KMP_CPU_ISSET(i, fullMask)) {
3415                     continue;
3416                 }
3417                 __kmp_avail_proc++;
3418             }
3419             if (__kmp_avail_proc > __kmp_xproc) {
3420                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3421                   && (__kmp_affinity_type != affinity_none))) {
3422                     KMP_WARNING(ErrorInitializeAffinity);
3423                 }
3424                 __kmp_affinity_type = affinity_none;
3425                 __kmp_affin_mask_size = 0;
3426                 return;
3427             }
3428         }
3429         else {
3430             __kmp_affinity_entire_machine_mask(fullMask);
3431             __kmp_avail_proc = __kmp_xproc;
3432         }
3433     }
3434 
3435     int depth = -1;
3436     kmp_i18n_id_t msg_id = kmp_i18n_null;
3437 
3438     //
3439     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3440     // KMP_TOPOLOGY_METHOD=cpuinfo
3441     //
3442     if ((__kmp_cpuinfo_file != NULL) &&
3443       (__kmp_affinity_top_method == affinity_top_method_all)) {
3444         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3445     }
3446 
3447     if (__kmp_affinity_top_method == affinity_top_method_all) {
3448         //
3449         // In the default code path, errors are not fatal - we just try using
3450         // another method.  We only emit a warning message if affinity is on,
3451         // or the verbose flag is set, an the nowarnings flag was not set.
3452         //
3453         const char *file_name = NULL;
3454         int line = 0;
3455 
3456 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3457 
3458         if (__kmp_affinity_verbose) {
3459             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3460         }
3461 
3462         file_name = NULL;
3463         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3464         if (depth == 0) {
3465             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3466             KMP_ASSERT(address2os == NULL);
3467             return;
3468         }
3469 
3470         if (depth < 0) {
3471             if (__kmp_affinity_verbose) {
3472                 if (msg_id != kmp_i18n_null) {
3473                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3474                       KMP_I18N_STR(DecodingLegacyAPIC));
3475                 }
3476                 else {
3477                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3478                 }
3479             }
3480 
3481             file_name = NULL;
3482             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3483             if (depth == 0) {
3484                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3485                 KMP_ASSERT(address2os == NULL);
3486                 return;
3487             }
3488         }
3489 
3490 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3491 
3492 # if KMP_OS_LINUX
3493 
3494         if (depth < 0) {
3495             if (__kmp_affinity_verbose) {
3496                 if (msg_id != kmp_i18n_null) {
3497                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3498                 }
3499                 else {
3500                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3501                 }
3502             }
3503 
3504             FILE *f = fopen("/proc/cpuinfo", "r");
3505             if (f == NULL) {
3506                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3507             }
3508             else {
3509                 file_name = "/proc/cpuinfo";
3510                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3511                 fclose(f);
3512                 if (depth == 0) {
3513                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3514                     KMP_ASSERT(address2os == NULL);
3515                     return;
3516                 }
3517             }
3518         }
3519 
3520 # endif /* KMP_OS_LINUX */
3521 
3522 # if KMP_GROUP_AFFINITY
3523 
3524         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3525             if (__kmp_affinity_verbose) {
3526                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3527             }
3528 
3529             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3530             KMP_ASSERT(depth != 0);
3531         }
3532 
3533 # endif /* KMP_GROUP_AFFINITY */
3534 
3535         if (depth < 0) {
3536             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3537                 if (file_name == NULL) {
3538                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3539                 }
3540                 else if (line == 0) {
3541                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3542                 }
3543                 else {
3544                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3545                 }
3546             }
3547             // FIXME - print msg if msg_id = kmp_i18n_null ???
3548 
3549             file_name = "";
3550             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3551             if (depth == 0) {
3552                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3553                 KMP_ASSERT(address2os == NULL);
3554                 return;
3555             }
3556             KMP_ASSERT(depth > 0);
3557             KMP_ASSERT(address2os != NULL);
3558         }
3559     }
3560 
3561     //
3562     // If the user has specified that a paricular topology discovery method
3563     // is to be used, then we abort if that method fails.  The exception is
3564     // group affinity, which might have been implicitly set.
3565     //
3566 
3567 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3568 
3569     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3570         if (__kmp_affinity_verbose) {
3571             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3572               KMP_I18N_STR(Decodingx2APIC));
3573         }
3574 
3575         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3576         if (depth == 0) {
3577             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3578             KMP_ASSERT(address2os == NULL);
3579             return;
3580         }
3581         if (depth < 0) {
3582             KMP_ASSERT(msg_id != kmp_i18n_null);
3583             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3584         }
3585     }
3586     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3587         if (__kmp_affinity_verbose) {
3588             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3589               KMP_I18N_STR(DecodingLegacyAPIC));
3590         }
3591 
3592         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3593         if (depth == 0) {
3594             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3595             KMP_ASSERT(address2os == NULL);
3596             return;
3597         }
3598         if (depth < 0) {
3599             KMP_ASSERT(msg_id != kmp_i18n_null);
3600             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3601         }
3602     }
3603 
3604 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3605 
3606     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3607         const char *filename;
3608         if (__kmp_cpuinfo_file != NULL) {
3609             filename = __kmp_cpuinfo_file;
3610         }
3611         else {
3612             filename = "/proc/cpuinfo";
3613         }
3614 
3615         if (__kmp_affinity_verbose) {
3616             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3617         }
3618 
3619         FILE *f = fopen(filename, "r");
3620         if (f == NULL) {
3621             int code = errno;
3622             if (__kmp_cpuinfo_file != NULL) {
3623                 __kmp_msg(
3624                     kmp_ms_fatal,
3625                     KMP_MSG(CantOpenFileForReading, filename),
3626                     KMP_ERR(code),
3627                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3628                     __kmp_msg_null
3629                 );
3630             }
3631             else {
3632                 __kmp_msg(
3633                     kmp_ms_fatal,
3634                     KMP_MSG(CantOpenFileForReading, filename),
3635                     KMP_ERR(code),
3636                     __kmp_msg_null
3637                 );
3638             }
3639         }
3640         int line = 0;
3641         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3642         fclose(f);
3643         if (depth < 0) {
3644             KMP_ASSERT(msg_id != kmp_i18n_null);
3645             if (line > 0) {
3646                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3647             }
3648             else {
3649                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3650             }
3651         }
3652         if (__kmp_affinity_type == affinity_none) {
3653             KMP_ASSERT(depth == 0);
3654             KMP_ASSERT(address2os == NULL);
3655             return;
3656         }
3657     }
3658 
3659 # if KMP_GROUP_AFFINITY
3660 
3661     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3662         if (__kmp_affinity_verbose) {
3663             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3664         }
3665 
3666         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3667         KMP_ASSERT(depth != 0);
3668         if (depth < 0) {
3669             KMP_ASSERT(msg_id != kmp_i18n_null);
3670             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3671         }
3672     }
3673 
3674 # endif /* KMP_GROUP_AFFINITY */
3675 
3676     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3677         if (__kmp_affinity_verbose) {
3678             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3679         }
3680 
3681         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3682         if (depth == 0) {
3683             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3684             KMP_ASSERT(address2os == NULL);
3685             return;
3686         }
3687         // should not fail
3688         KMP_ASSERT(depth > 0);
3689         KMP_ASSERT(address2os != NULL);
3690     }
3691 
3692     if (address2os == NULL) {
3693         if (KMP_AFFINITY_CAPABLE()
3694           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3695           && (__kmp_affinity_type != affinity_none)))) {
3696             KMP_WARNING(ErrorInitializeAffinity);
3697         }
3698         __kmp_affinity_type = affinity_none;
3699         __kmp_affin_mask_size = 0;
3700         return;
3701     }
3702 
3703     __kmp_apply_thread_places(&address2os, depth);
3704 
3705     //
3706     // Create the table of masks, indexed by thread Id.
3707     //
3708     unsigned maxIndex;
3709     unsigned numUnique;
3710     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3711       address2os, __kmp_avail_proc);
3712     if (__kmp_affinity_gran_levels == 0) {
3713         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3714     }
3715 
3716     //
3717     // Set the childNums vector in all Address objects.  This must be done
3718     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3719     // which takes into account the setting of __kmp_affinity_compact.
3720     //
3721     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3722 
3723     switch (__kmp_affinity_type) {
3724 
3725         case affinity_explicit:
3726         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3727 # if OMP_40_ENABLED
3728         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3729 # endif
3730         {
3731             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3732               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3733               maxIndex);
3734         }
3735 # if OMP_40_ENABLED
3736         else {
3737             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3738               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3739               maxIndex);
3740         }
3741 # endif
3742         if (__kmp_affinity_num_masks == 0) {
3743             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3744               && (__kmp_affinity_type != affinity_none))) {
3745                 KMP_WARNING(AffNoValidProcID);
3746             }
3747             __kmp_affinity_type = affinity_none;
3748             return;
3749         }
3750         break;
3751 
3752         //
3753         // The other affinity types rely on sorting the Addresses according
3754         // to some permutation of the machine topology tree.  Set
3755         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3756         // then jump to a common code fragment to do the sort and create
3757         // the array of affinity masks.
3758         //
3759 
3760         case affinity_logical:
3761         __kmp_affinity_compact = 0;
3762         if (__kmp_affinity_offset) {
3763             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3764               % __kmp_avail_proc;
3765         }
3766         goto sortAddresses;
3767 
3768         case affinity_physical:
3769         if (__kmp_nThreadsPerCore > 1) {
3770             __kmp_affinity_compact = 1;
3771             if (__kmp_affinity_compact >= depth) {
3772                 __kmp_affinity_compact = 0;
3773             }
3774         } else {
3775             __kmp_affinity_compact = 0;
3776         }
3777         if (__kmp_affinity_offset) {
3778             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3779               % __kmp_avail_proc;
3780         }
3781         goto sortAddresses;
3782 
3783         case affinity_scatter:
3784         if (__kmp_affinity_compact >= depth) {
3785             __kmp_affinity_compact = 0;
3786         }
3787         else {
3788             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3789         }
3790         goto sortAddresses;
3791 
3792         case affinity_compact:
3793         if (__kmp_affinity_compact >= depth) {
3794             __kmp_affinity_compact = depth - 1;
3795         }
3796         goto sortAddresses;
3797 
3798         case affinity_balanced:
3799         // Balanced works only for the case of a single package
3800         if( nPackages > 1 ) {
3801             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3802                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3803             }
3804             __kmp_affinity_type = affinity_none;
3805             return;
3806         } else if( __kmp_affinity_uniform_topology() ) {
3807             break;
3808         } else { // Non-uniform topology
3809 
3810             // Save the depth for further usage
3811             __kmp_aff_depth = depth;
3812 
3813             // Number of hyper threads per core in HT machine
3814             int nth_per_core = __kmp_nThreadsPerCore;
3815 
3816             int core_level;
3817             if( nth_per_core > 1 ) {
3818                 core_level = depth - 2;
3819             } else {
3820                 core_level = depth - 1;
3821             }
3822             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3823             int nproc = nth_per_core * ncores;
3824 
3825             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3826             for( int i = 0; i < nproc; i++ ) {
3827                 procarr[ i ] = -1;
3828             }
3829 
3830             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3831                 int proc = address2os[ i ].second;
3832                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3833                 // If there is only one thread per core then depth == 2: level 0 - package,
3834                 // level 1 - core.
3835                 int level = depth - 1;
3836 
3837                 // __kmp_nth_per_core == 1
3838                 int thread = 0;
3839                 int core = address2os[ i ].first.labels[ level ];
3840                 // If the thread level exists, that is we have more than one thread context per core
3841                 if( nth_per_core > 1 ) {
3842                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3843                     core = address2os[ i ].first.labels[ level - 1 ];
3844                 }
3845                 procarr[ core * nth_per_core + thread ] = proc;
3846             }
3847 
3848             break;
3849         }
3850 
3851         sortAddresses:
3852         //
3853         // Allocate the gtid->affinity mask table.
3854         //
3855         if (__kmp_affinity_dups) {
3856             __kmp_affinity_num_masks = __kmp_avail_proc;
3857         }
3858         else {
3859             __kmp_affinity_num_masks = numUnique;
3860         }
3861 
3862 # if OMP_40_ENABLED
3863         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3864           && ( __kmp_affinity_num_places > 0 )
3865           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3866             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3867         }
3868 # endif
3869 
3870         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3871           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3872 
3873         //
3874         // Sort the address2os table according to the current setting of
3875         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3876         //
3877         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3878           __kmp_affinity_cmp_Address_child_num);
3879         {
3880             int i;
3881             unsigned j;
3882             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3883                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3884                     continue;
3885                 }
3886                 unsigned osId = address2os[i].second;
3887                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3888                 kmp_affin_mask_t *dest
3889                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3890                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3891                 KMP_CPU_COPY(dest, src);
3892                 if (++j >= __kmp_affinity_num_masks) {
3893                     break;
3894                 }
3895             }
3896             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3897         }
3898         break;
3899 
3900         default:
3901         KMP_ASSERT2(0, "Unexpected affinity setting");
3902     }
3903 
3904     __kmp_free(osId2Mask);
3905     machine_hierarchy.init(address2os, __kmp_avail_proc);
3906 }
3907 
3908 
3909 void
3910 __kmp_affinity_initialize(void)
3911 {
3912     //
3913     // Much of the code above was written assumming that if a machine was not
3914     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3915     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3916     //
3917     // There are too many checks for __kmp_affinity_type == affinity_none
3918     // in this code.  Instead of trying to change them all, check if
3919     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3920     // affinity_none, call the real initialization routine, then restore
3921     // __kmp_affinity_type to affinity_disabled.
3922     //
3923     int disabled = (__kmp_affinity_type == affinity_disabled);
3924     if (! KMP_AFFINITY_CAPABLE()) {
3925         KMP_ASSERT(disabled);
3926     }
3927     if (disabled) {
3928         __kmp_affinity_type = affinity_none;
3929     }
3930     __kmp_aux_affinity_initialize();
3931     if (disabled) {
3932         __kmp_affinity_type = affinity_disabled;
3933     }
3934 }
3935 
3936 
3937 void
3938 __kmp_affinity_uninitialize(void)
3939 {
3940     if (__kmp_affinity_masks != NULL) {
3941         __kmp_free(__kmp_affinity_masks);
3942         __kmp_affinity_masks = NULL;
3943     }
3944     if (fullMask != NULL) {
3945         KMP_CPU_FREE(fullMask);
3946         fullMask = NULL;
3947     }
3948     __kmp_affinity_num_masks = 0;
3949 # if OMP_40_ENABLED
3950     __kmp_affinity_num_places = 0;
3951 # endif
3952     if (__kmp_affinity_proclist != NULL) {
3953         __kmp_free(__kmp_affinity_proclist);
3954         __kmp_affinity_proclist = NULL;
3955     }
3956     if( address2os != NULL ) {
3957         __kmp_free( address2os );
3958         address2os = NULL;
3959     }
3960     if( procarr != NULL ) {
3961         __kmp_free( procarr );
3962         procarr = NULL;
3963     }
3964 }
3965 
3966 
3967 void
3968 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3969 {
3970     if (! KMP_AFFINITY_CAPABLE()) {
3971         return;
3972     }
3973 
3974     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3975     if (th->th.th_affin_mask == NULL) {
3976         KMP_CPU_ALLOC(th->th.th_affin_mask);
3977     }
3978     else {
3979         KMP_CPU_ZERO(th->th.th_affin_mask);
3980     }
3981 
3982     //
3983     // Copy the thread mask to the kmp_info_t strucuture.
3984     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3985     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3986     // is set, then the full mask is the same as the mask of the initialization
3987     // thread.
3988     //
3989     kmp_affin_mask_t *mask;
3990     int i;
3991 
3992 # if OMP_40_ENABLED
3993     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3994 # endif
3995     {
3996         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
3997           ) {
3998 # if KMP_GROUP_AFFINITY
3999             if (__kmp_num_proc_groups > 1) {
4000                 return;
4001             }
4002 # endif
4003             KMP_ASSERT(fullMask != NULL);
4004             i = KMP_PLACE_ALL;
4005             mask = fullMask;
4006         }
4007         else {
4008             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4009             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4010             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4011         }
4012     }
4013 # if OMP_40_ENABLED
4014     else {
4015         if ((! isa_root)
4016           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4017 #  if KMP_GROUP_AFFINITY
4018             if (__kmp_num_proc_groups > 1) {
4019                 return;
4020             }
4021 #  endif
4022             KMP_ASSERT(fullMask != NULL);
4023             i = KMP_PLACE_ALL;
4024             mask = fullMask;
4025         }
4026         else {
4027             //
4028             // int i = some hash function or just a counter that doesn't
4029             // always start at 0.  Use gtid for now.
4030             //
4031             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4032             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4033             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4034         }
4035     }
4036 # endif
4037 
4038 # if OMP_40_ENABLED
4039     th->th.th_current_place = i;
4040     if (isa_root) {
4041         th->th.th_new_place = i;
4042         th->th.th_first_place = 0;
4043         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4044     }
4045 
4046     if (i == KMP_PLACE_ALL) {
4047         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4048           gtid));
4049     }
4050     else {
4051         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4052           gtid, i));
4053     }
4054 # else
4055     if (i == -1) {
4056         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4057           gtid));
4058     }
4059     else {
4060         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4061           gtid, i));
4062     }
4063 # endif /* OMP_40_ENABLED */
4064 
4065     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4066 
4067     if (__kmp_affinity_verbose) {
4068         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4069         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4070           th->th.th_affin_mask);
4071         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4072           buf);
4073     }
4074 
4075 # if KMP_OS_WINDOWS
4076     //
4077     // On Windows* OS, the process affinity mask might have changed.
4078     // If the user didn't request affinity and this call fails,
4079     // just continue silently.  See CQ171393.
4080     //
4081     if ( __kmp_affinity_type == affinity_none ) {
4082         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4083     }
4084     else
4085 # endif
4086     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4087 }
4088 
4089 
4090 # if OMP_40_ENABLED
4091 
4092 void
4093 __kmp_affinity_set_place(int gtid)
4094 {
4095     int retval;
4096 
4097     if (! KMP_AFFINITY_CAPABLE()) {
4098         return;
4099     }
4100 
4101     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4102 
4103     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4104       gtid, th->th.th_new_place, th->th.th_current_place));
4105 
4106     //
4107     // Check that the new place is within this thread's partition.
4108     //
4109     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4110     KMP_ASSERT(th->th.th_new_place >= 0);
4111     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4112     if (th->th.th_first_place <= th->th.th_last_place) {
4113         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4114          && (th->th.th_new_place <= th->th.th_last_place));
4115     }
4116     else {
4117         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4118          || (th->th.th_new_place >= th->th.th_last_place));
4119     }
4120 
4121     //
4122     // Copy the thread mask to the kmp_info_t strucuture,
4123     // and set this thread's affinity.
4124     //
4125     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4126       th->th.th_new_place);
4127     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4128     th->th.th_current_place = th->th.th_new_place;
4129 
4130     if (__kmp_affinity_verbose) {
4131         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4132         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4133           th->th.th_affin_mask);
4134         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4135           gtid, buf);
4136     }
4137     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4138 }
4139 
4140 # endif /* OMP_40_ENABLED */
4141 
4142 
4143 int
4144 __kmp_aux_set_affinity(void **mask)
4145 {
4146     int gtid;
4147     kmp_info_t *th;
4148     int retval;
4149 
4150     if (! KMP_AFFINITY_CAPABLE()) {
4151         return -1;
4152     }
4153 
4154     gtid = __kmp_entry_gtid();
4155     KA_TRACE(1000, ;{
4156         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4157         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4158           (kmp_affin_mask_t *)(*mask));
4159         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4160           gtid, buf);
4161     });
4162 
4163     if (__kmp_env_consistency_check) {
4164         if ((mask == NULL) || (*mask == NULL)) {
4165             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4166         }
4167         else {
4168             unsigned proc;
4169             int num_procs = 0;
4170 
4171             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4172                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4173                     continue;
4174                 }
4175                 num_procs++;
4176                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4177                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4178                     break;
4179                 }
4180             }
4181             if (num_procs == 0) {
4182                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4183             }
4184 
4185 # if KMP_GROUP_AFFINITY
4186             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4187                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4188             }
4189 # endif /* KMP_GROUP_AFFINITY */
4190 
4191         }
4192     }
4193 
4194     th = __kmp_threads[gtid];
4195     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4196     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4197     if (retval == 0) {
4198         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4199     }
4200 
4201 # if OMP_40_ENABLED
4202     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4203     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4204     th->th.th_first_place = 0;
4205     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4206 
4207     //
4208     // Turn off 4.0 affinity for the current tread at this parallel level.
4209     //
4210     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4211 # endif
4212 
4213     return retval;
4214 }
4215 
4216 
4217 int
4218 __kmp_aux_get_affinity(void **mask)
4219 {
4220     int gtid;
4221     int retval;
4222     kmp_info_t *th;
4223 
4224     if (! KMP_AFFINITY_CAPABLE()) {
4225         return -1;
4226     }
4227 
4228     gtid = __kmp_entry_gtid();
4229     th = __kmp_threads[gtid];
4230     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4231 
4232     KA_TRACE(1000, ;{
4233         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4234         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4235           th->th.th_affin_mask);
4236         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4237     });
4238 
4239     if (__kmp_env_consistency_check) {
4240         if ((mask == NULL) || (*mask == NULL)) {
4241             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4242         }
4243     }
4244 
4245 # if !KMP_OS_WINDOWS
4246 
4247     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4248     KA_TRACE(1000, ;{
4249         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4250         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4251           (kmp_affin_mask_t *)(*mask));
4252         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4253     });
4254     return retval;
4255 
4256 # else
4257 
4258     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4259     return 0;
4260 
4261 # endif /* KMP_OS_WINDOWS */
4262 
4263 }
4264 
4265 int
4266 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4267 {
4268     int retval;
4269 
4270     if (! KMP_AFFINITY_CAPABLE()) {
4271         return -1;
4272     }
4273 
4274     KA_TRACE(1000, ;{
4275         int gtid = __kmp_entry_gtid();
4276         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4277         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4278           (kmp_affin_mask_t *)(*mask));
4279         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4280           proc, gtid, buf);
4281     });
4282 
4283     if (__kmp_env_consistency_check) {
4284         if ((mask == NULL) || (*mask == NULL)) {
4285             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4286         }
4287     }
4288 
4289     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4290         return -1;
4291     }
4292     if (! KMP_CPU_ISSET(proc, fullMask)) {
4293         return -2;
4294     }
4295 
4296     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4297     return 0;
4298 }
4299 
4300 
4301 int
4302 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4303 {
4304     int retval;
4305 
4306     if (! KMP_AFFINITY_CAPABLE()) {
4307         return -1;
4308     }
4309 
4310     KA_TRACE(1000, ;{
4311         int gtid = __kmp_entry_gtid();
4312         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4313         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4314           (kmp_affin_mask_t *)(*mask));
4315         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4316           proc, gtid, buf);
4317     });
4318 
4319     if (__kmp_env_consistency_check) {
4320         if ((mask == NULL) || (*mask == NULL)) {
4321             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4322         }
4323     }
4324 
4325     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4326         return -1;
4327     }
4328     if (! KMP_CPU_ISSET(proc, fullMask)) {
4329         return -2;
4330     }
4331 
4332     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4333     return 0;
4334 }
4335 
4336 
4337 int
4338 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4339 {
4340     int retval;
4341 
4342     if (! KMP_AFFINITY_CAPABLE()) {
4343         return -1;
4344     }
4345 
4346     KA_TRACE(1000, ;{
4347         int gtid = __kmp_entry_gtid();
4348         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4349         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4350           (kmp_affin_mask_t *)(*mask));
4351         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4352           proc, gtid, buf);
4353     });
4354 
4355     if (__kmp_env_consistency_check) {
4356         if ((mask == NULL) || (*mask == NULL)) {
4357             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4358         }
4359     }
4360 
4361     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4362         return 0;
4363     }
4364     if (! KMP_CPU_ISSET(proc, fullMask)) {
4365         return 0;
4366     }
4367 
4368     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4369 }
4370 
4371 
4372 // Dynamic affinity settings - Affinity balanced
4373 void __kmp_balanced_affinity( int tid, int nthreads )
4374 {
4375     if( __kmp_affinity_uniform_topology() ) {
4376         int coreID;
4377         int threadID;
4378         // Number of hyper threads per core in HT machine
4379         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4380         // Number of cores
4381         int ncores = __kmp_ncores;
4382         // How many threads will be bound to each core
4383         int chunk = nthreads / ncores;
4384         // How many cores will have an additional thread bound to it - "big cores"
4385         int big_cores = nthreads % ncores;
4386         // Number of threads on the big cores
4387         int big_nth = ( chunk + 1 ) * big_cores;
4388         if( tid < big_nth ) {
4389             coreID = tid / (chunk + 1 );
4390             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4391         } else { //tid >= big_nth
4392             coreID = ( tid - big_cores ) / chunk;
4393             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4394         }
4395 
4396         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4397           "Illegal set affinity operation when not capable");
4398 
4399         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4400         KMP_CPU_ZERO(mask);
4401 
4402         // Granularity == thread
4403         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4404             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4405             KMP_CPU_SET( osID, mask);
4406         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4407             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4408                 int osID;
4409                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4410                 KMP_CPU_SET( osID, mask);
4411             }
4412         }
4413         if (__kmp_affinity_verbose) {
4414             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4415             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4416             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4417               tid, buf);
4418         }
4419         __kmp_set_system_affinity( mask, TRUE );
4420     } else { // Non-uniform topology
4421 
4422         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4423         KMP_CPU_ZERO(mask);
4424 
4425         // Number of hyper threads per core in HT machine
4426         int nth_per_core = __kmp_nThreadsPerCore;
4427         int core_level;
4428         if( nth_per_core > 1 ) {
4429             core_level = __kmp_aff_depth - 2;
4430         } else {
4431             core_level = __kmp_aff_depth - 1;
4432         }
4433 
4434         // Number of cores - maximum value; it does not count trail cores with 0 processors
4435         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4436 
4437         // For performance gain consider the special case nthreads == __kmp_avail_proc
4438         if( nthreads == __kmp_avail_proc ) {
4439             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4440                 int osID = address2os[ tid ].second;
4441                 KMP_CPU_SET( osID, mask);
4442             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4443                 int coreID = address2os[ tid ].first.labels[ core_level ];
4444                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4445                 // since the address2os is sortied we can break when cnt==nth_per_core
4446                 int cnt = 0;
4447                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4448                     int osID = address2os[ i ].second;
4449                     int core = address2os[ i ].first.labels[ core_level ];
4450                     if( core == coreID ) {
4451                         KMP_CPU_SET( osID, mask);
4452                         cnt++;
4453                         if( cnt == nth_per_core ) {
4454                             break;
4455                         }
4456                     }
4457                 }
4458             }
4459         } else if( nthreads <= __kmp_ncores ) {
4460 
4461             int core = 0;
4462             for( int i = 0; i < ncores; i++ ) {
4463                 // Check if this core from procarr[] is in the mask
4464                 int in_mask = 0;
4465                 for( int j = 0; j < nth_per_core; j++ ) {
4466                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4467                         in_mask = 1;
4468                         break;
4469                     }
4470                 }
4471                 if( in_mask ) {
4472                     if( tid == core ) {
4473                         for( int j = 0; j < nth_per_core; j++ ) {
4474                             int osID = procarr[ i * nth_per_core + j ];
4475                             if( osID != -1 ) {
4476                                 KMP_CPU_SET( osID, mask );
4477                                 // For granularity=thread it is enough to set the first available osID for this core
4478                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4479                                     break;
4480                                 }
4481                             }
4482                         }
4483                         break;
4484                     } else {
4485                         core++;
4486                     }
4487                 }
4488             }
4489 
4490         } else { // nthreads > __kmp_ncores
4491 
4492             // Array to save the number of processors at each core
4493             int nproc_at_core[ ncores ];
4494             // Array to save the number of cores with "x" available processors;
4495             int ncores_with_x_procs[ nth_per_core + 1 ];
4496             // Array to save the number of cores with # procs from x to nth_per_core
4497             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4498 
4499             for( int i = 0; i <= nth_per_core; i++ ) {
4500                 ncores_with_x_procs[ i ] = 0;
4501                 ncores_with_x_to_max_procs[ i ] = 0;
4502             }
4503 
4504             for( int i = 0; i < ncores; i++ ) {
4505                 int cnt = 0;
4506                 for( int j = 0; j < nth_per_core; j++ ) {
4507                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4508                         cnt++;
4509                     }
4510                 }
4511                 nproc_at_core[ i ] = cnt;
4512                 ncores_with_x_procs[ cnt ]++;
4513             }
4514 
4515             for( int i = 0; i <= nth_per_core; i++ ) {
4516                 for( int j = i; j <= nth_per_core; j++ ) {
4517                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4518                 }
4519             }
4520 
4521             // Max number of processors
4522             int nproc = nth_per_core * ncores;
4523             // An array to keep number of threads per each context
4524             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4525             for( int i = 0; i < nproc; i++ ) {
4526                 newarr[ i ] = 0;
4527             }
4528 
4529             int nth = nthreads;
4530             int flag = 0;
4531             while( nth > 0 ) {
4532                 for( int j = 1; j <= nth_per_core; j++ ) {
4533                     int cnt = ncores_with_x_to_max_procs[ j ];
4534                     for( int i = 0; i < ncores; i++ ) {
4535                         // Skip the core with 0 processors
4536                         if( nproc_at_core[ i ] == 0 ) {
4537                             continue;
4538                         }
4539                         for( int k = 0; k < nth_per_core; k++ ) {
4540                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4541                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4542                                     newarr[ i * nth_per_core + k ] = 1;
4543                                     cnt--;
4544                                     nth--;
4545                                     break;
4546                                 } else {
4547                                     if( flag != 0 ) {
4548                                         newarr[ i * nth_per_core + k ] ++;
4549                                         cnt--;
4550                                         nth--;
4551                                         break;
4552                                     }
4553                                 }
4554                             }
4555                         }
4556                         if( cnt == 0 || nth == 0 ) {
4557                             break;
4558                         }
4559                     }
4560                     if( nth == 0 ) {
4561                         break;
4562                     }
4563                 }
4564                 flag = 1;
4565             }
4566             int sum = 0;
4567             for( int i = 0; i < nproc; i++ ) {
4568                 sum += newarr[ i ];
4569                 if( sum > tid ) {
4570                     // Granularity == thread
4571                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4572                         int osID = procarr[ i ];
4573                         KMP_CPU_SET( osID, mask);
4574                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4575                         int coreID = i / nth_per_core;
4576                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4577                             int osID = procarr[ coreID * nth_per_core + ii ];
4578                             if( osID != -1 ) {
4579                                 KMP_CPU_SET( osID, mask);
4580                             }
4581                         }
4582                     }
4583                     break;
4584                 }
4585             }
4586             __kmp_free( newarr );
4587         }
4588 
4589         if (__kmp_affinity_verbose) {
4590             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4591             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4592             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4593               tid, buf);
4594         }
4595         __kmp_set_system_affinity( mask, TRUE );
4596     }
4597 }
4598 
4599 #else
4600     // affinity not supported
4601 
4602 kmp_uint32 mac_skipPerLevel[7];
4603 kmp_uint32 mac_depth;
4604 kmp_uint8 mac_leaf_kids;
4605 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4606     static int first = 1;
4607     if (first) {
4608         const kmp_uint32 maxLevels = 7;
4609         kmp_uint32 numPerLevel[maxLevels];
4610 
4611         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4612             numPerLevel[i] = 1;
4613             mac_skipPerLevel[i] = 1;
4614         }
4615 
4616         mac_depth = 2;
4617         numPerLevel[0] = nproc;
4618 
4619         kmp_uint32 branch = 4;
4620         if (numPerLevel[0] == 1) branch = nproc/4;
4621         if (branch<4) branch=4;
4622         for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4623             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4624                 if (numPerLevel[d] & 1) numPerLevel[d]++;
4625                 numPerLevel[d] = numPerLevel[d] >> 1;
4626                 if (numPerLevel[d+1] == 1) mac_depth++;
4627                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4628             }
4629             if(numPerLevel[0] == 1) {
4630                 branch = branch >> 1;
4631                 if (branch<4) branch = 4;
4632             }
4633         }
4634 
4635         for (kmp_uint32 i=1; i<mac_depth; ++i)
4636             mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4637         mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4638         first=0;
4639     }
4640     thr_bar->depth = mac_depth;
4641     thr_bar->base_leaf_kids = mac_leaf_kids;
4642     thr_bar->skip_per_level = mac_skipPerLevel;
4643 }
4644 
4645 #endif // KMP_AFFINITY_SUPPORTED
4646