1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         KMP_SNPRINTF(scan, buf_len, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         KMP_SNPRINTF(scan, buf_len,  ",...");
72         while (*scan != '\0') scan++;
73     }
74     KMP_SNPRINTF(scan, buf_len, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libiomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */
316 class hierarchy_info {
317 public:
318     /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
319         etc.  We don't want to get specific with nomenclature */
320     static const kmp_uint32 maxLevels=7;
321 
322     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
323         number of levels along the longest path from root to any leaf. It corresponds to the
324         number of entries in numPerLevel if we exclude all but one trailing 1. */
325     kmp_uint32 depth;
326     kmp_uint32 base_num_threads;
327     bool uninitialized;
328 
329     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
330         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
331         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
332     kmp_uint32 numPerLevel[maxLevels];
333     kmp_uint32 skipPerLevel[maxLevels];
334 
335     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
336         int hier_depth = adr2os[0].first.depth;
337         int level = 0;
338         for (int i=hier_depth-1; i>=0; --i) {
339             int max = -1;
340             for (int j=0; j<num_addrs; ++j) {
341                 int next = adr2os[j].first.childNums[i];
342                 if (next > max) max = next;
343             }
344             numPerLevel[level] = max+1;
345             ++level;
346         }
347     }
348 
349     hierarchy_info() : depth(1), uninitialized(true) {}
350     void init(AddrUnsPair *adr2os, int num_addrs)
351     {
352         /* Added explicit initialization of the depth here to prevent usage of dirty value
353            observed when static library is re-initialized multiple times (e.g. when
354            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
355         depth = 1;
356         uninitialized = false;
357         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
358             numPerLevel[i] = 1;
359             skipPerLevel[i] = 1;
360         }
361 
362         // Sort table by physical ID
363         if (adr2os) {
364             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
365             deriveLevels(adr2os, num_addrs);
366         }
367         else {
368             numPerLevel[0] = 4;
369             numPerLevel[1] = num_addrs/4;
370             if (num_addrs%4) numPerLevel[1]++;
371         }
372 
373         base_num_threads = num_addrs;
374         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
375             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
376                 depth++;
377 
378         kmp_uint32 branch = 4;
379         if (numPerLevel[0] == 1) branch = num_addrs/4;
380         if (branch<4) branch=4;
381         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
382             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
383                 if (numPerLevel[d] & 1) numPerLevel[d]++;
384                 numPerLevel[d] = numPerLevel[d] >> 1;
385                 if (numPerLevel[d+1] == 1) depth++;
386                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
387             }
388             if(numPerLevel[0] == 1) {
389                 branch = branch >> 1;
390                 if (branch<4) branch = 4;
391             }
392         }
393 
394         for (kmp_uint32 i=1; i<depth; ++i)
395             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
396 
397     }
398 };
399 
400 static hierarchy_info machine_hierarchy;
401 
402 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
403     kmp_uint32 depth;
404     if (machine_hierarchy.uninitialized)
405         machine_hierarchy.init(NULL, nproc);
406 
407     depth = machine_hierarchy.depth;
408     KMP_DEBUG_ASSERT(depth > 0);
409     while (nproc > machine_hierarchy.skipPerLevel[depth-1]) {
410         depth++;
411         machine_hierarchy.skipPerLevel[depth-1] = 2*machine_hierarchy.skipPerLevel[depth-2];
412     }
413     thr_bar->depth = depth;
414     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
415     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
416 }
417 
418 //
419 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
420 // called to renumber the labels from [0..n] and place them into the child_num
421 // vector of the address object.  This is done in case the labels used for
422 // the children at one node of the hierarchy differ from those used for
423 // another node at the same level.  Example:  suppose the machine has 2 nodes
424 // with 2 packages each.  The first node contains packages 601 and 602, and
425 // second node contains packages 603 and 604.  If we try to sort the table
426 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
427 // because we are paying attention to the labels themselves, not the ordinal
428 // child numbers.  By using the child numbers in the sort, the result is
429 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
430 //
431 static void
432 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
433   int numAddrs)
434 {
435     KMP_DEBUG_ASSERT(numAddrs > 0);
436     int depth = address2os->first.depth;
437     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
438     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
439       * sizeof(unsigned));
440     int labCt;
441     for (labCt = 0; labCt < depth; labCt++) {
442         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
443         lastLabel[labCt] = address2os[0].first.labels[labCt];
444     }
445     int i;
446     for (i = 1; i < numAddrs; i++) {
447         for (labCt = 0; labCt < depth; labCt++) {
448             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
449                 int labCt2;
450                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
451                     counts[labCt2] = 0;
452                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
453                 }
454                 counts[labCt]++;
455                 lastLabel[labCt] = address2os[i].first.labels[labCt];
456                 break;
457             }
458         }
459         for (labCt = 0; labCt < depth; labCt++) {
460             address2os[i].first.childNums[labCt] = counts[labCt];
461         }
462         for (; labCt < (int)Address::maxDepth; labCt++) {
463             address2os[i].first.childNums[labCt] = 0;
464         }
465     }
466 }
467 
468 
469 //
470 // All of the __kmp_affinity_create_*_map() routines should set
471 // __kmp_affinity_masks to a vector of affinity mask objects of length
472 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
473 // return the number of levels in the machine topology tree (zero if
474 // __kmp_affinity_type == affinity_none).
475 //
476 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
477 // to the affinity mask for the initialization thread.  They need to save and
478 // restore the mask, and it could be needed later, so saving it is just an
479 // optimization to avoid calling kmp_get_system_affinity() again.
480 //
481 static kmp_affin_mask_t *fullMask = NULL;
482 
483 kmp_affin_mask_t *
484 __kmp_affinity_get_fullMask() { return fullMask; }
485 
486 
487 static int nCoresPerPkg, nPackages;
488 static int __kmp_nThreadsPerCore;
489 #ifndef KMP_DFLT_NTH_CORES
490 static int __kmp_ncores;
491 #endif
492 
493 //
494 // __kmp_affinity_uniform_topology() doesn't work when called from
495 // places which support arbitrarily many levels in the machine topology
496 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
497 // __kmp_affinity_create_x2apicid_map().
498 //
499 inline static bool
500 __kmp_affinity_uniform_topology()
501 {
502     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
503 }
504 
505 
506 //
507 // Print out the detailed machine topology map, i.e. the physical locations
508 // of each OS proc.
509 //
510 static void
511 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
512   int pkgLevel, int coreLevel, int threadLevel)
513 {
514     int proc;
515 
516     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
517     for (proc = 0; proc < len; proc++) {
518         int level;
519         kmp_str_buf_t buf;
520         __kmp_str_buf_init(&buf);
521         for (level = 0; level < depth; level++) {
522             if (level == threadLevel) {
523                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
524             }
525             else if (level == coreLevel) {
526                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
527             }
528             else if (level == pkgLevel) {
529                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
530             }
531             else if (level > pkgLevel) {
532                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
533                   level - pkgLevel - 1);
534             }
535             else {
536                 __kmp_str_buf_print(&buf, "L%d ", level);
537             }
538             __kmp_str_buf_print(&buf, "%d ",
539               address2os[proc].first.labels[level]);
540         }
541         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
542           buf.str);
543         __kmp_str_buf_free(&buf);
544     }
545 }
546 
547 
548 //
549 // If we don't know how to retrieve the machine's processor topology, or
550 // encounter an error in doing so, this routine is called to form a "flat"
551 // mapping of os thread id's <-> processor id's.
552 //
553 static int
554 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
555   kmp_i18n_id_t *const msg_id)
556 {
557     *address2os = NULL;
558     *msg_id = kmp_i18n_null;
559 
560     //
561     // Even if __kmp_affinity_type == affinity_none, this routine might still
562     // called to set __kmp_ncores, as well as
563     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
564     //
565     if (! KMP_AFFINITY_CAPABLE()) {
566         KMP_ASSERT(__kmp_affinity_type == affinity_none);
567         __kmp_ncores = nPackages = __kmp_xproc;
568         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
569         if (__kmp_affinity_verbose) {
570             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
571             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
572             KMP_INFORM(Uniform, "KMP_AFFINITY");
573             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
574               __kmp_nThreadsPerCore, __kmp_ncores);
575         }
576         return 0;
577     }
578 
579     //
580     // When affinity is off, this routine will still be called to set
581     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
582     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
583     //  correctly, and return now if affinity is not enabled.
584     //
585     __kmp_ncores = nPackages = __kmp_avail_proc;
586     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
587     if (__kmp_affinity_verbose) {
588         char buf[KMP_AFFIN_MASK_PRINT_LEN];
589         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
590 
591         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
592         if (__kmp_affinity_respect_mask) {
593             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
594         } else {
595             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
596         }
597         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
598         KMP_INFORM(Uniform, "KMP_AFFINITY");
599         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
600           __kmp_nThreadsPerCore, __kmp_ncores);
601     }
602     if (__kmp_affinity_type == affinity_none) {
603         return 0;
604     }
605 
606     //
607     // Contruct the data structure to be returned.
608     //
609     *address2os = (AddrUnsPair*)
610       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
611     int avail_ct = 0;
612     unsigned int i;
613     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
614         //
615         // Skip this proc if it is not included in the machine model.
616         //
617         if (! KMP_CPU_ISSET(i, fullMask)) {
618             continue;
619         }
620 
621         Address addr(1);
622         addr.labels[0] = i;
623         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
624     }
625     if (__kmp_affinity_verbose) {
626         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
627     }
628 
629     if (__kmp_affinity_gran_levels < 0) {
630         //
631         // Only the package level is modeled in the machine topology map,
632         // so the #levels of granularity is either 0 or 1.
633         //
634         if (__kmp_affinity_gran > affinity_gran_package) {
635             __kmp_affinity_gran_levels = 1;
636         }
637         else {
638             __kmp_affinity_gran_levels = 0;
639         }
640     }
641     return 1;
642 }
643 
644 
645 # if KMP_GROUP_AFFINITY
646 
647 //
648 // If multiple Windows* OS processor groups exist, we can create a 2-level
649 // topology map with the groups at level 0 and the individual procs at
650 // level 1.
651 //
652 // This facilitates letting the threads float among all procs in a group,
653 // if granularity=group (the default when there are multiple groups).
654 //
655 static int
656 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
657   kmp_i18n_id_t *const msg_id)
658 {
659     *address2os = NULL;
660     *msg_id = kmp_i18n_null;
661 
662     //
663     // If we don't have multiple processor groups, return now.
664     // The flat mapping will be used.
665     //
666     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
667         // FIXME set *msg_id
668         return -1;
669     }
670 
671     //
672     // Contruct the data structure to be returned.
673     //
674     *address2os = (AddrUnsPair*)
675       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
676     int avail_ct = 0;
677     int i;
678     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
679         //
680         // Skip this proc if it is not included in the machine model.
681         //
682         if (! KMP_CPU_ISSET(i, fullMask)) {
683             continue;
684         }
685 
686         Address addr(2);
687         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
688         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
689         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
690 
691         if (__kmp_affinity_verbose) {
692             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
693               addr.labels[1]);
694         }
695     }
696 
697     if (__kmp_affinity_gran_levels < 0) {
698         if (__kmp_affinity_gran == affinity_gran_group) {
699             __kmp_affinity_gran_levels = 1;
700         }
701         else if ((__kmp_affinity_gran == affinity_gran_fine)
702           || (__kmp_affinity_gran == affinity_gran_thread)) {
703             __kmp_affinity_gran_levels = 0;
704         }
705         else {
706             const char *gran_str = NULL;
707             if (__kmp_affinity_gran == affinity_gran_core) {
708                 gran_str = "core";
709             }
710             else if (__kmp_affinity_gran == affinity_gran_package) {
711                 gran_str = "package";
712             }
713             else if (__kmp_affinity_gran == affinity_gran_node) {
714                 gran_str = "node";
715             }
716             else {
717                 KMP_ASSERT(0);
718             }
719 
720             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
721             __kmp_affinity_gran_levels = 0;
722         }
723     }
724     return 2;
725 }
726 
727 # endif /* KMP_GROUP_AFFINITY */
728 
729 
730 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
731 
732 static int
733 __kmp_cpuid_mask_width(int count) {
734     int r = 0;
735 
736     while((1<<r) < count)
737         ++r;
738     return r;
739 }
740 
741 
742 class apicThreadInfo {
743 public:
744     unsigned osId;              // param to __kmp_affinity_bind_thread
745     unsigned apicId;            // from cpuid after binding
746     unsigned maxCoresPerPkg;    //      ""
747     unsigned maxThreadsPerPkg;  //      ""
748     unsigned pkgId;             // inferred from above values
749     unsigned coreId;            //      ""
750     unsigned threadId;          //      ""
751 };
752 
753 
754 static int
755 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
756 {
757     const apicThreadInfo *aa = (const apicThreadInfo *)a;
758     const apicThreadInfo *bb = (const apicThreadInfo *)b;
759     if (aa->osId < bb->osId) return -1;
760     if (aa->osId > bb->osId) return 1;
761     return 0;
762 }
763 
764 
765 static int
766 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
767 {
768     const apicThreadInfo *aa = (const apicThreadInfo *)a;
769     const apicThreadInfo *bb = (const apicThreadInfo *)b;
770     if (aa->pkgId < bb->pkgId) return -1;
771     if (aa->pkgId > bb->pkgId) return 1;
772     if (aa->coreId < bb->coreId) return -1;
773     if (aa->coreId > bb->coreId) return 1;
774     if (aa->threadId < bb->threadId) return -1;
775     if (aa->threadId > bb->threadId) return 1;
776     return 0;
777 }
778 
779 
780 //
781 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
782 // an algorithm which cycles through the available os threads, setting
783 // the current thread's affinity mask to that thread, and then retrieves
784 // the Apic Id for each thread context using the cpuid instruction.
785 //
786 static int
787 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
788   kmp_i18n_id_t *const msg_id)
789 {
790     kmp_cpuid buf;
791     int rc;
792     *address2os = NULL;
793     *msg_id = kmp_i18n_null;
794 
795     //
796     // Check if cpuid leaf 4 is supported.
797     //
798         __kmp_x86_cpuid(0, 0, &buf);
799         if (buf.eax < 4) {
800             *msg_id = kmp_i18n_str_NoLeaf4Support;
801             return -1;
802         }
803 
804     //
805     // The algorithm used starts by setting the affinity to each available
806     // thread and retrieving info from the cpuid instruction, so if we are
807     // not capable of calling __kmp_get_system_affinity() and
808     // _kmp_get_system_affinity(), then we need to do something else - use
809     // the defaults that we calculated from issuing cpuid without binding
810     // to each proc.
811     //
812     if (! KMP_AFFINITY_CAPABLE()) {
813         //
814         // Hack to try and infer the machine topology using only the data
815         // available from cpuid on the current thread, and __kmp_xproc.
816         //
817         KMP_ASSERT(__kmp_affinity_type == affinity_none);
818 
819         //
820         // Get an upper bound on the number of threads per package using
821         // cpuid(1).
822         //
823         // On some OS/chps combinations where HT is supported by the chip
824         // but is disabled, this value will be 2 on a single core chip.
825         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
826         //
827         __kmp_x86_cpuid(1, 0, &buf);
828         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
829         if (maxThreadsPerPkg == 0) {
830             maxThreadsPerPkg = 1;
831         }
832 
833         //
834         // The num cores per pkg comes from cpuid(4).
835         // 1 must be added to the encoded value.
836         //
837         // The author of cpu_count.cpp treated this only an upper bound
838         // on the number of cores, but I haven't seen any cases where it
839         // was greater than the actual number of cores, so we will treat
840         // it as exact in this block of code.
841         //
842         // First, we need to check if cpuid(4) is supported on this chip.
843         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
844         // has the value n or greater.
845         //
846         __kmp_x86_cpuid(0, 0, &buf);
847         if (buf.eax >= 4) {
848             __kmp_x86_cpuid(4, 0, &buf);
849             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
850         }
851         else {
852             nCoresPerPkg = 1;
853         }
854 
855         //
856         // There is no way to reliably tell if HT is enabled without issuing
857         // the cpuid instruction from every thread, can correlating the cpuid
858         // info, so if the machine is not affinity capable, we assume that HT
859         // is off.  We have seen quite a few machines where maxThreadsPerPkg
860         // is 2, yet the machine does not support HT.
861         //
862         // - Older OSes are usually found on machines with older chips, which
863         //   do not support HT.
864         //
865         // - The performance penalty for mistakenly identifying a machine as
866         //   HT when it isn't (which results in blocktime being incorrecly set
867         //   to 0) is greater than the penalty when for mistakenly identifying
868         //   a machine as being 1 thread/core when it is really HT enabled
869         //   (which results in blocktime being incorrectly set to a positive
870         //   value).
871         //
872         __kmp_ncores = __kmp_xproc;
873         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
874         __kmp_nThreadsPerCore = 1;
875         if (__kmp_affinity_verbose) {
876             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
877             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
878             if (__kmp_affinity_uniform_topology()) {
879                 KMP_INFORM(Uniform, "KMP_AFFINITY");
880             } else {
881                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
882             }
883             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
884               __kmp_nThreadsPerCore, __kmp_ncores);
885         }
886         return 0;
887     }
888 
889     //
890     //
891     // From here on, we can assume that it is safe to call
892     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
893     // even if __kmp_affinity_type = affinity_none.
894     //
895 
896     //
897     // Save the affinity mask for the current thread.
898     //
899     kmp_affin_mask_t *oldMask;
900     KMP_CPU_ALLOC(oldMask);
901     KMP_ASSERT(oldMask != NULL);
902     __kmp_get_system_affinity(oldMask, TRUE);
903 
904     //
905     // Run through each of the available contexts, binding the current thread
906     // to it, and obtaining the pertinent information using the cpuid instr.
907     //
908     // The relevant information is:
909     //
910     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
911     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
912     //
913     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
914     //    value of this field determines the width of the core# + thread#
915     //    fields in the Apic Id.  It is also an upper bound on the number
916     //    of threads per package, but it has been verified that situations
917     //    happen were it is not exact.  In particular, on certain OS/chip
918     //    combinations where Intel(R) Hyper-Threading Technology is supported
919     //    by the chip but has
920     //    been disabled, the value of this field will be 2 (for a single core
921     //    chip).  On other OS/chip combinations supporting
922     //    Intel(R) Hyper-Threading Technology, the value of
923     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
924     //    disabled and 2 when it is enabled.
925     //
926     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
927     //    value of this field (+1) determines the width of the core# field in
928     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
929     //    an upper bound, but the IA-32 architecture manual says that it is
930     //    exactly the number of cores per package, and I haven't seen any
931     //    case where it wasn't.
932     //
933     // From this information, deduce the package Id, core Id, and thread Id,
934     // and set the corresponding fields in the apicThreadInfo struct.
935     //
936     unsigned i;
937     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
938       __kmp_avail_proc * sizeof(apicThreadInfo));
939     unsigned nApics = 0;
940     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
941         //
942         // Skip this proc if it is not included in the machine model.
943         //
944         if (! KMP_CPU_ISSET(i, fullMask)) {
945             continue;
946         }
947         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
948 
949         __kmp_affinity_bind_thread(i);
950         threadInfo[nApics].osId = i;
951 
952         //
953         // The apic id and max threads per pkg come from cpuid(1).
954         //
955         __kmp_x86_cpuid(1, 0, &buf);
956         if (! (buf.edx >> 9) & 1) {
957             __kmp_set_system_affinity(oldMask, TRUE);
958             __kmp_free(threadInfo);
959             KMP_CPU_FREE(oldMask);
960             *msg_id = kmp_i18n_str_ApicNotPresent;
961             return -1;
962         }
963         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
964         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
965         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
966             threadInfo[nApics].maxThreadsPerPkg = 1;
967         }
968 
969         //
970         // Max cores per pkg comes from cpuid(4).
971         // 1 must be added to the encoded value.
972         //
973         // First, we need to check if cpuid(4) is supported on this chip.
974         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
975         // has the value n or greater.
976         //
977         __kmp_x86_cpuid(0, 0, &buf);
978         if (buf.eax >= 4) {
979             __kmp_x86_cpuid(4, 0, &buf);
980             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
981         }
982         else {
983             threadInfo[nApics].maxCoresPerPkg = 1;
984         }
985 
986         //
987         // Infer the pkgId / coreId / threadId using only the info
988         // obtained locally.
989         //
990         int widthCT = __kmp_cpuid_mask_width(
991           threadInfo[nApics].maxThreadsPerPkg);
992         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
993 
994         int widthC = __kmp_cpuid_mask_width(
995           threadInfo[nApics].maxCoresPerPkg);
996         int widthT = widthCT - widthC;
997         if (widthT < 0) {
998             //
999             // I've never seen this one happen, but I suppose it could, if
1000             // the cpuid instruction on a chip was really screwed up.
1001             // Make sure to restore the affinity mask before the tail call.
1002             //
1003             __kmp_set_system_affinity(oldMask, TRUE);
1004             __kmp_free(threadInfo);
1005             KMP_CPU_FREE(oldMask);
1006             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1007             return -1;
1008         }
1009 
1010         int maskC = (1 << widthC) - 1;
1011         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1012           &maskC;
1013 
1014         int maskT = (1 << widthT) - 1;
1015         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1016 
1017         nApics++;
1018     }
1019 
1020     //
1021     // We've collected all the info we need.
1022     // Restore the old affinity mask for this thread.
1023     //
1024     __kmp_set_system_affinity(oldMask, TRUE);
1025 
1026     //
1027     // If there's only one thread context to bind to, form an Address object
1028     // with depth 1 and return immediately (or, if affinity is off, set
1029     // address2os to NULL and return).
1030     //
1031     // If it is configured to omit the package level when there is only a
1032     // single package, the logic at the end of this routine won't work if
1033     // there is only a single thread - it would try to form an Address
1034     // object with depth 0.
1035     //
1036     KMP_ASSERT(nApics > 0);
1037     if (nApics == 1) {
1038         __kmp_ncores = nPackages = 1;
1039         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1040         if (__kmp_affinity_verbose) {
1041             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1042             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1043 
1044             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1045             if (__kmp_affinity_respect_mask) {
1046                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1047             } else {
1048                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1049             }
1050             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1051             KMP_INFORM(Uniform, "KMP_AFFINITY");
1052             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1053               __kmp_nThreadsPerCore, __kmp_ncores);
1054         }
1055 
1056         if (__kmp_affinity_type == affinity_none) {
1057             __kmp_free(threadInfo);
1058             KMP_CPU_FREE(oldMask);
1059             return 0;
1060         }
1061 
1062         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1063         Address addr(1);
1064         addr.labels[0] = threadInfo[0].pkgId;
1065         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1066 
1067         if (__kmp_affinity_gran_levels < 0) {
1068             __kmp_affinity_gran_levels = 0;
1069         }
1070 
1071         if (__kmp_affinity_verbose) {
1072             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1073         }
1074 
1075         __kmp_free(threadInfo);
1076         KMP_CPU_FREE(oldMask);
1077         return 1;
1078     }
1079 
1080     //
1081     // Sort the threadInfo table by physical Id.
1082     //
1083     qsort(threadInfo, nApics, sizeof(*threadInfo),
1084       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1085 
1086     //
1087     // The table is now sorted by pkgId / coreId / threadId, but we really
1088     // don't know the radix of any of the fields.  pkgId's may be sparsely
1089     // assigned among the chips on a system.  Although coreId's are usually
1090     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1091     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1092     //
1093     // For that matter, we don't know what coresPerPkg and threadsPerCore
1094     // (or the total # packages) are at this point - we want to determine
1095     // that now.  We only have an upper bound on the first two figures.
1096     //
1097     // We also perform a consistency check at this point: the values returned
1098     // by the cpuid instruction for any thread bound to a given package had
1099     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1100     //
1101     nPackages = 1;
1102     nCoresPerPkg = 1;
1103     __kmp_nThreadsPerCore = 1;
1104     unsigned nCores = 1;
1105 
1106     unsigned pkgCt = 1;                         // to determine radii
1107     unsigned lastPkgId = threadInfo[0].pkgId;
1108     unsigned coreCt = 1;
1109     unsigned lastCoreId = threadInfo[0].coreId;
1110     unsigned threadCt = 1;
1111     unsigned lastThreadId = threadInfo[0].threadId;
1112 
1113                                                 // intra-pkg consist checks
1114     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1115     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1116 
1117     for (i = 1; i < nApics; i++) {
1118         if (threadInfo[i].pkgId != lastPkgId) {
1119             nCores++;
1120             pkgCt++;
1121             lastPkgId = threadInfo[i].pkgId;
1122             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1123             coreCt = 1;
1124             lastCoreId = threadInfo[i].coreId;
1125             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1126             threadCt = 1;
1127             lastThreadId = threadInfo[i].threadId;
1128 
1129             //
1130             // This is a different package, so go on to the next iteration
1131             // without doing any consistency checks.  Reset the consistency
1132             // check vars, though.
1133             //
1134             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1135             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1136             continue;
1137         }
1138 
1139         if (threadInfo[i].coreId != lastCoreId) {
1140             nCores++;
1141             coreCt++;
1142             lastCoreId = threadInfo[i].coreId;
1143             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1144             threadCt = 1;
1145             lastThreadId = threadInfo[i].threadId;
1146         }
1147         else if (threadInfo[i].threadId != lastThreadId) {
1148             threadCt++;
1149             lastThreadId = threadInfo[i].threadId;
1150         }
1151         else {
1152             __kmp_free(threadInfo);
1153             KMP_CPU_FREE(oldMask);
1154             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1155             return -1;
1156         }
1157 
1158         //
1159         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1160         // fields agree between all the threads bounds to a given package.
1161         //
1162         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1163           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1164             __kmp_free(threadInfo);
1165             KMP_CPU_FREE(oldMask);
1166             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1167             return -1;
1168         }
1169     }
1170     nPackages = pkgCt;
1171     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1172     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1173 
1174     //
1175     // When affinity is off, this routine will still be called to set
1176     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1177     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1178     // correctly, and return now if affinity is not enabled.
1179     //
1180     __kmp_ncores = nCores;
1181     if (__kmp_affinity_verbose) {
1182         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1183         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1184 
1185         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1186         if (__kmp_affinity_respect_mask) {
1187             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1188         } else {
1189             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1190         }
1191         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1192         if (__kmp_affinity_uniform_topology()) {
1193             KMP_INFORM(Uniform, "KMP_AFFINITY");
1194         } else {
1195             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1196         }
1197         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1198           __kmp_nThreadsPerCore, __kmp_ncores);
1199 
1200     }
1201 
1202     if (__kmp_affinity_type == affinity_none) {
1203         __kmp_free(threadInfo);
1204         KMP_CPU_FREE(oldMask);
1205         return 0;
1206     }
1207 
1208     //
1209     // Now that we've determined the number of packages, the number of cores
1210     // per package, and the number of threads per core, we can construct the
1211     // data structure that is to be returned.
1212     //
1213     int pkgLevel = 0;
1214     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1215     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1216     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1217 
1218     KMP_ASSERT(depth > 0);
1219     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1220 
1221     for (i = 0; i < nApics; ++i) {
1222         Address addr(depth);
1223         unsigned os = threadInfo[i].osId;
1224         int d = 0;
1225 
1226         if (pkgLevel >= 0) {
1227             addr.labels[d++] = threadInfo[i].pkgId;
1228         }
1229         if (coreLevel >= 0) {
1230             addr.labels[d++] = threadInfo[i].coreId;
1231         }
1232         if (threadLevel >= 0) {
1233             addr.labels[d++] = threadInfo[i].threadId;
1234         }
1235         (*address2os)[i] = AddrUnsPair(addr, os);
1236     }
1237 
1238     if (__kmp_affinity_gran_levels < 0) {
1239         //
1240         // Set the granularity level based on what levels are modeled
1241         // in the machine topology map.
1242         //
1243         __kmp_affinity_gran_levels = 0;
1244         if ((threadLevel >= 0)
1245           && (__kmp_affinity_gran > affinity_gran_thread)) {
1246             __kmp_affinity_gran_levels++;
1247         }
1248         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1249             __kmp_affinity_gran_levels++;
1250         }
1251         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1252             __kmp_affinity_gran_levels++;
1253         }
1254     }
1255 
1256     if (__kmp_affinity_verbose) {
1257         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1258           coreLevel, threadLevel);
1259     }
1260 
1261     __kmp_free(threadInfo);
1262     KMP_CPU_FREE(oldMask);
1263     return depth;
1264 }
1265 
1266 
1267 //
1268 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1269 // architectures support a newer interface for specifying the x2APIC Ids,
1270 // based on cpuid leaf 11.
1271 //
1272 static int
1273 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1274   kmp_i18n_id_t *const msg_id)
1275 {
1276     kmp_cpuid buf;
1277 
1278     *address2os = NULL;
1279     *msg_id = kmp_i18n_null;
1280 
1281     //
1282     // Check to see if cpuid leaf 11 is supported.
1283     //
1284     __kmp_x86_cpuid(0, 0, &buf);
1285     if (buf.eax < 11) {
1286         *msg_id = kmp_i18n_str_NoLeaf11Support;
1287         return -1;
1288     }
1289     __kmp_x86_cpuid(11, 0, &buf);
1290     if (buf.ebx == 0) {
1291         *msg_id = kmp_i18n_str_NoLeaf11Support;
1292         return -1;
1293     }
1294 
1295     //
1296     // Find the number of levels in the machine topology.  While we're at it,
1297     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1298     // try to get more accurate values later by explicitly counting them,
1299     // but get reasonable defaults now, in case we return early.
1300     //
1301     int level;
1302     int threadLevel = -1;
1303     int coreLevel = -1;
1304     int pkgLevel = -1;
1305     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1306 
1307     for (level = 0;; level++) {
1308         if (level > 31) {
1309             //
1310             // FIXME: Hack for DPD200163180
1311             //
1312             // If level is big then something went wrong -> exiting
1313             //
1314             // There could actually be 32 valid levels in the machine topology,
1315             // but so far, the only machine we have seen which does not exit
1316             // this loop before iteration 32 has fubar x2APIC settings.
1317             //
1318             // For now, just reject this case based upon loop trip count.
1319             //
1320             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1321             return -1;
1322         }
1323         __kmp_x86_cpuid(11, level, &buf);
1324         if (buf.ebx == 0) {
1325             if (pkgLevel < 0) {
1326                 //
1327                 // Will infer nPackages from __kmp_xproc
1328                 //
1329                 pkgLevel = level;
1330                 level++;
1331             }
1332             break;
1333         }
1334         int kind = (buf.ecx >> 8) & 0xff;
1335         if (kind == 1) {
1336             //
1337             // SMT level
1338             //
1339             threadLevel = level;
1340             coreLevel = -1;
1341             pkgLevel = -1;
1342             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1343             if (__kmp_nThreadsPerCore == 0) {
1344                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1345                 return -1;
1346             }
1347         }
1348         else if (kind == 2) {
1349             //
1350             // core level
1351             //
1352             coreLevel = level;
1353             pkgLevel = -1;
1354             nCoresPerPkg = buf.ebx & 0xff;
1355             if (nCoresPerPkg == 0) {
1356                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1357                 return -1;
1358             }
1359         }
1360         else {
1361             if (level <= 0) {
1362                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1363                 return -1;
1364             }
1365             if (pkgLevel >= 0) {
1366                 continue;
1367             }
1368             pkgLevel = level;
1369             nPackages = buf.ebx & 0xff;
1370             if (nPackages == 0) {
1371                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1372                 return -1;
1373             }
1374         }
1375     }
1376     int depth = level;
1377 
1378     //
1379     // In the above loop, "level" was counted from the finest level (usually
1380     // thread) to the coarsest.  The caller expects that we will place the
1381     // labels in (*address2os)[].first.labels[] in the inverse order, so
1382     // we need to invert the vars saying which level means what.
1383     //
1384     if (threadLevel >= 0) {
1385         threadLevel = depth - threadLevel - 1;
1386     }
1387     if (coreLevel >= 0) {
1388         coreLevel = depth - coreLevel - 1;
1389     }
1390     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1391     pkgLevel = depth - pkgLevel - 1;
1392 
1393     //
1394     // The algorithm used starts by setting the affinity to each available
1395     // thread and retrieving info from the cpuid instruction, so if we are
1396     // not capable of calling __kmp_get_system_affinity() and
1397     // _kmp_get_system_affinity(), then we need to do something else - use
1398     // the defaults that we calculated from issuing cpuid without binding
1399     // to each proc.
1400     //
1401     if (! KMP_AFFINITY_CAPABLE())
1402     {
1403         //
1404         // Hack to try and infer the machine topology using only the data
1405         // available from cpuid on the current thread, and __kmp_xproc.
1406         //
1407         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1408 
1409         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1410         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1411         if (__kmp_affinity_verbose) {
1412             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1413             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1414             if (__kmp_affinity_uniform_topology()) {
1415                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1416             } else {
1417                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1418             }
1419             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1420               __kmp_nThreadsPerCore, __kmp_ncores);
1421         }
1422         return 0;
1423     }
1424 
1425     //
1426     //
1427     // From here on, we can assume that it is safe to call
1428     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1429     // even if __kmp_affinity_type = affinity_none.
1430     //
1431 
1432     //
1433     // Save the affinity mask for the current thread.
1434     //
1435     kmp_affin_mask_t *oldMask;
1436     KMP_CPU_ALLOC(oldMask);
1437     __kmp_get_system_affinity(oldMask, TRUE);
1438 
1439     //
1440     // Allocate the data structure to be returned.
1441     //
1442     AddrUnsPair *retval = (AddrUnsPair *)
1443       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1444 
1445     //
1446     // Run through each of the available contexts, binding the current thread
1447     // to it, and obtaining the pertinent information using the cpuid instr.
1448     //
1449     unsigned int proc;
1450     int nApics = 0;
1451     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1452         //
1453         // Skip this proc if it is not included in the machine model.
1454         //
1455         if (! KMP_CPU_ISSET(proc, fullMask)) {
1456             continue;
1457         }
1458         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1459 
1460         __kmp_affinity_bind_thread(proc);
1461 
1462         //
1463         // Extrach the labels for each level in the machine topology map
1464         // from the Apic ID.
1465         //
1466         Address addr(depth);
1467         int prev_shift = 0;
1468 
1469         for (level = 0; level < depth; level++) {
1470             __kmp_x86_cpuid(11, level, &buf);
1471             unsigned apicId = buf.edx;
1472             if (buf.ebx == 0) {
1473                 if (level != depth - 1) {
1474                     KMP_CPU_FREE(oldMask);
1475                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1476                     return -1;
1477                 }
1478                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1479                 level++;
1480                 break;
1481             }
1482             int shift = buf.eax & 0x1f;
1483             int mask = (1 << shift) - 1;
1484             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1485             prev_shift = shift;
1486         }
1487         if (level != depth) {
1488             KMP_CPU_FREE(oldMask);
1489             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1490             return -1;
1491         }
1492 
1493         retval[nApics] = AddrUnsPair(addr, proc);
1494         nApics++;
1495     }
1496 
1497     //
1498     // We've collected all the info we need.
1499     // Restore the old affinity mask for this thread.
1500     //
1501     __kmp_set_system_affinity(oldMask, TRUE);
1502 
1503     //
1504     // If there's only one thread context to bind to, return now.
1505     //
1506     KMP_ASSERT(nApics > 0);
1507     if (nApics == 1) {
1508         __kmp_ncores = nPackages = 1;
1509         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1510         if (__kmp_affinity_verbose) {
1511             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1512             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1513 
1514             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1515             if (__kmp_affinity_respect_mask) {
1516                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1517             } else {
1518                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1519             }
1520             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1521             KMP_INFORM(Uniform, "KMP_AFFINITY");
1522             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1523               __kmp_nThreadsPerCore, __kmp_ncores);
1524         }
1525 
1526         if (__kmp_affinity_type == affinity_none) {
1527             __kmp_free(retval);
1528             KMP_CPU_FREE(oldMask);
1529             return 0;
1530         }
1531 
1532         //
1533         // Form an Address object which only includes the package level.
1534         //
1535         Address addr(1);
1536         addr.labels[0] = retval[0].first.labels[pkgLevel];
1537         retval[0].first = addr;
1538 
1539         if (__kmp_affinity_gran_levels < 0) {
1540             __kmp_affinity_gran_levels = 0;
1541         }
1542 
1543         if (__kmp_affinity_verbose) {
1544             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1545         }
1546 
1547         *address2os = retval;
1548         KMP_CPU_FREE(oldMask);
1549         return 1;
1550     }
1551 
1552     //
1553     // Sort the table by physical Id.
1554     //
1555     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1556 
1557     //
1558     // Find the radix at each of the levels.
1559     //
1560     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1561     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1562     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1563     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1564     for (level = 0; level < depth; level++) {
1565         totals[level] = 1;
1566         maxCt[level] = 1;
1567         counts[level] = 1;
1568         last[level] = retval[0].first.labels[level];
1569     }
1570 
1571     //
1572     // From here on, the iteration variable "level" runs from the finest
1573     // level to the coarsest, i.e. we iterate forward through
1574     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1575     // backwards.
1576     //
1577     for (proc = 1; (int)proc < nApics; proc++) {
1578         int level;
1579         for (level = 0; level < depth; level++) {
1580             if (retval[proc].first.labels[level] != last[level]) {
1581                 int j;
1582                 for (j = level + 1; j < depth; j++) {
1583                     totals[j]++;
1584                     counts[j] = 1;
1585                     // The line below causes printing incorrect topology information
1586                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1587                     // some less value while going through the array.
1588                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1589                     // whereas it must be 4.
1590                     // TODO!!! Check if it can be commented safely
1591                     //maxCt[j] = 1;
1592                     last[j] = retval[proc].first.labels[j];
1593                 }
1594                 totals[level]++;
1595                 counts[level]++;
1596                 if (counts[level] > maxCt[level]) {
1597                     maxCt[level] = counts[level];
1598                 }
1599                 last[level] = retval[proc].first.labels[level];
1600                 break;
1601             }
1602             else if (level == depth - 1) {
1603                 __kmp_free(last);
1604                 __kmp_free(maxCt);
1605                 __kmp_free(counts);
1606                 __kmp_free(totals);
1607                 __kmp_free(retval);
1608                 KMP_CPU_FREE(oldMask);
1609                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1610                 return -1;
1611             }
1612         }
1613     }
1614 
1615     //
1616     // When affinity is off, this routine will still be called to set
1617     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1618     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1619     // correctly, and return if affinity is not enabled.
1620     //
1621     if (threadLevel >= 0) {
1622         __kmp_nThreadsPerCore = maxCt[threadLevel];
1623     }
1624     else {
1625         __kmp_nThreadsPerCore = 1;
1626     }
1627     nPackages = totals[pkgLevel];
1628 
1629     if (coreLevel >= 0) {
1630         __kmp_ncores = totals[coreLevel];
1631         nCoresPerPkg = maxCt[coreLevel];
1632     }
1633     else {
1634         __kmp_ncores = nPackages;
1635         nCoresPerPkg = 1;
1636     }
1637 
1638     //
1639     // Check to see if the machine topology is uniform
1640     //
1641     unsigned prod = maxCt[0];
1642     for (level = 1; level < depth; level++) {
1643        prod *= maxCt[level];
1644     }
1645     bool uniform = (prod == totals[level - 1]);
1646 
1647     //
1648     // Print the machine topology summary.
1649     //
1650     if (__kmp_affinity_verbose) {
1651         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1652         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1653 
1654         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1655         if (__kmp_affinity_respect_mask) {
1656             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1657         } else {
1658             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1659         }
1660         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1661         if (uniform) {
1662             KMP_INFORM(Uniform, "KMP_AFFINITY");
1663         } else {
1664             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1665         }
1666 
1667         kmp_str_buf_t buf;
1668         __kmp_str_buf_init(&buf);
1669 
1670         __kmp_str_buf_print(&buf, "%d", totals[0]);
1671         for (level = 1; level <= pkgLevel; level++) {
1672             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1673         }
1674         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1675           __kmp_nThreadsPerCore, __kmp_ncores);
1676 
1677         __kmp_str_buf_free(&buf);
1678     }
1679 
1680     if (__kmp_affinity_type == affinity_none) {
1681         __kmp_free(last);
1682         __kmp_free(maxCt);
1683         __kmp_free(counts);
1684         __kmp_free(totals);
1685         __kmp_free(retval);
1686         KMP_CPU_FREE(oldMask);
1687         return 0;
1688     }
1689 
1690     //
1691     // Find any levels with radiix 1, and remove them from the map
1692     // (except for the package level).
1693     //
1694     int new_depth = 0;
1695     for (level = 0; level < depth; level++) {
1696         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1697            continue;
1698         }
1699         new_depth++;
1700     }
1701 
1702     //
1703     // If we are removing any levels, allocate a new vector to return,
1704     // and copy the relevant information to it.
1705     //
1706     if (new_depth != depth) {
1707         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1708           sizeof(AddrUnsPair) * nApics);
1709         for (proc = 0; (int)proc < nApics; proc++) {
1710             Address addr(new_depth);
1711             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1712         }
1713         int new_level = 0;
1714         for (level = 0; level < depth; level++) {
1715             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1716                if (level == threadLevel) {
1717                    threadLevel = -1;
1718                }
1719                else if ((threadLevel >= 0) && (level < threadLevel)) {
1720                    threadLevel--;
1721                }
1722                if (level == coreLevel) {
1723                    coreLevel = -1;
1724                }
1725                else if ((coreLevel >= 0) && (level < coreLevel)) {
1726                    coreLevel--;
1727                }
1728                if (level < pkgLevel) {
1729                    pkgLevel--;
1730                }
1731                continue;
1732             }
1733             for (proc = 0; (int)proc < nApics; proc++) {
1734                 new_retval[proc].first.labels[new_level]
1735                   = retval[proc].first.labels[level];
1736             }
1737             new_level++;
1738         }
1739 
1740         __kmp_free(retval);
1741         retval = new_retval;
1742         depth = new_depth;
1743     }
1744 
1745     if (__kmp_affinity_gran_levels < 0) {
1746         //
1747         // Set the granularity level based on what levels are modeled
1748         // in the machine topology map.
1749         //
1750         __kmp_affinity_gran_levels = 0;
1751         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1752             __kmp_affinity_gran_levels++;
1753         }
1754         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1755             __kmp_affinity_gran_levels++;
1756         }
1757         if (__kmp_affinity_gran > affinity_gran_package) {
1758             __kmp_affinity_gran_levels++;
1759         }
1760     }
1761 
1762     if (__kmp_affinity_verbose) {
1763         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1764           coreLevel, threadLevel);
1765     }
1766 
1767     __kmp_free(last);
1768     __kmp_free(maxCt);
1769     __kmp_free(counts);
1770     __kmp_free(totals);
1771     KMP_CPU_FREE(oldMask);
1772     *address2os = retval;
1773     return depth;
1774 }
1775 
1776 
1777 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1778 
1779 
1780 #define osIdIndex       0
1781 #define threadIdIndex   1
1782 #define coreIdIndex     2
1783 #define pkgIdIndex      3
1784 #define nodeIdIndex     4
1785 
1786 typedef unsigned *ProcCpuInfo;
1787 static unsigned maxIndex = pkgIdIndex;
1788 
1789 
1790 static int
1791 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1792 {
1793     const unsigned *aa = (const unsigned *)a;
1794     const unsigned *bb = (const unsigned *)b;
1795     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1796     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1797     return 0;
1798 };
1799 
1800 
1801 static int
1802 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1803 {
1804     unsigned i;
1805     const unsigned *aa = *((const unsigned **)a);
1806     const unsigned *bb = *((const unsigned **)b);
1807     for (i = maxIndex; ; i--) {
1808         if (aa[i] < bb[i]) return -1;
1809         if (aa[i] > bb[i]) return 1;
1810         if (i == osIdIndex) break;
1811     }
1812     return 0;
1813 }
1814 
1815 
1816 //
1817 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1818 // affinity map.
1819 //
1820 static int
1821 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1822   kmp_i18n_id_t *const msg_id, FILE *f)
1823 {
1824     *address2os = NULL;
1825     *msg_id = kmp_i18n_null;
1826 
1827     //
1828     // Scan of the file, and count the number of "processor" (osId) fields,
1829     // and find the highest value of <n> for a node_<n> field.
1830     //
1831     char buf[256];
1832     unsigned num_records = 0;
1833     while (! feof(f)) {
1834         buf[sizeof(buf) - 1] = 1;
1835         if (! fgets(buf, sizeof(buf), f)) {
1836             //
1837             // Read errors presumably because of EOF
1838             //
1839             break;
1840         }
1841 
1842         char s1[] = "processor";
1843         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1844             num_records++;
1845             continue;
1846         }
1847 
1848         //
1849         // FIXME - this will match "node_<n> <garbage>"
1850         //
1851         unsigned level;
1852         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1853             if (nodeIdIndex + level >= maxIndex) {
1854                 maxIndex = nodeIdIndex + level;
1855             }
1856             continue;
1857         }
1858     }
1859 
1860     //
1861     // Check for empty file / no valid processor records, or too many.
1862     // The number of records can't exceed the number of valid bits in the
1863     // affinity mask.
1864     //
1865     if (num_records == 0) {
1866         *line = 0;
1867         *msg_id = kmp_i18n_str_NoProcRecords;
1868         return -1;
1869     }
1870     if (num_records > (unsigned)__kmp_xproc) {
1871         *line = 0;
1872         *msg_id = kmp_i18n_str_TooManyProcRecords;
1873         return -1;
1874     }
1875 
1876     //
1877     // Set the file pointer back to the begginning, so that we can scan the
1878     // file again, this time performing a full parse of the data.
1879     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1880     // Adding an extra element at the end allows us to remove a lot of extra
1881     // checks for termination conditions.
1882     //
1883     if (fseek(f, 0, SEEK_SET) != 0) {
1884         *line = 0;
1885         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1886         return -1;
1887     }
1888 
1889     //
1890     // Allocate the array of records to store the proc info in.  The dummy
1891     // element at the end makes the logic in filling them out easier to code.
1892     //
1893     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1894       * sizeof(unsigned *));
1895     unsigned i;
1896     for (i = 0; i <= num_records; i++) {
1897         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1898           * sizeof(unsigned));
1899     }
1900 
1901 #define CLEANUP_THREAD_INFO \
1902     for (i = 0; i <= num_records; i++) {                                \
1903         __kmp_free(threadInfo[i]);                                      \
1904     }                                                                   \
1905     __kmp_free(threadInfo);
1906 
1907     //
1908     // A value of UINT_MAX means that we didn't find the field
1909     //
1910     unsigned __index;
1911 
1912 #define INIT_PROC_INFO(p) \
1913     for (__index = 0; __index <= maxIndex; __index++) {                 \
1914         (p)[__index] = UINT_MAX;                                        \
1915     }
1916 
1917     for (i = 0; i <= num_records; i++) {
1918         INIT_PROC_INFO(threadInfo[i]);
1919     }
1920 
1921     unsigned num_avail = 0;
1922     *line = 0;
1923     while (! feof(f)) {
1924         //
1925         // Create an inner scoping level, so that all the goto targets at the
1926         // end of the loop appear in an outer scoping level.  This avoids
1927         // warnings about jumping past an initialization to a target in the
1928         // same block.
1929         //
1930         {
1931             buf[sizeof(buf) - 1] = 1;
1932             bool long_line = false;
1933             if (! fgets(buf, sizeof(buf), f)) {
1934                 //
1935                 // Read errors presumably because of EOF
1936                 //
1937                 // If there is valid data in threadInfo[num_avail], then fake
1938                 // a blank line in ensure that the last address gets parsed.
1939                 //
1940                 bool valid = false;
1941                 for (i = 0; i <= maxIndex; i++) {
1942                     if (threadInfo[num_avail][i] != UINT_MAX) {
1943                         valid = true;
1944                     }
1945                 }
1946                 if (! valid) {
1947                     break;
1948                 }
1949                 buf[0] = 0;
1950             } else if (!buf[sizeof(buf) - 1]) {
1951                 //
1952                 // The line is longer than the buffer.  Set a flag and don't
1953                 // emit an error if we were going to ignore the line, anyway.
1954                 //
1955                 long_line = true;
1956 
1957 #define CHECK_LINE \
1958     if (long_line) {                                                    \
1959         CLEANUP_THREAD_INFO;                                            \
1960         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1961         return -1;                                                      \
1962     }
1963             }
1964             (*line)++;
1965 
1966             char s1[] = "processor";
1967             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1968                 CHECK_LINE;
1969                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1970                 unsigned val;
1971                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1972                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1973                 threadInfo[num_avail][osIdIndex] = val;
1974 #if KMP_OS_LINUX && USE_SYSFS_INFO
1975                 char path[256];
1976                 KMP_SNPRINTF(path, sizeof(path),
1977                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1978                     threadInfo[num_avail][osIdIndex]);
1979                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1980 
1981                 KMP_SNPRINTF(path, sizeof(path),
1982                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1983                     threadInfo[num_avail][osIdIndex]);
1984                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1985                 continue;
1986 #else
1987             }
1988             char s2[] = "physical id";
1989             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1990                 CHECK_LINE;
1991                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1992                 unsigned val;
1993                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1994                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1995                 threadInfo[num_avail][pkgIdIndex] = val;
1996                 continue;
1997             }
1998             char s3[] = "core id";
1999             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2000                 CHECK_LINE;
2001                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2002                 unsigned val;
2003                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2004                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2005                 threadInfo[num_avail][coreIdIndex] = val;
2006                 continue;
2007 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2008             }
2009             char s4[] = "thread id";
2010             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2011                 CHECK_LINE;
2012                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2013                 unsigned val;
2014                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2015                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2016                 threadInfo[num_avail][threadIdIndex] = val;
2017                 continue;
2018             }
2019             unsigned level;
2020             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2021                 CHECK_LINE;
2022                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2023                 unsigned val;
2024                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2025                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2026                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2027                 threadInfo[num_avail][nodeIdIndex + level] = val;
2028                 continue;
2029             }
2030 
2031             //
2032             // We didn't recognize the leading token on the line.
2033             // There are lots of leading tokens that we don't recognize -
2034             // if the line isn't empty, go on to the next line.
2035             //
2036             if ((*buf != 0) && (*buf != '\n')) {
2037                 //
2038                 // If the line is longer than the buffer, read characters
2039                 // until we find a newline.
2040                 //
2041                 if (long_line) {
2042                     int ch;
2043                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2044                 }
2045                 continue;
2046             }
2047 
2048             //
2049             // A newline has signalled the end of the processor record.
2050             // Check that there aren't too many procs specified.
2051             //
2052             if ((int)num_avail == __kmp_xproc) {
2053                 CLEANUP_THREAD_INFO;
2054                 *msg_id = kmp_i18n_str_TooManyEntries;
2055                 return -1;
2056             }
2057 
2058             //
2059             // Check for missing fields.  The osId field must be there, and we
2060             // currently require that the physical id field is specified, also.
2061             //
2062             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2063                 CLEANUP_THREAD_INFO;
2064                 *msg_id = kmp_i18n_str_MissingProcField;
2065                 return -1;
2066             }
2067             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2068                 CLEANUP_THREAD_INFO;
2069                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2070                 return -1;
2071             }
2072 
2073             //
2074             // Skip this proc if it is not included in the machine model.
2075             //
2076             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2077                 INIT_PROC_INFO(threadInfo[num_avail]);
2078                 continue;
2079             }
2080 
2081             //
2082             // We have a successful parse of this proc's info.
2083             // Increment the counter, and prepare for the next proc.
2084             //
2085             num_avail++;
2086             KMP_ASSERT(num_avail <= num_records);
2087             INIT_PROC_INFO(threadInfo[num_avail]);
2088         }
2089         continue;
2090 
2091         no_val:
2092         CLEANUP_THREAD_INFO;
2093         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2094         return -1;
2095 
2096         dup_field:
2097         CLEANUP_THREAD_INFO;
2098         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2099         return -1;
2100     }
2101     *line = 0;
2102 
2103 # if KMP_MIC && REDUCE_TEAM_SIZE
2104     unsigned teamSize = 0;
2105 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2106 
2107     // check for num_records == __kmp_xproc ???
2108 
2109     //
2110     // If there's only one thread context to bind to, form an Address object
2111     // with depth 1 and return immediately (or, if affinity is off, set
2112     // address2os to NULL and return).
2113     //
2114     // If it is configured to omit the package level when there is only a
2115     // single package, the logic at the end of this routine won't work if
2116     // there is only a single thread - it would try to form an Address
2117     // object with depth 0.
2118     //
2119     KMP_ASSERT(num_avail > 0);
2120     KMP_ASSERT(num_avail <= num_records);
2121     if (num_avail == 1) {
2122         __kmp_ncores = 1;
2123         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2124         if (__kmp_affinity_verbose) {
2125             if (! KMP_AFFINITY_CAPABLE()) {
2126                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2127                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2128                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2129             }
2130             else {
2131                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2132                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2133                   fullMask);
2134                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2135                 if (__kmp_affinity_respect_mask) {
2136                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2137                 } else {
2138                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2139                 }
2140                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2141                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2142             }
2143             int index;
2144             kmp_str_buf_t buf;
2145             __kmp_str_buf_init(&buf);
2146             __kmp_str_buf_print(&buf, "1");
2147             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2148                 __kmp_str_buf_print(&buf, " x 1");
2149             }
2150             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2151             __kmp_str_buf_free(&buf);
2152         }
2153 
2154         if (__kmp_affinity_type == affinity_none) {
2155             CLEANUP_THREAD_INFO;
2156             return 0;
2157         }
2158 
2159         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2160         Address addr(1);
2161         addr.labels[0] = threadInfo[0][pkgIdIndex];
2162         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2163 
2164         if (__kmp_affinity_gran_levels < 0) {
2165             __kmp_affinity_gran_levels = 0;
2166         }
2167 
2168         if (__kmp_affinity_verbose) {
2169             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2170         }
2171 
2172         CLEANUP_THREAD_INFO;
2173         return 1;
2174     }
2175 
2176     //
2177     // Sort the threadInfo table by physical Id.
2178     //
2179     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2180       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2181 
2182     //
2183     // The table is now sorted by pkgId / coreId / threadId, but we really
2184     // don't know the radix of any of the fields.  pkgId's may be sparsely
2185     // assigned among the chips on a system.  Although coreId's are usually
2186     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2187     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2188     //
2189     // For that matter, we don't know what coresPerPkg and threadsPerCore
2190     // (or the total # packages) are at this point - we want to determine
2191     // that now.  We only have an upper bound on the first two figures.
2192     //
2193     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2194       * sizeof(unsigned));
2195     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2196       * sizeof(unsigned));
2197     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2198       * sizeof(unsigned));
2199     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2200       * sizeof(unsigned));
2201 
2202     bool assign_thread_ids = false;
2203     unsigned threadIdCt;
2204     unsigned index;
2205 
2206     restart_radix_check:
2207     threadIdCt = 0;
2208 
2209     //
2210     // Initialize the counter arrays with data from threadInfo[0].
2211     //
2212     if (assign_thread_ids) {
2213         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2214             threadInfo[0][threadIdIndex] = threadIdCt++;
2215         }
2216         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2217             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2218         }
2219     }
2220     for (index = 0; index <= maxIndex; index++) {
2221         counts[index] = 1;
2222         maxCt[index] = 1;
2223         totals[index] = 1;
2224         lastId[index] = threadInfo[0][index];;
2225     }
2226 
2227     //
2228     // Run through the rest of the OS procs.
2229     //
2230     for (i = 1; i < num_avail; i++) {
2231         //
2232         // Find the most significant index whose id differs
2233         // from the id for the previous OS proc.
2234         //
2235         for (index = maxIndex; index >= threadIdIndex; index--) {
2236             if (assign_thread_ids && (index == threadIdIndex)) {
2237                 //
2238                 // Auto-assign the thread id field if it wasn't specified.
2239                 //
2240                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2241                     threadInfo[i][threadIdIndex] = threadIdCt++;
2242                 }
2243 
2244                 //
2245                 // Aparrently the thread id field was specified for some
2246                 // entries and not others.  Start the thread id counter
2247                 // off at the next higher thread id.
2248                 //
2249                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2250                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2251                 }
2252             }
2253             if (threadInfo[i][index] != lastId[index]) {
2254                 //
2255                 // Run through all indices which are less significant,
2256                 // and reset the counts to 1.
2257                 //
2258                 // At all levels up to and including index, we need to
2259                 // increment the totals and record the last id.
2260                 //
2261                 unsigned index2;
2262                 for (index2 = threadIdIndex; index2 < index; index2++) {
2263                     totals[index2]++;
2264                     if (counts[index2] > maxCt[index2]) {
2265                         maxCt[index2] = counts[index2];
2266                     }
2267                     counts[index2] = 1;
2268                     lastId[index2] = threadInfo[i][index2];
2269                 }
2270                 counts[index]++;
2271                 totals[index]++;
2272                 lastId[index] = threadInfo[i][index];
2273 
2274                 if (assign_thread_ids && (index > threadIdIndex)) {
2275 
2276 # if KMP_MIC && REDUCE_TEAM_SIZE
2277                     //
2278                     // The default team size is the total #threads in the machine
2279                     // minus 1 thread for every core that has 3 or more threads.
2280                     //
2281                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2282 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2283 
2284                     //
2285                     // Restart the thread counter, as we are on a new core.
2286                     //
2287                     threadIdCt = 0;
2288 
2289                     //
2290                     // Auto-assign the thread id field if it wasn't specified.
2291                     //
2292                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2293                         threadInfo[i][threadIdIndex] = threadIdCt++;
2294                     }
2295 
2296                     //
2297                     // Aparrently the thread id field was specified for some
2298                     // entries and not others.  Start the thread id counter
2299                     // off at the next higher thread id.
2300                     //
2301                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2302                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2303                     }
2304                 }
2305                 break;
2306             }
2307         }
2308         if (index < threadIdIndex) {
2309             //
2310             // If thread ids were specified, it is an error if they are not
2311             // unique.  Also, check that we waven't already restarted the
2312             // loop (to be safe - shouldn't need to).
2313             //
2314             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2315               || assign_thread_ids) {
2316                 __kmp_free(lastId);
2317                 __kmp_free(totals);
2318                 __kmp_free(maxCt);
2319                 __kmp_free(counts);
2320                 CLEANUP_THREAD_INFO;
2321                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2322                 return -1;
2323             }
2324 
2325             //
2326             // If the thread ids were not specified and we see entries
2327             // entries that are duplicates, start the loop over and
2328             // assign the thread ids manually.
2329             //
2330             assign_thread_ids = true;
2331             goto restart_radix_check;
2332         }
2333     }
2334 
2335 # if KMP_MIC && REDUCE_TEAM_SIZE
2336     //
2337     // The default team size is the total #threads in the machine
2338     // minus 1 thread for every core that has 3 or more threads.
2339     //
2340     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2341 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2342 
2343     for (index = threadIdIndex; index <= maxIndex; index++) {
2344         if (counts[index] > maxCt[index]) {
2345             maxCt[index] = counts[index];
2346         }
2347     }
2348 
2349     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2350     nCoresPerPkg = maxCt[coreIdIndex];
2351     nPackages = totals[pkgIdIndex];
2352 
2353     //
2354     // Check to see if the machine topology is uniform
2355     //
2356     unsigned prod = totals[maxIndex];
2357     for (index = threadIdIndex; index < maxIndex; index++) {
2358        prod *= maxCt[index];
2359     }
2360     bool uniform = (prod == totals[threadIdIndex]);
2361 
2362     //
2363     // When affinity is off, this routine will still be called to set
2364     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2365     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2366     // correctly, and return now if affinity is not enabled.
2367     //
2368     __kmp_ncores = totals[coreIdIndex];
2369 
2370     if (__kmp_affinity_verbose) {
2371         if (! KMP_AFFINITY_CAPABLE()) {
2372                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2373                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2374                 if (uniform) {
2375                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2376                 } else {
2377                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2378                 }
2379         }
2380         else {
2381             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2382             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2383                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2384                 if (__kmp_affinity_respect_mask) {
2385                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2386                 } else {
2387                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2388                 }
2389                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2390                 if (uniform) {
2391                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2392                 } else {
2393                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2394                 }
2395         }
2396         kmp_str_buf_t buf;
2397         __kmp_str_buf_init(&buf);
2398 
2399         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2400         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2401             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2402         }
2403         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2404           maxCt[threadIdIndex], __kmp_ncores);
2405 
2406         __kmp_str_buf_free(&buf);
2407     }
2408 
2409 # if KMP_MIC && REDUCE_TEAM_SIZE
2410     //
2411     // Set the default team size.
2412     //
2413     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2414         __kmp_dflt_team_nth = teamSize;
2415         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2416           __kmp_dflt_team_nth));
2417     }
2418 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2419 
2420     if (__kmp_affinity_type == affinity_none) {
2421         __kmp_free(lastId);
2422         __kmp_free(totals);
2423         __kmp_free(maxCt);
2424         __kmp_free(counts);
2425         CLEANUP_THREAD_INFO;
2426         return 0;
2427     }
2428 
2429     //
2430     // Count the number of levels which have more nodes at that level than
2431     // at the parent's level (with there being an implicit root node of
2432     // the top level).  This is equivalent to saying that there is at least
2433     // one node at this level which has a sibling.  These levels are in the
2434     // map, and the package level is always in the map.
2435     //
2436     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2437     int level = 0;
2438     for (index = threadIdIndex; index < maxIndex; index++) {
2439         KMP_ASSERT(totals[index] >= totals[index + 1]);
2440         inMap[index] = (totals[index] > totals[index + 1]);
2441     }
2442     inMap[maxIndex] = (totals[maxIndex] > 1);
2443     inMap[pkgIdIndex] = true;
2444 
2445     int depth = 0;
2446     for (index = threadIdIndex; index <= maxIndex; index++) {
2447         if (inMap[index]) {
2448             depth++;
2449         }
2450     }
2451     KMP_ASSERT(depth > 0);
2452 
2453     //
2454     // Construct the data structure that is to be returned.
2455     //
2456     *address2os = (AddrUnsPair*)
2457       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2458     int pkgLevel = -1;
2459     int coreLevel = -1;
2460     int threadLevel = -1;
2461 
2462     for (i = 0; i < num_avail; ++i) {
2463         Address addr(depth);
2464         unsigned os = threadInfo[i][osIdIndex];
2465         int src_index;
2466         int dst_index = 0;
2467 
2468         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2469             if (! inMap[src_index]) {
2470                 continue;
2471             }
2472             addr.labels[dst_index] = threadInfo[i][src_index];
2473             if (src_index == pkgIdIndex) {
2474                 pkgLevel = dst_index;
2475             }
2476             else if (src_index == coreIdIndex) {
2477                 coreLevel = dst_index;
2478             }
2479             else if (src_index == threadIdIndex) {
2480                 threadLevel = dst_index;
2481             }
2482             dst_index++;
2483         }
2484         (*address2os)[i] = AddrUnsPair(addr, os);
2485     }
2486 
2487     if (__kmp_affinity_gran_levels < 0) {
2488         //
2489         // Set the granularity level based on what levels are modeled
2490         // in the machine topology map.
2491         //
2492         unsigned src_index;
2493         __kmp_affinity_gran_levels = 0;
2494         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2495             if (! inMap[src_index]) {
2496                 continue;
2497             }
2498             switch (src_index) {
2499                 case threadIdIndex:
2500                 if (__kmp_affinity_gran > affinity_gran_thread) {
2501                     __kmp_affinity_gran_levels++;
2502                 }
2503 
2504                 break;
2505                 case coreIdIndex:
2506                 if (__kmp_affinity_gran > affinity_gran_core) {
2507                     __kmp_affinity_gran_levels++;
2508                 }
2509                 break;
2510 
2511                 case pkgIdIndex:
2512                 if (__kmp_affinity_gran > affinity_gran_package) {
2513                     __kmp_affinity_gran_levels++;
2514                 }
2515                 break;
2516             }
2517         }
2518     }
2519 
2520     if (__kmp_affinity_verbose) {
2521         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2522           coreLevel, threadLevel);
2523     }
2524 
2525     __kmp_free(inMap);
2526     __kmp_free(lastId);
2527     __kmp_free(totals);
2528     __kmp_free(maxCt);
2529     __kmp_free(counts);
2530     CLEANUP_THREAD_INFO;
2531     return depth;
2532 }
2533 
2534 
2535 //
2536 // Create and return a table of affinity masks, indexed by OS thread ID.
2537 // This routine handles OR'ing together all the affinity masks of threads
2538 // that are sufficiently close, if granularity > fine.
2539 //
2540 static kmp_affin_mask_t *
2541 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2542   AddrUnsPair *address2os, unsigned numAddrs)
2543 {
2544     //
2545     // First form a table of affinity masks in order of OS thread id.
2546     //
2547     unsigned depth;
2548     unsigned maxOsId;
2549     unsigned i;
2550 
2551     KMP_ASSERT(numAddrs > 0);
2552     depth = address2os[0].first.depth;
2553 
2554     maxOsId = 0;
2555     for (i = 0; i < numAddrs; i++) {
2556         unsigned osId = address2os[i].second;
2557         if (osId > maxOsId) {
2558             maxOsId = osId;
2559         }
2560     }
2561     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2562       (maxOsId + 1) * __kmp_affin_mask_size);
2563 
2564     //
2565     // Sort the address2os table according to physical order.  Doing so
2566     // will put all threads on the same core/package/node in consecutive
2567     // locations.
2568     //
2569     qsort(address2os, numAddrs, sizeof(*address2os),
2570       __kmp_affinity_cmp_Address_labels);
2571 
2572     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2573     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2574         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2575     }
2576     if (__kmp_affinity_gran_levels >= (int)depth) {
2577         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2578           && (__kmp_affinity_type != affinity_none))) {
2579             KMP_WARNING(AffThreadsMayMigrate);
2580         }
2581     }
2582 
2583     //
2584     // Run through the table, forming the masks for all threads on each
2585     // core.  Threads on the same core will have identical "Address"
2586     // objects, not considering the last level, which must be the thread
2587     // id.  All threads on a core will appear consecutively.
2588     //
2589     unsigned unique = 0;
2590     unsigned j = 0;                             // index of 1st thread on core
2591     unsigned leader = 0;
2592     Address *leaderAddr = &(address2os[0].first);
2593     kmp_affin_mask_t *sum
2594       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2595     KMP_CPU_ZERO(sum);
2596     KMP_CPU_SET(address2os[0].second, sum);
2597     for (i = 1; i < numAddrs; i++) {
2598         //
2599         // If this thread is sufficiently close to the leader (within the
2600         // granularity setting), then set the bit for this os thread in the
2601         // affinity mask for this group, and go on to the next thread.
2602         //
2603         if (leaderAddr->isClose(address2os[i].first,
2604           __kmp_affinity_gran_levels)) {
2605             KMP_CPU_SET(address2os[i].second, sum);
2606             continue;
2607         }
2608 
2609         //
2610         // For every thread in this group, copy the mask to the thread's
2611         // entry in the osId2Mask table.  Mark the first address as a
2612         // leader.
2613         //
2614         for (; j < i; j++) {
2615             unsigned osId = address2os[j].second;
2616             KMP_DEBUG_ASSERT(osId <= maxOsId);
2617             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2618             KMP_CPU_COPY(mask, sum);
2619             address2os[j].first.leader = (j == leader);
2620         }
2621         unique++;
2622 
2623         //
2624         // Start a new mask.
2625         //
2626         leader = i;
2627         leaderAddr = &(address2os[i].first);
2628         KMP_CPU_ZERO(sum);
2629         KMP_CPU_SET(address2os[i].second, sum);
2630     }
2631 
2632     //
2633     // For every thread in last group, copy the mask to the thread's
2634     // entry in the osId2Mask table.
2635     //
2636     for (; j < i; j++) {
2637         unsigned osId = address2os[j].second;
2638         KMP_DEBUG_ASSERT(osId <= maxOsId);
2639         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2640         KMP_CPU_COPY(mask, sum);
2641         address2os[j].first.leader = (j == leader);
2642     }
2643     unique++;
2644 
2645     *maxIndex = maxOsId;
2646     *numUnique = unique;
2647     return osId2Mask;
2648 }
2649 
2650 
2651 //
2652 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2653 // as file-static than to try and pass them through the calling sequence of
2654 // the recursive-descent OMP_PLACES parser.
2655 //
2656 static kmp_affin_mask_t *newMasks;
2657 static int numNewMasks;
2658 static int nextNewMask;
2659 
2660 #define ADD_MASK(_mask) \
2661     {                                                                   \
2662         if (nextNewMask >= numNewMasks) {                               \
2663             numNewMasks *= 2;                                           \
2664             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2665               numNewMasks * __kmp_affin_mask_size);                     \
2666         }                                                               \
2667         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2668         nextNewMask++;                                                  \
2669     }
2670 
2671 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2672     {                                                                   \
2673         if (((_osId) > _maxOsId) ||                                     \
2674           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2675             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2676               && (__kmp_affinity_type != affinity_none))) {             \
2677                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2678             }                                                           \
2679         }                                                               \
2680         else {                                                          \
2681             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2682         }                                                               \
2683     }
2684 
2685 
2686 //
2687 // Re-parse the proclist (for the explicit affinity type), and form the list
2688 // of affinity newMasks indexed by gtid.
2689 //
2690 static void
2691 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2692   unsigned int *out_numMasks, const char *proclist,
2693   kmp_affin_mask_t *osId2Mask, int maxOsId)
2694 {
2695     const char *scan = proclist;
2696     const char *next = proclist;
2697 
2698     //
2699     // We use malloc() for the temporary mask vector,
2700     // so that we can use realloc() to extend it.
2701     //
2702     numNewMasks = 2;
2703     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2704       * __kmp_affin_mask_size);
2705     nextNewMask = 0;
2706     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2707       __kmp_affin_mask_size);
2708     int setSize = 0;
2709 
2710     for (;;) {
2711         int start, end, stride;
2712 
2713         SKIP_WS(scan);
2714         next = scan;
2715         if (*next == '\0') {
2716             break;
2717         }
2718 
2719         if (*next == '{') {
2720             int num;
2721             setSize = 0;
2722             next++;     // skip '{'
2723             SKIP_WS(next);
2724             scan = next;
2725 
2726             //
2727             // Read the first integer in the set.
2728             //
2729             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2730               "bad proclist");
2731             SKIP_DIGITS(next);
2732             num = __kmp_str_to_int(scan, *next);
2733             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2734 
2735             //
2736             // Copy the mask for that osId to the sum (union) mask.
2737             //
2738             if ((num > maxOsId) ||
2739               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2740                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2741                   && (__kmp_affinity_type != affinity_none))) {
2742                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2743                 }
2744                 KMP_CPU_ZERO(sumMask);
2745             }
2746             else {
2747                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2748                 setSize = 1;
2749             }
2750 
2751             for (;;) {
2752                 //
2753                 // Check for end of set.
2754                 //
2755                 SKIP_WS(next);
2756                 if (*next == '}') {
2757                     next++;     // skip '}'
2758                     break;
2759                 }
2760 
2761                 //
2762                 // Skip optional comma.
2763                 //
2764                 if (*next == ',') {
2765                     next++;
2766                 }
2767                 SKIP_WS(next);
2768 
2769                 //
2770                 // Read the next integer in the set.
2771                 //
2772                 scan = next;
2773                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2774                   "bad explicit proc list");
2775 
2776                 SKIP_DIGITS(next);
2777                 num = __kmp_str_to_int(scan, *next);
2778                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2779 
2780                 //
2781                 // Add the mask for that osId to the sum mask.
2782                 //
2783                 if ((num > maxOsId) ||
2784                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2785                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2786                       && (__kmp_affinity_type != affinity_none))) {
2787                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2788                     }
2789                 }
2790                 else {
2791                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2792                     setSize++;
2793                 }
2794             }
2795             if (setSize > 0) {
2796                 ADD_MASK(sumMask);
2797             }
2798 
2799             SKIP_WS(next);
2800             if (*next == ',') {
2801                 next++;
2802             }
2803             scan = next;
2804             continue;
2805         }
2806 
2807         //
2808         // Read the first integer.
2809         //
2810         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2811         SKIP_DIGITS(next);
2812         start = __kmp_str_to_int(scan, *next);
2813         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2814         SKIP_WS(next);
2815 
2816         //
2817         // If this isn't a range, then add a mask to the list and go on.
2818         //
2819         if (*next != '-') {
2820             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2821 
2822             //
2823             // Skip optional comma.
2824             //
2825             if (*next == ',') {
2826                 next++;
2827             }
2828             scan = next;
2829             continue;
2830         }
2831 
2832         //
2833         // This is a range.  Skip over the '-' and read in the 2nd int.
2834         //
2835         next++;         // skip '-'
2836         SKIP_WS(next);
2837         scan = next;
2838         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2839         SKIP_DIGITS(next);
2840         end = __kmp_str_to_int(scan, *next);
2841         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2842 
2843         //
2844         // Check for a stride parameter
2845         //
2846         stride = 1;
2847         SKIP_WS(next);
2848         if (*next == ':') {
2849             //
2850             // A stride is specified.  Skip over the ':" and read the 3rd int.
2851             //
2852             int sign = +1;
2853             next++;         // skip ':'
2854             SKIP_WS(next);
2855             scan = next;
2856             if (*next == '-') {
2857                 sign = -1;
2858                 next++;
2859                 SKIP_WS(next);
2860                 scan = next;
2861             }
2862             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2863               "bad explicit proc list");
2864             SKIP_DIGITS(next);
2865             stride = __kmp_str_to_int(scan, *next);
2866             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2867             stride *= sign;
2868         }
2869 
2870         //
2871         // Do some range checks.
2872         //
2873         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2874         if (stride > 0) {
2875             KMP_ASSERT2(start <= end, "bad explicit proc list");
2876         }
2877         else {
2878             KMP_ASSERT2(start >= end, "bad explicit proc list");
2879         }
2880         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2881 
2882         //
2883         // Add the mask for each OS proc # to the list.
2884         //
2885         if (stride > 0) {
2886             do {
2887                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2888                 start += stride;
2889             } while (start <= end);
2890         }
2891         else {
2892             do {
2893                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2894                 start += stride;
2895             } while (start >= end);
2896         }
2897 
2898         //
2899         // Skip optional comma.
2900         //
2901         SKIP_WS(next);
2902         if (*next == ',') {
2903             next++;
2904         }
2905         scan = next;
2906     }
2907 
2908     *out_numMasks = nextNewMask;
2909     if (nextNewMask == 0) {
2910         *out_masks = NULL;
2911         KMP_INTERNAL_FREE(newMasks);
2912         return;
2913     }
2914     *out_masks
2915       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2916     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2917     __kmp_free(sumMask);
2918     KMP_INTERNAL_FREE(newMasks);
2919 }
2920 
2921 
2922 # if OMP_40_ENABLED
2923 
2924 /*-----------------------------------------------------------------------------
2925 
2926 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2927 places.  Again, Here is the grammar:
2928 
2929 place_list := place
2930 place_list := place , place_list
2931 place := num
2932 place := place : num
2933 place := place : num : signed
2934 place := { subplacelist }
2935 place := ! place                  // (lowest priority)
2936 subplace_list := subplace
2937 subplace_list := subplace , subplace_list
2938 subplace := num
2939 subplace := num : num
2940 subplace := num : num : signed
2941 signed := num
2942 signed := + signed
2943 signed := - signed
2944 
2945 -----------------------------------------------------------------------------*/
2946 
2947 static void
2948 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2949   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2950 {
2951     const char *next;
2952 
2953     for (;;) {
2954         int start, count, stride, i;
2955 
2956         //
2957         // Read in the starting proc id
2958         //
2959         SKIP_WS(*scan);
2960         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2961           "bad explicit places list");
2962         next = *scan;
2963         SKIP_DIGITS(next);
2964         start = __kmp_str_to_int(*scan, *next);
2965         KMP_ASSERT(start >= 0);
2966         *scan = next;
2967 
2968         //
2969         // valid follow sets are ',' ':' and '}'
2970         //
2971         SKIP_WS(*scan);
2972         if (**scan == '}' || **scan == ',') {
2973             if ((start > maxOsId) ||
2974               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2975                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2976                   && (__kmp_affinity_type != affinity_none))) {
2977                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2978                 }
2979             }
2980             else {
2981                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2982                 (*setSize)++;
2983             }
2984             if (**scan == '}') {
2985                 break;
2986             }
2987             (*scan)++;  // skip ','
2988             continue;
2989         }
2990         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2991         (*scan)++;      // skip ':'
2992 
2993         //
2994         // Read count parameter
2995         //
2996         SKIP_WS(*scan);
2997         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2998           "bad explicit places list");
2999         next = *scan;
3000         SKIP_DIGITS(next);
3001         count = __kmp_str_to_int(*scan, *next);
3002         KMP_ASSERT(count >= 0);
3003         *scan = next;
3004 
3005         //
3006         // valid follow sets are ',' ':' and '}'
3007         //
3008         SKIP_WS(*scan);
3009         if (**scan == '}' || **scan == ',') {
3010             for (i = 0; i < count; i++) {
3011                 if ((start > maxOsId) ||
3012                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3013                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3014                       && (__kmp_affinity_type != affinity_none))) {
3015                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3016                     }
3017                     break;  // don't proliferate warnings for large count
3018                 }
3019                 else {
3020                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3021                     start++;
3022                     (*setSize)++;
3023                 }
3024             }
3025             if (**scan == '}') {
3026                 break;
3027             }
3028             (*scan)++;  // skip ','
3029             continue;
3030         }
3031         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3032         (*scan)++;      // skip ':'
3033 
3034         //
3035         // Read stride parameter
3036         //
3037         int sign = +1;
3038         for (;;) {
3039             SKIP_WS(*scan);
3040             if (**scan == '+') {
3041                 (*scan)++; // skip '+'
3042                 continue;
3043             }
3044             if (**scan == '-') {
3045                 sign *= -1;
3046                 (*scan)++; // skip '-'
3047                 continue;
3048             }
3049             break;
3050         }
3051         SKIP_WS(*scan);
3052         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3053           "bad explicit places list");
3054         next = *scan;
3055         SKIP_DIGITS(next);
3056         stride = __kmp_str_to_int(*scan, *next);
3057         KMP_ASSERT(stride >= 0);
3058         *scan = next;
3059         stride *= sign;
3060 
3061         //
3062         // valid follow sets are ',' and '}'
3063         //
3064         SKIP_WS(*scan);
3065         if (**scan == '}' || **scan == ',') {
3066             for (i = 0; i < count; i++) {
3067                 if ((start > maxOsId) ||
3068                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3069                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3070                       && (__kmp_affinity_type != affinity_none))) {
3071                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3072                     }
3073                     break;  // don't proliferate warnings for large count
3074                 }
3075                 else {
3076                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3077                     start += stride;
3078                     (*setSize)++;
3079                 }
3080             }
3081             if (**scan == '}') {
3082                 break;
3083             }
3084             (*scan)++;  // skip ','
3085             continue;
3086         }
3087 
3088         KMP_ASSERT2(0, "bad explicit places list");
3089     }
3090 }
3091 
3092 
3093 static void
3094 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3095   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3096 {
3097     const char *next;
3098 
3099     //
3100     // valid follow sets are '{' '!' and num
3101     //
3102     SKIP_WS(*scan);
3103     if (**scan == '{') {
3104         (*scan)++;      // skip '{'
3105         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3106           setSize);
3107         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3108         (*scan)++;      // skip '}'
3109     }
3110     else if (**scan == '!') {
3111         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3112         KMP_CPU_COMPLEMENT(tempMask);
3113         (*scan)++;      // skip '!'
3114     }
3115     else if ((**scan >= '0') && (**scan <= '9')) {
3116         next = *scan;
3117         SKIP_DIGITS(next);
3118         int num = __kmp_str_to_int(*scan, *next);
3119         KMP_ASSERT(num >= 0);
3120         if ((num > maxOsId) ||
3121           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3122             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3123               && (__kmp_affinity_type != affinity_none))) {
3124                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3125             }
3126         }
3127         else {
3128             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3129             (*setSize)++;
3130         }
3131         *scan = next;  // skip num
3132     }
3133     else {
3134         KMP_ASSERT2(0, "bad explicit places list");
3135     }
3136 }
3137 
3138 
3139 //static void
3140 void
3141 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3142   unsigned int *out_numMasks, const char *placelist,
3143   kmp_affin_mask_t *osId2Mask, int maxOsId)
3144 {
3145     const char *scan = placelist;
3146     const char *next = placelist;
3147 
3148     numNewMasks = 2;
3149     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3150       * __kmp_affin_mask_size);
3151     nextNewMask = 0;
3152 
3153     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3154       __kmp_affin_mask_size);
3155     KMP_CPU_ZERO(tempMask);
3156     int setSize = 0;
3157 
3158     for (;;) {
3159         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3160 
3161         //
3162         // valid follow sets are ',' ':' and EOL
3163         //
3164         SKIP_WS(scan);
3165         if (*scan == '\0' || *scan == ',') {
3166             if (setSize > 0) {
3167                 ADD_MASK(tempMask);
3168             }
3169             KMP_CPU_ZERO(tempMask);
3170             setSize = 0;
3171             if (*scan == '\0') {
3172                 break;
3173             }
3174             scan++;     // skip ','
3175             continue;
3176         }
3177 
3178         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3179         scan++;         // skip ':'
3180 
3181         //
3182         // Read count parameter
3183         //
3184         SKIP_WS(scan);
3185         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3186           "bad explicit places list");
3187         next = scan;
3188         SKIP_DIGITS(next);
3189         int count = __kmp_str_to_int(scan, *next);
3190         KMP_ASSERT(count >= 0);
3191         scan = next;
3192 
3193         //
3194         // valid follow sets are ',' ':' and EOL
3195         //
3196         SKIP_WS(scan);
3197         int stride;
3198         if (*scan == '\0' || *scan == ',') {
3199             stride = +1;
3200         }
3201         else {
3202             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3203             scan++;         // skip ':'
3204 
3205             //
3206             // Read stride parameter
3207             //
3208             int sign = +1;
3209             for (;;) {
3210                 SKIP_WS(scan);
3211                 if (*scan == '+') {
3212                     scan++; // skip '+'
3213                     continue;
3214                 }
3215                 if (*scan == '-') {
3216                     sign *= -1;
3217                     scan++; // skip '-'
3218                     continue;
3219                 }
3220                 break;
3221             }
3222             SKIP_WS(scan);
3223             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3224               "bad explicit places list");
3225             next = scan;
3226             SKIP_DIGITS(next);
3227             stride = __kmp_str_to_int(scan, *next);
3228             KMP_DEBUG_ASSERT(stride >= 0);
3229             scan = next;
3230             stride *= sign;
3231         }
3232 
3233         if (stride > 0) {
3234             int i;
3235             for (i = 0; i < count; i++) {
3236                 int j;
3237                 if (setSize == 0) {
3238                     break;
3239                 }
3240                 ADD_MASK(tempMask);
3241                 setSize = 0;
3242                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3243                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3244                         KMP_CPU_CLR(j, tempMask);
3245                     }
3246                     else if ((j > maxOsId) ||
3247                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3248                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3249                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3250                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3251                         }
3252                         KMP_CPU_CLR(j, tempMask);
3253                     }
3254                     else {
3255                         KMP_CPU_SET(j, tempMask);
3256                         setSize++;
3257                     }
3258                 }
3259                 for (; j >= 0; j--) {
3260                     KMP_CPU_CLR(j, tempMask);
3261                 }
3262             }
3263         }
3264         else {
3265             int i;
3266             for (i = 0; i < count; i++) {
3267                 int j;
3268                 if (setSize == 0) {
3269                     break;
3270                 }
3271                 ADD_MASK(tempMask);
3272                 setSize = 0;
3273                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3274                   j++) {
3275                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3276                         KMP_CPU_CLR(j, tempMask);
3277                     }
3278                     else if ((j > maxOsId) ||
3279                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3280                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3281                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3282                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3283                         }
3284                         KMP_CPU_CLR(j, tempMask);
3285                     }
3286                     else {
3287                         KMP_CPU_SET(j, tempMask);
3288                         setSize++;
3289                     }
3290                 }
3291                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3292                     KMP_CPU_CLR(j, tempMask);
3293                 }
3294             }
3295         }
3296         KMP_CPU_ZERO(tempMask);
3297         setSize = 0;
3298 
3299         //
3300         // valid follow sets are ',' and EOL
3301         //
3302         SKIP_WS(scan);
3303         if (*scan == '\0') {
3304             break;
3305         }
3306         if (*scan == ',') {
3307             scan++;     // skip ','
3308             continue;
3309         }
3310 
3311         KMP_ASSERT2(0, "bad explicit places list");
3312     }
3313 
3314     *out_numMasks = nextNewMask;
3315     if (nextNewMask == 0) {
3316         *out_masks = NULL;
3317         KMP_INTERNAL_FREE(newMasks);
3318         return;
3319     }
3320     *out_masks
3321       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3322     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3323     __kmp_free(tempMask);
3324     KMP_INTERNAL_FREE(newMasks);
3325 }
3326 
3327 # endif /* OMP_40_ENABLED */
3328 
3329 #undef ADD_MASK
3330 #undef ADD_MASK_OSID
3331 
3332 static void
3333 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3334 {
3335     if ( __kmp_place_num_cores == 0 ) {
3336         if ( __kmp_place_num_threads_per_core == 0 ) {
3337             return;   // no cores limiting actions requested, exit
3338         }
3339         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3340     }
3341     if ( !__kmp_affinity_uniform_topology() ) {
3342         KMP_WARNING( AffThrPlaceNonUniform );
3343         return; // don't support non-uniform topology
3344     }
3345     if ( depth != 3 ) {
3346         KMP_WARNING( AffThrPlaceNonThreeLevel );
3347         return; // don't support not-3-level topology
3348     }
3349     if ( __kmp_place_num_threads_per_core == 0 ) {
3350         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3351     }
3352     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3353         KMP_WARNING( AffThrPlaceManyCores );
3354         return;
3355     }
3356 
3357     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3358                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3359     int i, j, k, n_old = 0, n_new = 0;
3360     for ( i = 0; i < nPackages; ++i ) {
3361         for ( j = 0; j < nCoresPerPkg; ++j ) {
3362             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3363                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3364             } else {
3365                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3366                     if ( k < __kmp_place_num_threads_per_core ) {
3367                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3368                         n_new++;
3369                     }
3370                     n_old++;
3371                 }
3372             }
3373         }
3374     }
3375     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3376     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3377     __kmp_avail_proc = n_new;                                 // correct avail_proc
3378     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3379 
3380     __kmp_free( *pAddr );
3381     *pAddr = newAddr;      // replace old topology with new one
3382 }
3383 
3384 
3385 static AddrUnsPair *address2os = NULL;
3386 static int           * procarr = NULL;
3387 static int     __kmp_aff_depth = 0;
3388 
3389 static void
3390 __kmp_aux_affinity_initialize(void)
3391 {
3392     if (__kmp_affinity_masks != NULL) {
3393         KMP_ASSERT(fullMask != NULL);
3394         return;
3395     }
3396 
3397     //
3398     // Create the "full" mask - this defines all of the processors that we
3399     // consider to be in the machine model.  If respect is set, then it is
3400     // the initialization thread's affinity mask.  Otherwise, it is all
3401     // processors that we know about on the machine.
3402     //
3403     if (fullMask == NULL) {
3404         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3405     }
3406     if (KMP_AFFINITY_CAPABLE()) {
3407         if (__kmp_affinity_respect_mask) {
3408             __kmp_get_system_affinity(fullMask, TRUE);
3409 
3410             //
3411             // Count the number of available processors.
3412             //
3413             unsigned i;
3414             __kmp_avail_proc = 0;
3415             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3416                 if (! KMP_CPU_ISSET(i, fullMask)) {
3417                     continue;
3418                 }
3419                 __kmp_avail_proc++;
3420             }
3421             if (__kmp_avail_proc > __kmp_xproc) {
3422                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3423                   && (__kmp_affinity_type != affinity_none))) {
3424                     KMP_WARNING(ErrorInitializeAffinity);
3425                 }
3426                 __kmp_affinity_type = affinity_none;
3427                 KMP_AFFINITY_DISABLE();
3428                 return;
3429             }
3430         }
3431         else {
3432             __kmp_affinity_entire_machine_mask(fullMask);
3433             __kmp_avail_proc = __kmp_xproc;
3434         }
3435     }
3436 
3437     int depth = -1;
3438     kmp_i18n_id_t msg_id = kmp_i18n_null;
3439 
3440     //
3441     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3442     // KMP_TOPOLOGY_METHOD=cpuinfo
3443     //
3444     if ((__kmp_cpuinfo_file != NULL) &&
3445       (__kmp_affinity_top_method == affinity_top_method_all)) {
3446         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3447     }
3448 
3449     if (__kmp_affinity_top_method == affinity_top_method_all) {
3450         //
3451         // In the default code path, errors are not fatal - we just try using
3452         // another method.  We only emit a warning message if affinity is on,
3453         // or the verbose flag is set, an the nowarnings flag was not set.
3454         //
3455         const char *file_name = NULL;
3456         int line = 0;
3457 
3458 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3459 
3460         if (__kmp_affinity_verbose) {
3461             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3462         }
3463 
3464         file_name = NULL;
3465         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3466         if (depth == 0) {
3467             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3468             KMP_ASSERT(address2os == NULL);
3469             return;
3470         }
3471 
3472         if (depth < 0) {
3473             if (__kmp_affinity_verbose) {
3474                 if (msg_id != kmp_i18n_null) {
3475                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3476                       KMP_I18N_STR(DecodingLegacyAPIC));
3477                 }
3478                 else {
3479                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3480                 }
3481             }
3482 
3483             file_name = NULL;
3484             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3485             if (depth == 0) {
3486                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3487                 KMP_ASSERT(address2os == NULL);
3488                 return;
3489             }
3490         }
3491 
3492 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3493 
3494 # if KMP_OS_LINUX
3495 
3496         if (depth < 0) {
3497             if (__kmp_affinity_verbose) {
3498                 if (msg_id != kmp_i18n_null) {
3499                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3500                 }
3501                 else {
3502                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3503                 }
3504             }
3505 
3506             FILE *f = fopen("/proc/cpuinfo", "r");
3507             if (f == NULL) {
3508                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3509             }
3510             else {
3511                 file_name = "/proc/cpuinfo";
3512                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3513                 fclose(f);
3514                 if (depth == 0) {
3515                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3516                     KMP_ASSERT(address2os == NULL);
3517                     return;
3518                 }
3519             }
3520         }
3521 
3522 # endif /* KMP_OS_LINUX */
3523 
3524 # if KMP_GROUP_AFFINITY
3525 
3526         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3527             if (__kmp_affinity_verbose) {
3528                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3529             }
3530 
3531             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3532             KMP_ASSERT(depth != 0);
3533         }
3534 
3535 # endif /* KMP_GROUP_AFFINITY */
3536 
3537         if (depth < 0) {
3538             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3539                 if (file_name == NULL) {
3540                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3541                 }
3542                 else if (line == 0) {
3543                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3544                 }
3545                 else {
3546                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3547                 }
3548             }
3549             // FIXME - print msg if msg_id = kmp_i18n_null ???
3550 
3551             file_name = "";
3552             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3553             if (depth == 0) {
3554                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3555                 KMP_ASSERT(address2os == NULL);
3556                 return;
3557             }
3558             KMP_ASSERT(depth > 0);
3559             KMP_ASSERT(address2os != NULL);
3560         }
3561     }
3562 
3563     //
3564     // If the user has specified that a paricular topology discovery method
3565     // is to be used, then we abort if that method fails.  The exception is
3566     // group affinity, which might have been implicitly set.
3567     //
3568 
3569 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3570 
3571     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3572         if (__kmp_affinity_verbose) {
3573             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3574               KMP_I18N_STR(Decodingx2APIC));
3575         }
3576 
3577         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3578         if (depth == 0) {
3579             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3580             KMP_ASSERT(address2os == NULL);
3581             return;
3582         }
3583         if (depth < 0) {
3584             KMP_ASSERT(msg_id != kmp_i18n_null);
3585             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3586         }
3587     }
3588     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3589         if (__kmp_affinity_verbose) {
3590             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3591               KMP_I18N_STR(DecodingLegacyAPIC));
3592         }
3593 
3594         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3595         if (depth == 0) {
3596             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3597             KMP_ASSERT(address2os == NULL);
3598             return;
3599         }
3600         if (depth < 0) {
3601             KMP_ASSERT(msg_id != kmp_i18n_null);
3602             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3603         }
3604     }
3605 
3606 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3607 
3608     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3609         const char *filename;
3610         if (__kmp_cpuinfo_file != NULL) {
3611             filename = __kmp_cpuinfo_file;
3612         }
3613         else {
3614             filename = "/proc/cpuinfo";
3615         }
3616 
3617         if (__kmp_affinity_verbose) {
3618             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3619         }
3620 
3621         FILE *f = fopen(filename, "r");
3622         if (f == NULL) {
3623             int code = errno;
3624             if (__kmp_cpuinfo_file != NULL) {
3625                 __kmp_msg(
3626                     kmp_ms_fatal,
3627                     KMP_MSG(CantOpenFileForReading, filename),
3628                     KMP_ERR(code),
3629                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3630                     __kmp_msg_null
3631                 );
3632             }
3633             else {
3634                 __kmp_msg(
3635                     kmp_ms_fatal,
3636                     KMP_MSG(CantOpenFileForReading, filename),
3637                     KMP_ERR(code),
3638                     __kmp_msg_null
3639                 );
3640             }
3641         }
3642         int line = 0;
3643         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3644         fclose(f);
3645         if (depth < 0) {
3646             KMP_ASSERT(msg_id != kmp_i18n_null);
3647             if (line > 0) {
3648                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3649             }
3650             else {
3651                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3652             }
3653         }
3654         if (__kmp_affinity_type == affinity_none) {
3655             KMP_ASSERT(depth == 0);
3656             KMP_ASSERT(address2os == NULL);
3657             return;
3658         }
3659     }
3660 
3661 # if KMP_GROUP_AFFINITY
3662 
3663     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3664         if (__kmp_affinity_verbose) {
3665             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3666         }
3667 
3668         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3669         KMP_ASSERT(depth != 0);
3670         if (depth < 0) {
3671             KMP_ASSERT(msg_id != kmp_i18n_null);
3672             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3673         }
3674     }
3675 
3676 # endif /* KMP_GROUP_AFFINITY */
3677 
3678     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3679         if (__kmp_affinity_verbose) {
3680             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3681         }
3682 
3683         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3684         if (depth == 0) {
3685             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3686             KMP_ASSERT(address2os == NULL);
3687             return;
3688         }
3689         // should not fail
3690         KMP_ASSERT(depth > 0);
3691         KMP_ASSERT(address2os != NULL);
3692     }
3693 
3694     if (address2os == NULL) {
3695         if (KMP_AFFINITY_CAPABLE()
3696           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3697           && (__kmp_affinity_type != affinity_none)))) {
3698             KMP_WARNING(ErrorInitializeAffinity);
3699         }
3700         __kmp_affinity_type = affinity_none;
3701         KMP_AFFINITY_DISABLE();
3702         return;
3703     }
3704 
3705     __kmp_apply_thread_places(&address2os, depth);
3706 
3707     //
3708     // Create the table of masks, indexed by thread Id.
3709     //
3710     unsigned maxIndex;
3711     unsigned numUnique;
3712     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3713       address2os, __kmp_avail_proc);
3714     if (__kmp_affinity_gran_levels == 0) {
3715         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3716     }
3717 
3718     //
3719     // Set the childNums vector in all Address objects.  This must be done
3720     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3721     // which takes into account the setting of __kmp_affinity_compact.
3722     //
3723     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3724 
3725     switch (__kmp_affinity_type) {
3726 
3727         case affinity_explicit:
3728         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3729 # if OMP_40_ENABLED
3730         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3731 # endif
3732         {
3733             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3734               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3735               maxIndex);
3736         }
3737 # if OMP_40_ENABLED
3738         else {
3739             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3740               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3741               maxIndex);
3742         }
3743 # endif
3744         if (__kmp_affinity_num_masks == 0) {
3745             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3746               && (__kmp_affinity_type != affinity_none))) {
3747                 KMP_WARNING(AffNoValidProcID);
3748             }
3749             __kmp_affinity_type = affinity_none;
3750             return;
3751         }
3752         break;
3753 
3754         //
3755         // The other affinity types rely on sorting the Addresses according
3756         // to some permutation of the machine topology tree.  Set
3757         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3758         // then jump to a common code fragment to do the sort and create
3759         // the array of affinity masks.
3760         //
3761 
3762         case affinity_logical:
3763         __kmp_affinity_compact = 0;
3764         if (__kmp_affinity_offset) {
3765             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3766               % __kmp_avail_proc;
3767         }
3768         goto sortAddresses;
3769 
3770         case affinity_physical:
3771         if (__kmp_nThreadsPerCore > 1) {
3772             __kmp_affinity_compact = 1;
3773             if (__kmp_affinity_compact >= depth) {
3774                 __kmp_affinity_compact = 0;
3775             }
3776         } else {
3777             __kmp_affinity_compact = 0;
3778         }
3779         if (__kmp_affinity_offset) {
3780             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3781               % __kmp_avail_proc;
3782         }
3783         goto sortAddresses;
3784 
3785         case affinity_scatter:
3786         if (__kmp_affinity_compact >= depth) {
3787             __kmp_affinity_compact = 0;
3788         }
3789         else {
3790             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3791         }
3792         goto sortAddresses;
3793 
3794         case affinity_compact:
3795         if (__kmp_affinity_compact >= depth) {
3796             __kmp_affinity_compact = depth - 1;
3797         }
3798         goto sortAddresses;
3799 
3800         case affinity_balanced:
3801         // Balanced works only for the case of a single package and uniform topology
3802         if( nPackages > 1 ) {
3803             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3804                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3805             }
3806             __kmp_affinity_type = affinity_none;
3807             return;
3808         } else if( __kmp_affinity_uniform_topology() ) {
3809             break;
3810         } else { // Non-uniform topology
3811 
3812             // Save the depth for further usage
3813             __kmp_aff_depth = depth;
3814 
3815             // Number of hyper threads per core in HT machine
3816             int nth_per_core = __kmp_nThreadsPerCore;
3817 
3818             int core_level;
3819             if( nth_per_core > 1 ) {
3820                 core_level = depth - 2;
3821             } else {
3822                 core_level = depth - 1;
3823             }
3824             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3825             int nproc = nth_per_core * ncores;
3826 
3827             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3828             for( int i = 0; i < nproc; i++ ) {
3829                 procarr[ i ] = -1;
3830             }
3831 
3832             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3833                 int proc = address2os[ i ].second;
3834                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3835                 // If there is only one thread per core then depth == 2: level 0 - package,
3836                 // level 1 - core.
3837                 int level = depth - 1;
3838 
3839                 // __kmp_nth_per_core == 1
3840                 int thread = 0;
3841                 int core = address2os[ i ].first.labels[ level ];
3842                 // If the thread level exists, that is we have more than one thread context per core
3843                 if( nth_per_core > 1 ) {
3844                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3845                     core = address2os[ i ].first.labels[ level - 1 ];
3846                 }
3847                 procarr[ core * nth_per_core + thread ] = proc;
3848             }
3849 
3850             break;
3851         }
3852 
3853         sortAddresses:
3854         //
3855         // Allocate the gtid->affinity mask table.
3856         //
3857         if (__kmp_affinity_dups) {
3858             __kmp_affinity_num_masks = __kmp_avail_proc;
3859         }
3860         else {
3861             __kmp_affinity_num_masks = numUnique;
3862         }
3863 
3864 # if OMP_40_ENABLED
3865         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3866           && ( __kmp_affinity_num_places > 0 )
3867           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3868             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3869         }
3870 # endif
3871 
3872         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3873           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3874 
3875         //
3876         // Sort the address2os table according to the current setting of
3877         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3878         //
3879         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3880           __kmp_affinity_cmp_Address_child_num);
3881         {
3882             int i;
3883             unsigned j;
3884             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3885                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3886                     continue;
3887                 }
3888                 unsigned osId = address2os[i].second;
3889                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3890                 kmp_affin_mask_t *dest
3891                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3892                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3893                 KMP_CPU_COPY(dest, src);
3894                 if (++j >= __kmp_affinity_num_masks) {
3895                     break;
3896                 }
3897             }
3898             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3899         }
3900         break;
3901 
3902         default:
3903         KMP_ASSERT2(0, "Unexpected affinity setting");
3904     }
3905 
3906     __kmp_free(osId2Mask);
3907     machine_hierarchy.init(address2os, __kmp_avail_proc);
3908 }
3909 
3910 
3911 void
3912 __kmp_affinity_initialize(void)
3913 {
3914     //
3915     // Much of the code above was written assumming that if a machine was not
3916     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3917     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3918     //
3919     // There are too many checks for __kmp_affinity_type == affinity_none
3920     // in this code.  Instead of trying to change them all, check if
3921     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3922     // affinity_none, call the real initialization routine, then restore
3923     // __kmp_affinity_type to affinity_disabled.
3924     //
3925     int disabled = (__kmp_affinity_type == affinity_disabled);
3926     if (! KMP_AFFINITY_CAPABLE()) {
3927         KMP_ASSERT(disabled);
3928     }
3929     if (disabled) {
3930         __kmp_affinity_type = affinity_none;
3931     }
3932     __kmp_aux_affinity_initialize();
3933     if (disabled) {
3934         __kmp_affinity_type = affinity_disabled;
3935     }
3936 }
3937 
3938 
3939 void
3940 __kmp_affinity_uninitialize(void)
3941 {
3942     if (__kmp_affinity_masks != NULL) {
3943         __kmp_free(__kmp_affinity_masks);
3944         __kmp_affinity_masks = NULL;
3945     }
3946     if (fullMask != NULL) {
3947         KMP_CPU_FREE(fullMask);
3948         fullMask = NULL;
3949     }
3950     __kmp_affinity_num_masks = 0;
3951 # if OMP_40_ENABLED
3952     __kmp_affinity_num_places = 0;
3953 # endif
3954     if (__kmp_affinity_proclist != NULL) {
3955         __kmp_free(__kmp_affinity_proclist);
3956         __kmp_affinity_proclist = NULL;
3957     }
3958     if( address2os != NULL ) {
3959         __kmp_free( address2os );
3960         address2os = NULL;
3961     }
3962     if( procarr != NULL ) {
3963         __kmp_free( procarr );
3964         procarr = NULL;
3965     }
3966 }
3967 
3968 
3969 void
3970 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3971 {
3972     if (! KMP_AFFINITY_CAPABLE()) {
3973         return;
3974     }
3975 
3976     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3977     if (th->th.th_affin_mask == NULL) {
3978         KMP_CPU_ALLOC(th->th.th_affin_mask);
3979     }
3980     else {
3981         KMP_CPU_ZERO(th->th.th_affin_mask);
3982     }
3983 
3984     //
3985     // Copy the thread mask to the kmp_info_t strucuture.
3986     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3987     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3988     // is set, then the full mask is the same as the mask of the initialization
3989     // thread.
3990     //
3991     kmp_affin_mask_t *mask;
3992     int i;
3993 
3994 # if OMP_40_ENABLED
3995     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3996 # endif
3997     {
3998         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
3999           ) {
4000 # if KMP_GROUP_AFFINITY
4001             if (__kmp_num_proc_groups > 1) {
4002                 return;
4003             }
4004 # endif
4005             KMP_ASSERT(fullMask != NULL);
4006             i = KMP_PLACE_ALL;
4007             mask = fullMask;
4008         }
4009         else {
4010             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4011             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4012             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4013         }
4014     }
4015 # if OMP_40_ENABLED
4016     else {
4017         if ((! isa_root)
4018           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4019 #  if KMP_GROUP_AFFINITY
4020             if (__kmp_num_proc_groups > 1) {
4021                 return;
4022             }
4023 #  endif
4024             KMP_ASSERT(fullMask != NULL);
4025             i = KMP_PLACE_ALL;
4026             mask = fullMask;
4027         }
4028         else {
4029             //
4030             // int i = some hash function or just a counter that doesn't
4031             // always start at 0.  Use gtid for now.
4032             //
4033             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4034             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4035             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4036         }
4037     }
4038 # endif
4039 
4040 # if OMP_40_ENABLED
4041     th->th.th_current_place = i;
4042     if (isa_root) {
4043         th->th.th_new_place = i;
4044         th->th.th_first_place = 0;
4045         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4046     }
4047 
4048     if (i == KMP_PLACE_ALL) {
4049         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4050           gtid));
4051     }
4052     else {
4053         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4054           gtid, i));
4055     }
4056 # else
4057     if (i == -1) {
4058         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4059           gtid));
4060     }
4061     else {
4062         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4063           gtid, i));
4064     }
4065 # endif /* OMP_40_ENABLED */
4066 
4067     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4068 
4069     if (__kmp_affinity_verbose) {
4070         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4071         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4072           th->th.th_affin_mask);
4073         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4074           buf);
4075     }
4076 
4077 # if KMP_OS_WINDOWS
4078     //
4079     // On Windows* OS, the process affinity mask might have changed.
4080     // If the user didn't request affinity and this call fails,
4081     // just continue silently.  See CQ171393.
4082     //
4083     if ( __kmp_affinity_type == affinity_none ) {
4084         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4085     }
4086     else
4087 # endif
4088     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4089 }
4090 
4091 
4092 # if OMP_40_ENABLED
4093 
4094 void
4095 __kmp_affinity_set_place(int gtid)
4096 {
4097     int retval;
4098 
4099     if (! KMP_AFFINITY_CAPABLE()) {
4100         return;
4101     }
4102 
4103     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4104 
4105     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4106       gtid, th->th.th_new_place, th->th.th_current_place));
4107 
4108     //
4109     // Check that the new place is within this thread's partition.
4110     //
4111     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4112     KMP_ASSERT(th->th.th_new_place >= 0);
4113     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4114     if (th->th.th_first_place <= th->th.th_last_place) {
4115         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4116          && (th->th.th_new_place <= th->th.th_last_place));
4117     }
4118     else {
4119         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4120          || (th->th.th_new_place >= th->th.th_last_place));
4121     }
4122 
4123     //
4124     // Copy the thread mask to the kmp_info_t strucuture,
4125     // and set this thread's affinity.
4126     //
4127     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4128       th->th.th_new_place);
4129     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4130     th->th.th_current_place = th->th.th_new_place;
4131 
4132     if (__kmp_affinity_verbose) {
4133         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4134         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4135           th->th.th_affin_mask);
4136         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4137           gtid, buf);
4138     }
4139     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4140 }
4141 
4142 # endif /* OMP_40_ENABLED */
4143 
4144 
4145 int
4146 __kmp_aux_set_affinity(void **mask)
4147 {
4148     int gtid;
4149     kmp_info_t *th;
4150     int retval;
4151 
4152     if (! KMP_AFFINITY_CAPABLE()) {
4153         return -1;
4154     }
4155 
4156     gtid = __kmp_entry_gtid();
4157     KA_TRACE(1000, ;{
4158         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4159         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4160           (kmp_affin_mask_t *)(*mask));
4161         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4162           gtid, buf);
4163     });
4164 
4165     if (__kmp_env_consistency_check) {
4166         if ((mask == NULL) || (*mask == NULL)) {
4167             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4168         }
4169         else {
4170             unsigned proc;
4171             int num_procs = 0;
4172 
4173             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4174                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4175                     continue;
4176                 }
4177                 num_procs++;
4178                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4179                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4180                     break;
4181                 }
4182             }
4183             if (num_procs == 0) {
4184                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4185             }
4186 
4187 # if KMP_GROUP_AFFINITY
4188             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4189                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4190             }
4191 # endif /* KMP_GROUP_AFFINITY */
4192 
4193         }
4194     }
4195 
4196     th = __kmp_threads[gtid];
4197     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4198     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4199     if (retval == 0) {
4200         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4201     }
4202 
4203 # if OMP_40_ENABLED
4204     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4205     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4206     th->th.th_first_place = 0;
4207     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4208 
4209     //
4210     // Turn off 4.0 affinity for the current tread at this parallel level.
4211     //
4212     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4213 # endif
4214 
4215     return retval;
4216 }
4217 
4218 
4219 int
4220 __kmp_aux_get_affinity(void **mask)
4221 {
4222     int gtid;
4223     int retval;
4224     kmp_info_t *th;
4225 
4226     if (! KMP_AFFINITY_CAPABLE()) {
4227         return -1;
4228     }
4229 
4230     gtid = __kmp_entry_gtid();
4231     th = __kmp_threads[gtid];
4232     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4233 
4234     KA_TRACE(1000, ;{
4235         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4236         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4237           th->th.th_affin_mask);
4238         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4239     });
4240 
4241     if (__kmp_env_consistency_check) {
4242         if ((mask == NULL) || (*mask == NULL)) {
4243             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4244         }
4245     }
4246 
4247 # if !KMP_OS_WINDOWS
4248 
4249     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4250     KA_TRACE(1000, ;{
4251         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4252         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4253           (kmp_affin_mask_t *)(*mask));
4254         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4255     });
4256     return retval;
4257 
4258 # else
4259 
4260     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4261     return 0;
4262 
4263 # endif /* KMP_OS_WINDOWS */
4264 
4265 }
4266 
4267 int
4268 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4269 {
4270     int retval;
4271 
4272     if (! KMP_AFFINITY_CAPABLE()) {
4273         return -1;
4274     }
4275 
4276     KA_TRACE(1000, ;{
4277         int gtid = __kmp_entry_gtid();
4278         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4279         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4280           (kmp_affin_mask_t *)(*mask));
4281         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4282           proc, gtid, buf);
4283     });
4284 
4285     if (__kmp_env_consistency_check) {
4286         if ((mask == NULL) || (*mask == NULL)) {
4287             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4288         }
4289     }
4290 
4291     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4292         return -1;
4293     }
4294     if (! KMP_CPU_ISSET(proc, fullMask)) {
4295         return -2;
4296     }
4297 
4298     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4299     return 0;
4300 }
4301 
4302 
4303 int
4304 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4305 {
4306     int retval;
4307 
4308     if (! KMP_AFFINITY_CAPABLE()) {
4309         return -1;
4310     }
4311 
4312     KA_TRACE(1000, ;{
4313         int gtid = __kmp_entry_gtid();
4314         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4315         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4316           (kmp_affin_mask_t *)(*mask));
4317         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4318           proc, gtid, buf);
4319     });
4320 
4321     if (__kmp_env_consistency_check) {
4322         if ((mask == NULL) || (*mask == NULL)) {
4323             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4324         }
4325     }
4326 
4327     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4328         return -1;
4329     }
4330     if (! KMP_CPU_ISSET(proc, fullMask)) {
4331         return -2;
4332     }
4333 
4334     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4335     return 0;
4336 }
4337 
4338 
4339 int
4340 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4341 {
4342     int retval;
4343 
4344     if (! KMP_AFFINITY_CAPABLE()) {
4345         return -1;
4346     }
4347 
4348     KA_TRACE(1000, ;{
4349         int gtid = __kmp_entry_gtid();
4350         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4351         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4352           (kmp_affin_mask_t *)(*mask));
4353         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4354           proc, gtid, buf);
4355     });
4356 
4357     if (__kmp_env_consistency_check) {
4358         if ((mask == NULL) || (*mask == NULL)) {
4359             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4360         }
4361     }
4362 
4363     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4364         return 0;
4365     }
4366     if (! KMP_CPU_ISSET(proc, fullMask)) {
4367         return 0;
4368     }
4369 
4370     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4371 }
4372 
4373 
4374 // Dynamic affinity settings - Affinity balanced
4375 void __kmp_balanced_affinity( int tid, int nthreads )
4376 {
4377     if( __kmp_affinity_uniform_topology() ) {
4378         int coreID;
4379         int threadID;
4380         // Number of hyper threads per core in HT machine
4381         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4382         // Number of cores
4383         int ncores = __kmp_ncores;
4384         // How many threads will be bound to each core
4385         int chunk = nthreads / ncores;
4386         // How many cores will have an additional thread bound to it - "big cores"
4387         int big_cores = nthreads % ncores;
4388         // Number of threads on the big cores
4389         int big_nth = ( chunk + 1 ) * big_cores;
4390         if( tid < big_nth ) {
4391             coreID = tid / (chunk + 1 );
4392             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4393         } else { //tid >= big_nth
4394             coreID = ( tid - big_cores ) / chunk;
4395             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4396         }
4397 
4398         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4399           "Illegal set affinity operation when not capable");
4400 
4401         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4402         KMP_CPU_ZERO(mask);
4403 
4404         // Granularity == thread
4405         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4406             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4407             KMP_CPU_SET( osID, mask);
4408         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4409             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4410                 int osID;
4411                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4412                 KMP_CPU_SET( osID, mask);
4413             }
4414         }
4415         if (__kmp_affinity_verbose) {
4416             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4417             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4418             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4419               tid, buf);
4420         }
4421         __kmp_set_system_affinity( mask, TRUE );
4422     } else { // Non-uniform topology
4423 
4424         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4425         KMP_CPU_ZERO(mask);
4426 
4427         // Number of hyper threads per core in HT machine
4428         int nth_per_core = __kmp_nThreadsPerCore;
4429         int core_level;
4430         if( nth_per_core > 1 ) {
4431             core_level = __kmp_aff_depth - 2;
4432         } else {
4433             core_level = __kmp_aff_depth - 1;
4434         }
4435 
4436         // Number of cores - maximum value; it does not count trail cores with 0 processors
4437         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4438 
4439         // For performance gain consider the special case nthreads == __kmp_avail_proc
4440         if( nthreads == __kmp_avail_proc ) {
4441             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4442                 int osID = address2os[ tid ].second;
4443                 KMP_CPU_SET( osID, mask);
4444             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4445                 int coreID = address2os[ tid ].first.labels[ core_level ];
4446                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4447                 // since the address2os is sortied we can break when cnt==nth_per_core
4448                 int cnt = 0;
4449                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4450                     int osID = address2os[ i ].second;
4451                     int core = address2os[ i ].first.labels[ core_level ];
4452                     if( core == coreID ) {
4453                         KMP_CPU_SET( osID, mask);
4454                         cnt++;
4455                         if( cnt == nth_per_core ) {
4456                             break;
4457                         }
4458                     }
4459                 }
4460             }
4461         } else if( nthreads <= __kmp_ncores ) {
4462 
4463             int core = 0;
4464             for( int i = 0; i < ncores; i++ ) {
4465                 // Check if this core from procarr[] is in the mask
4466                 int in_mask = 0;
4467                 for( int j = 0; j < nth_per_core; j++ ) {
4468                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4469                         in_mask = 1;
4470                         break;
4471                     }
4472                 }
4473                 if( in_mask ) {
4474                     if( tid == core ) {
4475                         for( int j = 0; j < nth_per_core; j++ ) {
4476                             int osID = procarr[ i * nth_per_core + j ];
4477                             if( osID != -1 ) {
4478                                 KMP_CPU_SET( osID, mask );
4479                                 // For granularity=thread it is enough to set the first available osID for this core
4480                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4481                                     break;
4482                                 }
4483                             }
4484                         }
4485                         break;
4486                     } else {
4487                         core++;
4488                     }
4489                 }
4490             }
4491 
4492         } else { // nthreads > __kmp_ncores
4493 
4494             // Array to save the number of processors at each core
4495             int nproc_at_core[ ncores ];
4496             // Array to save the number of cores with "x" available processors;
4497             int ncores_with_x_procs[ nth_per_core + 1 ];
4498             // Array to save the number of cores with # procs from x to nth_per_core
4499             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4500 
4501             for( int i = 0; i <= nth_per_core; i++ ) {
4502                 ncores_with_x_procs[ i ] = 0;
4503                 ncores_with_x_to_max_procs[ i ] = 0;
4504             }
4505 
4506             for( int i = 0; i < ncores; i++ ) {
4507                 int cnt = 0;
4508                 for( int j = 0; j < nth_per_core; j++ ) {
4509                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4510                         cnt++;
4511                     }
4512                 }
4513                 nproc_at_core[ i ] = cnt;
4514                 ncores_with_x_procs[ cnt ]++;
4515             }
4516 
4517             for( int i = 0; i <= nth_per_core; i++ ) {
4518                 for( int j = i; j <= nth_per_core; j++ ) {
4519                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4520                 }
4521             }
4522 
4523             // Max number of processors
4524             int nproc = nth_per_core * ncores;
4525             // An array to keep number of threads per each context
4526             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4527             for( int i = 0; i < nproc; i++ ) {
4528                 newarr[ i ] = 0;
4529             }
4530 
4531             int nth = nthreads;
4532             int flag = 0;
4533             while( nth > 0 ) {
4534                 for( int j = 1; j <= nth_per_core; j++ ) {
4535                     int cnt = ncores_with_x_to_max_procs[ j ];
4536                     for( int i = 0; i < ncores; i++ ) {
4537                         // Skip the core with 0 processors
4538                         if( nproc_at_core[ i ] == 0 ) {
4539                             continue;
4540                         }
4541                         for( int k = 0; k < nth_per_core; k++ ) {
4542                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4543                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4544                                     newarr[ i * nth_per_core + k ] = 1;
4545                                     cnt--;
4546                                     nth--;
4547                                     break;
4548                                 } else {
4549                                     if( flag != 0 ) {
4550                                         newarr[ i * nth_per_core + k ] ++;
4551                                         cnt--;
4552                                         nth--;
4553                                         break;
4554                                     }
4555                                 }
4556                             }
4557                         }
4558                         if( cnt == 0 || nth == 0 ) {
4559                             break;
4560                         }
4561                     }
4562                     if( nth == 0 ) {
4563                         break;
4564                     }
4565                 }
4566                 flag = 1;
4567             }
4568             int sum = 0;
4569             for( int i = 0; i < nproc; i++ ) {
4570                 sum += newarr[ i ];
4571                 if( sum > tid ) {
4572                     // Granularity == thread
4573                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4574                         int osID = procarr[ i ];
4575                         KMP_CPU_SET( osID, mask);
4576                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4577                         int coreID = i / nth_per_core;
4578                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4579                             int osID = procarr[ coreID * nth_per_core + ii ];
4580                             if( osID != -1 ) {
4581                                 KMP_CPU_SET( osID, mask);
4582                             }
4583                         }
4584                     }
4585                     break;
4586                 }
4587             }
4588             __kmp_free( newarr );
4589         }
4590 
4591         if (__kmp_affinity_verbose) {
4592             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4593             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4594             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4595               tid, buf);
4596         }
4597         __kmp_set_system_affinity( mask, TRUE );
4598     }
4599 }
4600 
4601 #else
4602     // affinity not supported
4603 
4604 kmp_uint32 mac_skipPerLevel[7];
4605 kmp_uint32 mac_depth;
4606 kmp_uint8 mac_leaf_kids;
4607 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4608     static int first = 1;
4609     if (first) {
4610         const kmp_uint32 maxLevels = 7;
4611         kmp_uint32 numPerLevel[maxLevels];
4612 
4613         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4614             numPerLevel[i] = 1;
4615             mac_skipPerLevel[i] = 1;
4616         }
4617 
4618         mac_depth = 2;
4619         numPerLevel[0] = nproc;
4620 
4621         kmp_uint32 branch = 4;
4622         if (numPerLevel[0] == 1) branch = nproc/4;
4623         if (branch<4) branch=4;
4624         for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4625             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4626                 if (numPerLevel[d] & 1) numPerLevel[d]++;
4627                 numPerLevel[d] = numPerLevel[d] >> 1;
4628                 if (numPerLevel[d+1] == 1) mac_depth++;
4629                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4630             }
4631             if(numPerLevel[0] == 1) {
4632                 branch = branch >> 1;
4633                 if (branch<4) branch = 4;
4634             }
4635         }
4636 
4637         for (kmp_uint32 i=1; i<mac_depth; ++i)
4638             mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4639         mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4640         first=0;
4641     }
4642     thr_bar->depth = mac_depth;
4643     thr_bar->base_leaf_kids = mac_leaf_kids;
4644     thr_bar->skip_per_level = mac_skipPerLevel;
4645 }
4646 
4647 #endif // KMP_AFFINITY_SUPPORTED
4648