1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         sprintf(scan, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     sprintf(scan, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         sprintf(scan, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         sprintf(scan, ",...");
72         while (*scan != '\0') scan++;
73     }
74     sprintf(scan, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libiomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */
316 class hierarchy_info {
317 public:
318     /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
319         etc.  We don't want to get specific with nomenclature */
320     static const kmp_uint32 maxLevels=7;
321 
322     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
323         number of levels along the longest path from root to any leaf. It corresponds to the
324         number of entries in numPerLevel if we exclude all but one trailing 1. */
325     kmp_uint32 depth;
326     kmp_uint32 base_depth;
327     kmp_uint32 base_num_threads;
328     bool uninitialized;
329 
330     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
331         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
332         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
333     kmp_uint32 numPerLevel[maxLevels];
334     kmp_uint32 skipPerLevel[maxLevels];
335 
336     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
337         int hier_depth = adr2os[0].first.depth;
338         int level = 0;
339         for (int i=hier_depth-1; i>=0; --i) {
340             int max = -1;
341             for (int j=0; j<num_addrs; ++j) {
342                 int next = adr2os[j].first.childNums[i];
343                 if (next > max) max = next;
344             }
345             numPerLevel[level] = max+1;
346             ++level;
347         }
348     }
349 
350     hierarchy_info() : depth(1), uninitialized(true) {}
351     void init(AddrUnsPair *adr2os, int num_addrs)
352     {
353         /* Added explicit initialization of the depth here to prevent usage of dirty value
354            observed when static library is re-initialized multiple times (e.g. when
355            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
356         depth = 1;
357         uninitialized = false;
358         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
359             numPerLevel[i] = 1;
360             skipPerLevel[i] = 1;
361         }
362 
363         // Sort table by physical ID
364         if (adr2os) {
365             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
366             deriveLevels(adr2os, num_addrs);
367         }
368         else {
369             numPerLevel[0] = 4;
370             numPerLevel[1] = num_addrs/4;
371             if (num_addrs%4) numPerLevel[1]++;
372         }
373 
374         base_num_threads = num_addrs;
375         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
376             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
377                 depth++;
378 
379         kmp_uint32 branch = 4;
380         if (numPerLevel[0] == 1) branch = num_addrs/4;
381         if (branch<4) branch=4;
382         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
383             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
384                 if (numPerLevel[d] & 1) numPerLevel[d]++;
385                 numPerLevel[d] = numPerLevel[d] >> 1;
386                 if (numPerLevel[d+1] == 1) depth++;
387                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
388             }
389             if(numPerLevel[0] == 1) {
390                 branch = branch >> 1;
391                 if (branch<4) branch = 4;
392             }
393         }
394 
395         for (kmp_uint32 i=1; i<depth; ++i)
396             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
397 
398         base_depth = depth;
399     }
400 };
401 
402 static hierarchy_info machine_hierarchy;
403 
404 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
405     if (machine_hierarchy.uninitialized)
406         machine_hierarchy.init(NULL, nproc);
407 
408     if (nproc <= machine_hierarchy.base_num_threads)
409         machine_hierarchy.depth = machine_hierarchy.base_depth;
410     KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
411     while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
412         machine_hierarchy.depth++;
413         machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
414     }
415     thr_bar->depth = machine_hierarchy.depth;
416     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
417     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
418 }
419 
420 //
421 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
422 // called to renumber the labels from [0..n] and place them into the child_num
423 // vector of the address object.  This is done in case the labels used for
424 // the children at one node of the hierarchy differ from those used for
425 // another node at the same level.  Example:  suppose the machine has 2 nodes
426 // with 2 packages each.  The first node contains packages 601 and 602, and
427 // second node contains packages 603 and 604.  If we try to sort the table
428 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
429 // because we are paying attention to the labels themselves, not the ordinal
430 // child numbers.  By using the child numbers in the sort, the result is
431 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
432 //
433 static void
434 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
435   int numAddrs)
436 {
437     KMP_DEBUG_ASSERT(numAddrs > 0);
438     int depth = address2os->first.depth;
439     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
440     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
441       * sizeof(unsigned));
442     int labCt;
443     for (labCt = 0; labCt < depth; labCt++) {
444         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
445         lastLabel[labCt] = address2os[0].first.labels[labCt];
446     }
447     int i;
448     for (i = 1; i < numAddrs; i++) {
449         for (labCt = 0; labCt < depth; labCt++) {
450             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
451                 int labCt2;
452                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
453                     counts[labCt2] = 0;
454                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
455                 }
456                 counts[labCt]++;
457                 lastLabel[labCt] = address2os[i].first.labels[labCt];
458                 break;
459             }
460         }
461         for (labCt = 0; labCt < depth; labCt++) {
462             address2os[i].first.childNums[labCt] = counts[labCt];
463         }
464         for (; labCt < (int)Address::maxDepth; labCt++) {
465             address2os[i].first.childNums[labCt] = 0;
466         }
467     }
468 }
469 
470 
471 //
472 // All of the __kmp_affinity_create_*_map() routines should set
473 // __kmp_affinity_masks to a vector of affinity mask objects of length
474 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
475 // return the number of levels in the machine topology tree (zero if
476 // __kmp_affinity_type == affinity_none).
477 //
478 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
479 // to the affinity mask for the initialization thread.  They need to save and
480 // restore the mask, and it could be needed later, so saving it is just an
481 // optimization to avoid calling kmp_get_system_affinity() again.
482 //
483 static kmp_affin_mask_t *fullMask = NULL;
484 
485 kmp_affin_mask_t *
486 __kmp_affinity_get_fullMask() { return fullMask; }
487 
488 
489 static int nCoresPerPkg, nPackages;
490 static int __kmp_nThreadsPerCore;
491 #ifndef KMP_DFLT_NTH_CORES
492 static int __kmp_ncores;
493 #endif
494 
495 //
496 // __kmp_affinity_uniform_topology() doesn't work when called from
497 // places which support arbitrarily many levels in the machine topology
498 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
499 // __kmp_affinity_create_x2apicid_map().
500 //
501 inline static bool
502 __kmp_affinity_uniform_topology()
503 {
504     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
505 }
506 
507 
508 //
509 // Print out the detailed machine topology map, i.e. the physical locations
510 // of each OS proc.
511 //
512 static void
513 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
514   int pkgLevel, int coreLevel, int threadLevel)
515 {
516     int proc;
517 
518     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
519     for (proc = 0; proc < len; proc++) {
520         int level;
521         kmp_str_buf_t buf;
522         __kmp_str_buf_init(&buf);
523         for (level = 0; level < depth; level++) {
524             if (level == threadLevel) {
525                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
526             }
527             else if (level == coreLevel) {
528                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
529             }
530             else if (level == pkgLevel) {
531                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
532             }
533             else if (level > pkgLevel) {
534                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
535                   level - pkgLevel - 1);
536             }
537             else {
538                 __kmp_str_buf_print(&buf, "L%d ", level);
539             }
540             __kmp_str_buf_print(&buf, "%d ",
541               address2os[proc].first.labels[level]);
542         }
543         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
544           buf.str);
545         __kmp_str_buf_free(&buf);
546     }
547 }
548 
549 
550 //
551 // If we don't know how to retrieve the machine's processor topology, or
552 // encounter an error in doing so, this routine is called to form a "flat"
553 // mapping of os thread id's <-> processor id's.
554 //
555 static int
556 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
557   kmp_i18n_id_t *const msg_id)
558 {
559     *address2os = NULL;
560     *msg_id = kmp_i18n_null;
561 
562     //
563     // Even if __kmp_affinity_type == affinity_none, this routine might still
564     // called to set __kmp_ncores, as well as
565     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
566     //
567     if (! KMP_AFFINITY_CAPABLE()) {
568         KMP_ASSERT(__kmp_affinity_type == affinity_none);
569         __kmp_ncores = nPackages = __kmp_xproc;
570         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
571         if (__kmp_affinity_verbose) {
572             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
573             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
574             KMP_INFORM(Uniform, "KMP_AFFINITY");
575             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
576               __kmp_nThreadsPerCore, __kmp_ncores);
577         }
578         return 0;
579     }
580 
581     //
582     // When affinity is off, this routine will still be called to set
583     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
584     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
585     //  correctly, and return now if affinity is not enabled.
586     //
587     __kmp_ncores = nPackages = __kmp_avail_proc;
588     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
589     if (__kmp_affinity_verbose) {
590         char buf[KMP_AFFIN_MASK_PRINT_LEN];
591         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
592 
593         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
594         if (__kmp_affinity_respect_mask) {
595             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
596         } else {
597             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
598         }
599         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
600         KMP_INFORM(Uniform, "KMP_AFFINITY");
601         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
602           __kmp_nThreadsPerCore, __kmp_ncores);
603     }
604     if (__kmp_affinity_type == affinity_none) {
605         return 0;
606     }
607 
608     //
609     // Contruct the data structure to be returned.
610     //
611     *address2os = (AddrUnsPair*)
612       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
613     int avail_ct = 0;
614     unsigned int i;
615     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
616         //
617         // Skip this proc if it is not included in the machine model.
618         //
619         if (! KMP_CPU_ISSET(i, fullMask)) {
620             continue;
621         }
622 
623         Address addr(1);
624         addr.labels[0] = i;
625         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
626     }
627     if (__kmp_affinity_verbose) {
628         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
629     }
630 
631     if (__kmp_affinity_gran_levels < 0) {
632         //
633         // Only the package level is modeled in the machine topology map,
634         // so the #levels of granularity is either 0 or 1.
635         //
636         if (__kmp_affinity_gran > affinity_gran_package) {
637             __kmp_affinity_gran_levels = 1;
638         }
639         else {
640             __kmp_affinity_gran_levels = 0;
641         }
642     }
643     return 1;
644 }
645 
646 
647 # if KMP_GROUP_AFFINITY
648 
649 //
650 // If multiple Windows* OS processor groups exist, we can create a 2-level
651 // topology map with the groups at level 0 and the individual procs at
652 // level 1.
653 //
654 // This facilitates letting the threads float among all procs in a group,
655 // if granularity=group (the default when there are multiple groups).
656 //
657 static int
658 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
659   kmp_i18n_id_t *const msg_id)
660 {
661     *address2os = NULL;
662     *msg_id = kmp_i18n_null;
663 
664     //
665     // If we don't have multiple processor groups, return now.
666     // The flat mapping will be used.
667     //
668     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
669         // FIXME set *msg_id
670         return -1;
671     }
672 
673     //
674     // Contruct the data structure to be returned.
675     //
676     *address2os = (AddrUnsPair*)
677       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
678     int avail_ct = 0;
679     int i;
680     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
681         //
682         // Skip this proc if it is not included in the machine model.
683         //
684         if (! KMP_CPU_ISSET(i, fullMask)) {
685             continue;
686         }
687 
688         Address addr(2);
689         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
690         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
691         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
692 
693         if (__kmp_affinity_verbose) {
694             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
695               addr.labels[1]);
696         }
697     }
698 
699     if (__kmp_affinity_gran_levels < 0) {
700         if (__kmp_affinity_gran == affinity_gran_group) {
701             __kmp_affinity_gran_levels = 1;
702         }
703         else if ((__kmp_affinity_gran == affinity_gran_fine)
704           || (__kmp_affinity_gran == affinity_gran_thread)) {
705             __kmp_affinity_gran_levels = 0;
706         }
707         else {
708             const char *gran_str = NULL;
709             if (__kmp_affinity_gran == affinity_gran_core) {
710                 gran_str = "core";
711             }
712             else if (__kmp_affinity_gran == affinity_gran_package) {
713                 gran_str = "package";
714             }
715             else if (__kmp_affinity_gran == affinity_gran_node) {
716                 gran_str = "node";
717             }
718             else {
719                 KMP_ASSERT(0);
720             }
721 
722             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
723             __kmp_affinity_gran_levels = 0;
724         }
725     }
726     return 2;
727 }
728 
729 # endif /* KMP_GROUP_AFFINITY */
730 
731 
732 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
733 
734 static int
735 __kmp_cpuid_mask_width(int count) {
736     int r = 0;
737 
738     while((1<<r) < count)
739         ++r;
740     return r;
741 }
742 
743 
744 class apicThreadInfo {
745 public:
746     unsigned osId;              // param to __kmp_affinity_bind_thread
747     unsigned apicId;            // from cpuid after binding
748     unsigned maxCoresPerPkg;    //      ""
749     unsigned maxThreadsPerPkg;  //      ""
750     unsigned pkgId;             // inferred from above values
751     unsigned coreId;            //      ""
752     unsigned threadId;          //      ""
753 };
754 
755 
756 static int
757 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
758 {
759     const apicThreadInfo *aa = (const apicThreadInfo *)a;
760     const apicThreadInfo *bb = (const apicThreadInfo *)b;
761     if (aa->osId < bb->osId) return -1;
762     if (aa->osId > bb->osId) return 1;
763     return 0;
764 }
765 
766 
767 static int
768 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
769 {
770     const apicThreadInfo *aa = (const apicThreadInfo *)a;
771     const apicThreadInfo *bb = (const apicThreadInfo *)b;
772     if (aa->pkgId < bb->pkgId) return -1;
773     if (aa->pkgId > bb->pkgId) return 1;
774     if (aa->coreId < bb->coreId) return -1;
775     if (aa->coreId > bb->coreId) return 1;
776     if (aa->threadId < bb->threadId) return -1;
777     if (aa->threadId > bb->threadId) return 1;
778     return 0;
779 }
780 
781 
782 //
783 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
784 // an algorithm which cycles through the available os threads, setting
785 // the current thread's affinity mask to that thread, and then retrieves
786 // the Apic Id for each thread context using the cpuid instruction.
787 //
788 static int
789 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
790   kmp_i18n_id_t *const msg_id)
791 {
792     kmp_cpuid buf;
793     int rc;
794     *address2os = NULL;
795     *msg_id = kmp_i18n_null;
796 
797     //
798     // Check if cpuid leaf 4 is supported.
799     //
800         __kmp_x86_cpuid(0, 0, &buf);
801         if (buf.eax < 4) {
802             *msg_id = kmp_i18n_str_NoLeaf4Support;
803             return -1;
804         }
805 
806     //
807     // The algorithm used starts by setting the affinity to each available
808     // thread and retrieving info from the cpuid instruction, so if we are
809     // not capable of calling __kmp_get_system_affinity() and
810     // _kmp_get_system_affinity(), then we need to do something else - use
811     // the defaults that we calculated from issuing cpuid without binding
812     // to each proc.
813     //
814     if (! KMP_AFFINITY_CAPABLE()) {
815         //
816         // Hack to try and infer the machine topology using only the data
817         // available from cpuid on the current thread, and __kmp_xproc.
818         //
819         KMP_ASSERT(__kmp_affinity_type == affinity_none);
820 
821         //
822         // Get an upper bound on the number of threads per package using
823         // cpuid(1).
824         //
825         // On some OS/chps combinations where HT is supported by the chip
826         // but is disabled, this value will be 2 on a single core chip.
827         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
828         //
829         __kmp_x86_cpuid(1, 0, &buf);
830         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
831         if (maxThreadsPerPkg == 0) {
832             maxThreadsPerPkg = 1;
833         }
834 
835         //
836         // The num cores per pkg comes from cpuid(4).
837         // 1 must be added to the encoded value.
838         //
839         // The author of cpu_count.cpp treated this only an upper bound
840         // on the number of cores, but I haven't seen any cases where it
841         // was greater than the actual number of cores, so we will treat
842         // it as exact in this block of code.
843         //
844         // First, we need to check if cpuid(4) is supported on this chip.
845         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
846         // has the value n or greater.
847         //
848         __kmp_x86_cpuid(0, 0, &buf);
849         if (buf.eax >= 4) {
850             __kmp_x86_cpuid(4, 0, &buf);
851             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
852         }
853         else {
854             nCoresPerPkg = 1;
855         }
856 
857         //
858         // There is no way to reliably tell if HT is enabled without issuing
859         // the cpuid instruction from every thread, can correlating the cpuid
860         // info, so if the machine is not affinity capable, we assume that HT
861         // is off.  We have seen quite a few machines where maxThreadsPerPkg
862         // is 2, yet the machine does not support HT.
863         //
864         // - Older OSes are usually found on machines with older chips, which
865         //   do not support HT.
866         //
867         // - The performance penalty for mistakenly identifying a machine as
868         //   HT when it isn't (which results in blocktime being incorrecly set
869         //   to 0) is greater than the penalty when for mistakenly identifying
870         //   a machine as being 1 thread/core when it is really HT enabled
871         //   (which results in blocktime being incorrectly set to a positive
872         //   value).
873         //
874         __kmp_ncores = __kmp_xproc;
875         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
876         __kmp_nThreadsPerCore = 1;
877         if (__kmp_affinity_verbose) {
878             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
879             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
880             if (__kmp_affinity_uniform_topology()) {
881                 KMP_INFORM(Uniform, "KMP_AFFINITY");
882             } else {
883                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
884             }
885             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
886               __kmp_nThreadsPerCore, __kmp_ncores);
887         }
888         return 0;
889     }
890 
891     //
892     //
893     // From here on, we can assume that it is safe to call
894     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
895     // even if __kmp_affinity_type = affinity_none.
896     //
897 
898     //
899     // Save the affinity mask for the current thread.
900     //
901     kmp_affin_mask_t *oldMask;
902     KMP_CPU_ALLOC(oldMask);
903     KMP_ASSERT(oldMask != NULL);
904     __kmp_get_system_affinity(oldMask, TRUE);
905 
906     //
907     // Run through each of the available contexts, binding the current thread
908     // to it, and obtaining the pertinent information using the cpuid instr.
909     //
910     // The relevant information is:
911     //
912     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
913     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
914     //
915     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
916     //    value of this field determines the width of the core# + thread#
917     //    fields in the Apic Id.  It is also an upper bound on the number
918     //    of threads per package, but it has been verified that situations
919     //    happen were it is not exact.  In particular, on certain OS/chip
920     //    combinations where Intel(R) Hyper-Threading Technology is supported
921     //    by the chip but has
922     //    been disabled, the value of this field will be 2 (for a single core
923     //    chip).  On other OS/chip combinations supporting
924     //    Intel(R) Hyper-Threading Technology, the value of
925     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
926     //    disabled and 2 when it is enabled.
927     //
928     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
929     //    value of this field (+1) determines the width of the core# field in
930     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
931     //    an upper bound, but the IA-32 architecture manual says that it is
932     //    exactly the number of cores per package, and I haven't seen any
933     //    case where it wasn't.
934     //
935     // From this information, deduce the package Id, core Id, and thread Id,
936     // and set the corresponding fields in the apicThreadInfo struct.
937     //
938     unsigned i;
939     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
940       __kmp_avail_proc * sizeof(apicThreadInfo));
941     unsigned nApics = 0;
942     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
943         //
944         // Skip this proc if it is not included in the machine model.
945         //
946         if (! KMP_CPU_ISSET(i, fullMask)) {
947             continue;
948         }
949         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
950 
951         __kmp_affinity_bind_thread(i);
952         threadInfo[nApics].osId = i;
953 
954         //
955         // The apic id and max threads per pkg come from cpuid(1).
956         //
957         __kmp_x86_cpuid(1, 0, &buf);
958         if (! (buf.edx >> 9) & 1) {
959             __kmp_set_system_affinity(oldMask, TRUE);
960             __kmp_free(threadInfo);
961             KMP_CPU_FREE(oldMask);
962             *msg_id = kmp_i18n_str_ApicNotPresent;
963             return -1;
964         }
965         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
966         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
967         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
968             threadInfo[nApics].maxThreadsPerPkg = 1;
969         }
970 
971         //
972         // Max cores per pkg comes from cpuid(4).
973         // 1 must be added to the encoded value.
974         //
975         // First, we need to check if cpuid(4) is supported on this chip.
976         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
977         // has the value n or greater.
978         //
979         __kmp_x86_cpuid(0, 0, &buf);
980         if (buf.eax >= 4) {
981             __kmp_x86_cpuid(4, 0, &buf);
982             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
983         }
984         else {
985             threadInfo[nApics].maxCoresPerPkg = 1;
986         }
987 
988         //
989         // Infer the pkgId / coreId / threadId using only the info
990         // obtained locally.
991         //
992         int widthCT = __kmp_cpuid_mask_width(
993           threadInfo[nApics].maxThreadsPerPkg);
994         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
995 
996         int widthC = __kmp_cpuid_mask_width(
997           threadInfo[nApics].maxCoresPerPkg);
998         int widthT = widthCT - widthC;
999         if (widthT < 0) {
1000             //
1001             // I've never seen this one happen, but I suppose it could, if
1002             // the cpuid instruction on a chip was really screwed up.
1003             // Make sure to restore the affinity mask before the tail call.
1004             //
1005             __kmp_set_system_affinity(oldMask, TRUE);
1006             __kmp_free(threadInfo);
1007             KMP_CPU_FREE(oldMask);
1008             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1009             return -1;
1010         }
1011 
1012         int maskC = (1 << widthC) - 1;
1013         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1014           &maskC;
1015 
1016         int maskT = (1 << widthT) - 1;
1017         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1018 
1019         nApics++;
1020     }
1021 
1022     //
1023     // We've collected all the info we need.
1024     // Restore the old affinity mask for this thread.
1025     //
1026     __kmp_set_system_affinity(oldMask, TRUE);
1027 
1028     //
1029     // If there's only one thread context to bind to, form an Address object
1030     // with depth 1 and return immediately (or, if affinity is off, set
1031     // address2os to NULL and return).
1032     //
1033     // If it is configured to omit the package level when there is only a
1034     // single package, the logic at the end of this routine won't work if
1035     // there is only a single thread - it would try to form an Address
1036     // object with depth 0.
1037     //
1038     KMP_ASSERT(nApics > 0);
1039     if (nApics == 1) {
1040         __kmp_ncores = nPackages = 1;
1041         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1042         if (__kmp_affinity_verbose) {
1043             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1044             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1045 
1046             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1047             if (__kmp_affinity_respect_mask) {
1048                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1049             } else {
1050                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1051             }
1052             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1053             KMP_INFORM(Uniform, "KMP_AFFINITY");
1054             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1055               __kmp_nThreadsPerCore, __kmp_ncores);
1056         }
1057 
1058         if (__kmp_affinity_type == affinity_none) {
1059             __kmp_free(threadInfo);
1060             KMP_CPU_FREE(oldMask);
1061             return 0;
1062         }
1063 
1064         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1065         Address addr(1);
1066         addr.labels[0] = threadInfo[0].pkgId;
1067         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1068 
1069         if (__kmp_affinity_gran_levels < 0) {
1070             __kmp_affinity_gran_levels = 0;
1071         }
1072 
1073         if (__kmp_affinity_verbose) {
1074             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1075         }
1076 
1077         __kmp_free(threadInfo);
1078         KMP_CPU_FREE(oldMask);
1079         return 1;
1080     }
1081 
1082     //
1083     // Sort the threadInfo table by physical Id.
1084     //
1085     qsort(threadInfo, nApics, sizeof(*threadInfo),
1086       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1087 
1088     //
1089     // The table is now sorted by pkgId / coreId / threadId, but we really
1090     // don't know the radix of any of the fields.  pkgId's may be sparsely
1091     // assigned among the chips on a system.  Although coreId's are usually
1092     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1093     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1094     //
1095     // For that matter, we don't know what coresPerPkg and threadsPerCore
1096     // (or the total # packages) are at this point - we want to determine
1097     // that now.  We only have an upper bound on the first two figures.
1098     //
1099     // We also perform a consistency check at this point: the values returned
1100     // by the cpuid instruction for any thread bound to a given package had
1101     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1102     //
1103     nPackages = 1;
1104     nCoresPerPkg = 1;
1105     __kmp_nThreadsPerCore = 1;
1106     unsigned nCores = 1;
1107 
1108     unsigned pkgCt = 1;                         // to determine radii
1109     unsigned lastPkgId = threadInfo[0].pkgId;
1110     unsigned coreCt = 1;
1111     unsigned lastCoreId = threadInfo[0].coreId;
1112     unsigned threadCt = 1;
1113     unsigned lastThreadId = threadInfo[0].threadId;
1114 
1115                                                 // intra-pkg consist checks
1116     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1117     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1118 
1119     for (i = 1; i < nApics; i++) {
1120         if (threadInfo[i].pkgId != lastPkgId) {
1121             nCores++;
1122             pkgCt++;
1123             lastPkgId = threadInfo[i].pkgId;
1124             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1125             coreCt = 1;
1126             lastCoreId = threadInfo[i].coreId;
1127             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1128             threadCt = 1;
1129             lastThreadId = threadInfo[i].threadId;
1130 
1131             //
1132             // This is a different package, so go on to the next iteration
1133             // without doing any consistency checks.  Reset the consistency
1134             // check vars, though.
1135             //
1136             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1137             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1138             continue;
1139         }
1140 
1141         if (threadInfo[i].coreId != lastCoreId) {
1142             nCores++;
1143             coreCt++;
1144             lastCoreId = threadInfo[i].coreId;
1145             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1146             threadCt = 1;
1147             lastThreadId = threadInfo[i].threadId;
1148         }
1149         else if (threadInfo[i].threadId != lastThreadId) {
1150             threadCt++;
1151             lastThreadId = threadInfo[i].threadId;
1152         }
1153         else {
1154             __kmp_free(threadInfo);
1155             KMP_CPU_FREE(oldMask);
1156             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1157             return -1;
1158         }
1159 
1160         //
1161         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1162         // fields agree between all the threads bounds to a given package.
1163         //
1164         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1165           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1166             __kmp_free(threadInfo);
1167             KMP_CPU_FREE(oldMask);
1168             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1169             return -1;
1170         }
1171     }
1172     nPackages = pkgCt;
1173     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1174     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1175 
1176     //
1177     // When affinity is off, this routine will still be called to set
1178     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1179     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1180     // correctly, and return now if affinity is not enabled.
1181     //
1182     __kmp_ncores = nCores;
1183     if (__kmp_affinity_verbose) {
1184         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1185         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1186 
1187         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1188         if (__kmp_affinity_respect_mask) {
1189             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1190         } else {
1191             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1192         }
1193         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1194         if (__kmp_affinity_uniform_topology()) {
1195             KMP_INFORM(Uniform, "KMP_AFFINITY");
1196         } else {
1197             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1198         }
1199         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1200           __kmp_nThreadsPerCore, __kmp_ncores);
1201 
1202     }
1203 
1204     if (__kmp_affinity_type == affinity_none) {
1205         __kmp_free(threadInfo);
1206         KMP_CPU_FREE(oldMask);
1207         return 0;
1208     }
1209 
1210     //
1211     // Now that we've determined the number of packages, the number of cores
1212     // per package, and the number of threads per core, we can construct the
1213     // data structure that is to be returned.
1214     //
1215     int pkgLevel = 0;
1216     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1217     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1218     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1219 
1220     KMP_ASSERT(depth > 0);
1221     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1222 
1223     for (i = 0; i < nApics; ++i) {
1224         Address addr(depth);
1225         unsigned os = threadInfo[i].osId;
1226         int d = 0;
1227 
1228         if (pkgLevel >= 0) {
1229             addr.labels[d++] = threadInfo[i].pkgId;
1230         }
1231         if (coreLevel >= 0) {
1232             addr.labels[d++] = threadInfo[i].coreId;
1233         }
1234         if (threadLevel >= 0) {
1235             addr.labels[d++] = threadInfo[i].threadId;
1236         }
1237         (*address2os)[i] = AddrUnsPair(addr, os);
1238     }
1239 
1240     if (__kmp_affinity_gran_levels < 0) {
1241         //
1242         // Set the granularity level based on what levels are modeled
1243         // in the machine topology map.
1244         //
1245         __kmp_affinity_gran_levels = 0;
1246         if ((threadLevel >= 0)
1247           && (__kmp_affinity_gran > affinity_gran_thread)) {
1248             __kmp_affinity_gran_levels++;
1249         }
1250         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1251             __kmp_affinity_gran_levels++;
1252         }
1253         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1254             __kmp_affinity_gran_levels++;
1255         }
1256     }
1257 
1258     if (__kmp_affinity_verbose) {
1259         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1260           coreLevel, threadLevel);
1261     }
1262 
1263     __kmp_free(threadInfo);
1264     KMP_CPU_FREE(oldMask);
1265     return depth;
1266 }
1267 
1268 
1269 //
1270 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1271 // architectures support a newer interface for specifying the x2APIC Ids,
1272 // based on cpuid leaf 11.
1273 //
1274 static int
1275 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1276   kmp_i18n_id_t *const msg_id)
1277 {
1278     kmp_cpuid buf;
1279 
1280     *address2os = NULL;
1281     *msg_id = kmp_i18n_null;
1282 
1283     //
1284     // Check to see if cpuid leaf 11 is supported.
1285     //
1286     __kmp_x86_cpuid(0, 0, &buf);
1287     if (buf.eax < 11) {
1288         *msg_id = kmp_i18n_str_NoLeaf11Support;
1289         return -1;
1290     }
1291     __kmp_x86_cpuid(11, 0, &buf);
1292     if (buf.ebx == 0) {
1293         *msg_id = kmp_i18n_str_NoLeaf11Support;
1294         return -1;
1295     }
1296 
1297     //
1298     // Find the number of levels in the machine topology.  While we're at it,
1299     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1300     // try to get more accurate values later by explicitly counting them,
1301     // but get reasonable defaults now, in case we return early.
1302     //
1303     int level;
1304     int threadLevel = -1;
1305     int coreLevel = -1;
1306     int pkgLevel = -1;
1307     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1308 
1309     for (level = 0;; level++) {
1310         if (level > 31) {
1311             //
1312             // FIXME: Hack for DPD200163180
1313             //
1314             // If level is big then something went wrong -> exiting
1315             //
1316             // There could actually be 32 valid levels in the machine topology,
1317             // but so far, the only machine we have seen which does not exit
1318             // this loop before iteration 32 has fubar x2APIC settings.
1319             //
1320             // For now, just reject this case based upon loop trip count.
1321             //
1322             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1323             return -1;
1324         }
1325         __kmp_x86_cpuid(11, level, &buf);
1326         if (buf.ebx == 0) {
1327             if (pkgLevel < 0) {
1328                 //
1329                 // Will infer nPackages from __kmp_xproc
1330                 //
1331                 pkgLevel = level;
1332                 level++;
1333             }
1334             break;
1335         }
1336         int kind = (buf.ecx >> 8) & 0xff;
1337         if (kind == 1) {
1338             //
1339             // SMT level
1340             //
1341             threadLevel = level;
1342             coreLevel = -1;
1343             pkgLevel = -1;
1344             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1345             if (__kmp_nThreadsPerCore == 0) {
1346                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1347                 return -1;
1348             }
1349         }
1350         else if (kind == 2) {
1351             //
1352             // core level
1353             //
1354             coreLevel = level;
1355             pkgLevel = -1;
1356             nCoresPerPkg = buf.ebx & 0xff;
1357             if (nCoresPerPkg == 0) {
1358                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1359                 return -1;
1360             }
1361         }
1362         else {
1363             if (level <= 0) {
1364                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1365                 return -1;
1366             }
1367             if (pkgLevel >= 0) {
1368                 continue;
1369             }
1370             pkgLevel = level;
1371             nPackages = buf.ebx & 0xff;
1372             if (nPackages == 0) {
1373                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1374                 return -1;
1375             }
1376         }
1377     }
1378     int depth = level;
1379 
1380     //
1381     // In the above loop, "level" was counted from the finest level (usually
1382     // thread) to the coarsest.  The caller expects that we will place the
1383     // labels in (*address2os)[].first.labels[] in the inverse order, so
1384     // we need to invert the vars saying which level means what.
1385     //
1386     if (threadLevel >= 0) {
1387         threadLevel = depth - threadLevel - 1;
1388     }
1389     if (coreLevel >= 0) {
1390         coreLevel = depth - coreLevel - 1;
1391     }
1392     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1393     pkgLevel = depth - pkgLevel - 1;
1394 
1395     //
1396     // The algorithm used starts by setting the affinity to each available
1397     // thread and retrieving info from the cpuid instruction, so if we are
1398     // not capable of calling __kmp_get_system_affinity() and
1399     // _kmp_get_system_affinity(), then we need to do something else - use
1400     // the defaults that we calculated from issuing cpuid without binding
1401     // to each proc.
1402     //
1403     if (! KMP_AFFINITY_CAPABLE())
1404     {
1405         //
1406         // Hack to try and infer the machine topology using only the data
1407         // available from cpuid on the current thread, and __kmp_xproc.
1408         //
1409         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1410 
1411         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1412         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1413         if (__kmp_affinity_verbose) {
1414             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1415             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1416             if (__kmp_affinity_uniform_topology()) {
1417                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1418             } else {
1419                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1420             }
1421             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1422               __kmp_nThreadsPerCore, __kmp_ncores);
1423         }
1424         return 0;
1425     }
1426 
1427     //
1428     //
1429     // From here on, we can assume that it is safe to call
1430     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1431     // even if __kmp_affinity_type = affinity_none.
1432     //
1433 
1434     //
1435     // Save the affinity mask for the current thread.
1436     //
1437     kmp_affin_mask_t *oldMask;
1438     KMP_CPU_ALLOC(oldMask);
1439     __kmp_get_system_affinity(oldMask, TRUE);
1440 
1441     //
1442     // Allocate the data structure to be returned.
1443     //
1444     AddrUnsPair *retval = (AddrUnsPair *)
1445       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1446 
1447     //
1448     // Run through each of the available contexts, binding the current thread
1449     // to it, and obtaining the pertinent information using the cpuid instr.
1450     //
1451     unsigned int proc;
1452     int nApics = 0;
1453     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1454         //
1455         // Skip this proc if it is not included in the machine model.
1456         //
1457         if (! KMP_CPU_ISSET(proc, fullMask)) {
1458             continue;
1459         }
1460         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1461 
1462         __kmp_affinity_bind_thread(proc);
1463 
1464         //
1465         // Extrach the labels for each level in the machine topology map
1466         // from the Apic ID.
1467         //
1468         Address addr(depth);
1469         int prev_shift = 0;
1470 
1471         for (level = 0; level < depth; level++) {
1472             __kmp_x86_cpuid(11, level, &buf);
1473             unsigned apicId = buf.edx;
1474             if (buf.ebx == 0) {
1475                 if (level != depth - 1) {
1476                     KMP_CPU_FREE(oldMask);
1477                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1478                     return -1;
1479                 }
1480                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1481                 level++;
1482                 break;
1483             }
1484             int shift = buf.eax & 0x1f;
1485             int mask = (1 << shift) - 1;
1486             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1487             prev_shift = shift;
1488         }
1489         if (level != depth) {
1490             KMP_CPU_FREE(oldMask);
1491             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1492             return -1;
1493         }
1494 
1495         retval[nApics] = AddrUnsPair(addr, proc);
1496         nApics++;
1497     }
1498 
1499     //
1500     // We've collected all the info we need.
1501     // Restore the old affinity mask for this thread.
1502     //
1503     __kmp_set_system_affinity(oldMask, TRUE);
1504 
1505     //
1506     // If there's only one thread context to bind to, return now.
1507     //
1508     KMP_ASSERT(nApics > 0);
1509     if (nApics == 1) {
1510         __kmp_ncores = nPackages = 1;
1511         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1512         if (__kmp_affinity_verbose) {
1513             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1514             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1515 
1516             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1517             if (__kmp_affinity_respect_mask) {
1518                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1519             } else {
1520                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1521             }
1522             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1523             KMP_INFORM(Uniform, "KMP_AFFINITY");
1524             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1525               __kmp_nThreadsPerCore, __kmp_ncores);
1526         }
1527 
1528         if (__kmp_affinity_type == affinity_none) {
1529             __kmp_free(retval);
1530             KMP_CPU_FREE(oldMask);
1531             return 0;
1532         }
1533 
1534         //
1535         // Form an Address object which only includes the package level.
1536         //
1537         Address addr(1);
1538         addr.labels[0] = retval[0].first.labels[pkgLevel];
1539         retval[0].first = addr;
1540 
1541         if (__kmp_affinity_gran_levels < 0) {
1542             __kmp_affinity_gran_levels = 0;
1543         }
1544 
1545         if (__kmp_affinity_verbose) {
1546             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1547         }
1548 
1549         *address2os = retval;
1550         KMP_CPU_FREE(oldMask);
1551         return 1;
1552     }
1553 
1554     //
1555     // Sort the table by physical Id.
1556     //
1557     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1558 
1559     //
1560     // Find the radix at each of the levels.
1561     //
1562     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1563     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1564     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1565     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1566     for (level = 0; level < depth; level++) {
1567         totals[level] = 1;
1568         maxCt[level] = 1;
1569         counts[level] = 1;
1570         last[level] = retval[0].first.labels[level];
1571     }
1572 
1573     //
1574     // From here on, the iteration variable "level" runs from the finest
1575     // level to the coarsest, i.e. we iterate forward through
1576     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1577     // backwards.
1578     //
1579     for (proc = 1; (int)proc < nApics; proc++) {
1580         int level;
1581         for (level = 0; level < depth; level++) {
1582             if (retval[proc].first.labels[level] != last[level]) {
1583                 int j;
1584                 for (j = level + 1; j < depth; j++) {
1585                     totals[j]++;
1586                     counts[j] = 1;
1587                     // The line below causes printing incorrect topology information
1588                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1589                     // some less value while going through the array.
1590                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1591                     // whereas it must be 4.
1592                     // TODO!!! Check if it can be commented safely
1593                     //maxCt[j] = 1;
1594                     last[j] = retval[proc].first.labels[j];
1595                 }
1596                 totals[level]++;
1597                 counts[level]++;
1598                 if (counts[level] > maxCt[level]) {
1599                     maxCt[level] = counts[level];
1600                 }
1601                 last[level] = retval[proc].first.labels[level];
1602                 break;
1603             }
1604             else if (level == depth - 1) {
1605                 __kmp_free(last);
1606                 __kmp_free(maxCt);
1607                 __kmp_free(counts);
1608                 __kmp_free(totals);
1609                 __kmp_free(retval);
1610                 KMP_CPU_FREE(oldMask);
1611                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1612                 return -1;
1613             }
1614         }
1615     }
1616 
1617     //
1618     // When affinity is off, this routine will still be called to set
1619     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1620     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1621     // correctly, and return if affinity is not enabled.
1622     //
1623     if (threadLevel >= 0) {
1624         __kmp_nThreadsPerCore = maxCt[threadLevel];
1625     }
1626     else {
1627         __kmp_nThreadsPerCore = 1;
1628     }
1629     nPackages = totals[pkgLevel];
1630 
1631     if (coreLevel >= 0) {
1632         __kmp_ncores = totals[coreLevel];
1633         nCoresPerPkg = maxCt[coreLevel];
1634     }
1635     else {
1636         __kmp_ncores = nPackages;
1637         nCoresPerPkg = 1;
1638     }
1639 
1640     //
1641     // Check to see if the machine topology is uniform
1642     //
1643     unsigned prod = maxCt[0];
1644     for (level = 1; level < depth; level++) {
1645        prod *= maxCt[level];
1646     }
1647     bool uniform = (prod == totals[level - 1]);
1648 
1649     //
1650     // Print the machine topology summary.
1651     //
1652     if (__kmp_affinity_verbose) {
1653         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1654         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1655 
1656         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1657         if (__kmp_affinity_respect_mask) {
1658             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1659         } else {
1660             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1661         }
1662         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1663         if (uniform) {
1664             KMP_INFORM(Uniform, "KMP_AFFINITY");
1665         } else {
1666             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1667         }
1668 
1669         kmp_str_buf_t buf;
1670         __kmp_str_buf_init(&buf);
1671 
1672         __kmp_str_buf_print(&buf, "%d", totals[0]);
1673         for (level = 1; level <= pkgLevel; level++) {
1674             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1675         }
1676         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1677           __kmp_nThreadsPerCore, __kmp_ncores);
1678 
1679         __kmp_str_buf_free(&buf);
1680     }
1681 
1682     if (__kmp_affinity_type == affinity_none) {
1683         __kmp_free(last);
1684         __kmp_free(maxCt);
1685         __kmp_free(counts);
1686         __kmp_free(totals);
1687         __kmp_free(retval);
1688         KMP_CPU_FREE(oldMask);
1689         return 0;
1690     }
1691 
1692     //
1693     // Find any levels with radiix 1, and remove them from the map
1694     // (except for the package level).
1695     //
1696     int new_depth = 0;
1697     for (level = 0; level < depth; level++) {
1698         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1699            continue;
1700         }
1701         new_depth++;
1702     }
1703 
1704     //
1705     // If we are removing any levels, allocate a new vector to return,
1706     // and copy the relevant information to it.
1707     //
1708     if (new_depth != depth) {
1709         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1710           sizeof(AddrUnsPair) * nApics);
1711         for (proc = 0; (int)proc < nApics; proc++) {
1712             Address addr(new_depth);
1713             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1714         }
1715         int new_level = 0;
1716         for (level = 0; level < depth; level++) {
1717             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1718                if (level == threadLevel) {
1719                    threadLevel = -1;
1720                }
1721                else if ((threadLevel >= 0) && (level < threadLevel)) {
1722                    threadLevel--;
1723                }
1724                if (level == coreLevel) {
1725                    coreLevel = -1;
1726                }
1727                else if ((coreLevel >= 0) && (level < coreLevel)) {
1728                    coreLevel--;
1729                }
1730                if (level < pkgLevel) {
1731                    pkgLevel--;
1732                }
1733                continue;
1734             }
1735             for (proc = 0; (int)proc < nApics; proc++) {
1736                 new_retval[proc].first.labels[new_level]
1737                   = retval[proc].first.labels[level];
1738             }
1739             new_level++;
1740         }
1741 
1742         __kmp_free(retval);
1743         retval = new_retval;
1744         depth = new_depth;
1745     }
1746 
1747     if (__kmp_affinity_gran_levels < 0) {
1748         //
1749         // Set the granularity level based on what levels are modeled
1750         // in the machine topology map.
1751         //
1752         __kmp_affinity_gran_levels = 0;
1753         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1754             __kmp_affinity_gran_levels++;
1755         }
1756         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1757             __kmp_affinity_gran_levels++;
1758         }
1759         if (__kmp_affinity_gran > affinity_gran_package) {
1760             __kmp_affinity_gran_levels++;
1761         }
1762     }
1763 
1764     if (__kmp_affinity_verbose) {
1765         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1766           coreLevel, threadLevel);
1767     }
1768 
1769     __kmp_free(last);
1770     __kmp_free(maxCt);
1771     __kmp_free(counts);
1772     __kmp_free(totals);
1773     KMP_CPU_FREE(oldMask);
1774     *address2os = retval;
1775     return depth;
1776 }
1777 
1778 
1779 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1780 
1781 
1782 #define osIdIndex       0
1783 #define threadIdIndex   1
1784 #define coreIdIndex     2
1785 #define pkgIdIndex      3
1786 #define nodeIdIndex     4
1787 
1788 typedef unsigned *ProcCpuInfo;
1789 static unsigned maxIndex = pkgIdIndex;
1790 
1791 
1792 static int
1793 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1794 {
1795     const unsigned *aa = (const unsigned *)a;
1796     const unsigned *bb = (const unsigned *)b;
1797     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1798     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1799     return 0;
1800 };
1801 
1802 
1803 static int
1804 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1805 {
1806     unsigned i;
1807     const unsigned *aa = *((const unsigned **)a);
1808     const unsigned *bb = *((const unsigned **)b);
1809     for (i = maxIndex; ; i--) {
1810         if (aa[i] < bb[i]) return -1;
1811         if (aa[i] > bb[i]) return 1;
1812         if (i == osIdIndex) break;
1813     }
1814     return 0;
1815 }
1816 
1817 
1818 //
1819 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1820 // affinity map.
1821 //
1822 static int
1823 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1824   kmp_i18n_id_t *const msg_id, FILE *f)
1825 {
1826     *address2os = NULL;
1827     *msg_id = kmp_i18n_null;
1828 
1829     //
1830     // Scan of the file, and count the number of "processor" (osId) fields,
1831     // and find the highest value of <n> for a node_<n> field.
1832     //
1833     char buf[256];
1834     unsigned num_records = 0;
1835     while (! feof(f)) {
1836         buf[sizeof(buf) - 1] = 1;
1837         if (! fgets(buf, sizeof(buf), f)) {
1838             //
1839             // Read errors presumably because of EOF
1840             //
1841             break;
1842         }
1843 
1844         char s1[] = "processor";
1845         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1846             num_records++;
1847             continue;
1848         }
1849 
1850         //
1851         // FIXME - this will match "node_<n> <garbage>"
1852         //
1853         unsigned level;
1854         if (sscanf(buf, "node_%d id", &level) == 1) {
1855             if (nodeIdIndex + level >= maxIndex) {
1856                 maxIndex = nodeIdIndex + level;
1857             }
1858             continue;
1859         }
1860     }
1861 
1862     //
1863     // Check for empty file / no valid processor records, or too many.
1864     // The number of records can't exceed the number of valid bits in the
1865     // affinity mask.
1866     //
1867     if (num_records == 0) {
1868         *line = 0;
1869         *msg_id = kmp_i18n_str_NoProcRecords;
1870         return -1;
1871     }
1872     if (num_records > (unsigned)__kmp_xproc) {
1873         *line = 0;
1874         *msg_id = kmp_i18n_str_TooManyProcRecords;
1875         return -1;
1876     }
1877 
1878     //
1879     // Set the file pointer back to the begginning, so that we can scan the
1880     // file again, this time performing a full parse of the data.
1881     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1882     // Adding an extra element at the end allows us to remove a lot of extra
1883     // checks for termination conditions.
1884     //
1885     if (fseek(f, 0, SEEK_SET) != 0) {
1886         *line = 0;
1887         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1888         return -1;
1889     }
1890 
1891     //
1892     // Allocate the array of records to store the proc info in.  The dummy
1893     // element at the end makes the logic in filling them out easier to code.
1894     //
1895     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1896       * sizeof(unsigned *));
1897     unsigned i;
1898     for (i = 0; i <= num_records; i++) {
1899         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1900           * sizeof(unsigned));
1901     }
1902 
1903 #define CLEANUP_THREAD_INFO \
1904     for (i = 0; i <= num_records; i++) {                                \
1905         __kmp_free(threadInfo[i]);                                      \
1906     }                                                                   \
1907     __kmp_free(threadInfo);
1908 
1909     //
1910     // A value of UINT_MAX means that we didn't find the field
1911     //
1912     unsigned __index;
1913 
1914 #define INIT_PROC_INFO(p) \
1915     for (__index = 0; __index <= maxIndex; __index++) {                 \
1916         (p)[__index] = UINT_MAX;                                        \
1917     }
1918 
1919     for (i = 0; i <= num_records; i++) {
1920         INIT_PROC_INFO(threadInfo[i]);
1921     }
1922 
1923     unsigned num_avail = 0;
1924     *line = 0;
1925     while (! feof(f)) {
1926         //
1927         // Create an inner scoping level, so that all the goto targets at the
1928         // end of the loop appear in an outer scoping level.  This avoids
1929         // warnings about jumping past an initialization to a target in the
1930         // same block.
1931         //
1932         {
1933             buf[sizeof(buf) - 1] = 1;
1934             bool long_line = false;
1935             if (! fgets(buf, sizeof(buf), f)) {
1936                 //
1937                 // Read errors presumably because of EOF
1938                 //
1939                 // If there is valid data in threadInfo[num_avail], then fake
1940                 // a blank line in ensure that the last address gets parsed.
1941                 //
1942                 bool valid = false;
1943                 for (i = 0; i <= maxIndex; i++) {
1944                     if (threadInfo[num_avail][i] != UINT_MAX) {
1945                         valid = true;
1946                     }
1947                 }
1948                 if (! valid) {
1949                     break;
1950                 }
1951                 buf[0] = 0;
1952             } else if (!buf[sizeof(buf) - 1]) {
1953                 //
1954                 // The line is longer than the buffer.  Set a flag and don't
1955                 // emit an error if we were going to ignore the line, anyway.
1956                 //
1957                 long_line = true;
1958 
1959 #define CHECK_LINE \
1960     if (long_line) {                                                    \
1961         CLEANUP_THREAD_INFO;                                            \
1962         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1963         return -1;                                                      \
1964     }
1965             }
1966             (*line)++;
1967 
1968             char s1[] = "processor";
1969             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1970                 CHECK_LINE;
1971                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1972                 unsigned val;
1973                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1974                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1975                 threadInfo[num_avail][osIdIndex] = val;
1976 #if KMP_OS_LINUX && USE_SYSFS_INFO
1977                 char path[256];
1978                 snprintf(path, sizeof(path),
1979                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1980                     threadInfo[num_avail][osIdIndex]);
1981                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1982 
1983                 snprintf(path, sizeof(path),
1984                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1985                     threadInfo[num_avail][osIdIndex]);
1986                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1987                 continue;
1988 #else
1989             }
1990             char s2[] = "physical id";
1991             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1992                 CHECK_LINE;
1993                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1994                 unsigned val;
1995                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1996                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1997                 threadInfo[num_avail][pkgIdIndex] = val;
1998                 continue;
1999             }
2000             char s3[] = "core id";
2001             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2002                 CHECK_LINE;
2003                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2004                 unsigned val;
2005                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2006                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2007                 threadInfo[num_avail][coreIdIndex] = val;
2008                 continue;
2009 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2010             }
2011             char s4[] = "thread id";
2012             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2013                 CHECK_LINE;
2014                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2015                 unsigned val;
2016                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2017                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2018                 threadInfo[num_avail][threadIdIndex] = val;
2019                 continue;
2020             }
2021             unsigned level;
2022             if (sscanf(buf, "node_%d id", &level) == 1) {
2023                 CHECK_LINE;
2024                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2025                 unsigned val;
2026                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2027                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2028                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2029                 threadInfo[num_avail][nodeIdIndex + level] = val;
2030                 continue;
2031             }
2032 
2033             //
2034             // We didn't recognize the leading token on the line.
2035             // There are lots of leading tokens that we don't recognize -
2036             // if the line isn't empty, go on to the next line.
2037             //
2038             if ((*buf != 0) && (*buf != '\n')) {
2039                 //
2040                 // If the line is longer than the buffer, read characters
2041                 // until we find a newline.
2042                 //
2043                 if (long_line) {
2044                     int ch;
2045                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2046                 }
2047                 continue;
2048             }
2049 
2050             //
2051             // A newline has signalled the end of the processor record.
2052             // Check that there aren't too many procs specified.
2053             //
2054             if ((int)num_avail == __kmp_xproc) {
2055                 CLEANUP_THREAD_INFO;
2056                 *msg_id = kmp_i18n_str_TooManyEntries;
2057                 return -1;
2058             }
2059 
2060             //
2061             // Check for missing fields.  The osId field must be there, and we
2062             // currently require that the physical id field is specified, also.
2063             //
2064             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2065                 CLEANUP_THREAD_INFO;
2066                 *msg_id = kmp_i18n_str_MissingProcField;
2067                 return -1;
2068             }
2069             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2070                 CLEANUP_THREAD_INFO;
2071                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2072                 return -1;
2073             }
2074 
2075             //
2076             // Skip this proc if it is not included in the machine model.
2077             //
2078             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2079                 INIT_PROC_INFO(threadInfo[num_avail]);
2080                 continue;
2081             }
2082 
2083             //
2084             // We have a successful parse of this proc's info.
2085             // Increment the counter, and prepare for the next proc.
2086             //
2087             num_avail++;
2088             KMP_ASSERT(num_avail <= num_records);
2089             INIT_PROC_INFO(threadInfo[num_avail]);
2090         }
2091         continue;
2092 
2093         no_val:
2094         CLEANUP_THREAD_INFO;
2095         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2096         return -1;
2097 
2098         dup_field:
2099         CLEANUP_THREAD_INFO;
2100         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2101         return -1;
2102     }
2103     *line = 0;
2104 
2105 # if KMP_MIC && REDUCE_TEAM_SIZE
2106     unsigned teamSize = 0;
2107 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2108 
2109     // check for num_records == __kmp_xproc ???
2110 
2111     //
2112     // If there's only one thread context to bind to, form an Address object
2113     // with depth 1 and return immediately (or, if affinity is off, set
2114     // address2os to NULL and return).
2115     //
2116     // If it is configured to omit the package level when there is only a
2117     // single package, the logic at the end of this routine won't work if
2118     // there is only a single thread - it would try to form an Address
2119     // object with depth 0.
2120     //
2121     KMP_ASSERT(num_avail > 0);
2122     KMP_ASSERT(num_avail <= num_records);
2123     if (num_avail == 1) {
2124         __kmp_ncores = 1;
2125         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2126         if (__kmp_affinity_verbose) {
2127             if (! KMP_AFFINITY_CAPABLE()) {
2128                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2129                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2130                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2131             }
2132             else {
2133                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2134                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2135                   fullMask);
2136                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2137                 if (__kmp_affinity_respect_mask) {
2138                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2139                 } else {
2140                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2141                 }
2142                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2143                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2144             }
2145             int index;
2146             kmp_str_buf_t buf;
2147             __kmp_str_buf_init(&buf);
2148             __kmp_str_buf_print(&buf, "1");
2149             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2150                 __kmp_str_buf_print(&buf, " x 1");
2151             }
2152             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2153             __kmp_str_buf_free(&buf);
2154         }
2155 
2156         if (__kmp_affinity_type == affinity_none) {
2157             CLEANUP_THREAD_INFO;
2158             return 0;
2159         }
2160 
2161         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2162         Address addr(1);
2163         addr.labels[0] = threadInfo[0][pkgIdIndex];
2164         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2165 
2166         if (__kmp_affinity_gran_levels < 0) {
2167             __kmp_affinity_gran_levels = 0;
2168         }
2169 
2170         if (__kmp_affinity_verbose) {
2171             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2172         }
2173 
2174         CLEANUP_THREAD_INFO;
2175         return 1;
2176     }
2177 
2178     //
2179     // Sort the threadInfo table by physical Id.
2180     //
2181     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2182       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2183 
2184     //
2185     // The table is now sorted by pkgId / coreId / threadId, but we really
2186     // don't know the radix of any of the fields.  pkgId's may be sparsely
2187     // assigned among the chips on a system.  Although coreId's are usually
2188     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2189     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2190     //
2191     // For that matter, we don't know what coresPerPkg and threadsPerCore
2192     // (or the total # packages) are at this point - we want to determine
2193     // that now.  We only have an upper bound on the first two figures.
2194     //
2195     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2196       * sizeof(unsigned));
2197     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2198       * sizeof(unsigned));
2199     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2200       * sizeof(unsigned));
2201     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2202       * sizeof(unsigned));
2203 
2204     bool assign_thread_ids = false;
2205     unsigned threadIdCt;
2206     unsigned index;
2207 
2208     restart_radix_check:
2209     threadIdCt = 0;
2210 
2211     //
2212     // Initialize the counter arrays with data from threadInfo[0].
2213     //
2214     if (assign_thread_ids) {
2215         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2216             threadInfo[0][threadIdIndex] = threadIdCt++;
2217         }
2218         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2219             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2220         }
2221     }
2222     for (index = 0; index <= maxIndex; index++) {
2223         counts[index] = 1;
2224         maxCt[index] = 1;
2225         totals[index] = 1;
2226         lastId[index] = threadInfo[0][index];;
2227     }
2228 
2229     //
2230     // Run through the rest of the OS procs.
2231     //
2232     for (i = 1; i < num_avail; i++) {
2233         //
2234         // Find the most significant index whose id differs
2235         // from the id for the previous OS proc.
2236         //
2237         for (index = maxIndex; index >= threadIdIndex; index--) {
2238             if (assign_thread_ids && (index == threadIdIndex)) {
2239                 //
2240                 // Auto-assign the thread id field if it wasn't specified.
2241                 //
2242                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2243                     threadInfo[i][threadIdIndex] = threadIdCt++;
2244                 }
2245 
2246                 //
2247                 // Aparrently the thread id field was specified for some
2248                 // entries and not others.  Start the thread id counter
2249                 // off at the next higher thread id.
2250                 //
2251                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2252                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2253                 }
2254             }
2255             if (threadInfo[i][index] != lastId[index]) {
2256                 //
2257                 // Run through all indices which are less significant,
2258                 // and reset the counts to 1.
2259                 //
2260                 // At all levels up to and including index, we need to
2261                 // increment the totals and record the last id.
2262                 //
2263                 unsigned index2;
2264                 for (index2 = threadIdIndex; index2 < index; index2++) {
2265                     totals[index2]++;
2266                     if (counts[index2] > maxCt[index2]) {
2267                         maxCt[index2] = counts[index2];
2268                     }
2269                     counts[index2] = 1;
2270                     lastId[index2] = threadInfo[i][index2];
2271                 }
2272                 counts[index]++;
2273                 totals[index]++;
2274                 lastId[index] = threadInfo[i][index];
2275 
2276                 if (assign_thread_ids && (index > threadIdIndex)) {
2277 
2278 # if KMP_MIC && REDUCE_TEAM_SIZE
2279                     //
2280                     // The default team size is the total #threads in the machine
2281                     // minus 1 thread for every core that has 3 or more threads.
2282                     //
2283                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2284 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2285 
2286                     //
2287                     // Restart the thread counter, as we are on a new core.
2288                     //
2289                     threadIdCt = 0;
2290 
2291                     //
2292                     // Auto-assign the thread id field if it wasn't specified.
2293                     //
2294                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2295                         threadInfo[i][threadIdIndex] = threadIdCt++;
2296                     }
2297 
2298                     //
2299                     // Aparrently the thread id field was specified for some
2300                     // entries and not others.  Start the thread id counter
2301                     // off at the next higher thread id.
2302                     //
2303                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2304                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2305                     }
2306                 }
2307                 break;
2308             }
2309         }
2310         if (index < threadIdIndex) {
2311             //
2312             // If thread ids were specified, it is an error if they are not
2313             // unique.  Also, check that we waven't already restarted the
2314             // loop (to be safe - shouldn't need to).
2315             //
2316             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2317               || assign_thread_ids) {
2318                 __kmp_free(lastId);
2319                 __kmp_free(totals);
2320                 __kmp_free(maxCt);
2321                 __kmp_free(counts);
2322                 CLEANUP_THREAD_INFO;
2323                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2324                 return -1;
2325             }
2326 
2327             //
2328             // If the thread ids were not specified and we see entries
2329             // entries that are duplicates, start the loop over and
2330             // assign the thread ids manually.
2331             //
2332             assign_thread_ids = true;
2333             goto restart_radix_check;
2334         }
2335     }
2336 
2337 # if KMP_MIC && REDUCE_TEAM_SIZE
2338     //
2339     // The default team size is the total #threads in the machine
2340     // minus 1 thread for every core that has 3 or more threads.
2341     //
2342     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2343 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2344 
2345     for (index = threadIdIndex; index <= maxIndex; index++) {
2346         if (counts[index] > maxCt[index]) {
2347             maxCt[index] = counts[index];
2348         }
2349     }
2350 
2351     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2352     nCoresPerPkg = maxCt[coreIdIndex];
2353     nPackages = totals[pkgIdIndex];
2354 
2355     //
2356     // Check to see if the machine topology is uniform
2357     //
2358     unsigned prod = totals[maxIndex];
2359     for (index = threadIdIndex; index < maxIndex; index++) {
2360        prod *= maxCt[index];
2361     }
2362     bool uniform = (prod == totals[threadIdIndex]);
2363 
2364     //
2365     // When affinity is off, this routine will still be called to set
2366     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2367     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2368     // correctly, and return now if affinity is not enabled.
2369     //
2370     __kmp_ncores = totals[coreIdIndex];
2371 
2372     if (__kmp_affinity_verbose) {
2373         if (! KMP_AFFINITY_CAPABLE()) {
2374                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2375                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2376                 if (uniform) {
2377                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2378                 } else {
2379                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2380                 }
2381         }
2382         else {
2383             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2384             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2385                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2386                 if (__kmp_affinity_respect_mask) {
2387                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2388                 } else {
2389                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2390                 }
2391                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2392                 if (uniform) {
2393                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2394                 } else {
2395                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2396                 }
2397         }
2398         kmp_str_buf_t buf;
2399         __kmp_str_buf_init(&buf);
2400 
2401         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2402         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2403             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2404         }
2405         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2406           maxCt[threadIdIndex], __kmp_ncores);
2407 
2408         __kmp_str_buf_free(&buf);
2409     }
2410 
2411 # if KMP_MIC && REDUCE_TEAM_SIZE
2412     //
2413     // Set the default team size.
2414     //
2415     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2416         __kmp_dflt_team_nth = teamSize;
2417         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2418           __kmp_dflt_team_nth));
2419     }
2420 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2421 
2422     if (__kmp_affinity_type == affinity_none) {
2423         __kmp_free(lastId);
2424         __kmp_free(totals);
2425         __kmp_free(maxCt);
2426         __kmp_free(counts);
2427         CLEANUP_THREAD_INFO;
2428         return 0;
2429     }
2430 
2431     //
2432     // Count the number of levels which have more nodes at that level than
2433     // at the parent's level (with there being an implicit root node of
2434     // the top level).  This is equivalent to saying that there is at least
2435     // one node at this level which has a sibling.  These levels are in the
2436     // map, and the package level is always in the map.
2437     //
2438     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2439     int level = 0;
2440     for (index = threadIdIndex; index < maxIndex; index++) {
2441         KMP_ASSERT(totals[index] >= totals[index + 1]);
2442         inMap[index] = (totals[index] > totals[index + 1]);
2443     }
2444     inMap[maxIndex] = (totals[maxIndex] > 1);
2445     inMap[pkgIdIndex] = true;
2446 
2447     int depth = 0;
2448     for (index = threadIdIndex; index <= maxIndex; index++) {
2449         if (inMap[index]) {
2450             depth++;
2451         }
2452     }
2453     KMP_ASSERT(depth > 0);
2454 
2455     //
2456     // Construct the data structure that is to be returned.
2457     //
2458     *address2os = (AddrUnsPair*)
2459       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2460     int pkgLevel = -1;
2461     int coreLevel = -1;
2462     int threadLevel = -1;
2463 
2464     for (i = 0; i < num_avail; ++i) {
2465         Address addr(depth);
2466         unsigned os = threadInfo[i][osIdIndex];
2467         int src_index;
2468         int dst_index = 0;
2469 
2470         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2471             if (! inMap[src_index]) {
2472                 continue;
2473             }
2474             addr.labels[dst_index] = threadInfo[i][src_index];
2475             if (src_index == pkgIdIndex) {
2476                 pkgLevel = dst_index;
2477             }
2478             else if (src_index == coreIdIndex) {
2479                 coreLevel = dst_index;
2480             }
2481             else if (src_index == threadIdIndex) {
2482                 threadLevel = dst_index;
2483             }
2484             dst_index++;
2485         }
2486         (*address2os)[i] = AddrUnsPair(addr, os);
2487     }
2488 
2489     if (__kmp_affinity_gran_levels < 0) {
2490         //
2491         // Set the granularity level based on what levels are modeled
2492         // in the machine topology map.
2493         //
2494         unsigned src_index;
2495         __kmp_affinity_gran_levels = 0;
2496         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2497             if (! inMap[src_index]) {
2498                 continue;
2499             }
2500             switch (src_index) {
2501                 case threadIdIndex:
2502                 if (__kmp_affinity_gran > affinity_gran_thread) {
2503                     __kmp_affinity_gran_levels++;
2504                 }
2505 
2506                 break;
2507                 case coreIdIndex:
2508                 if (__kmp_affinity_gran > affinity_gran_core) {
2509                     __kmp_affinity_gran_levels++;
2510                 }
2511                 break;
2512 
2513                 case pkgIdIndex:
2514                 if (__kmp_affinity_gran > affinity_gran_package) {
2515                     __kmp_affinity_gran_levels++;
2516                 }
2517                 break;
2518             }
2519         }
2520     }
2521 
2522     if (__kmp_affinity_verbose) {
2523         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2524           coreLevel, threadLevel);
2525     }
2526 
2527     __kmp_free(inMap);
2528     __kmp_free(lastId);
2529     __kmp_free(totals);
2530     __kmp_free(maxCt);
2531     __kmp_free(counts);
2532     CLEANUP_THREAD_INFO;
2533     return depth;
2534 }
2535 
2536 
2537 //
2538 // Create and return a table of affinity masks, indexed by OS thread ID.
2539 // This routine handles OR'ing together all the affinity masks of threads
2540 // that are sufficiently close, if granularity > fine.
2541 //
2542 static kmp_affin_mask_t *
2543 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2544   AddrUnsPair *address2os, unsigned numAddrs)
2545 {
2546     //
2547     // First form a table of affinity masks in order of OS thread id.
2548     //
2549     unsigned depth;
2550     unsigned maxOsId;
2551     unsigned i;
2552 
2553     KMP_ASSERT(numAddrs > 0);
2554     depth = address2os[0].first.depth;
2555 
2556     maxOsId = 0;
2557     for (i = 0; i < numAddrs; i++) {
2558         unsigned osId = address2os[i].second;
2559         if (osId > maxOsId) {
2560             maxOsId = osId;
2561         }
2562     }
2563     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2564       (maxOsId + 1) * __kmp_affin_mask_size);
2565 
2566     //
2567     // Sort the address2os table according to physical order.  Doing so
2568     // will put all threads on the same core/package/node in consecutive
2569     // locations.
2570     //
2571     qsort(address2os, numAddrs, sizeof(*address2os),
2572       __kmp_affinity_cmp_Address_labels);
2573 
2574     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2575     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2576         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2577     }
2578     if (__kmp_affinity_gran_levels >= (int)depth) {
2579         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2580           && (__kmp_affinity_type != affinity_none))) {
2581             KMP_WARNING(AffThreadsMayMigrate);
2582         }
2583     }
2584 
2585     //
2586     // Run through the table, forming the masks for all threads on each
2587     // core.  Threads on the same core will have identical "Address"
2588     // objects, not considering the last level, which must be the thread
2589     // id.  All threads on a core will appear consecutively.
2590     //
2591     unsigned unique = 0;
2592     unsigned j = 0;                             // index of 1st thread on core
2593     unsigned leader = 0;
2594     Address *leaderAddr = &(address2os[0].first);
2595     kmp_affin_mask_t *sum
2596       = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2597     KMP_CPU_ZERO(sum);
2598     KMP_CPU_SET(address2os[0].second, sum);
2599     for (i = 1; i < numAddrs; i++) {
2600         //
2601         // If this thread is sufficiently close to the leader (within the
2602         // granularity setting), then set the bit for this os thread in the
2603         // affinity mask for this group, and go on to the next thread.
2604         //
2605         if (leaderAddr->isClose(address2os[i].first,
2606           __kmp_affinity_gran_levels)) {
2607             KMP_CPU_SET(address2os[i].second, sum);
2608             continue;
2609         }
2610 
2611         //
2612         // For every thread in this group, copy the mask to the thread's
2613         // entry in the osId2Mask table.  Mark the first address as a
2614         // leader.
2615         //
2616         for (; j < i; j++) {
2617             unsigned osId = address2os[j].second;
2618             KMP_DEBUG_ASSERT(osId <= maxOsId);
2619             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2620             KMP_CPU_COPY(mask, sum);
2621             address2os[j].first.leader = (j == leader);
2622         }
2623         unique++;
2624 
2625         //
2626         // Start a new mask.
2627         //
2628         leader = i;
2629         leaderAddr = &(address2os[i].first);
2630         KMP_CPU_ZERO(sum);
2631         KMP_CPU_SET(address2os[i].second, sum);
2632     }
2633 
2634     //
2635     // For every thread in last group, copy the mask to the thread's
2636     // entry in the osId2Mask table.
2637     //
2638     for (; j < i; j++) {
2639         unsigned osId = address2os[j].second;
2640         KMP_DEBUG_ASSERT(osId <= maxOsId);
2641         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2642         KMP_CPU_COPY(mask, sum);
2643         address2os[j].first.leader = (j == leader);
2644     }
2645     unique++;
2646 
2647     *maxIndex = maxOsId;
2648     *numUnique = unique;
2649     return osId2Mask;
2650 }
2651 
2652 
2653 //
2654 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2655 // as file-static than to try and pass them through the calling sequence of
2656 // the recursive-descent OMP_PLACES parser.
2657 //
2658 static kmp_affin_mask_t *newMasks;
2659 static int numNewMasks;
2660 static int nextNewMask;
2661 
2662 #define ADD_MASK(_mask) \
2663     {                                                                   \
2664         if (nextNewMask >= numNewMasks) {                               \
2665             numNewMasks *= 2;                                           \
2666             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2667               numNewMasks * __kmp_affin_mask_size);                     \
2668         }                                                               \
2669         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2670         nextNewMask++;                                                  \
2671     }
2672 
2673 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2674     {                                                                   \
2675         if (((_osId) > _maxOsId) ||                                     \
2676           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2677             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2678               && (__kmp_affinity_type != affinity_none))) {             \
2679                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2680             }                                                           \
2681         }                                                               \
2682         else {                                                          \
2683             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2684         }                                                               \
2685     }
2686 
2687 
2688 //
2689 // Re-parse the proclist (for the explicit affinity type), and form the list
2690 // of affinity newMasks indexed by gtid.
2691 //
2692 static void
2693 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2694   unsigned int *out_numMasks, const char *proclist,
2695   kmp_affin_mask_t *osId2Mask, int maxOsId)
2696 {
2697     const char *scan = proclist;
2698     const char *next = proclist;
2699 
2700     //
2701     // We use malloc() for the temporary mask vector,
2702     // so that we can use realloc() to extend it.
2703     //
2704     numNewMasks = 2;
2705     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2706       * __kmp_affin_mask_size);
2707     nextNewMask = 0;
2708     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2709       __kmp_affin_mask_size);
2710     int setSize = 0;
2711 
2712     for (;;) {
2713         int start, end, stride;
2714 
2715         SKIP_WS(scan);
2716         next = scan;
2717         if (*next == '\0') {
2718             break;
2719         }
2720 
2721         if (*next == '{') {
2722             int num;
2723             setSize = 0;
2724             next++;     // skip '{'
2725             SKIP_WS(next);
2726             scan = next;
2727 
2728             //
2729             // Read the first integer in the set.
2730             //
2731             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2732               "bad proclist");
2733             SKIP_DIGITS(next);
2734             num = __kmp_str_to_int(scan, *next);
2735             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2736 
2737             //
2738             // Copy the mask for that osId to the sum (union) mask.
2739             //
2740             if ((num > maxOsId) ||
2741               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2742                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2743                   && (__kmp_affinity_type != affinity_none))) {
2744                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2745                 }
2746                 KMP_CPU_ZERO(sumMask);
2747             }
2748             else {
2749                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2750                 setSize = 1;
2751             }
2752 
2753             for (;;) {
2754                 //
2755                 // Check for end of set.
2756                 //
2757                 SKIP_WS(next);
2758                 if (*next == '}') {
2759                     next++;     // skip '}'
2760                     break;
2761                 }
2762 
2763                 //
2764                 // Skip optional comma.
2765                 //
2766                 if (*next == ',') {
2767                     next++;
2768                 }
2769                 SKIP_WS(next);
2770 
2771                 //
2772                 // Read the next integer in the set.
2773                 //
2774                 scan = next;
2775                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2776                   "bad explicit proc list");
2777 
2778                 SKIP_DIGITS(next);
2779                 num = __kmp_str_to_int(scan, *next);
2780                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2781 
2782                 //
2783                 // Add the mask for that osId to the sum mask.
2784                 //
2785                 if ((num > maxOsId) ||
2786                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2787                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2788                       && (__kmp_affinity_type != affinity_none))) {
2789                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2790                     }
2791                 }
2792                 else {
2793                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2794                     setSize++;
2795                 }
2796             }
2797             if (setSize > 0) {
2798                 ADD_MASK(sumMask);
2799             }
2800 
2801             SKIP_WS(next);
2802             if (*next == ',') {
2803                 next++;
2804             }
2805             scan = next;
2806             continue;
2807         }
2808 
2809         //
2810         // Read the first integer.
2811         //
2812         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2813         SKIP_DIGITS(next);
2814         start = __kmp_str_to_int(scan, *next);
2815         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2816         SKIP_WS(next);
2817 
2818         //
2819         // If this isn't a range, then add a mask to the list and go on.
2820         //
2821         if (*next != '-') {
2822             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2823 
2824             //
2825             // Skip optional comma.
2826             //
2827             if (*next == ',') {
2828                 next++;
2829             }
2830             scan = next;
2831             continue;
2832         }
2833 
2834         //
2835         // This is a range.  Skip over the '-' and read in the 2nd int.
2836         //
2837         next++;         // skip '-'
2838         SKIP_WS(next);
2839         scan = next;
2840         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2841         SKIP_DIGITS(next);
2842         end = __kmp_str_to_int(scan, *next);
2843         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2844 
2845         //
2846         // Check for a stride parameter
2847         //
2848         stride = 1;
2849         SKIP_WS(next);
2850         if (*next == ':') {
2851             //
2852             // A stride is specified.  Skip over the ':" and read the 3rd int.
2853             //
2854             int sign = +1;
2855             next++;         // skip ':'
2856             SKIP_WS(next);
2857             scan = next;
2858             if (*next == '-') {
2859                 sign = -1;
2860                 next++;
2861                 SKIP_WS(next);
2862                 scan = next;
2863             }
2864             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2865               "bad explicit proc list");
2866             SKIP_DIGITS(next);
2867             stride = __kmp_str_to_int(scan, *next);
2868             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2869             stride *= sign;
2870         }
2871 
2872         //
2873         // Do some range checks.
2874         //
2875         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2876         if (stride > 0) {
2877             KMP_ASSERT2(start <= end, "bad explicit proc list");
2878         }
2879         else {
2880             KMP_ASSERT2(start >= end, "bad explicit proc list");
2881         }
2882         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2883 
2884         //
2885         // Add the mask for each OS proc # to the list.
2886         //
2887         if (stride > 0) {
2888             do {
2889                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2890                 start += stride;
2891             } while (start <= end);
2892         }
2893         else {
2894             do {
2895                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2896                 start += stride;
2897             } while (start >= end);
2898         }
2899 
2900         //
2901         // Skip optional comma.
2902         //
2903         SKIP_WS(next);
2904         if (*next == ',') {
2905             next++;
2906         }
2907         scan = next;
2908     }
2909 
2910     *out_numMasks = nextNewMask;
2911     if (nextNewMask == 0) {
2912         *out_masks = NULL;
2913         KMP_INTERNAL_FREE(newMasks);
2914         return;
2915     }
2916     *out_masks
2917       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2918     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2919     __kmp_free(sumMask);
2920     KMP_INTERNAL_FREE(newMasks);
2921 }
2922 
2923 
2924 # if OMP_40_ENABLED
2925 
2926 /*-----------------------------------------------------------------------------
2927 
2928 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2929 places.  Again, Here is the grammar:
2930 
2931 place_list := place
2932 place_list := place , place_list
2933 place := num
2934 place := place : num
2935 place := place : num : signed
2936 place := { subplacelist }
2937 place := ! place                  // (lowest priority)
2938 subplace_list := subplace
2939 subplace_list := subplace , subplace_list
2940 subplace := num
2941 subplace := num : num
2942 subplace := num : num : signed
2943 signed := num
2944 signed := + signed
2945 signed := - signed
2946 
2947 -----------------------------------------------------------------------------*/
2948 
2949 static void
2950 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2951   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2952 {
2953     const char *next;
2954 
2955     for (;;) {
2956         int start, count, stride, i;
2957 
2958         //
2959         // Read in the starting proc id
2960         //
2961         SKIP_WS(*scan);
2962         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2963           "bad explicit places list");
2964         next = *scan;
2965         SKIP_DIGITS(next);
2966         start = __kmp_str_to_int(*scan, *next);
2967         KMP_ASSERT(start >= 0);
2968         *scan = next;
2969 
2970         //
2971         // valid follow sets are ',' ':' and '}'
2972         //
2973         SKIP_WS(*scan);
2974         if (**scan == '}' || **scan == ',') {
2975             if ((start > maxOsId) ||
2976               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2977                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2978                   && (__kmp_affinity_type != affinity_none))) {
2979                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2980                 }
2981             }
2982             else {
2983                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2984                 (*setSize)++;
2985             }
2986             if (**scan == '}') {
2987                 break;
2988             }
2989             (*scan)++;  // skip ','
2990             continue;
2991         }
2992         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2993         (*scan)++;      // skip ':'
2994 
2995         //
2996         // Read count parameter
2997         //
2998         SKIP_WS(*scan);
2999         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3000           "bad explicit places list");
3001         next = *scan;
3002         SKIP_DIGITS(next);
3003         count = __kmp_str_to_int(*scan, *next);
3004         KMP_ASSERT(count >= 0);
3005         *scan = next;
3006 
3007         //
3008         // valid follow sets are ',' ':' and '}'
3009         //
3010         SKIP_WS(*scan);
3011         if (**scan == '}' || **scan == ',') {
3012             for (i = 0; i < count; i++) {
3013                 if ((start > maxOsId) ||
3014                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3015                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3016                       && (__kmp_affinity_type != affinity_none))) {
3017                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3018                     }
3019                     break;  // don't proliferate warnings for large count
3020                 }
3021                 else {
3022                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3023                     start++;
3024                     (*setSize)++;
3025                 }
3026             }
3027             if (**scan == '}') {
3028                 break;
3029             }
3030             (*scan)++;  // skip ','
3031             continue;
3032         }
3033         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3034         (*scan)++;      // skip ':'
3035 
3036         //
3037         // Read stride parameter
3038         //
3039         int sign = +1;
3040         for (;;) {
3041             SKIP_WS(*scan);
3042             if (**scan == '+') {
3043                 (*scan)++; // skip '+'
3044                 continue;
3045             }
3046             if (**scan == '-') {
3047                 sign *= -1;
3048                 (*scan)++; // skip '-'
3049                 continue;
3050             }
3051             break;
3052         }
3053         SKIP_WS(*scan);
3054         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3055           "bad explicit places list");
3056         next = *scan;
3057         SKIP_DIGITS(next);
3058         stride = __kmp_str_to_int(*scan, *next);
3059         KMP_ASSERT(stride >= 0);
3060         *scan = next;
3061         stride *= sign;
3062 
3063         //
3064         // valid follow sets are ',' and '}'
3065         //
3066         SKIP_WS(*scan);
3067         if (**scan == '}' || **scan == ',') {
3068             for (i = 0; i < count; i++) {
3069                 if ((start > maxOsId) ||
3070                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3071                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3072                       && (__kmp_affinity_type != affinity_none))) {
3073                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3074                     }
3075                     break;  // don't proliferate warnings for large count
3076                 }
3077                 else {
3078                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3079                     start += stride;
3080                     (*setSize)++;
3081                 }
3082             }
3083             if (**scan == '}') {
3084                 break;
3085             }
3086             (*scan)++;  // skip ','
3087             continue;
3088         }
3089 
3090         KMP_ASSERT2(0, "bad explicit places list");
3091     }
3092 }
3093 
3094 
3095 static void
3096 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3097   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3098 {
3099     const char *next;
3100 
3101     //
3102     // valid follow sets are '{' '!' and num
3103     //
3104     SKIP_WS(*scan);
3105     if (**scan == '{') {
3106         (*scan)++;      // skip '{'
3107         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3108           setSize);
3109         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3110         (*scan)++;      // skip '}'
3111     }
3112     else if (**scan == '!') {
3113         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3114         KMP_CPU_COMPLEMENT(tempMask);
3115         (*scan)++;      // skip '!'
3116     }
3117     else if ((**scan >= '0') && (**scan <= '9')) {
3118         next = *scan;
3119         SKIP_DIGITS(next);
3120         int num = __kmp_str_to_int(*scan, *next);
3121         KMP_ASSERT(num >= 0);
3122         if ((num > maxOsId) ||
3123           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3124             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3125               && (__kmp_affinity_type != affinity_none))) {
3126                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3127             }
3128         }
3129         else {
3130             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3131             (*setSize)++;
3132         }
3133         *scan = next;  // skip num
3134     }
3135     else {
3136         KMP_ASSERT2(0, "bad explicit places list");
3137     }
3138 }
3139 
3140 
3141 //static void
3142 void
3143 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3144   unsigned int *out_numMasks, const char *placelist,
3145   kmp_affin_mask_t *osId2Mask, int maxOsId)
3146 {
3147     const char *scan = placelist;
3148     const char *next = placelist;
3149 
3150     numNewMasks = 2;
3151     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3152       * __kmp_affin_mask_size);
3153     nextNewMask = 0;
3154 
3155     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3156       __kmp_affin_mask_size);
3157     KMP_CPU_ZERO(tempMask);
3158     int setSize = 0;
3159 
3160     for (;;) {
3161         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3162 
3163         //
3164         // valid follow sets are ',' ':' and EOL
3165         //
3166         SKIP_WS(scan);
3167         if (*scan == '\0' || *scan == ',') {
3168             if (setSize > 0) {
3169                 ADD_MASK(tempMask);
3170             }
3171             KMP_CPU_ZERO(tempMask);
3172             setSize = 0;
3173             if (*scan == '\0') {
3174                 break;
3175             }
3176             scan++;     // skip ','
3177             continue;
3178         }
3179 
3180         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3181         scan++;         // skip ':'
3182 
3183         //
3184         // Read count parameter
3185         //
3186         SKIP_WS(scan);
3187         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3188           "bad explicit places list");
3189         next = scan;
3190         SKIP_DIGITS(next);
3191         int count = __kmp_str_to_int(scan, *next);
3192         KMP_ASSERT(count >= 0);
3193         scan = next;
3194 
3195         //
3196         // valid follow sets are ',' ':' and EOL
3197         //
3198         SKIP_WS(scan);
3199         int stride;
3200         if (*scan == '\0' || *scan == ',') {
3201             stride = +1;
3202         }
3203         else {
3204             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3205             scan++;         // skip ':'
3206 
3207             //
3208             // Read stride parameter
3209             //
3210             int sign = +1;
3211             for (;;) {
3212                 SKIP_WS(scan);
3213                 if (*scan == '+') {
3214                     scan++; // skip '+'
3215                     continue;
3216                 }
3217                 if (*scan == '-') {
3218                     sign *= -1;
3219                     scan++; // skip '-'
3220                     continue;
3221                 }
3222                 break;
3223             }
3224             SKIP_WS(scan);
3225             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3226               "bad explicit places list");
3227             next = scan;
3228             SKIP_DIGITS(next);
3229             stride = __kmp_str_to_int(scan, *next);
3230             KMP_DEBUG_ASSERT(stride >= 0);
3231             scan = next;
3232             stride *= sign;
3233         }
3234 
3235         if (stride > 0) {
3236             int i;
3237             for (i = 0; i < count; i++) {
3238                 int j;
3239                 if (setSize == 0) {
3240                     break;
3241                 }
3242                 ADD_MASK(tempMask);
3243                 setSize = 0;
3244                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3245                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3246                         KMP_CPU_CLR(j, tempMask);
3247                     }
3248                     else if ((j > maxOsId) ||
3249                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3250                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3251                           && (__kmp_affinity_type != affinity_none))) {
3252                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3253                         }
3254                         KMP_CPU_CLR(j, tempMask);
3255                     }
3256                     else {
3257                         KMP_CPU_SET(j, tempMask);
3258                         setSize++;
3259                     }
3260                 }
3261                 for (; j >= 0; j--) {
3262                     KMP_CPU_CLR(j, tempMask);
3263                 }
3264             }
3265         }
3266         else {
3267             int i;
3268             for (i = 0; i < count; i++) {
3269                 int j;
3270                 if (setSize == 0) {
3271                     break;
3272                 }
3273                 ADD_MASK(tempMask);
3274                 setSize = 0;
3275                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3276                   j++) {
3277                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3278                         KMP_CPU_CLR(j, tempMask);
3279                     }
3280                     else if ((j > maxOsId) ||
3281                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3282                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3283                           && (__kmp_affinity_type != affinity_none))) {
3284                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3285                         }
3286                         KMP_CPU_CLR(j, tempMask);
3287                     }
3288                     else {
3289                         KMP_CPU_SET(j, tempMask);
3290                         setSize++;
3291                     }
3292                 }
3293                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3294                     KMP_CPU_CLR(j, tempMask);
3295                 }
3296             }
3297         }
3298         KMP_CPU_ZERO(tempMask);
3299         setSize = 0;
3300 
3301         //
3302         // valid follow sets are ',' and EOL
3303         //
3304         SKIP_WS(scan);
3305         if (*scan == '\0') {
3306             break;
3307         }
3308         if (*scan == ',') {
3309             scan++;     // skip ','
3310             continue;
3311         }
3312 
3313         KMP_ASSERT2(0, "bad explicit places list");
3314     }
3315 
3316     *out_numMasks = nextNewMask;
3317     if (nextNewMask == 0) {
3318         *out_masks = NULL;
3319         KMP_INTERNAL_FREE(newMasks);
3320         return;
3321     }
3322     *out_masks
3323       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3324     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3325     __kmp_free(tempMask);
3326     KMP_INTERNAL_FREE(newMasks);
3327 }
3328 
3329 # endif /* OMP_40_ENABLED */
3330 
3331 #undef ADD_MASK
3332 #undef ADD_MASK_OSID
3333 
3334 static void
3335 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3336 {
3337     if ( __kmp_place_num_cores == 0 ) {
3338         if ( __kmp_place_num_threads_per_core == 0 ) {
3339             return;   // no cores limiting actions requested, exit
3340         }
3341         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3342     }
3343     if ( !__kmp_affinity_uniform_topology() ) {
3344         KMP_WARNING( AffThrPlaceNonUniform );
3345         return; // don't support non-uniform topology
3346     }
3347     if ( depth != 3 ) {
3348         KMP_WARNING( AffThrPlaceNonThreeLevel );
3349         return; // don't support not-3-level topology
3350     }
3351     if ( __kmp_place_num_threads_per_core == 0 ) {
3352         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3353     }
3354     if ( __kmp_place_core_offset + __kmp_place_num_cores > (unsigned int)nCoresPerPkg ) {
3355         KMP_WARNING( AffThrPlaceManyCores );
3356         return;
3357     }
3358 
3359     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3360                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3361     int i, j, k, n_old = 0, n_new = 0;
3362     for ( i = 0; i < nPackages; ++i ) {
3363         for ( j = 0; j < nCoresPerPkg; ++j ) {
3364             if ( (unsigned int)j < __kmp_place_core_offset || (unsigned int)j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3365                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3366             } else {
3367                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3368                     if ( (unsigned int)k < __kmp_place_num_threads_per_core ) {
3369                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3370                         n_new++;
3371                     }
3372                     n_old++;
3373                 }
3374             }
3375         }
3376     }
3377     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3378     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3379     __kmp_avail_proc = n_new;                                 // correct avail_proc
3380     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3381 
3382     __kmp_free( *pAddr );
3383     *pAddr = newAddr;      // replace old topology with new one
3384 }
3385 
3386 
3387 static AddrUnsPair *address2os = NULL;
3388 static int           * procarr = NULL;
3389 static int     __kmp_aff_depth = 0;
3390 
3391 static void
3392 __kmp_aux_affinity_initialize(void)
3393 {
3394     if (__kmp_affinity_masks != NULL) {
3395         KMP_ASSERT(fullMask != NULL);
3396         return;
3397     }
3398 
3399     //
3400     // Create the "full" mask - this defines all of the processors that we
3401     // consider to be in the machine model.  If respect is set, then it is
3402     // the initialization thread's affinity mask.  Otherwise, it is all
3403     // processors that we know about on the machine.
3404     //
3405     if (fullMask == NULL) {
3406         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3407     }
3408     if (KMP_AFFINITY_CAPABLE()) {
3409         if (__kmp_affinity_respect_mask) {
3410             __kmp_get_system_affinity(fullMask, TRUE);
3411 
3412             //
3413             // Count the number of available processors.
3414             //
3415             unsigned i;
3416             __kmp_avail_proc = 0;
3417             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3418                 if (! KMP_CPU_ISSET(i, fullMask)) {
3419                     continue;
3420                 }
3421                 __kmp_avail_proc++;
3422             }
3423             if (__kmp_avail_proc > __kmp_xproc) {
3424                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3425                   && (__kmp_affinity_type != affinity_none))) {
3426                     KMP_WARNING(ErrorInitializeAffinity);
3427                 }
3428                 __kmp_affinity_type = affinity_none;
3429                 __kmp_affin_mask_size = 0;
3430                 return;
3431             }
3432         }
3433         else {
3434             __kmp_affinity_entire_machine_mask(fullMask);
3435             __kmp_avail_proc = __kmp_xproc;
3436         }
3437     }
3438 
3439     int depth = -1;
3440     kmp_i18n_id_t msg_id = kmp_i18n_null;
3441 
3442     //
3443     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3444     // KMP_TOPOLOGY_METHOD=cpuinfo
3445     //
3446     if ((__kmp_cpuinfo_file != NULL) &&
3447       (__kmp_affinity_top_method == affinity_top_method_all)) {
3448         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3449     }
3450 
3451     if (__kmp_affinity_top_method == affinity_top_method_all) {
3452         //
3453         // In the default code path, errors are not fatal - we just try using
3454         // another method.  We only emit a warning message if affinity is on,
3455         // or the verbose flag is set, an the nowarnings flag was not set.
3456         //
3457         const char *file_name = NULL;
3458         int line = 0;
3459 
3460 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3461 
3462         if (__kmp_affinity_verbose) {
3463             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3464         }
3465 
3466         file_name = NULL;
3467         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3468         if (depth == 0) {
3469             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3470             KMP_ASSERT(address2os == NULL);
3471             return;
3472         }
3473 
3474         if (depth < 0) {
3475             if (__kmp_affinity_verbose) {
3476                 if (msg_id != kmp_i18n_null) {
3477                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3478                       KMP_I18N_STR(DecodingLegacyAPIC));
3479                 }
3480                 else {
3481                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3482                 }
3483             }
3484 
3485             file_name = NULL;
3486             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3487             if (depth == 0) {
3488                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3489                 KMP_ASSERT(address2os == NULL);
3490                 return;
3491             }
3492         }
3493 
3494 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3495 
3496 # if KMP_OS_LINUX
3497 
3498         if (depth < 0) {
3499             if (__kmp_affinity_verbose) {
3500                 if (msg_id != kmp_i18n_null) {
3501                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3502                 }
3503                 else {
3504                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3505                 }
3506             }
3507 
3508             FILE *f = fopen("/proc/cpuinfo", "r");
3509             if (f == NULL) {
3510                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3511             }
3512             else {
3513                 file_name = "/proc/cpuinfo";
3514                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3515                 fclose(f);
3516                 if (depth == 0) {
3517                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3518                     KMP_ASSERT(address2os == NULL);
3519                     return;
3520                 }
3521             }
3522         }
3523 
3524 # endif /* KMP_OS_LINUX */
3525 
3526 # if KMP_GROUP_AFFINITY
3527 
3528         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3529             if (__kmp_affinity_verbose) {
3530                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3531             }
3532 
3533             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3534             KMP_ASSERT(depth != 0);
3535         }
3536 
3537 # endif /* KMP_GROUP_AFFINITY */
3538 
3539         if (depth < 0) {
3540             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3541                 if (file_name == NULL) {
3542                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3543                 }
3544                 else if (line == 0) {
3545                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3546                 }
3547                 else {
3548                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3549                 }
3550             }
3551             // FIXME - print msg if msg_id = kmp_i18n_null ???
3552 
3553             file_name = "";
3554             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3555             if (depth == 0) {
3556                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3557                 KMP_ASSERT(address2os == NULL);
3558                 return;
3559             }
3560             KMP_ASSERT(depth > 0);
3561             KMP_ASSERT(address2os != NULL);
3562         }
3563     }
3564 
3565     //
3566     // If the user has specified that a paricular topology discovery method
3567     // is to be used, then we abort if that method fails.  The exception is
3568     // group affinity, which might have been implicitly set.
3569     //
3570 
3571 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3572 
3573     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3574         if (__kmp_affinity_verbose) {
3575             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3576               KMP_I18N_STR(Decodingx2APIC));
3577         }
3578 
3579         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3580         if (depth == 0) {
3581             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3582             KMP_ASSERT(address2os == NULL);
3583             return;
3584         }
3585         if (depth < 0) {
3586             KMP_ASSERT(msg_id != kmp_i18n_null);
3587             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3588         }
3589     }
3590     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3591         if (__kmp_affinity_verbose) {
3592             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3593               KMP_I18N_STR(DecodingLegacyAPIC));
3594         }
3595 
3596         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3597         if (depth == 0) {
3598             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3599             KMP_ASSERT(address2os == NULL);
3600             return;
3601         }
3602         if (depth < 0) {
3603             KMP_ASSERT(msg_id != kmp_i18n_null);
3604             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3605         }
3606     }
3607 
3608 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3609 
3610     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3611         const char *filename;
3612         if (__kmp_cpuinfo_file != NULL) {
3613             filename = __kmp_cpuinfo_file;
3614         }
3615         else {
3616             filename = "/proc/cpuinfo";
3617         }
3618 
3619         if (__kmp_affinity_verbose) {
3620             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3621         }
3622 
3623         FILE *f = fopen(filename, "r");
3624         if (f == NULL) {
3625             int code = errno;
3626             if (__kmp_cpuinfo_file != NULL) {
3627                 __kmp_msg(
3628                     kmp_ms_fatal,
3629                     KMP_MSG(CantOpenFileForReading, filename),
3630                     KMP_ERR(code),
3631                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3632                     __kmp_msg_null
3633                 );
3634             }
3635             else {
3636                 __kmp_msg(
3637                     kmp_ms_fatal,
3638                     KMP_MSG(CantOpenFileForReading, filename),
3639                     KMP_ERR(code),
3640                     __kmp_msg_null
3641                 );
3642             }
3643         }
3644         int line = 0;
3645         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3646         fclose(f);
3647         if (depth < 0) {
3648             KMP_ASSERT(msg_id != kmp_i18n_null);
3649             if (line > 0) {
3650                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3651             }
3652             else {
3653                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3654             }
3655         }
3656         if (__kmp_affinity_type == affinity_none) {
3657             KMP_ASSERT(depth == 0);
3658             KMP_ASSERT(address2os == NULL);
3659             return;
3660         }
3661     }
3662 
3663 # if KMP_GROUP_AFFINITY
3664 
3665     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3666         if (__kmp_affinity_verbose) {
3667             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3668         }
3669 
3670         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3671         KMP_ASSERT(depth != 0);
3672         if (depth < 0) {
3673             KMP_ASSERT(msg_id != kmp_i18n_null);
3674             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3675         }
3676     }
3677 
3678 # endif /* KMP_GROUP_AFFINITY */
3679 
3680     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3681         if (__kmp_affinity_verbose) {
3682             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3683         }
3684 
3685         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3686         if (depth == 0) {
3687             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3688             KMP_ASSERT(address2os == NULL);
3689             return;
3690         }
3691         // should not fail
3692         KMP_ASSERT(depth > 0);
3693         KMP_ASSERT(address2os != NULL);
3694     }
3695 
3696     if (address2os == NULL) {
3697         if (KMP_AFFINITY_CAPABLE()
3698           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3699           && (__kmp_affinity_type != affinity_none)))) {
3700             KMP_WARNING(ErrorInitializeAffinity);
3701         }
3702         __kmp_affinity_type = affinity_none;
3703         __kmp_affin_mask_size = 0;
3704         return;
3705     }
3706 
3707     __kmp_apply_thread_places(&address2os, depth);
3708 
3709     //
3710     // Create the table of masks, indexed by thread Id.
3711     //
3712     unsigned maxIndex;
3713     unsigned numUnique;
3714     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3715       address2os, __kmp_avail_proc);
3716     if (__kmp_affinity_gran_levels == 0) {
3717         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3718     }
3719 
3720     //
3721     // Set the childNums vector in all Address objects.  This must be done
3722     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3723     // which takes into account the setting of __kmp_affinity_compact.
3724     //
3725     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3726 
3727     switch (__kmp_affinity_type) {
3728 
3729         case affinity_explicit:
3730         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3731 # if OMP_40_ENABLED
3732         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3733 # endif
3734         {
3735             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3736               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3737               maxIndex);
3738         }
3739 # if OMP_40_ENABLED
3740         else {
3741             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3742               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3743               maxIndex);
3744         }
3745 # endif
3746         if (__kmp_affinity_num_masks == 0) {
3747             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3748               && (__kmp_affinity_type != affinity_none))) {
3749                 KMP_WARNING(AffNoValidProcID);
3750             }
3751             __kmp_affinity_type = affinity_none;
3752             return;
3753         }
3754         break;
3755 
3756         //
3757         // The other affinity types rely on sorting the Addresses according
3758         // to some permutation of the machine topology tree.  Set
3759         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3760         // then jump to a common code fragment to do the sort and create
3761         // the array of affinity masks.
3762         //
3763 
3764         case affinity_logical:
3765         __kmp_affinity_compact = 0;
3766         if (__kmp_affinity_offset) {
3767             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3768               % __kmp_avail_proc;
3769         }
3770         goto sortAddresses;
3771 
3772         case affinity_physical:
3773         if (__kmp_nThreadsPerCore > 1) {
3774             __kmp_affinity_compact = 1;
3775             if (__kmp_affinity_compact >= depth) {
3776                 __kmp_affinity_compact = 0;
3777             }
3778         } else {
3779             __kmp_affinity_compact = 0;
3780         }
3781         if (__kmp_affinity_offset) {
3782             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3783               % __kmp_avail_proc;
3784         }
3785         goto sortAddresses;
3786 
3787         case affinity_scatter:
3788         if (__kmp_affinity_compact >= depth) {
3789             __kmp_affinity_compact = 0;
3790         }
3791         else {
3792             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3793         }
3794         goto sortAddresses;
3795 
3796         case affinity_compact:
3797         if (__kmp_affinity_compact >= depth) {
3798             __kmp_affinity_compact = depth - 1;
3799         }
3800         goto sortAddresses;
3801 
3802         case affinity_balanced:
3803         // Balanced works only for the case of a single package
3804         if( nPackages > 1 ) {
3805             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3806                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3807             }
3808             __kmp_affinity_type = affinity_none;
3809             return;
3810         } else if( __kmp_affinity_uniform_topology() ) {
3811             break;
3812         } else { // Non-uniform topology
3813 
3814             // Save the depth for further usage
3815             __kmp_aff_depth = depth;
3816 
3817             // Number of hyper threads per core in HT machine
3818             int nth_per_core = __kmp_nThreadsPerCore;
3819 
3820             int core_level;
3821             if( nth_per_core > 1 ) {
3822                 core_level = depth - 2;
3823             } else {
3824                 core_level = depth - 1;
3825             }
3826             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3827             int nproc = nth_per_core * ncores;
3828 
3829             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3830             for( int i = 0; i < nproc; i++ ) {
3831                 procarr[ i ] = -1;
3832             }
3833 
3834             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3835                 int proc = address2os[ i ].second;
3836                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3837                 // If there is only one thread per core then depth == 2: level 0 - package,
3838                 // level 1 - core.
3839                 int level = depth - 1;
3840 
3841                 // __kmp_nth_per_core == 1
3842                 int thread = 0;
3843                 int core = address2os[ i ].first.labels[ level ];
3844                 // If the thread level exists, that is we have more than one thread context per core
3845                 if( nth_per_core > 1 ) {
3846                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3847                     core = address2os[ i ].first.labels[ level - 1 ];
3848                 }
3849                 procarr[ core * nth_per_core + thread ] = proc;
3850             }
3851 
3852             break;
3853         }
3854 
3855         sortAddresses:
3856         //
3857         // Allocate the gtid->affinity mask table.
3858         //
3859         if (__kmp_affinity_dups) {
3860             __kmp_affinity_num_masks = __kmp_avail_proc;
3861         }
3862         else {
3863             __kmp_affinity_num_masks = numUnique;
3864         }
3865 
3866 # if OMP_40_ENABLED
3867         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3868           && ( __kmp_affinity_num_places > 0 )
3869           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3870             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3871         }
3872 # endif
3873 
3874         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3875           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3876 
3877         //
3878         // Sort the address2os table according to the current setting of
3879         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3880         //
3881         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3882           __kmp_affinity_cmp_Address_child_num);
3883         {
3884             int i;
3885             unsigned j;
3886             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3887                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3888                     continue;
3889                 }
3890                 unsigned osId = address2os[i].second;
3891                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3892                 kmp_affin_mask_t *dest
3893                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3894                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3895                 KMP_CPU_COPY(dest, src);
3896                 if (++j >= __kmp_affinity_num_masks) {
3897                     break;
3898                 }
3899             }
3900             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3901         }
3902         break;
3903 
3904         default:
3905         KMP_ASSERT2(0, "Unexpected affinity setting");
3906     }
3907 
3908     __kmp_free(osId2Mask);
3909     machine_hierarchy.init(address2os, __kmp_avail_proc);
3910 }
3911 
3912 
3913 void
3914 __kmp_affinity_initialize(void)
3915 {
3916     //
3917     // Much of the code above was written assumming that if a machine was not
3918     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3919     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3920     //
3921     // There are too many checks for __kmp_affinity_type == affinity_none
3922     // in this code.  Instead of trying to change them all, check if
3923     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3924     // affinity_none, call the real initialization routine, then restore
3925     // __kmp_affinity_type to affinity_disabled.
3926     //
3927     int disabled = (__kmp_affinity_type == affinity_disabled);
3928     if (! KMP_AFFINITY_CAPABLE()) {
3929         KMP_ASSERT(disabled);
3930     }
3931     if (disabled) {
3932         __kmp_affinity_type = affinity_none;
3933     }
3934     __kmp_aux_affinity_initialize();
3935     if (disabled) {
3936         __kmp_affinity_type = affinity_disabled;
3937     }
3938 }
3939 
3940 
3941 void
3942 __kmp_affinity_uninitialize(void)
3943 {
3944     if (__kmp_affinity_masks != NULL) {
3945         __kmp_free(__kmp_affinity_masks);
3946         __kmp_affinity_masks = NULL;
3947     }
3948     if (fullMask != NULL) {
3949         KMP_CPU_FREE(fullMask);
3950         fullMask = NULL;
3951     }
3952     __kmp_affinity_num_masks = 0;
3953 # if OMP_40_ENABLED
3954     __kmp_affinity_num_places = 0;
3955 # endif
3956     if (__kmp_affinity_proclist != NULL) {
3957         __kmp_free(__kmp_affinity_proclist);
3958         __kmp_affinity_proclist = NULL;
3959     }
3960     if( address2os != NULL ) {
3961         __kmp_free( address2os );
3962         address2os = NULL;
3963     }
3964     if( procarr != NULL ) {
3965         __kmp_free( procarr );
3966         procarr = NULL;
3967     }
3968 }
3969 
3970 
3971 void
3972 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3973 {
3974     if (! KMP_AFFINITY_CAPABLE()) {
3975         return;
3976     }
3977 
3978     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3979     if (th->th.th_affin_mask == NULL) {
3980         KMP_CPU_ALLOC(th->th.th_affin_mask);
3981     }
3982     else {
3983         KMP_CPU_ZERO(th->th.th_affin_mask);
3984     }
3985 
3986     //
3987     // Copy the thread mask to the kmp_info_t strucuture.
3988     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3989     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3990     // is set, then the full mask is the same as the mask of the initialization
3991     // thread.
3992     //
3993     kmp_affin_mask_t *mask;
3994     int i;
3995 
3996 # if OMP_40_ENABLED
3997     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3998 # endif
3999     {
4000         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4001           ) {
4002 # if KMP_GROUP_AFFINITY
4003             if (__kmp_num_proc_groups > 1) {
4004                 return;
4005             }
4006 # endif
4007             KMP_ASSERT(fullMask != NULL);
4008             i = KMP_PLACE_ALL;
4009             mask = fullMask;
4010         }
4011         else {
4012             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4013             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4014             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4015         }
4016     }
4017 # if OMP_40_ENABLED
4018     else {
4019         if ((! isa_root)
4020           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4021 #  if KMP_GROUP_AFFINITY
4022             if (__kmp_num_proc_groups > 1) {
4023                 return;
4024             }
4025 #  endif
4026             KMP_ASSERT(fullMask != NULL);
4027             i = KMP_PLACE_ALL;
4028             mask = fullMask;
4029         }
4030         else {
4031             //
4032             // int i = some hash function or just a counter that doesn't
4033             // always start at 0.  Use gtid for now.
4034             //
4035             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4036             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4037             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4038         }
4039     }
4040 # endif
4041 
4042 # if OMP_40_ENABLED
4043     th->th.th_current_place = i;
4044     if (isa_root) {
4045         th->th.th_new_place = i;
4046         th->th.th_first_place = 0;
4047         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4048     }
4049 
4050     if (i == KMP_PLACE_ALL) {
4051         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4052           gtid));
4053     }
4054     else {
4055         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4056           gtid, i));
4057     }
4058 # else
4059     if (i == -1) {
4060         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4061           gtid));
4062     }
4063     else {
4064         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4065           gtid, i));
4066     }
4067 # endif /* OMP_40_ENABLED */
4068 
4069     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4070 
4071     if (__kmp_affinity_verbose) {
4072         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4073         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4074           th->th.th_affin_mask);
4075         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4076           buf);
4077     }
4078 
4079 # if KMP_OS_WINDOWS
4080     //
4081     // On Windows* OS, the process affinity mask might have changed.
4082     // If the user didn't request affinity and this call fails,
4083     // just continue silently.  See CQ171393.
4084     //
4085     if ( __kmp_affinity_type == affinity_none ) {
4086         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4087     }
4088     else
4089 # endif
4090     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4091 }
4092 
4093 
4094 # if OMP_40_ENABLED
4095 
4096 void
4097 __kmp_affinity_set_place(int gtid)
4098 {
4099     int retval;
4100 
4101     if (! KMP_AFFINITY_CAPABLE()) {
4102         return;
4103     }
4104 
4105     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4106 
4107     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4108       gtid, th->th.th_new_place, th->th.th_current_place));
4109 
4110     //
4111     // Check that the new place is within this thread's partition.
4112     //
4113     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4114     KMP_ASSERT(th->th.th_new_place >= 0);
4115     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4116     if (th->th.th_first_place <= th->th.th_last_place) {
4117         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4118          && (th->th.th_new_place <= th->th.th_last_place));
4119     }
4120     else {
4121         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4122          || (th->th.th_new_place >= th->th.th_last_place));
4123     }
4124 
4125     //
4126     // Copy the thread mask to the kmp_info_t strucuture,
4127     // and set this thread's affinity.
4128     //
4129     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4130       th->th.th_new_place);
4131     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4132     th->th.th_current_place = th->th.th_new_place;
4133 
4134     if (__kmp_affinity_verbose) {
4135         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4136         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4137           th->th.th_affin_mask);
4138         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4139           gtid, buf);
4140     }
4141     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4142 }
4143 
4144 # endif /* OMP_40_ENABLED */
4145 
4146 
4147 int
4148 __kmp_aux_set_affinity(void **mask)
4149 {
4150     int gtid;
4151     kmp_info_t *th;
4152     int retval;
4153 
4154     if (! KMP_AFFINITY_CAPABLE()) {
4155         return -1;
4156     }
4157 
4158     gtid = __kmp_entry_gtid();
4159     KA_TRACE(1000, ;{
4160         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4161         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4162           (kmp_affin_mask_t *)(*mask));
4163         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4164           gtid, buf);
4165     });
4166 
4167     if (__kmp_env_consistency_check) {
4168         if ((mask == NULL) || (*mask == NULL)) {
4169             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4170         }
4171         else {
4172             unsigned proc;
4173             int num_procs = 0;
4174 
4175             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4176                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4177                     continue;
4178                 }
4179                 num_procs++;
4180                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4181                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4182                     break;
4183                 }
4184             }
4185             if (num_procs == 0) {
4186                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4187             }
4188 
4189 # if KMP_GROUP_AFFINITY
4190             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4191                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4192             }
4193 # endif /* KMP_GROUP_AFFINITY */
4194 
4195         }
4196     }
4197 
4198     th = __kmp_threads[gtid];
4199     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4200     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4201     if (retval == 0) {
4202         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4203     }
4204 
4205 # if OMP_40_ENABLED
4206     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4207     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4208     th->th.th_first_place = 0;
4209     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4210 
4211     //
4212     // Turn off 4.0 affinity for the current tread at this parallel level.
4213     //
4214     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4215 # endif
4216 
4217     return retval;
4218 }
4219 
4220 
4221 int
4222 __kmp_aux_get_affinity(void **mask)
4223 {
4224     int gtid;
4225     int retval;
4226     kmp_info_t *th;
4227 
4228     if (! KMP_AFFINITY_CAPABLE()) {
4229         return -1;
4230     }
4231 
4232     gtid = __kmp_entry_gtid();
4233     th = __kmp_threads[gtid];
4234     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4235 
4236     KA_TRACE(1000, ;{
4237         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4238         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4239           th->th.th_affin_mask);
4240         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4241     });
4242 
4243     if (__kmp_env_consistency_check) {
4244         if ((mask == NULL) || (*mask == NULL)) {
4245             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4246         }
4247     }
4248 
4249 # if !KMP_OS_WINDOWS
4250 
4251     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4252     KA_TRACE(1000, ;{
4253         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4254         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4255           (kmp_affin_mask_t *)(*mask));
4256         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4257     });
4258     return retval;
4259 
4260 # else
4261 
4262     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4263     return 0;
4264 
4265 # endif /* KMP_OS_WINDOWS */
4266 
4267 }
4268 
4269 int
4270 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4271 {
4272     int retval;
4273 
4274     if (! KMP_AFFINITY_CAPABLE()) {
4275         return -1;
4276     }
4277 
4278     KA_TRACE(1000, ;{
4279         int gtid = __kmp_entry_gtid();
4280         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4281         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4282           (kmp_affin_mask_t *)(*mask));
4283         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4284           proc, gtid, buf);
4285     });
4286 
4287     if (__kmp_env_consistency_check) {
4288         if ((mask == NULL) || (*mask == NULL)) {
4289             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4290         }
4291     }
4292 
4293     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4294         return -1;
4295     }
4296     if (! KMP_CPU_ISSET(proc, fullMask)) {
4297         return -2;
4298     }
4299 
4300     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4301     return 0;
4302 }
4303 
4304 
4305 int
4306 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4307 {
4308     int retval;
4309 
4310     if (! KMP_AFFINITY_CAPABLE()) {
4311         return -1;
4312     }
4313 
4314     KA_TRACE(1000, ;{
4315         int gtid = __kmp_entry_gtid();
4316         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4317         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4318           (kmp_affin_mask_t *)(*mask));
4319         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4320           proc, gtid, buf);
4321     });
4322 
4323     if (__kmp_env_consistency_check) {
4324         if ((mask == NULL) || (*mask == NULL)) {
4325             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4326         }
4327     }
4328 
4329     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4330         return -1;
4331     }
4332     if (! KMP_CPU_ISSET(proc, fullMask)) {
4333         return -2;
4334     }
4335 
4336     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4337     return 0;
4338 }
4339 
4340 
4341 int
4342 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4343 {
4344     int retval;
4345 
4346     if (! KMP_AFFINITY_CAPABLE()) {
4347         return -1;
4348     }
4349 
4350     KA_TRACE(1000, ;{
4351         int gtid = __kmp_entry_gtid();
4352         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4353         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4354           (kmp_affin_mask_t *)(*mask));
4355         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4356           proc, gtid, buf);
4357     });
4358 
4359     if (__kmp_env_consistency_check) {
4360         if ((mask == NULL) || (*mask == NULL)) {
4361             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4362         }
4363     }
4364 
4365     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4366         return 0;
4367     }
4368     if (! KMP_CPU_ISSET(proc, fullMask)) {
4369         return 0;
4370     }
4371 
4372     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4373 }
4374 
4375 
4376 // Dynamic affinity settings - Affinity balanced
4377 void __kmp_balanced_affinity( int tid, int nthreads )
4378 {
4379     if( __kmp_affinity_uniform_topology() ) {
4380         int coreID;
4381         int threadID;
4382         // Number of hyper threads per core in HT machine
4383         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4384         // Number of cores
4385         int ncores = __kmp_ncores;
4386         // How many threads will be bound to each core
4387         int chunk = nthreads / ncores;
4388         // How many cores will have an additional thread bound to it - "big cores"
4389         int big_cores = nthreads % ncores;
4390         // Number of threads on the big cores
4391         int big_nth = ( chunk + 1 ) * big_cores;
4392         if( tid < big_nth ) {
4393             coreID = tid / (chunk + 1 );
4394             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4395         } else { //tid >= big_nth
4396             coreID = ( tid - big_cores ) / chunk;
4397             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4398         }
4399 
4400         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4401           "Illegal set affinity operation when not capable");
4402 
4403         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4404         KMP_CPU_ZERO(mask);
4405 
4406         // Granularity == thread
4407         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4408             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4409             KMP_CPU_SET( osID, mask);
4410         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4411             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4412                 int osID;
4413                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4414                 KMP_CPU_SET( osID, mask);
4415             }
4416         }
4417         if (__kmp_affinity_verbose) {
4418             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4419             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4420             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4421               tid, buf);
4422         }
4423         __kmp_set_system_affinity( mask, TRUE );
4424     } else { // Non-uniform topology
4425 
4426         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4427         KMP_CPU_ZERO(mask);
4428 
4429         // Number of hyper threads per core in HT machine
4430         int nth_per_core = __kmp_nThreadsPerCore;
4431         int core_level;
4432         if( nth_per_core > 1 ) {
4433             core_level = __kmp_aff_depth - 2;
4434         } else {
4435             core_level = __kmp_aff_depth - 1;
4436         }
4437 
4438         // Number of cores - maximum value; it does not count trail cores with 0 processors
4439         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4440 
4441         // For performance gain consider the special case nthreads == __kmp_avail_proc
4442         if( nthreads == __kmp_avail_proc ) {
4443             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4444                 int osID = address2os[ tid ].second;
4445                 KMP_CPU_SET( osID, mask);
4446             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4447                 int coreID = address2os[ tid ].first.labels[ core_level ];
4448                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4449                 // since the address2os is sortied we can break when cnt==nth_per_core
4450                 int cnt = 0;
4451                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4452                     int osID = address2os[ i ].second;
4453                     int core = address2os[ i ].first.labels[ core_level ];
4454                     if( core == coreID ) {
4455                         KMP_CPU_SET( osID, mask);
4456                         cnt++;
4457                         if( cnt == nth_per_core ) {
4458                             break;
4459                         }
4460                     }
4461                 }
4462             }
4463         } else if( nthreads <= __kmp_ncores ) {
4464 
4465             int core = 0;
4466             for( int i = 0; i < ncores; i++ ) {
4467                 // Check if this core from procarr[] is in the mask
4468                 int in_mask = 0;
4469                 for( int j = 0; j < nth_per_core; j++ ) {
4470                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4471                         in_mask = 1;
4472                         break;
4473                     }
4474                 }
4475                 if( in_mask ) {
4476                     if( tid == core ) {
4477                         for( int j = 0; j < nth_per_core; j++ ) {
4478                             int osID = procarr[ i * nth_per_core + j ];
4479                             if( osID != -1 ) {
4480                                 KMP_CPU_SET( osID, mask );
4481                                 // For granularity=thread it is enough to set the first available osID for this core
4482                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4483                                     break;
4484                                 }
4485                             }
4486                         }
4487                         break;
4488                     } else {
4489                         core++;
4490                     }
4491                 }
4492             }
4493 
4494         } else { // nthreads > __kmp_ncores
4495 
4496             // Array to save the number of processors at each core
4497             int nproc_at_core[ ncores ];
4498             // Array to save the number of cores with "x" available processors;
4499             int ncores_with_x_procs[ nth_per_core + 1 ];
4500             // Array to save the number of cores with # procs from x to nth_per_core
4501             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4502 
4503             for( int i = 0; i <= nth_per_core; i++ ) {
4504                 ncores_with_x_procs[ i ] = 0;
4505                 ncores_with_x_to_max_procs[ i ] = 0;
4506             }
4507 
4508             for( int i = 0; i < ncores; i++ ) {
4509                 int cnt = 0;
4510                 for( int j = 0; j < nth_per_core; j++ ) {
4511                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4512                         cnt++;
4513                     }
4514                 }
4515                 nproc_at_core[ i ] = cnt;
4516                 ncores_with_x_procs[ cnt ]++;
4517             }
4518 
4519             for( int i = 0; i <= nth_per_core; i++ ) {
4520                 for( int j = i; j <= nth_per_core; j++ ) {
4521                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4522                 }
4523             }
4524 
4525             // Max number of processors
4526             int nproc = nth_per_core * ncores;
4527             // An array to keep number of threads per each context
4528             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4529             for( int i = 0; i < nproc; i++ ) {
4530                 newarr[ i ] = 0;
4531             }
4532 
4533             int nth = nthreads;
4534             int flag = 0;
4535             while( nth > 0 ) {
4536                 for( int j = 1; j <= nth_per_core; j++ ) {
4537                     int cnt = ncores_with_x_to_max_procs[ j ];
4538                     for( int i = 0; i < ncores; i++ ) {
4539                         // Skip the core with 0 processors
4540                         if( nproc_at_core[ i ] == 0 ) {
4541                             continue;
4542                         }
4543                         for( int k = 0; k < nth_per_core; k++ ) {
4544                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4545                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4546                                     newarr[ i * nth_per_core + k ] = 1;
4547                                     cnt--;
4548                                     nth--;
4549                                     break;
4550                                 } else {
4551                                     if( flag != 0 ) {
4552                                         newarr[ i * nth_per_core + k ] ++;
4553                                         cnt--;
4554                                         nth--;
4555                                         break;
4556                                     }
4557                                 }
4558                             }
4559                         }
4560                         if( cnt == 0 || nth == 0 ) {
4561                             break;
4562                         }
4563                     }
4564                     if( nth == 0 ) {
4565                         break;
4566                     }
4567                 }
4568                 flag = 1;
4569             }
4570             int sum = 0;
4571             for( int i = 0; i < nproc; i++ ) {
4572                 sum += newarr[ i ];
4573                 if( sum > tid ) {
4574                     // Granularity == thread
4575                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4576                         int osID = procarr[ i ];
4577                         KMP_CPU_SET( osID, mask);
4578                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4579                         int coreID = i / nth_per_core;
4580                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4581                             int osID = procarr[ coreID * nth_per_core + ii ];
4582                             if( osID != -1 ) {
4583                                 KMP_CPU_SET( osID, mask);
4584                             }
4585                         }
4586                     }
4587                     break;
4588                 }
4589             }
4590             __kmp_free( newarr );
4591         }
4592 
4593         if (__kmp_affinity_verbose) {
4594             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4595             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4596             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4597               tid, buf);
4598         }
4599         __kmp_set_system_affinity( mask, TRUE );
4600     }
4601 }
4602 
4603 #else
4604     // affinity not supported
4605 
4606 kmp_uint32 mac_skipPerLevel[7];
4607 kmp_uint32 mac_depth;
4608 kmp_uint8 mac_leaf_kids;
4609 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4610     static int first = 1;
4611     if (first) {
4612         const kmp_uint32 maxLevels = 7;
4613         kmp_uint32 numPerLevel[maxLevels];
4614 
4615         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4616             numPerLevel[i] = 1;
4617             mac_skipPerLevel[i] = 1;
4618         }
4619 
4620         mac_depth = 2;
4621         numPerLevel[0] = nproc;
4622 
4623         kmp_uint32 branch = 4;
4624         if (numPerLevel[0] == 1) branch = nproc/4;
4625         if (branch<4) branch=4;
4626         for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4627             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4628                 if (numPerLevel[d] & 1) numPerLevel[d]++;
4629                 numPerLevel[d] = numPerLevel[d] >> 1;
4630                 if (numPerLevel[d+1] == 1) mac_depth++;
4631                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4632             }
4633             if(numPerLevel[0] == 1) {
4634                 branch = branch >> 1;
4635                 if (branch<4) branch = 4;
4636             }
4637         }
4638 
4639         for (kmp_uint32 i=1; i<mac_depth; ++i)
4640             mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4641         mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4642         first=0;
4643     }
4644     thr_bar->depth = mac_depth;
4645     thr_bar->base_leaf_kids = mac_leaf_kids;
4646     thr_bar->skip_per_level = mac_skipPerLevel;
4647 }
4648 
4649 #endif // KMP_AFFINITY_SUPPORTED
4650