1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 43473 $
4  * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5  */
6 
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 
18 #include "kmp.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_str.h"
22 #include "kmp_wrapper_getpid.h"
23 
24 #if KMP_AFFINITY_SUPPORTED
25 
26 //
27 // Print the affinity mask to the character array in a pretty format.
28 //
29 char *
30 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
31 {
32     KMP_ASSERT(buf_len >= 40);
33     char *scan = buf;
34     char *end = buf + buf_len - 1;
35 
36     //
37     // Find first element / check for empty set.
38     //
39     size_t i;
40     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
41         if (KMP_CPU_ISSET(i, mask)) {
42             break;
43         }
44     }
45     if (i == KMP_CPU_SETSIZE) {
46         sprintf(scan, "{<empty>}");
47         while (*scan != '\0') scan++;
48         KMP_ASSERT(scan <= end);
49         return buf;
50     }
51 
52     sprintf(scan, "{%ld", (long)i);
53     while (*scan != '\0') scan++;
54     i++;
55     for (; i < KMP_CPU_SETSIZE; i++) {
56         if (! KMP_CPU_ISSET(i, mask)) {
57             continue;
58         }
59 
60         //
61         // Check for buffer overflow.  A string of the form ",<n>" will have
62         // at most 10 characters, plus we want to leave room to print ",...}"
63         // if the set is too large to print for a total of 15 characters.
64         // We already left room for '\0' in setting end.
65         //
66         if (end - scan < 15) {
67            break;
68         }
69         sprintf(scan, ",%-ld", (long)i);
70         while (*scan != '\0') scan++;
71     }
72     if (i < KMP_CPU_SETSIZE) {
73         sprintf(scan, ",...");
74         while (*scan != '\0') scan++;
75     }
76     sprintf(scan, "}");
77     while (*scan != '\0') scan++;
78     KMP_ASSERT(scan <= end);
79     return buf;
80 }
81 
82 
83 void
84 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
85 {
86     KMP_CPU_ZERO(mask);
87 
88 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
89 
90     if (__kmp_num_proc_groups > 1) {
91         int group;
92         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
93         for (group = 0; group < __kmp_num_proc_groups; group++) {
94             int i;
95             int num = __kmp_GetActiveProcessorCount(group);
96             for (i = 0; i < num; i++) {
97                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
98             }
99         }
100     }
101     else
102 
103 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
104 
105     {
106         int proc;
107         for (proc = 0; proc < __kmp_xproc; proc++) {
108             KMP_CPU_SET(proc, mask);
109         }
110     }
111 }
112 
113 
114 //
115 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
116 // functions.
117 //
118 // The icc codegen emits sections with extremely long names, of the form
119 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
120 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
121 // some sort of memory corruption or table overflow that is triggered by
122 // these long strings.  I checked the latest version of the linker -
123 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
124 // fixed.
125 //
126 // Unfortunately, my attempts to reproduce it in a smaller example have
127 // failed - I'm not sure what the prospects are of getting it fixed
128 // properly - but we need a reproducer smaller than all of libiomp.
129 //
130 // Work around the problem by avoiding inline constructors in such builds.
131 // We do this for all platforms, not just Linux* OS - non-inline functions are
132 // more debuggable and provide better coverage into than inline functions.
133 // Use inline functions in shipping libs, for performance.
134 //
135 
136 # if !defined(KMP_DEBUG) && !defined(COVER)
137 
138 class Address {
139 public:
140     static const unsigned maxDepth = 32;
141     unsigned labels[maxDepth];
142     unsigned childNums[maxDepth];
143     unsigned depth;
144     unsigned leader;
145     Address(unsigned _depth)
146       : depth(_depth), leader(FALSE) {
147     }
148     Address &operator=(const Address &b) {
149         depth = b.depth;
150         for (unsigned i = 0; i < depth; i++) {
151             labels[i] = b.labels[i];
152             childNums[i] = b.childNums[i];
153         }
154         leader = FALSE;
155         return *this;
156     }
157     bool operator==(const Address &b) const {
158         if (depth != b.depth)
159             return false;
160         for (unsigned i = 0; i < depth; i++)
161             if(labels[i] != b.labels[i])
162                 return false;
163         return true;
164     }
165     bool isClose(const Address &b, int level) const {
166         if (depth != b.depth)
167             return false;
168         if ((unsigned)level >= depth)
169             return true;
170         for (unsigned i = 0; i < (depth - level); i++)
171             if(labels[i] != b.labels[i])
172                 return false;
173         return true;
174     }
175     bool operator!=(const Address &b) const {
176         return !operator==(b);
177     }
178 };
179 
180 class AddrUnsPair {
181 public:
182     Address first;
183     unsigned second;
184     AddrUnsPair(Address _first, unsigned _second)
185       : first(_first), second(_second) {
186     }
187     AddrUnsPair &operator=(const AddrUnsPair &b)
188     {
189         first = b.first;
190         second = b.second;
191         return *this;
192     }
193 };
194 
195 # else
196 
197 class Address {
198 public:
199     static const unsigned maxDepth = 32;
200     unsigned labels[maxDepth];
201     unsigned childNums[maxDepth];
202     unsigned depth;
203     unsigned leader;
204     Address(unsigned _depth);
205     Address &operator=(const Address &b);
206     bool operator==(const Address &b) const;
207     bool isClose(const Address &b, int level) const;
208     bool operator!=(const Address &b) const;
209 };
210 
211 Address::Address(unsigned _depth)
212 {
213     depth = _depth;
214     leader = FALSE;
215 }
216 
217 Address &Address::operator=(const Address &b) {
218     depth = b.depth;
219     for (unsigned i = 0; i < depth; i++) {
220         labels[i] = b.labels[i];
221         childNums[i] = b.childNums[i];
222     }
223     leader = FALSE;
224     return *this;
225 }
226 
227 bool Address::operator==(const Address &b) const {
228     if (depth != b.depth)
229         return false;
230     for (unsigned i = 0; i < depth; i++)
231         if(labels[i] != b.labels[i])
232             return false;
233     return true;
234 }
235 
236 bool Address::isClose(const Address &b, int level) const {
237     if (depth != b.depth)
238         return false;
239     if ((unsigned)level >= depth)
240         return true;
241     for (unsigned i = 0; i < (depth - level); i++)
242         if(labels[i] != b.labels[i])
243             return false;
244     return true;
245 }
246 
247 bool Address::operator!=(const Address &b) const {
248     return !operator==(b);
249 }
250 
251 class AddrUnsPair {
252 public:
253     Address first;
254     unsigned second;
255     AddrUnsPair(Address _first, unsigned _second);
256     AddrUnsPair &operator=(const AddrUnsPair &b);
257 };
258 
259 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
260   : first(_first), second(_second)
261 {
262 }
263 
264 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
265 {
266     first = b.first;
267     second = b.second;
268     return *this;
269 }
270 
271 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
272 
273 
274 static int
275 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
276 {
277     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
278       ->first);
279     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
280       ->first);
281     unsigned depth = aa->depth;
282     unsigned i;
283     KMP_DEBUG_ASSERT(depth == bb->depth);
284     for (i  = 0; i < depth; i++) {
285         if (aa->labels[i] < bb->labels[i]) return -1;
286         if (aa->labels[i] > bb->labels[i]) return 1;
287     }
288     return 0;
289 }
290 
291 
292 static int
293 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
294 {
295     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
296       ->first);
297     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
298       ->first);
299     unsigned depth = aa->depth;
300     unsigned i;
301     KMP_DEBUG_ASSERT(depth == bb->depth);
302     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
303     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
304     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
305         int j = depth - i - 1;
306         if (aa->childNums[j] < bb->childNums[j]) return -1;
307         if (aa->childNums[j] > bb->childNums[j]) return 1;
308     }
309     for (; i < depth; i++) {
310         int j = i - __kmp_affinity_compact;
311         if (aa->childNums[j] < bb->childNums[j]) return -1;
312         if (aa->childNums[j] > bb->childNums[j]) return 1;
313     }
314     return 0;
315 }
316 
317 /** A structure for holding machine-specific hierarchy info to be computed once at init. */
318 class hierarchy_info {
319 public:
320     /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
321         etc.  We don't want to get specific with nomenclature */
322     static const kmp_uint32 maxLevels=7;
323 
324     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
325         number of levels along the longest path from root to any leaf. It corresponds to the
326         number of entries in numPerLevel if we exclude all but one trailing 1. */
327     kmp_uint32 depth;
328     kmp_uint32 base_depth;
329     kmp_uint32 base_num_threads;
330     bool uninitialized;
331 
332     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
333         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
334         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
335     kmp_uint32 numPerLevel[maxLevels];
336     kmp_uint32 skipPerLevel[maxLevels];
337 
338     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
339         int hier_depth = adr2os[0].first.depth;
340         int level = 0;
341         for (int i=hier_depth-1; i>=0; --i) {
342             int max = -1;
343             for (int j=0; j<num_addrs; ++j) {
344                 int next = adr2os[j].first.childNums[i];
345                 if (next > max) max = next;
346             }
347             numPerLevel[level] = max+1;
348             ++level;
349         }
350     }
351 
352     hierarchy_info() : depth(1), uninitialized(true) {}
353     void init(AddrUnsPair *adr2os, int num_addrs)
354     {
355         uninitialized = false;
356         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
357             numPerLevel[i] = 1;
358             skipPerLevel[i] = 1;
359         }
360 
361         // Sort table by physical ID
362         if (adr2os) {
363             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
364             deriveLevels(adr2os, num_addrs);
365         }
366         else {
367             numPerLevel[0] = 4;
368             numPerLevel[1] = num_addrs/4;
369             if (num_addrs%4) numPerLevel[1]++;
370         }
371 
372         base_num_threads = num_addrs;
373         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
374             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
375                 depth++;
376 
377         kmp_uint32 branch = 4;
378         if (numPerLevel[0] == 1) branch = num_addrs/4;
379         if (branch<4) branch=4;
380         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
381             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
382                 if (numPerLevel[d] & 1) numPerLevel[d]++;
383                 numPerLevel[d] = numPerLevel[d] >> 1;
384                 if (numPerLevel[d+1] == 1) depth++;
385                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
386             }
387             if(numPerLevel[0] == 1) {
388                 branch = branch >> 1;
389                 if (branch<4) branch = 4;
390             }
391         }
392 
393         for (kmp_uint32 i=1; i<depth; ++i)
394             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
395 
396         base_depth = depth;
397     }
398 };
399 
400 static hierarchy_info machine_hierarchy;
401 
402 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
403     if (machine_hierarchy.uninitialized)
404         machine_hierarchy.init(NULL, nproc);
405 
406     if (nproc <= machine_hierarchy.base_num_threads)
407         machine_hierarchy.depth = machine_hierarchy.base_depth;
408     KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
409     while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
410         machine_hierarchy.depth++;
411         machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
412     }
413     thr_bar->depth = machine_hierarchy.depth;
414     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
415     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
416 }
417 
418 //
419 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
420 // called to renumber the labels from [0..n] and place them into the child_num
421 // vector of the address object.  This is done in case the labels used for
422 // the children at one node of the hierarchy differ from those used for
423 // another node at the same level.  Example:  suppose the machine has 2 nodes
424 // with 2 packages each.  The first node contains packages 601 and 602, and
425 // second node contains packages 603 and 604.  If we try to sort the table
426 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
427 // because we are paying attention to the labels themselves, not the ordinal
428 // child numbers.  By using the child numbers in the sort, the result is
429 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
430 //
431 static void
432 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
433   int numAddrs)
434 {
435     KMP_DEBUG_ASSERT(numAddrs > 0);
436     int depth = address2os->first.depth;
437     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
438     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
439       * sizeof(unsigned));
440     int labCt;
441     for (labCt = 0; labCt < depth; labCt++) {
442         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
443         lastLabel[labCt] = address2os[0].first.labels[labCt];
444     }
445     int i;
446     for (i = 1; i < numAddrs; i++) {
447         for (labCt = 0; labCt < depth; labCt++) {
448             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
449                 int labCt2;
450                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
451                     counts[labCt2] = 0;
452                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
453                 }
454                 counts[labCt]++;
455                 lastLabel[labCt] = address2os[i].first.labels[labCt];
456                 break;
457             }
458         }
459         for (labCt = 0; labCt < depth; labCt++) {
460             address2os[i].first.childNums[labCt] = counts[labCt];
461         }
462         for (; labCt < (int)Address::maxDepth; labCt++) {
463             address2os[i].first.childNums[labCt] = 0;
464         }
465     }
466 }
467 
468 
469 //
470 // All of the __kmp_affinity_create_*_map() routines should set
471 // __kmp_affinity_masks to a vector of affinity mask objects of length
472 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
473 // return the number of levels in the machine topology tree (zero if
474 // __kmp_affinity_type == affinity_none).
475 //
476 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
477 // to the affinity mask for the initialization thread.  They need to save and
478 // restore the mask, and it could be needed later, so saving it is just an
479 // optimization to avoid calling kmp_get_system_affinity() again.
480 //
481 static kmp_affin_mask_t *fullMask = NULL;
482 
483 kmp_affin_mask_t *
484 __kmp_affinity_get_fullMask() { return fullMask; }
485 
486 
487 static int nCoresPerPkg, nPackages;
488 int __kmp_nThreadsPerCore;
489 
490 //
491 // __kmp_affinity_uniform_topology() doesn't work when called from
492 // places which support arbitrarily many levels in the machine topology
493 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
494 // __kmp_affinity_create_x2apicid_map().
495 //
496 inline static bool
497 __kmp_affinity_uniform_topology()
498 {
499     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
500 }
501 
502 
503 //
504 // Print out the detailed machine topology map, i.e. the physical locations
505 // of each OS proc.
506 //
507 static void
508 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
509   int pkgLevel, int coreLevel, int threadLevel)
510 {
511     int proc;
512 
513     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
514     for (proc = 0; proc < len; proc++) {
515         int level;
516         kmp_str_buf_t buf;
517         __kmp_str_buf_init(&buf);
518         for (level = 0; level < depth; level++) {
519             if (level == threadLevel) {
520                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
521             }
522             else if (level == coreLevel) {
523                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
524             }
525             else if (level == pkgLevel) {
526                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
527             }
528             else if (level > pkgLevel) {
529                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
530                   level - pkgLevel - 1);
531             }
532             else {
533                 __kmp_str_buf_print(&buf, "L%d ", level);
534             }
535             __kmp_str_buf_print(&buf, "%d ",
536               address2os[proc].first.labels[level]);
537         }
538         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
539           buf.str);
540         __kmp_str_buf_free(&buf);
541     }
542 }
543 
544 
545 //
546 // If we don't know how to retrieve the machine's processor topology, or
547 // encounter an error in doing so, this routine is called to form a "flat"
548 // mapping of os thread id's <-> processor id's.
549 //
550 static int
551 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
552   kmp_i18n_id_t *const msg_id)
553 {
554     *address2os = NULL;
555     *msg_id = kmp_i18n_null;
556 
557     //
558     // Even if __kmp_affinity_type == affinity_none, this routine might still
559     // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
560     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
561     //
562     if (! KMP_AFFINITY_CAPABLE()) {
563         KMP_ASSERT(__kmp_affinity_type == affinity_none);
564         __kmp_ncores = nPackages = __kmp_xproc;
565         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
566         __kmp_ht_enabled = FALSE;
567         if (__kmp_affinity_verbose) {
568             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
569             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
570             KMP_INFORM(Uniform, "KMP_AFFINITY");
571             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
572               __kmp_nThreadsPerCore, __kmp_ncores);
573         }
574         return 0;
575     }
576 
577     //
578     // When affinity is off, this routine will still be called to set
579     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
580     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
581     //  correctly, and return now if affinity is not enabled.
582     //
583     __kmp_ncores = nPackages = __kmp_avail_proc;
584     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
585     __kmp_ht_enabled = FALSE;
586     if (__kmp_affinity_verbose) {
587         char buf[KMP_AFFIN_MASK_PRINT_LEN];
588         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
589 
590         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
591         if (__kmp_affinity_respect_mask) {
592             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
593         } else {
594             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
595         }
596         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
597         KMP_INFORM(Uniform, "KMP_AFFINITY");
598         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
599           __kmp_nThreadsPerCore, __kmp_ncores);
600     }
601     if (__kmp_affinity_type == affinity_none) {
602         return 0;
603     }
604 
605     //
606     // Contruct the data structure to be returned.
607     //
608     *address2os = (AddrUnsPair*)
609       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
610     int avail_ct = 0;
611     unsigned int i;
612     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
613         //
614         // Skip this proc if it is not included in the machine model.
615         //
616         if (! KMP_CPU_ISSET(i, fullMask)) {
617             continue;
618         }
619 
620         Address addr(1);
621         addr.labels[0] = i;
622         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
623     }
624     if (__kmp_affinity_verbose) {
625         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
626     }
627 
628     if (__kmp_affinity_gran_levels < 0) {
629         //
630         // Only the package level is modeled in the machine topology map,
631         // so the #levels of granularity is either 0 or 1.
632         //
633         if (__kmp_affinity_gran > affinity_gran_package) {
634             __kmp_affinity_gran_levels = 1;
635         }
636         else {
637             __kmp_affinity_gran_levels = 0;
638         }
639     }
640     return 1;
641 }
642 
643 
644 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
645 
646 //
647 // If multiple Windows* OS processor groups exist, we can create a 2-level
648 // topology map with the groups at level 0 and the individual procs at
649 // level 1.
650 //
651 // This facilitates letting the threads float among all procs in a group,
652 // if granularity=group (the default when there are multiple groups).
653 //
654 static int
655 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
656   kmp_i18n_id_t *const msg_id)
657 {
658     *address2os = NULL;
659     *msg_id = kmp_i18n_null;
660 
661     //
662     // If we don't have multiple processor groups, return now.
663     // The flat mapping will be used.
664     //
665     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
666         // FIXME set *msg_id
667         return -1;
668     }
669 
670     //
671     // Contruct the data structure to be returned.
672     //
673     *address2os = (AddrUnsPair*)
674       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
675     int avail_ct = 0;
676     int i;
677     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
678         //
679         // Skip this proc if it is not included in the machine model.
680         //
681         if (! KMP_CPU_ISSET(i, fullMask)) {
682             continue;
683         }
684 
685         Address addr(2);
686         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
687         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
688         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
689 
690         if (__kmp_affinity_verbose) {
691             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
692               addr.labels[1]);
693         }
694     }
695 
696     if (__kmp_affinity_gran_levels < 0) {
697         if (__kmp_affinity_gran == affinity_gran_group) {
698             __kmp_affinity_gran_levels = 1;
699         }
700         else if ((__kmp_affinity_gran == affinity_gran_fine)
701           || (__kmp_affinity_gran == affinity_gran_thread)) {
702             __kmp_affinity_gran_levels = 0;
703         }
704         else {
705             const char *gran_str = NULL;
706             if (__kmp_affinity_gran == affinity_gran_core) {
707                 gran_str = "core";
708             }
709             else if (__kmp_affinity_gran == affinity_gran_package) {
710                 gran_str = "package";
711             }
712             else if (__kmp_affinity_gran == affinity_gran_node) {
713                 gran_str = "node";
714             }
715             else {
716                 KMP_ASSERT(0);
717             }
718 
719             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
720             __kmp_affinity_gran_levels = 0;
721         }
722     }
723     return 2;
724 }
725 
726 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
727 
728 
729 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
730 
731 static int
732 __kmp_cpuid_mask_width(int count) {
733     int r = 0;
734 
735     while((1<<r) < count)
736         ++r;
737     return r;
738 }
739 
740 
741 class apicThreadInfo {
742 public:
743     unsigned osId;              // param to __kmp_affinity_bind_thread
744     unsigned apicId;            // from cpuid after binding
745     unsigned maxCoresPerPkg;    //      ""
746     unsigned maxThreadsPerPkg;  //      ""
747     unsigned pkgId;             // inferred from above values
748     unsigned coreId;            //      ""
749     unsigned threadId;          //      ""
750 };
751 
752 
753 static int
754 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
755 {
756     const apicThreadInfo *aa = (const apicThreadInfo *)a;
757     const apicThreadInfo *bb = (const apicThreadInfo *)b;
758     if (aa->osId < bb->osId) return -1;
759     if (aa->osId > bb->osId) return 1;
760     return 0;
761 }
762 
763 
764 static int
765 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
766 {
767     const apicThreadInfo *aa = (const apicThreadInfo *)a;
768     const apicThreadInfo *bb = (const apicThreadInfo *)b;
769     if (aa->pkgId < bb->pkgId) return -1;
770     if (aa->pkgId > bb->pkgId) return 1;
771     if (aa->coreId < bb->coreId) return -1;
772     if (aa->coreId > bb->coreId) return 1;
773     if (aa->threadId < bb->threadId) return -1;
774     if (aa->threadId > bb->threadId) return 1;
775     return 0;
776 }
777 
778 
779 //
780 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
781 // an algorithm which cycles through the available os threads, setting
782 // the current thread's affinity mask to that thread, and then retrieves
783 // the Apic Id for each thread context using the cpuid instruction.
784 //
785 static int
786 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
787   kmp_i18n_id_t *const msg_id)
788 {
789     int rc;
790     *address2os = NULL;
791     *msg_id = kmp_i18n_null;
792 
793 #  if KMP_MIC
794     {
795         // The code below will use cpuid(4).
796         // Check if cpuid(4) is supported.
797         // FIXME? - this really doesn't need to be specific to MIC.
798         kmp_cpuid buf;
799         __kmp_x86_cpuid(0, 0, &buf);
800         if (buf.eax < 4) {
801             *msg_id = kmp_i18n_str_NoLeaf4Support;
802             return -1;
803         }
804     }
805 #  endif // KMP_MIC
806 
807     //
808     // Even if __kmp_affinity_type == affinity_none, this routine is still
809     // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
810     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
811     //
812     // The algorithm used starts by setting the affinity to each available
813     // thread and retreiving info from the cpuid instruction, so if we are not
814     // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
815     // then we need to do something else.
816     //
817     if (! KMP_AFFINITY_CAPABLE()) {
818         //
819         // Hack to try and infer the machine topology using only the data
820         // available from cpuid on the current thread, and __kmp_xproc.
821         //
822         KMP_ASSERT(__kmp_affinity_type == affinity_none);
823 
824         //
825         // Get an upper bound on the number of threads per package using
826         // cpuid(1).
827         //
828         // On some OS/chps combinations where HT is supported by the chip
829         // but is disabled, this value will be 2 on a single core chip.
830         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
831         //
832         kmp_cpuid buf;
833         __kmp_x86_cpuid(1, 0, &buf);
834         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
835         if (maxThreadsPerPkg == 0) {
836             maxThreadsPerPkg = 1;
837         }
838 
839         //
840         // The num cores per pkg comes from cpuid(4).
841         // 1 must be added to the encoded value.
842         //
843         // The author of cpu_count.cpp treated this only an upper bound
844         // on the number of cores, but I haven't seen any cases where it
845         // was greater than the actual number of cores, so we will treat
846         // it as exact in this block of code.
847         //
848         // First, we need to check if cpuid(4) is supported on this chip.
849         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
850         // has the value n or greater.
851         //
852         __kmp_x86_cpuid(0, 0, &buf);
853         if (buf.eax >= 4) {
854             __kmp_x86_cpuid(4, 0, &buf);
855             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
856         }
857         else {
858             nCoresPerPkg = 1;
859         }
860 
861         //
862         // There is no way to reliably tell if HT is enabled without issuing
863         // the cpuid instruction from every thread, can correlating the cpuid
864         // info, so if the machine is not affinity capable, we assume that HT
865         // is off.  We have seen quite a few machines where maxThreadsPerPkg
866         // is 2, yet the machine does not support HT.
867         //
868         // - Older OSes are usually found on machines with older chips, which
869         //   do not support HT.
870         //
871         // - The performance penalty for mistakenly identifying a machine as
872         //   HT when it isn't (which results in blocktime being incorrecly set
873         //   to 0) is greater than the penalty when for mistakenly identifying
874         //   a machine as being 1 thread/core when it is really HT enabled
875         //   (which results in blocktime being incorrectly set to a positive
876         //   value).
877         //
878         __kmp_ncores = __kmp_xproc;
879         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
880         __kmp_nThreadsPerCore = 1;
881         __kmp_ht_enabled = FALSE;
882         if (__kmp_affinity_verbose) {
883             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
884             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
885             if (__kmp_affinity_uniform_topology()) {
886                 KMP_INFORM(Uniform, "KMP_AFFINITY");
887             } else {
888                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
889             }
890             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
891               __kmp_nThreadsPerCore, __kmp_ncores);
892         }
893         return 0;
894     }
895 
896     //
897     //
898     // From here on, we can assume that it is safe to call
899     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
900     // even if __kmp_affinity_type = affinity_none.
901     //
902 
903     //
904     // Save the affinity mask for the current thread.
905     //
906     kmp_affin_mask_t *oldMask;
907     KMP_CPU_ALLOC(oldMask);
908     KMP_ASSERT(oldMask != NULL);
909     __kmp_get_system_affinity(oldMask, TRUE);
910 
911     //
912     // Run through each of the available contexts, binding the current thread
913     // to it, and obtaining the pertinent information using the cpuid instr.
914     //
915     // The relevant information is:
916     //
917     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
918     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
919     //
920     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
921     //    value of this field determines the width of the core# + thread#
922     //    fields in the Apic Id.  It is also an upper bound on the number
923     //    of threads per package, but it has been verified that situations
924     //    happen were it is not exact.  In particular, on certain OS/chip
925     //    combinations where Intel(R) Hyper-Threading Technology is supported
926     //    by the chip but has
927     //    been disabled, the value of this field will be 2 (for a single core
928     //    chip).  On other OS/chip combinations supporting
929     //    Intel(R) Hyper-Threading Technology, the value of
930     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
931     //    disabled and 2 when it is enabled.
932     //
933     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
934     //    value of this field (+1) determines the width of the core# field in
935     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
936     //    an upper bound, but the IA-32 architecture manual says that it is
937     //    exactly the number of cores per package, and I haven't seen any
938     //    case where it wasn't.
939     //
940     // From this information, deduce the package Id, core Id, and thread Id,
941     // and set the corresponding fields in the apicThreadInfo struct.
942     //
943     unsigned i;
944     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
945       __kmp_avail_proc * sizeof(apicThreadInfo));
946     unsigned nApics = 0;
947     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
948         //
949         // Skip this proc if it is not included in the machine model.
950         //
951         if (! KMP_CPU_ISSET(i, fullMask)) {
952             continue;
953         }
954         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
955 
956         __kmp_affinity_bind_thread(i);
957         threadInfo[nApics].osId = i;
958 
959         //
960         // The apic id and max threads per pkg come from cpuid(1).
961         //
962         kmp_cpuid buf;
963         __kmp_x86_cpuid(1, 0, &buf);
964         if (! (buf.edx >> 9) & 1) {
965             __kmp_set_system_affinity(oldMask, TRUE);
966             __kmp_free(threadInfo);
967             KMP_CPU_FREE(oldMask);
968             *msg_id = kmp_i18n_str_ApicNotPresent;
969             return -1;
970         }
971         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
972         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
973         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
974             threadInfo[nApics].maxThreadsPerPkg = 1;
975         }
976 
977         //
978         // Max cores per pkg comes from cpuid(4).
979         // 1 must be added to the encoded value.
980         //
981         // First, we need to check if cpuid(4) is supported on this chip.
982         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
983         // has the value n or greater.
984         //
985         __kmp_x86_cpuid(0, 0, &buf);
986         if (buf.eax >= 4) {
987             __kmp_x86_cpuid(4, 0, &buf);
988             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
989         }
990         else {
991             threadInfo[nApics].maxCoresPerPkg = 1;
992         }
993 
994         //
995         // Infer the pkgId / coreId / threadId using only the info
996         // obtained locally.
997         //
998         int widthCT = __kmp_cpuid_mask_width(
999           threadInfo[nApics].maxThreadsPerPkg);
1000         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1001 
1002         int widthC = __kmp_cpuid_mask_width(
1003           threadInfo[nApics].maxCoresPerPkg);
1004         int widthT = widthCT - widthC;
1005         if (widthT < 0) {
1006             //
1007             // I've never seen this one happen, but I suppose it could, if
1008             // the cpuid instruction on a chip was really screwed up.
1009             // Make sure to restore the affinity mask before the tail call.
1010             //
1011             __kmp_set_system_affinity(oldMask, TRUE);
1012             __kmp_free(threadInfo);
1013             KMP_CPU_FREE(oldMask);
1014             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1015             return -1;
1016         }
1017 
1018         int maskC = (1 << widthC) - 1;
1019         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1020           &maskC;
1021 
1022         int maskT = (1 << widthT) - 1;
1023         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1024 
1025         nApics++;
1026     }
1027 
1028     //
1029     // We've collected all the info we need.
1030     // Restore the old affinity mask for this thread.
1031     //
1032     __kmp_set_system_affinity(oldMask, TRUE);
1033 
1034     //
1035     // If there's only one thread context to bind to, form an Address object
1036     // with depth 1 and return immediately (or, if affinity is off, set
1037     // address2os to NULL and return).
1038     //
1039     // If it is configured to omit the package level when there is only a
1040     // single package, the logic at the end of this routine won't work if
1041     // there is only a single thread - it would try to form an Address
1042     // object with depth 0.
1043     //
1044     KMP_ASSERT(nApics > 0);
1045     if (nApics == 1) {
1046         __kmp_ncores = nPackages = 1;
1047         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1048         __kmp_ht_enabled = FALSE;
1049         if (__kmp_affinity_verbose) {
1050             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1051             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1052 
1053             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1054             if (__kmp_affinity_respect_mask) {
1055                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1056             } else {
1057                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1058             }
1059             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1060             KMP_INFORM(Uniform, "KMP_AFFINITY");
1061             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1062               __kmp_nThreadsPerCore, __kmp_ncores);
1063         }
1064 
1065         if (__kmp_affinity_type == affinity_none) {
1066             __kmp_free(threadInfo);
1067             KMP_CPU_FREE(oldMask);
1068             return 0;
1069         }
1070 
1071         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1072         Address addr(1);
1073         addr.labels[0] = threadInfo[0].pkgId;
1074         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1075 
1076         if (__kmp_affinity_gran_levels < 0) {
1077             __kmp_affinity_gran_levels = 0;
1078         }
1079 
1080         if (__kmp_affinity_verbose) {
1081             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1082         }
1083 
1084         __kmp_free(threadInfo);
1085         KMP_CPU_FREE(oldMask);
1086         return 1;
1087     }
1088 
1089     //
1090     // Sort the threadInfo table by physical Id.
1091     //
1092     qsort(threadInfo, nApics, sizeof(*threadInfo),
1093       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1094 
1095     //
1096     // The table is now sorted by pkgId / coreId / threadId, but we really
1097     // don't know the radix of any of the fields.  pkgId's may be sparsely
1098     // assigned among the chips on a system.  Although coreId's are usually
1099     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1100     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1101     //
1102     // For that matter, we don't know what coresPerPkg and threadsPerCore
1103     // (or the total # packages) are at this point - we want to determine
1104     // that now.  We only have an upper bound on the first two figures.
1105     //
1106     // We also perform a consistency check at this point: the values returned
1107     // by the cpuid instruction for any thread bound to a given package had
1108     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1109     //
1110     nPackages = 1;
1111     nCoresPerPkg = 1;
1112     __kmp_nThreadsPerCore = 1;
1113     unsigned nCores = 1;
1114 
1115     unsigned pkgCt = 1;                         // to determine radii
1116     unsigned lastPkgId = threadInfo[0].pkgId;
1117     unsigned coreCt = 1;
1118     unsigned lastCoreId = threadInfo[0].coreId;
1119     unsigned threadCt = 1;
1120     unsigned lastThreadId = threadInfo[0].threadId;
1121 
1122                                                 // intra-pkg consist checks
1123     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1124     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1125 
1126     for (i = 1; i < nApics; i++) {
1127         if (threadInfo[i].pkgId != lastPkgId) {
1128             nCores++;
1129             pkgCt++;
1130             lastPkgId = threadInfo[i].pkgId;
1131             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1132             coreCt = 1;
1133             lastCoreId = threadInfo[i].coreId;
1134             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1135             threadCt = 1;
1136             lastThreadId = threadInfo[i].threadId;
1137 
1138             //
1139             // This is a different package, so go on to the next iteration
1140             // without doing any consistency checks.  Reset the consistency
1141             // check vars, though.
1142             //
1143             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1144             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1145             continue;
1146         }
1147 
1148         if (threadInfo[i].coreId != lastCoreId) {
1149             nCores++;
1150             coreCt++;
1151             lastCoreId = threadInfo[i].coreId;
1152             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1153             threadCt = 1;
1154             lastThreadId = threadInfo[i].threadId;
1155         }
1156         else if (threadInfo[i].threadId != lastThreadId) {
1157             threadCt++;
1158             lastThreadId = threadInfo[i].threadId;
1159         }
1160         else {
1161             __kmp_free(threadInfo);
1162             KMP_CPU_FREE(oldMask);
1163             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1164             return -1;
1165         }
1166 
1167         //
1168         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1169         // fields agree between all the threads bounds to a given package.
1170         //
1171         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1172           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1173             __kmp_free(threadInfo);
1174             KMP_CPU_FREE(oldMask);
1175             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1176             return -1;
1177         }
1178     }
1179     nPackages = pkgCt;
1180     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1181     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1182 
1183     //
1184     // When affinity is off, this routine will still be called to set
1185     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1186     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1187     // correctly, and return now if affinity is not enabled.
1188     //
1189     __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1190     __kmp_ncores = nCores;
1191     if (__kmp_affinity_verbose) {
1192         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1193         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1194 
1195         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1196         if (__kmp_affinity_respect_mask) {
1197             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1198         } else {
1199             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1200         }
1201         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1202         if (__kmp_affinity_uniform_topology()) {
1203             KMP_INFORM(Uniform, "KMP_AFFINITY");
1204         } else {
1205             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1206         }
1207         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1208           __kmp_nThreadsPerCore, __kmp_ncores);
1209 
1210     }
1211 
1212     if (__kmp_affinity_type == affinity_none) {
1213         __kmp_free(threadInfo);
1214         KMP_CPU_FREE(oldMask);
1215         return 0;
1216     }
1217 
1218     //
1219     // Now that we've determined the number of packages, the number of cores
1220     // per package, and the number of threads per core, we can construct the
1221     // data structure that is to be returned.
1222     //
1223     int pkgLevel = 0;
1224     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1225     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1226     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1227 
1228     KMP_ASSERT(depth > 0);
1229     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1230 
1231     for (i = 0; i < nApics; ++i) {
1232         Address addr(depth);
1233         unsigned os = threadInfo[i].osId;
1234         int d = 0;
1235 
1236         if (pkgLevel >= 0) {
1237             addr.labels[d++] = threadInfo[i].pkgId;
1238         }
1239         if (coreLevel >= 0) {
1240             addr.labels[d++] = threadInfo[i].coreId;
1241         }
1242         if (threadLevel >= 0) {
1243             addr.labels[d++] = threadInfo[i].threadId;
1244         }
1245         (*address2os)[i] = AddrUnsPair(addr, os);
1246     }
1247 
1248     if (__kmp_affinity_gran_levels < 0) {
1249         //
1250         // Set the granularity level based on what levels are modeled
1251         // in the machine topology map.
1252         //
1253         __kmp_affinity_gran_levels = 0;
1254         if ((threadLevel >= 0)
1255           && (__kmp_affinity_gran > affinity_gran_thread)) {
1256             __kmp_affinity_gran_levels++;
1257         }
1258         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1259             __kmp_affinity_gran_levels++;
1260         }
1261         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1262             __kmp_affinity_gran_levels++;
1263         }
1264     }
1265 
1266     if (__kmp_affinity_verbose) {
1267         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1268           coreLevel, threadLevel);
1269     }
1270 
1271     __kmp_free(threadInfo);
1272     KMP_CPU_FREE(oldMask);
1273     return depth;
1274 }
1275 
1276 
1277 //
1278 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1279 // architectures support a newer interface for specifying the x2APIC Ids,
1280 // based on cpuid leaf 11.
1281 //
1282 static int
1283 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1284   kmp_i18n_id_t *const msg_id)
1285 {
1286     kmp_cpuid buf;
1287 
1288     *address2os = NULL;
1289     *msg_id = kmp_i18n_null;
1290 
1291     //
1292     // Check to see if cpuid leaf 11 is supported.
1293     //
1294     __kmp_x86_cpuid(0, 0, &buf);
1295     if (buf.eax < 11) {
1296         *msg_id = kmp_i18n_str_NoLeaf11Support;
1297         return -1;
1298     }
1299     __kmp_x86_cpuid(11, 0, &buf);
1300     if (buf.ebx == 0) {
1301         *msg_id = kmp_i18n_str_NoLeaf11Support;
1302         return -1;
1303     }
1304 
1305     //
1306     // Find the number of levels in the machine topology.  While we're at it,
1307     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1308     // try to get more accurate values later by explicitly counting them,
1309     // but get reasonable defaults now, in case we return early.
1310     //
1311     int level;
1312     int threadLevel = -1;
1313     int coreLevel = -1;
1314     int pkgLevel = -1;
1315     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1316 
1317     for (level = 0;; level++) {
1318         if (level > 31) {
1319             //
1320             // FIXME: Hack for DPD200163180
1321             //
1322             // If level is big then something went wrong -> exiting
1323             //
1324             // There could actually be 32 valid levels in the machine topology,
1325             // but so far, the only machine we have seen which does not exit
1326             // this loop before iteration 32 has fubar x2APIC settings.
1327             //
1328             // For now, just reject this case based upon loop trip count.
1329             //
1330             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1331             return -1;
1332         }
1333         __kmp_x86_cpuid(11, level, &buf);
1334         if (buf.ebx == 0) {
1335             if (pkgLevel < 0) {
1336                 //
1337                 // Will infer nPackages from __kmp_xproc
1338                 //
1339                 pkgLevel = level;
1340                 level++;
1341             }
1342             break;
1343         }
1344         int kind = (buf.ecx >> 8) & 0xff;
1345         if (kind == 1) {
1346             //
1347             // SMT level
1348             //
1349             threadLevel = level;
1350             coreLevel = -1;
1351             pkgLevel = -1;
1352             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1353             if (__kmp_nThreadsPerCore == 0) {
1354                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1355                 return -1;
1356             }
1357         }
1358         else if (kind == 2) {
1359             //
1360             // core level
1361             //
1362             coreLevel = level;
1363             pkgLevel = -1;
1364             nCoresPerPkg = buf.ebx & 0xff;
1365             if (nCoresPerPkg == 0) {
1366                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1367                 return -1;
1368             }
1369         }
1370         else {
1371             if (level <= 0) {
1372                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1373                 return -1;
1374             }
1375             if (pkgLevel >= 0) {
1376                 continue;
1377             }
1378             pkgLevel = level;
1379             nPackages = buf.ebx & 0xff;
1380             if (nPackages == 0) {
1381                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1382                 return -1;
1383             }
1384         }
1385     }
1386     int depth = level;
1387 
1388     //
1389     // In the above loop, "level" was counted from the finest level (usually
1390     // thread) to the coarsest.  The caller expects that we will place the
1391     // labels in (*address2os)[].first.labels[] in the inverse order, so
1392     // we need to invert the vars saying which level means what.
1393     //
1394     if (threadLevel >= 0) {
1395         threadLevel = depth - threadLevel - 1;
1396     }
1397     if (coreLevel >= 0) {
1398         coreLevel = depth - coreLevel - 1;
1399     }
1400     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1401     pkgLevel = depth - pkgLevel - 1;
1402 
1403     //
1404     // The algorithm used starts by setting the affinity to each available
1405     // thread and retrieving info from the cpuid instruction, so if we are not
1406     // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1407     // then we need to do something else - use the defaults that we calculated
1408     // from issuing cpuid without binding to each proc.
1409     //
1410     if (! KMP_AFFINITY_CAPABLE())
1411     {
1412         //
1413         // Hack to try and infer the machine topology using only the data
1414         // available from cpuid on the current thread, and __kmp_xproc.
1415         //
1416         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1417 
1418         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1419         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1420         __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1421         if (__kmp_affinity_verbose) {
1422             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1423             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1424             if (__kmp_affinity_uniform_topology()) {
1425                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1426             } else {
1427                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1428             }
1429             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1430               __kmp_nThreadsPerCore, __kmp_ncores);
1431         }
1432         return 0;
1433     }
1434 
1435     //
1436     //
1437     // From here on, we can assume that it is safe to call
1438     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1439     // even if __kmp_affinity_type = affinity_none.
1440     //
1441 
1442     //
1443     // Save the affinity mask for the current thread.
1444     //
1445     kmp_affin_mask_t *oldMask;
1446     KMP_CPU_ALLOC(oldMask);
1447     __kmp_get_system_affinity(oldMask, TRUE);
1448 
1449     //
1450     // Allocate the data structure to be returned.
1451     //
1452     AddrUnsPair *retval = (AddrUnsPair *)
1453       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1454 
1455     //
1456     // Run through each of the available contexts, binding the current thread
1457     // to it, and obtaining the pertinent information using the cpuid instr.
1458     //
1459     unsigned int proc;
1460     int nApics = 0;
1461     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1462         //
1463         // Skip this proc if it is not included in the machine model.
1464         //
1465         if (! KMP_CPU_ISSET(proc, fullMask)) {
1466             continue;
1467         }
1468         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1469 
1470         __kmp_affinity_bind_thread(proc);
1471 
1472         //
1473         // Extrach the labels for each level in the machine topology map
1474         // from the Apic ID.
1475         //
1476         Address addr(depth);
1477         int prev_shift = 0;
1478 
1479         for (level = 0; level < depth; level++) {
1480             __kmp_x86_cpuid(11, level, &buf);
1481             unsigned apicId = buf.edx;
1482             if (buf.ebx == 0) {
1483                 if (level != depth - 1) {
1484                     KMP_CPU_FREE(oldMask);
1485                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1486                     return -1;
1487                 }
1488                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1489                 level++;
1490                 break;
1491             }
1492             int shift = buf.eax & 0x1f;
1493             int mask = (1 << shift) - 1;
1494             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1495             prev_shift = shift;
1496         }
1497         if (level != depth) {
1498             KMP_CPU_FREE(oldMask);
1499             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1500             return -1;
1501         }
1502 
1503         retval[nApics] = AddrUnsPair(addr, proc);
1504         nApics++;
1505     }
1506 
1507     //
1508     // We've collected all the info we need.
1509     // Restore the old affinity mask for this thread.
1510     //
1511     __kmp_set_system_affinity(oldMask, TRUE);
1512 
1513     //
1514     // If there's only one thread context to bind to, return now.
1515     //
1516     KMP_ASSERT(nApics > 0);
1517     if (nApics == 1) {
1518         __kmp_ncores = nPackages = 1;
1519         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1520         __kmp_ht_enabled = FALSE;
1521         if (__kmp_affinity_verbose) {
1522             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1523             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1524 
1525             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1526             if (__kmp_affinity_respect_mask) {
1527                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1528             } else {
1529                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1530             }
1531             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1532             KMP_INFORM(Uniform, "KMP_AFFINITY");
1533             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1534               __kmp_nThreadsPerCore, __kmp_ncores);
1535         }
1536 
1537         if (__kmp_affinity_type == affinity_none) {
1538             __kmp_free(retval);
1539             KMP_CPU_FREE(oldMask);
1540             return 0;
1541         }
1542 
1543         //
1544         // Form an Address object which only includes the package level.
1545         //
1546         Address addr(1);
1547         addr.labels[0] = retval[0].first.labels[pkgLevel];
1548         retval[0].first = addr;
1549 
1550         if (__kmp_affinity_gran_levels < 0) {
1551             __kmp_affinity_gran_levels = 0;
1552         }
1553 
1554         if (__kmp_affinity_verbose) {
1555             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1556         }
1557 
1558         *address2os = retval;
1559         KMP_CPU_FREE(oldMask);
1560         return 1;
1561     }
1562 
1563     //
1564     // Sort the table by physical Id.
1565     //
1566     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1567 
1568     //
1569     // Find the radix at each of the levels.
1570     //
1571     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1572     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1573     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1574     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1575     for (level = 0; level < depth; level++) {
1576         totals[level] = 1;
1577         maxCt[level] = 1;
1578         counts[level] = 1;
1579         last[level] = retval[0].first.labels[level];
1580     }
1581 
1582     //
1583     // From here on, the iteration variable "level" runs from the finest
1584     // level to the coarsest, i.e. we iterate forward through
1585     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1586     // backwards.
1587     //
1588     for (proc = 1; (int)proc < nApics; proc++) {
1589         int level;
1590         for (level = 0; level < depth; level++) {
1591             if (retval[proc].first.labels[level] != last[level]) {
1592                 int j;
1593                 for (j = level + 1; j < depth; j++) {
1594                     totals[j]++;
1595                     counts[j] = 1;
1596                     // The line below causes printing incorrect topology information
1597                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1598                     // some less value while going through the array.
1599                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1600                     // whereas it must be 4.
1601                     // TODO!!! Check if it can be commented safely
1602                     //maxCt[j] = 1;
1603                     last[j] = retval[proc].first.labels[j];
1604                 }
1605                 totals[level]++;
1606                 counts[level]++;
1607                 if (counts[level] > maxCt[level]) {
1608                     maxCt[level] = counts[level];
1609                 }
1610                 last[level] = retval[proc].first.labels[level];
1611                 break;
1612             }
1613             else if (level == depth - 1) {
1614                 __kmp_free(last);
1615                 __kmp_free(maxCt);
1616                 __kmp_free(counts);
1617                 __kmp_free(totals);
1618                 __kmp_free(retval);
1619                 KMP_CPU_FREE(oldMask);
1620                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1621                 return -1;
1622             }
1623         }
1624     }
1625 
1626     //
1627     // When affinity is off, this routine will still be called to set
1628     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1629     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1630     // correctly, and return if affinity is not enabled.
1631     //
1632     if (threadLevel >= 0) {
1633         __kmp_nThreadsPerCore = maxCt[threadLevel];
1634     }
1635     else {
1636         __kmp_nThreadsPerCore = 1;
1637     }
1638     __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1639 
1640     nPackages = totals[pkgLevel];
1641 
1642     if (coreLevel >= 0) {
1643         __kmp_ncores = totals[coreLevel];
1644         nCoresPerPkg = maxCt[coreLevel];
1645     }
1646     else {
1647         __kmp_ncores = nPackages;
1648         nCoresPerPkg = 1;
1649     }
1650 
1651     //
1652     // Check to see if the machine topology is uniform
1653     //
1654     unsigned prod = maxCt[0];
1655     for (level = 1; level < depth; level++) {
1656        prod *= maxCt[level];
1657     }
1658     bool uniform = (prod == totals[level - 1]);
1659 
1660     //
1661     // Print the machine topology summary.
1662     //
1663     if (__kmp_affinity_verbose) {
1664         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1665         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1666 
1667         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1668         if (__kmp_affinity_respect_mask) {
1669             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1670         } else {
1671             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1672         }
1673         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1674         if (uniform) {
1675             KMP_INFORM(Uniform, "KMP_AFFINITY");
1676         } else {
1677             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1678         }
1679 
1680         kmp_str_buf_t buf;
1681         __kmp_str_buf_init(&buf);
1682 
1683         __kmp_str_buf_print(&buf, "%d", totals[0]);
1684         for (level = 1; level <= pkgLevel; level++) {
1685             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1686         }
1687         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1688           __kmp_nThreadsPerCore, __kmp_ncores);
1689 
1690         __kmp_str_buf_free(&buf);
1691     }
1692 
1693     if (__kmp_affinity_type == affinity_none) {
1694         __kmp_free(last);
1695         __kmp_free(maxCt);
1696         __kmp_free(counts);
1697         __kmp_free(totals);
1698         __kmp_free(retval);
1699         KMP_CPU_FREE(oldMask);
1700         return 0;
1701     }
1702 
1703     //
1704     // Find any levels with radiix 1, and remove them from the map
1705     // (except for the package level).
1706     //
1707     int new_depth = 0;
1708     for (level = 0; level < depth; level++) {
1709         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1710            continue;
1711         }
1712         new_depth++;
1713     }
1714 
1715     //
1716     // If we are removing any levels, allocate a new vector to return,
1717     // and copy the relevant information to it.
1718     //
1719     if (new_depth != depth) {
1720         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1721           sizeof(AddrUnsPair) * nApics);
1722         for (proc = 0; (int)proc < nApics; proc++) {
1723             Address addr(new_depth);
1724             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1725         }
1726         int new_level = 0;
1727         for (level = 0; level < depth; level++) {
1728             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1729                if (level == threadLevel) {
1730                    threadLevel = -1;
1731                }
1732                else if ((threadLevel >= 0) && (level < threadLevel)) {
1733                    threadLevel--;
1734                }
1735                if (level == coreLevel) {
1736                    coreLevel = -1;
1737                }
1738                else if ((coreLevel >= 0) && (level < coreLevel)) {
1739                    coreLevel--;
1740                }
1741                if (level < pkgLevel) {
1742                    pkgLevel--;
1743                }
1744                continue;
1745             }
1746             for (proc = 0; (int)proc < nApics; proc++) {
1747                 new_retval[proc].first.labels[new_level]
1748                   = retval[proc].first.labels[level];
1749             }
1750             new_level++;
1751         }
1752 
1753         __kmp_free(retval);
1754         retval = new_retval;
1755         depth = new_depth;
1756     }
1757 
1758     if (__kmp_affinity_gran_levels < 0) {
1759         //
1760         // Set the granularity level based on what levels are modeled
1761         // in the machine topology map.
1762         //
1763         __kmp_affinity_gran_levels = 0;
1764         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1765             __kmp_affinity_gran_levels++;
1766         }
1767         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1768             __kmp_affinity_gran_levels++;
1769         }
1770         if (__kmp_affinity_gran > affinity_gran_package) {
1771             __kmp_affinity_gran_levels++;
1772         }
1773     }
1774 
1775     if (__kmp_affinity_verbose) {
1776         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1777           coreLevel, threadLevel);
1778     }
1779 
1780     __kmp_free(last);
1781     __kmp_free(maxCt);
1782     __kmp_free(counts);
1783     __kmp_free(totals);
1784     KMP_CPU_FREE(oldMask);
1785     *address2os = retval;
1786     return depth;
1787 }
1788 
1789 
1790 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1791 
1792 
1793 #define osIdIndex       0
1794 #define threadIdIndex   1
1795 #define coreIdIndex     2
1796 #define pkgIdIndex      3
1797 #define nodeIdIndex     4
1798 
1799 typedef unsigned *ProcCpuInfo;
1800 static unsigned maxIndex = pkgIdIndex;
1801 
1802 
1803 static int
1804 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1805 {
1806     const unsigned *aa = (const unsigned *)a;
1807     const unsigned *bb = (const unsigned *)b;
1808     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1809     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1810     return 0;
1811 };
1812 
1813 
1814 static int
1815 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1816 {
1817     unsigned i;
1818     const unsigned *aa = *((const unsigned **)a);
1819     const unsigned *bb = *((const unsigned **)b);
1820     for (i = maxIndex; ; i--) {
1821         if (aa[i] < bb[i]) return -1;
1822         if (aa[i] > bb[i]) return 1;
1823         if (i == osIdIndex) break;
1824     }
1825     return 0;
1826 }
1827 
1828 
1829 //
1830 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1831 // affinity map.
1832 //
1833 static int
1834 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1835   kmp_i18n_id_t *const msg_id, FILE *f)
1836 {
1837     *address2os = NULL;
1838     *msg_id = kmp_i18n_null;
1839 
1840     //
1841     // Scan of the file, and count the number of "processor" (osId) fields,
1842     // and find the highest value of <n> for a node_<n> field.
1843     //
1844     char buf[256];
1845     unsigned num_records = 0;
1846     while (! feof(f)) {
1847         buf[sizeof(buf) - 1] = 1;
1848         if (! fgets(buf, sizeof(buf), f)) {
1849             //
1850             // Read errors presumably because of EOF
1851             //
1852             break;
1853         }
1854 
1855         char s1[] = "processor";
1856         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1857             num_records++;
1858             continue;
1859         }
1860 
1861         //
1862         // FIXME - this will match "node_<n> <garbage>"
1863         //
1864         unsigned level;
1865         if (sscanf(buf, "node_%d id", &level) == 1) {
1866             if (nodeIdIndex + level >= maxIndex) {
1867                 maxIndex = nodeIdIndex + level;
1868             }
1869             continue;
1870         }
1871     }
1872 
1873     //
1874     // Check for empty file / no valid processor records, or too many.
1875     // The number of records can't exceed the number of valid bits in the
1876     // affinity mask.
1877     //
1878     if (num_records == 0) {
1879         *line = 0;
1880         *msg_id = kmp_i18n_str_NoProcRecords;
1881         return -1;
1882     }
1883     if (num_records > (unsigned)__kmp_xproc) {
1884         *line = 0;
1885         *msg_id = kmp_i18n_str_TooManyProcRecords;
1886         return -1;
1887     }
1888 
1889     //
1890     // Set the file pointer back to the begginning, so that we can scan the
1891     // file again, this time performing a full parse of the data.
1892     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1893     // Adding an extra element at the end allows us to remove a lot of extra
1894     // checks for termination conditions.
1895     //
1896     if (fseek(f, 0, SEEK_SET) != 0) {
1897         *line = 0;
1898         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1899         return -1;
1900     }
1901 
1902     //
1903     // Allocate the array of records to store the proc info in.  The dummy
1904     // element at the end makes the logic in filling them out easier to code.
1905     //
1906     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1907       * sizeof(unsigned *));
1908     unsigned i;
1909     for (i = 0; i <= num_records; i++) {
1910         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1911           * sizeof(unsigned));
1912     }
1913 
1914 #define CLEANUP_THREAD_INFO \
1915     for (i = 0; i <= num_records; i++) {                                \
1916         __kmp_free(threadInfo[i]);                                      \
1917     }                                                                   \
1918     __kmp_free(threadInfo);
1919 
1920     //
1921     // A value of UINT_MAX means that we didn't find the field
1922     //
1923     unsigned __index;
1924 
1925 #define INIT_PROC_INFO(p) \
1926     for (__index = 0; __index <= maxIndex; __index++) {                 \
1927         (p)[__index] = UINT_MAX;                                        \
1928     }
1929 
1930     for (i = 0; i <= num_records; i++) {
1931         INIT_PROC_INFO(threadInfo[i]);
1932     }
1933 
1934     unsigned num_avail = 0;
1935     *line = 0;
1936     while (! feof(f)) {
1937         //
1938         // Create an inner scoping level, so that all the goto targets at the
1939         // end of the loop appear in an outer scoping level.  This avoids
1940         // warnings about jumping past an initialization to a target in the
1941         // same block.
1942         //
1943         {
1944             buf[sizeof(buf) - 1] = 1;
1945             bool long_line = false;
1946             if (! fgets(buf, sizeof(buf), f)) {
1947                 //
1948                 // Read errors presumably because of EOF
1949                 //
1950                 // If there is valid data in threadInfo[num_avail], then fake
1951                 // a blank line in ensure that the last address gets parsed.
1952                 //
1953                 bool valid = false;
1954                 for (i = 0; i <= maxIndex; i++) {
1955                     if (threadInfo[num_avail][i] != UINT_MAX) {
1956                         valid = true;
1957                     }
1958                 }
1959                 if (! valid) {
1960                     break;
1961                 }
1962                 buf[0] = 0;
1963             } else if (!buf[sizeof(buf) - 1]) {
1964                 //
1965                 // The line is longer than the buffer.  Set a flag and don't
1966                 // emit an error if we were going to ignore the line, anyway.
1967                 //
1968                 long_line = true;
1969 
1970 #define CHECK_LINE \
1971     if (long_line) {                                                    \
1972         CLEANUP_THREAD_INFO;                                            \
1973         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1974         return -1;                                                      \
1975     }
1976             }
1977             (*line)++;
1978 
1979             char s1[] = "processor";
1980             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1981                 CHECK_LINE;
1982                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1983                 unsigned val;
1984                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1985                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1986                 threadInfo[num_avail][osIdIndex] = val;
1987 #if KMP_OS_LINUX && USE_SYSFS_INFO
1988                 char path[256];
1989                 snprintf(path, sizeof(path),
1990                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1991                     threadInfo[num_avail][osIdIndex]);
1992                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1993 
1994                 snprintf(path, sizeof(path),
1995                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1996                     threadInfo[num_avail][osIdIndex]);
1997                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1998                 continue;
1999 #else
2000             }
2001             char s2[] = "physical id";
2002             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2003                 CHECK_LINE;
2004                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2005                 unsigned val;
2006                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2007                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2008                 threadInfo[num_avail][pkgIdIndex] = val;
2009                 continue;
2010             }
2011             char s3[] = "core id";
2012             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2013                 CHECK_LINE;
2014                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2015                 unsigned val;
2016                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2017                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2018                 threadInfo[num_avail][coreIdIndex] = val;
2019                 continue;
2020 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2021             }
2022             char s4[] = "thread id";
2023             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2024                 CHECK_LINE;
2025                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2026                 unsigned val;
2027                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2028                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2029                 threadInfo[num_avail][threadIdIndex] = val;
2030                 continue;
2031             }
2032             unsigned level;
2033             if (sscanf(buf, "node_%d id", &level) == 1) {
2034                 CHECK_LINE;
2035                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2036                 unsigned val;
2037                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2038                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2039                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2040                 threadInfo[num_avail][nodeIdIndex + level] = val;
2041                 continue;
2042             }
2043 
2044             //
2045             // We didn't recognize the leading token on the line.
2046             // There are lots of leading tokens that we don't recognize -
2047             // if the line isn't empty, go on to the next line.
2048             //
2049             if ((*buf != 0) && (*buf != '\n')) {
2050                 //
2051                 // If the line is longer than the buffer, read characters
2052                 // until we find a newline.
2053                 //
2054                 if (long_line) {
2055                     int ch;
2056                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2057                 }
2058                 continue;
2059             }
2060 
2061             //
2062             // A newline has signalled the end of the processor record.
2063             // Check that there aren't too many procs specified.
2064             //
2065             if ((int)num_avail == __kmp_xproc) {
2066                 CLEANUP_THREAD_INFO;
2067                 *msg_id = kmp_i18n_str_TooManyEntries;
2068                 return -1;
2069             }
2070 
2071             //
2072             // Check for missing fields.  The osId field must be there, and we
2073             // currently require that the physical id field is specified, also.
2074             //
2075             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2076                 CLEANUP_THREAD_INFO;
2077                 *msg_id = kmp_i18n_str_MissingProcField;
2078                 return -1;
2079             }
2080             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2081                 CLEANUP_THREAD_INFO;
2082                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2083                 return -1;
2084             }
2085 
2086             //
2087             // Skip this proc if it is not included in the machine model.
2088             //
2089             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2090                 INIT_PROC_INFO(threadInfo[num_avail]);
2091                 continue;
2092             }
2093 
2094             //
2095             // We have a successful parse of this proc's info.
2096             // Increment the counter, and prepare for the next proc.
2097             //
2098             num_avail++;
2099             KMP_ASSERT(num_avail <= num_records);
2100             INIT_PROC_INFO(threadInfo[num_avail]);
2101         }
2102         continue;
2103 
2104         no_val:
2105         CLEANUP_THREAD_INFO;
2106         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2107         return -1;
2108 
2109         dup_field:
2110         CLEANUP_THREAD_INFO;
2111         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2112         return -1;
2113     }
2114     *line = 0;
2115 
2116 # if KMP_MIC && REDUCE_TEAM_SIZE
2117     unsigned teamSize = 0;
2118 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2119 
2120     // check for num_records == __kmp_xproc ???
2121 
2122     //
2123     // If there's only one thread context to bind to, form an Address object
2124     // with depth 1 and return immediately (or, if affinity is off, set
2125     // address2os to NULL and return).
2126     //
2127     // If it is configured to omit the package level when there is only a
2128     // single package, the logic at the end of this routine won't work if
2129     // there is only a single thread - it would try to form an Address
2130     // object with depth 0.
2131     //
2132     KMP_ASSERT(num_avail > 0);
2133     KMP_ASSERT(num_avail <= num_records);
2134     if (num_avail == 1) {
2135         __kmp_ncores = 1;
2136         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2137         __kmp_ht_enabled = FALSE;
2138         if (__kmp_affinity_verbose) {
2139             if (! KMP_AFFINITY_CAPABLE()) {
2140                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2141                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2142                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2143             }
2144             else {
2145                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2146                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2147                   fullMask);
2148                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2149                 if (__kmp_affinity_respect_mask) {
2150                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2151                 } else {
2152                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2153                 }
2154                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2155                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2156             }
2157             int index;
2158             kmp_str_buf_t buf;
2159             __kmp_str_buf_init(&buf);
2160             __kmp_str_buf_print(&buf, "1");
2161             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2162                 __kmp_str_buf_print(&buf, " x 1");
2163             }
2164             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2165             __kmp_str_buf_free(&buf);
2166         }
2167 
2168         if (__kmp_affinity_type == affinity_none) {
2169             CLEANUP_THREAD_INFO;
2170             return 0;
2171         }
2172 
2173         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2174         Address addr(1);
2175         addr.labels[0] = threadInfo[0][pkgIdIndex];
2176         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2177 
2178         if (__kmp_affinity_gran_levels < 0) {
2179             __kmp_affinity_gran_levels = 0;
2180         }
2181 
2182         if (__kmp_affinity_verbose) {
2183             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2184         }
2185 
2186         CLEANUP_THREAD_INFO;
2187         return 1;
2188     }
2189 
2190     //
2191     // Sort the threadInfo table by physical Id.
2192     //
2193     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2194       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2195 
2196     //
2197     // The table is now sorted by pkgId / coreId / threadId, but we really
2198     // don't know the radix of any of the fields.  pkgId's may be sparsely
2199     // assigned among the chips on a system.  Although coreId's are usually
2200     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2201     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2202     //
2203     // For that matter, we don't know what coresPerPkg and threadsPerCore
2204     // (or the total # packages) are at this point - we want to determine
2205     // that now.  We only have an upper bound on the first two figures.
2206     //
2207     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2208       * sizeof(unsigned));
2209     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2210       * sizeof(unsigned));
2211     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2212       * sizeof(unsigned));
2213     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2214       * sizeof(unsigned));
2215 
2216     bool assign_thread_ids = false;
2217     unsigned threadIdCt;
2218     unsigned index;
2219 
2220     restart_radix_check:
2221     threadIdCt = 0;
2222 
2223     //
2224     // Initialize the counter arrays with data from threadInfo[0].
2225     //
2226     if (assign_thread_ids) {
2227         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2228             threadInfo[0][threadIdIndex] = threadIdCt++;
2229         }
2230         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2231             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2232         }
2233     }
2234     for (index = 0; index <= maxIndex; index++) {
2235         counts[index] = 1;
2236         maxCt[index] = 1;
2237         totals[index] = 1;
2238         lastId[index] = threadInfo[0][index];;
2239     }
2240 
2241     //
2242     // Run through the rest of the OS procs.
2243     //
2244     for (i = 1; i < num_avail; i++) {
2245         //
2246         // Find the most significant index whose id differs
2247         // from the id for the previous OS proc.
2248         //
2249         for (index = maxIndex; index >= threadIdIndex; index--) {
2250             if (assign_thread_ids && (index == threadIdIndex)) {
2251                 //
2252                 // Auto-assign the thread id field if it wasn't specified.
2253                 //
2254                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2255                     threadInfo[i][threadIdIndex] = threadIdCt++;
2256                 }
2257 
2258                 //
2259                 // Aparrently the thread id field was specified for some
2260                 // entries and not others.  Start the thread id counter
2261                 // off at the next higher thread id.
2262                 //
2263                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2264                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2265                 }
2266             }
2267             if (threadInfo[i][index] != lastId[index]) {
2268                 //
2269                 // Run through all indices which are less significant,
2270                 // and reset the counts to 1.
2271                 //
2272                 // At all levels up to and including index, we need to
2273                 // increment the totals and record the last id.
2274                 //
2275                 unsigned index2;
2276                 for (index2 = threadIdIndex; index2 < index; index2++) {
2277                     totals[index2]++;
2278                     if (counts[index2] > maxCt[index2]) {
2279                         maxCt[index2] = counts[index2];
2280                     }
2281                     counts[index2] = 1;
2282                     lastId[index2] = threadInfo[i][index2];
2283                 }
2284                 counts[index]++;
2285                 totals[index]++;
2286                 lastId[index] = threadInfo[i][index];
2287 
2288                 if (assign_thread_ids && (index > threadIdIndex)) {
2289 
2290 # if KMP_MIC && REDUCE_TEAM_SIZE
2291                     //
2292                     // The default team size is the total #threads in the machine
2293                     // minus 1 thread for every core that has 3 or more threads.
2294                     //
2295                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2296 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2297 
2298                     //
2299                     // Restart the thread counter, as we are on a new core.
2300                     //
2301                     threadIdCt = 0;
2302 
2303                     //
2304                     // Auto-assign the thread id field if it wasn't specified.
2305                     //
2306                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2307                         threadInfo[i][threadIdIndex] = threadIdCt++;
2308                     }
2309 
2310                     //
2311                     // Aparrently the thread id field was specified for some
2312                     // entries and not others.  Start the thread id counter
2313                     // off at the next higher thread id.
2314                     //
2315                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2316                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2317                     }
2318                 }
2319                 break;
2320             }
2321         }
2322         if (index < threadIdIndex) {
2323             //
2324             // If thread ids were specified, it is an error if they are not
2325             // unique.  Also, check that we waven't already restarted the
2326             // loop (to be safe - shouldn't need to).
2327             //
2328             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2329               || assign_thread_ids) {
2330                 __kmp_free(lastId);
2331                 __kmp_free(totals);
2332                 __kmp_free(maxCt);
2333                 __kmp_free(counts);
2334                 CLEANUP_THREAD_INFO;
2335                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2336                 return -1;
2337             }
2338 
2339             //
2340             // If the thread ids were not specified and we see entries
2341             // entries that are duplicates, start the loop over and
2342             // assign the thread ids manually.
2343             //
2344             assign_thread_ids = true;
2345             goto restart_radix_check;
2346         }
2347     }
2348 
2349 # if KMP_MIC && REDUCE_TEAM_SIZE
2350     //
2351     // The default team size is the total #threads in the machine
2352     // minus 1 thread for every core that has 3 or more threads.
2353     //
2354     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2355 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2356 
2357     for (index = threadIdIndex; index <= maxIndex; index++) {
2358         if (counts[index] > maxCt[index]) {
2359             maxCt[index] = counts[index];
2360         }
2361     }
2362 
2363     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2364     nCoresPerPkg = maxCt[coreIdIndex];
2365     nPackages = totals[pkgIdIndex];
2366 
2367     //
2368     // Check to see if the machine topology is uniform
2369     //
2370     unsigned prod = totals[maxIndex];
2371     for (index = threadIdIndex; index < maxIndex; index++) {
2372        prod *= maxCt[index];
2373     }
2374     bool uniform = (prod == totals[threadIdIndex]);
2375 
2376     //
2377     // When affinity is off, this routine will still be called to set
2378     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2379     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2380     // correctly, and return now if affinity is not enabled.
2381     //
2382     __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2383     __kmp_ncores = totals[coreIdIndex];
2384 
2385     if (__kmp_affinity_verbose) {
2386         if (! KMP_AFFINITY_CAPABLE()) {
2387                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2388                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2389                 if (uniform) {
2390                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2391                 } else {
2392                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2393                 }
2394         }
2395         else {
2396             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2397             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2398                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2399                 if (__kmp_affinity_respect_mask) {
2400                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2401                 } else {
2402                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2403                 }
2404                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2405                 if (uniform) {
2406                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2407                 } else {
2408                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2409                 }
2410         }
2411         kmp_str_buf_t buf;
2412         __kmp_str_buf_init(&buf);
2413 
2414         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2415         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2416             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2417         }
2418         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2419           maxCt[threadIdIndex], __kmp_ncores);
2420 
2421         __kmp_str_buf_free(&buf);
2422     }
2423 
2424 # if KMP_MIC && REDUCE_TEAM_SIZE
2425     //
2426     // Set the default team size.
2427     //
2428     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2429         __kmp_dflt_team_nth = teamSize;
2430         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2431           __kmp_dflt_team_nth));
2432     }
2433 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2434 
2435     if (__kmp_affinity_type == affinity_none) {
2436         __kmp_free(lastId);
2437         __kmp_free(totals);
2438         __kmp_free(maxCt);
2439         __kmp_free(counts);
2440         CLEANUP_THREAD_INFO;
2441         return 0;
2442     }
2443 
2444     //
2445     // Count the number of levels which have more nodes at that level than
2446     // at the parent's level (with there being an implicit root node of
2447     // the top level).  This is equivalent to saying that there is at least
2448     // one node at this level which has a sibling.  These levels are in the
2449     // map, and the package level is always in the map.
2450     //
2451     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2452     int level = 0;
2453     for (index = threadIdIndex; index < maxIndex; index++) {
2454         KMP_ASSERT(totals[index] >= totals[index + 1]);
2455         inMap[index] = (totals[index] > totals[index + 1]);
2456     }
2457     inMap[maxIndex] = (totals[maxIndex] > 1);
2458     inMap[pkgIdIndex] = true;
2459 
2460     int depth = 0;
2461     for (index = threadIdIndex; index <= maxIndex; index++) {
2462         if (inMap[index]) {
2463             depth++;
2464         }
2465     }
2466     KMP_ASSERT(depth > 0);
2467 
2468     //
2469     // Construct the data structure that is to be returned.
2470     //
2471     *address2os = (AddrUnsPair*)
2472       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2473     int pkgLevel = -1;
2474     int coreLevel = -1;
2475     int threadLevel = -1;
2476 
2477     for (i = 0; i < num_avail; ++i) {
2478         Address addr(depth);
2479         unsigned os = threadInfo[i][osIdIndex];
2480         int src_index;
2481         int dst_index = 0;
2482 
2483         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2484             if (! inMap[src_index]) {
2485                 continue;
2486             }
2487             addr.labels[dst_index] = threadInfo[i][src_index];
2488             if (src_index == pkgIdIndex) {
2489                 pkgLevel = dst_index;
2490             }
2491             else if (src_index == coreIdIndex) {
2492                 coreLevel = dst_index;
2493             }
2494             else if (src_index == threadIdIndex) {
2495                 threadLevel = dst_index;
2496             }
2497             dst_index++;
2498         }
2499         (*address2os)[i] = AddrUnsPair(addr, os);
2500     }
2501 
2502     if (__kmp_affinity_gran_levels < 0) {
2503         //
2504         // Set the granularity level based on what levels are modeled
2505         // in the machine topology map.
2506         //
2507         unsigned src_index;
2508         __kmp_affinity_gran_levels = 0;
2509         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2510             if (! inMap[src_index]) {
2511                 continue;
2512             }
2513             switch (src_index) {
2514                 case threadIdIndex:
2515                 if (__kmp_affinity_gran > affinity_gran_thread) {
2516                     __kmp_affinity_gran_levels++;
2517                 }
2518 
2519                 break;
2520                 case coreIdIndex:
2521                 if (__kmp_affinity_gran > affinity_gran_core) {
2522                     __kmp_affinity_gran_levels++;
2523                 }
2524                 break;
2525 
2526                 case pkgIdIndex:
2527                 if (__kmp_affinity_gran > affinity_gran_package) {
2528                     __kmp_affinity_gran_levels++;
2529                 }
2530                 break;
2531             }
2532         }
2533     }
2534 
2535     if (__kmp_affinity_verbose) {
2536         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2537           coreLevel, threadLevel);
2538     }
2539 
2540     __kmp_free(inMap);
2541     __kmp_free(lastId);
2542     __kmp_free(totals);
2543     __kmp_free(maxCt);
2544     __kmp_free(counts);
2545     CLEANUP_THREAD_INFO;
2546     return depth;
2547 }
2548 
2549 
2550 //
2551 // Create and return a table of affinity masks, indexed by OS thread ID.
2552 // This routine handles OR'ing together all the affinity masks of threads
2553 // that are sufficiently close, if granularity > fine.
2554 //
2555 static kmp_affin_mask_t *
2556 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2557   AddrUnsPair *address2os, unsigned numAddrs)
2558 {
2559     //
2560     // First form a table of affinity masks in order of OS thread id.
2561     //
2562     unsigned depth;
2563     unsigned maxOsId;
2564     unsigned i;
2565 
2566     KMP_ASSERT(numAddrs > 0);
2567     depth = address2os[0].first.depth;
2568 
2569     maxOsId = 0;
2570     for (i = 0; i < numAddrs; i++) {
2571         unsigned osId = address2os[i].second;
2572         if (osId > maxOsId) {
2573             maxOsId = osId;
2574         }
2575     }
2576     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2577       (maxOsId + 1) * __kmp_affin_mask_size);
2578 
2579     //
2580     // Sort the address2os table according to physical order.  Doing so
2581     // will put all threads on the same core/package/node in consecutive
2582     // locations.
2583     //
2584     qsort(address2os, numAddrs, sizeof(*address2os),
2585       __kmp_affinity_cmp_Address_labels);
2586 
2587     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2588     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2589         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2590     }
2591     if (__kmp_affinity_gran_levels >= (int)depth) {
2592         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2593           && (__kmp_affinity_type != affinity_none))) {
2594             KMP_WARNING(AffThreadsMayMigrate);
2595         }
2596     }
2597 
2598     //
2599     // Run through the table, forming the masks for all threads on each
2600     // core.  Threads on the same core will have identical "Address"
2601     // objects, not considering the last level, which must be the thread
2602     // id.  All threads on a core will appear consecutively.
2603     //
2604     unsigned unique = 0;
2605     unsigned j = 0;                             // index of 1st thread on core
2606     unsigned leader = 0;
2607     Address *leaderAddr = &(address2os[0].first);
2608     kmp_affin_mask_t *sum
2609       = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2610     KMP_CPU_ZERO(sum);
2611     KMP_CPU_SET(address2os[0].second, sum);
2612     for (i = 1; i < numAddrs; i++) {
2613         //
2614         // If this thread is sufficiently close to the leader (within the
2615         // granularity setting), then set the bit for this os thread in the
2616         // affinity mask for this group, and go on to the next thread.
2617         //
2618         if (leaderAddr->isClose(address2os[i].first,
2619           __kmp_affinity_gran_levels)) {
2620             KMP_CPU_SET(address2os[i].second, sum);
2621             continue;
2622         }
2623 
2624         //
2625         // For every thread in this group, copy the mask to the thread's
2626         // entry in the osId2Mask table.  Mark the first address as a
2627         // leader.
2628         //
2629         for (; j < i; j++) {
2630             unsigned osId = address2os[j].second;
2631             KMP_DEBUG_ASSERT(osId <= maxOsId);
2632             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2633             KMP_CPU_COPY(mask, sum);
2634             address2os[j].first.leader = (j == leader);
2635         }
2636         unique++;
2637 
2638         //
2639         // Start a new mask.
2640         //
2641         leader = i;
2642         leaderAddr = &(address2os[i].first);
2643         KMP_CPU_ZERO(sum);
2644         KMP_CPU_SET(address2os[i].second, sum);
2645     }
2646 
2647     //
2648     // For every thread in last group, copy the mask to the thread's
2649     // entry in the osId2Mask table.
2650     //
2651     for (; j < i; j++) {
2652         unsigned osId = address2os[j].second;
2653         KMP_DEBUG_ASSERT(osId <= maxOsId);
2654         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2655         KMP_CPU_COPY(mask, sum);
2656         address2os[j].first.leader = (j == leader);
2657     }
2658     unique++;
2659 
2660     *maxIndex = maxOsId;
2661     *numUnique = unique;
2662     return osId2Mask;
2663 }
2664 
2665 
2666 //
2667 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2668 // as file-static than to try and pass them through the calling sequence of
2669 // the recursive-descent OMP_PLACES parser.
2670 //
2671 static kmp_affin_mask_t *newMasks;
2672 static int numNewMasks;
2673 static int nextNewMask;
2674 
2675 #define ADD_MASK(_mask) \
2676     {                                                                   \
2677         if (nextNewMask >= numNewMasks) {                               \
2678             numNewMasks *= 2;                                           \
2679             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2680               numNewMasks * __kmp_affin_mask_size);                     \
2681         }                                                               \
2682         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2683         nextNewMask++;                                                  \
2684     }
2685 
2686 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2687     {                                                                   \
2688         if (((_osId) > _maxOsId) ||                                     \
2689           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2690             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2691               && (__kmp_affinity_type != affinity_none))) {             \
2692                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2693             }                                                           \
2694         }                                                               \
2695         else {                                                          \
2696             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2697         }                                                               \
2698     }
2699 
2700 
2701 //
2702 // Re-parse the proclist (for the explicit affinity type), and form the list
2703 // of affinity newMasks indexed by gtid.
2704 //
2705 static void
2706 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2707   unsigned int *out_numMasks, const char *proclist,
2708   kmp_affin_mask_t *osId2Mask, int maxOsId)
2709 {
2710     const char *scan = proclist;
2711     const char *next = proclist;
2712 
2713     //
2714     // We use malloc() for the temporary mask vector,
2715     // so that we can use realloc() to extend it.
2716     //
2717     numNewMasks = 2;
2718     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2719       * __kmp_affin_mask_size);
2720     nextNewMask = 0;
2721     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2722       __kmp_affin_mask_size);
2723     int setSize = 0;
2724 
2725     for (;;) {
2726         int start, end, stride;
2727 
2728         SKIP_WS(scan);
2729         next = scan;
2730         if (*next == '\0') {
2731             break;
2732         }
2733 
2734         if (*next == '{') {
2735             int num;
2736             setSize = 0;
2737             next++;     // skip '{'
2738             SKIP_WS(next);
2739             scan = next;
2740 
2741             //
2742             // Read the first integer in the set.
2743             //
2744             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2745               "bad proclist");
2746             SKIP_DIGITS(next);
2747             num = __kmp_str_to_int(scan, *next);
2748             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2749 
2750             //
2751             // Copy the mask for that osId to the sum (union) mask.
2752             //
2753             if ((num > maxOsId) ||
2754               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2755                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2756                   && (__kmp_affinity_type != affinity_none))) {
2757                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2758                 }
2759                 KMP_CPU_ZERO(sumMask);
2760             }
2761             else {
2762                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2763                 setSize = 1;
2764             }
2765 
2766             for (;;) {
2767                 //
2768                 // Check for end of set.
2769                 //
2770                 SKIP_WS(next);
2771                 if (*next == '}') {
2772                     next++;     // skip '}'
2773                     break;
2774                 }
2775 
2776                 //
2777                 // Skip optional comma.
2778                 //
2779                 if (*next == ',') {
2780                     next++;
2781                 }
2782                 SKIP_WS(next);
2783 
2784                 //
2785                 // Read the next integer in the set.
2786                 //
2787                 scan = next;
2788                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2789                   "bad explicit proc list");
2790 
2791                 SKIP_DIGITS(next);
2792                 num = __kmp_str_to_int(scan, *next);
2793                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2794 
2795                 //
2796                 // Add the mask for that osId to the sum mask.
2797                 //
2798                 if ((num > maxOsId) ||
2799                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2800                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2801                       && (__kmp_affinity_type != affinity_none))) {
2802                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2803                     }
2804                 }
2805                 else {
2806                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2807                     setSize++;
2808                 }
2809             }
2810             if (setSize > 0) {
2811                 ADD_MASK(sumMask);
2812             }
2813 
2814             SKIP_WS(next);
2815             if (*next == ',') {
2816                 next++;
2817             }
2818             scan = next;
2819             continue;
2820         }
2821 
2822         //
2823         // Read the first integer.
2824         //
2825         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2826         SKIP_DIGITS(next);
2827         start = __kmp_str_to_int(scan, *next);
2828         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2829         SKIP_WS(next);
2830 
2831         //
2832         // If this isn't a range, then add a mask to the list and go on.
2833         //
2834         if (*next != '-') {
2835             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2836 
2837             //
2838             // Skip optional comma.
2839             //
2840             if (*next == ',') {
2841                 next++;
2842             }
2843             scan = next;
2844             continue;
2845         }
2846 
2847         //
2848         // This is a range.  Skip over the '-' and read in the 2nd int.
2849         //
2850         next++;         // skip '-'
2851         SKIP_WS(next);
2852         scan = next;
2853         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2854         SKIP_DIGITS(next);
2855         end = __kmp_str_to_int(scan, *next);
2856         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2857 
2858         //
2859         // Check for a stride parameter
2860         //
2861         stride = 1;
2862         SKIP_WS(next);
2863         if (*next == ':') {
2864             //
2865             // A stride is specified.  Skip over the ':" and read the 3rd int.
2866             //
2867             int sign = +1;
2868             next++;         // skip ':'
2869             SKIP_WS(next);
2870             scan = next;
2871             if (*next == '-') {
2872                 sign = -1;
2873                 next++;
2874                 SKIP_WS(next);
2875                 scan = next;
2876             }
2877             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2878               "bad explicit proc list");
2879             SKIP_DIGITS(next);
2880             stride = __kmp_str_to_int(scan, *next);
2881             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2882             stride *= sign;
2883         }
2884 
2885         //
2886         // Do some range checks.
2887         //
2888         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2889         if (stride > 0) {
2890             KMP_ASSERT2(start <= end, "bad explicit proc list");
2891         }
2892         else {
2893             KMP_ASSERT2(start >= end, "bad explicit proc list");
2894         }
2895         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2896 
2897         //
2898         // Add the mask for each OS proc # to the list.
2899         //
2900         if (stride > 0) {
2901             do {
2902                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2903                 start += stride;
2904             } while (start <= end);
2905         }
2906         else {
2907             do {
2908                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2909                 start += stride;
2910             } while (start >= end);
2911         }
2912 
2913         //
2914         // Skip optional comma.
2915         //
2916         SKIP_WS(next);
2917         if (*next == ',') {
2918             next++;
2919         }
2920         scan = next;
2921     }
2922 
2923     *out_numMasks = nextNewMask;
2924     if (nextNewMask == 0) {
2925         *out_masks = NULL;
2926         KMP_INTERNAL_FREE(newMasks);
2927         return;
2928     }
2929     *out_masks
2930       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2931     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2932     __kmp_free(sumMask);
2933     KMP_INTERNAL_FREE(newMasks);
2934 }
2935 
2936 
2937 # if OMP_40_ENABLED
2938 
2939 /*-----------------------------------------------------------------------------
2940 
2941 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2942 places.  Again, Here is the grammar:
2943 
2944 place_list := place
2945 place_list := place , place_list
2946 place := num
2947 place := place : num
2948 place := place : num : signed
2949 place := { subplacelist }
2950 place := ! place                  // (lowest priority)
2951 subplace_list := subplace
2952 subplace_list := subplace , subplace_list
2953 subplace := num
2954 subplace := num : num
2955 subplace := num : num : signed
2956 signed := num
2957 signed := + signed
2958 signed := - signed
2959 
2960 -----------------------------------------------------------------------------*/
2961 
2962 static void
2963 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2964   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2965 {
2966     const char *next;
2967 
2968     for (;;) {
2969         int start, count, stride, i;
2970 
2971         //
2972         // Read in the starting proc id
2973         //
2974         SKIP_WS(*scan);
2975         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2976           "bad explicit places list");
2977         next = *scan;
2978         SKIP_DIGITS(next);
2979         start = __kmp_str_to_int(*scan, *next);
2980         KMP_ASSERT(start >= 0);
2981         *scan = next;
2982 
2983         //
2984         // valid follow sets are ',' ':' and '}'
2985         //
2986         SKIP_WS(*scan);
2987         if (**scan == '}' || **scan == ',') {
2988             if ((start > maxOsId) ||
2989               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2990                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2991                   && (__kmp_affinity_type != affinity_none))) {
2992                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2993                 }
2994             }
2995             else {
2996                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2997                 (*setSize)++;
2998             }
2999             if (**scan == '}') {
3000                 break;
3001             }
3002             (*scan)++;  // skip ','
3003             continue;
3004         }
3005         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3006         (*scan)++;      // skip ':'
3007 
3008         //
3009         // Read count parameter
3010         //
3011         SKIP_WS(*scan);
3012         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3013           "bad explicit places list");
3014         next = *scan;
3015         SKIP_DIGITS(next);
3016         count = __kmp_str_to_int(*scan, *next);
3017         KMP_ASSERT(count >= 0);
3018         *scan = next;
3019 
3020         //
3021         // valid follow sets are ',' ':' and '}'
3022         //
3023         SKIP_WS(*scan);
3024         if (**scan == '}' || **scan == ',') {
3025             for (i = 0; i < count; i++) {
3026                 if ((start > maxOsId) ||
3027                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3028                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3029                       && (__kmp_affinity_type != affinity_none))) {
3030                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3031                     }
3032                     break;  // don't proliferate warnings for large count
3033                 }
3034                 else {
3035                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3036                     start++;
3037                     (*setSize)++;
3038                 }
3039             }
3040             if (**scan == '}') {
3041                 break;
3042             }
3043             (*scan)++;  // skip ','
3044             continue;
3045         }
3046         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3047         (*scan)++;      // skip ':'
3048 
3049         //
3050         // Read stride parameter
3051         //
3052         int sign = +1;
3053         for (;;) {
3054             SKIP_WS(*scan);
3055             if (**scan == '+') {
3056                 (*scan)++; // skip '+'
3057                 continue;
3058             }
3059             if (**scan == '-') {
3060                 sign *= -1;
3061                 (*scan)++; // skip '-'
3062                 continue;
3063             }
3064             break;
3065         }
3066         SKIP_WS(*scan);
3067         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3068           "bad explicit places list");
3069         next = *scan;
3070         SKIP_DIGITS(next);
3071         stride = __kmp_str_to_int(*scan, *next);
3072         KMP_ASSERT(stride >= 0);
3073         *scan = next;
3074         stride *= sign;
3075 
3076         //
3077         // valid follow sets are ',' and '}'
3078         //
3079         SKIP_WS(*scan);
3080         if (**scan == '}' || **scan == ',') {
3081             for (i = 0; i < count; i++) {
3082                 if ((start > maxOsId) ||
3083                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3084                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3085                       && (__kmp_affinity_type != affinity_none))) {
3086                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3087                     }
3088                     break;  // don't proliferate warnings for large count
3089                 }
3090                 else {
3091                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3092                     start += stride;
3093                     (*setSize)++;
3094                 }
3095             }
3096             if (**scan == '}') {
3097                 break;
3098             }
3099             (*scan)++;  // skip ','
3100             continue;
3101         }
3102 
3103         KMP_ASSERT2(0, "bad explicit places list");
3104     }
3105 }
3106 
3107 
3108 static void
3109 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3110   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3111 {
3112     const char *next;
3113 
3114     //
3115     // valid follow sets are '{' '!' and num
3116     //
3117     SKIP_WS(*scan);
3118     if (**scan == '{') {
3119         (*scan)++;      // skip '{'
3120         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3121           setSize);
3122         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3123         (*scan)++;      // skip '}'
3124     }
3125     else if (**scan == '!') {
3126         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3127         KMP_CPU_COMPLEMENT(tempMask);
3128         (*scan)++;      // skip '!'
3129     }
3130     else if ((**scan >= '0') && (**scan <= '9')) {
3131         next = *scan;
3132         SKIP_DIGITS(next);
3133         int num = __kmp_str_to_int(*scan, *next);
3134         KMP_ASSERT(num >= 0);
3135         if ((num > maxOsId) ||
3136           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3137             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3138               && (__kmp_affinity_type != affinity_none))) {
3139                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3140             }
3141         }
3142         else {
3143             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3144             (*setSize)++;
3145         }
3146         *scan = next;  // skip num
3147     }
3148     else {
3149         KMP_ASSERT2(0, "bad explicit places list");
3150     }
3151 }
3152 
3153 
3154 //static void
3155 void
3156 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3157   unsigned int *out_numMasks, const char *placelist,
3158   kmp_affin_mask_t *osId2Mask, int maxOsId)
3159 {
3160     const char *scan = placelist;
3161     const char *next = placelist;
3162 
3163     numNewMasks = 2;
3164     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3165       * __kmp_affin_mask_size);
3166     nextNewMask = 0;
3167 
3168     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3169       __kmp_affin_mask_size);
3170     KMP_CPU_ZERO(tempMask);
3171     int setSize = 0;
3172 
3173     for (;;) {
3174         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3175 
3176         //
3177         // valid follow sets are ',' ':' and EOL
3178         //
3179         SKIP_WS(scan);
3180         if (*scan == '\0' || *scan == ',') {
3181             if (setSize > 0) {
3182                 ADD_MASK(tempMask);
3183             }
3184             KMP_CPU_ZERO(tempMask);
3185             setSize = 0;
3186             if (*scan == '\0') {
3187                 break;
3188             }
3189             scan++;     // skip ','
3190             continue;
3191         }
3192 
3193         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3194         scan++;         // skip ':'
3195 
3196         //
3197         // Read count parameter
3198         //
3199         SKIP_WS(scan);
3200         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3201           "bad explicit places list");
3202         next = scan;
3203         SKIP_DIGITS(next);
3204         int count = __kmp_str_to_int(scan, *next);
3205         KMP_ASSERT(count >= 0);
3206         scan = next;
3207 
3208         //
3209         // valid follow sets are ',' ':' and EOL
3210         //
3211         SKIP_WS(scan);
3212         int stride;
3213         if (*scan == '\0' || *scan == ',') {
3214             stride = +1;
3215         }
3216         else {
3217             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3218             scan++;         // skip ':'
3219 
3220             //
3221             // Read stride parameter
3222             //
3223             int sign = +1;
3224             for (;;) {
3225                 SKIP_WS(scan);
3226                 if (*scan == '+') {
3227                     scan++; // skip '+'
3228                     continue;
3229                 }
3230                 if (*scan == '-') {
3231                     sign *= -1;
3232                     scan++; // skip '-'
3233                     continue;
3234                 }
3235                 break;
3236             }
3237             SKIP_WS(scan);
3238             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3239               "bad explicit places list");
3240             next = scan;
3241             SKIP_DIGITS(next);
3242             stride = __kmp_str_to_int(scan, *next);
3243             KMP_DEBUG_ASSERT(stride >= 0);
3244             scan = next;
3245             stride *= sign;
3246         }
3247 
3248         if (stride > 0) {
3249             int i;
3250             for (i = 0; i < count; i++) {
3251                 int j;
3252                 if (setSize == 0) {
3253                     break;
3254                 }
3255                 ADD_MASK(tempMask);
3256                 setSize = 0;
3257                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3258                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3259                         KMP_CPU_CLR(j, tempMask);
3260                     }
3261                     else if ((j > maxOsId) ||
3262                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3263                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3264                           && (__kmp_affinity_type != affinity_none))) {
3265                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3266                         }
3267                         KMP_CPU_CLR(j, tempMask);
3268                     }
3269                     else {
3270                         KMP_CPU_SET(j, tempMask);
3271                         setSize++;
3272                     }
3273                 }
3274                 for (; j >= 0; j--) {
3275                     KMP_CPU_CLR(j, tempMask);
3276                 }
3277             }
3278         }
3279         else {
3280             int i;
3281             for (i = 0; i < count; i++) {
3282                 int j;
3283                 if (setSize == 0) {
3284                     break;
3285                 }
3286                 ADD_MASK(tempMask);
3287                 setSize = 0;
3288                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3289                   j++) {
3290                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3291                         KMP_CPU_CLR(j, tempMask);
3292                     }
3293                     else if ((j > maxOsId) ||
3294                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3295                         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3296                           && (__kmp_affinity_type != affinity_none))) {
3297                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3298                         }
3299                         KMP_CPU_CLR(j, tempMask);
3300                     }
3301                     else {
3302                         KMP_CPU_SET(j, tempMask);
3303                         setSize++;
3304                     }
3305                 }
3306                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3307                     KMP_CPU_CLR(j, tempMask);
3308                 }
3309             }
3310         }
3311         KMP_CPU_ZERO(tempMask);
3312         setSize = 0;
3313 
3314         //
3315         // valid follow sets are ',' and EOL
3316         //
3317         SKIP_WS(scan);
3318         if (*scan == '\0') {
3319             break;
3320         }
3321         if (*scan == ',') {
3322             scan++;     // skip ','
3323             continue;
3324         }
3325 
3326         KMP_ASSERT2(0, "bad explicit places list");
3327     }
3328 
3329     *out_numMasks = nextNewMask;
3330     if (nextNewMask == 0) {
3331         *out_masks = NULL;
3332         KMP_INTERNAL_FREE(newMasks);
3333         return;
3334     }
3335     *out_masks
3336       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3337     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3338     __kmp_free(tempMask);
3339     KMP_INTERNAL_FREE(newMasks);
3340 }
3341 
3342 # endif /* OMP_40_ENABLED */
3343 
3344 #undef ADD_MASK
3345 #undef ADD_MASK_OSID
3346 
3347 
3348 # if KMP_MIC
3349 
3350 static void
3351 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3352 {
3353     if ( __kmp_place_num_cores == 0 ) {
3354         if ( __kmp_place_num_threads_per_core == 0 ) {
3355             return;   // no cores limiting actions requested, exit
3356         }
3357         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3358     }
3359     if ( !__kmp_affinity_uniform_topology() ) {
3360         KMP_WARNING( AffThrPlaceNonUniform );
3361         return; // don't support non-uniform topology
3362     }
3363     if ( depth != 3 ) {
3364         KMP_WARNING( AffThrPlaceNonThreeLevel );
3365         return; // don't support not-3-level topology
3366     }
3367     if ( __kmp_place_num_threads_per_core == 0 ) {
3368         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3369     }
3370     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3371         KMP_WARNING( AffThrPlaceManyCores );
3372         return;
3373     }
3374 
3375     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3376                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3377     int i, j, k, n_old = 0, n_new = 0;
3378     for ( i = 0; i < nPackages; ++i ) {
3379         for ( j = 0; j < nCoresPerPkg; ++j ) {
3380             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3381                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3382             } else {
3383                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3384                     if ( k < __kmp_place_num_threads_per_core ) {
3385                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3386                         n_new++;
3387                     }
3388                     n_old++;
3389                 }
3390             }
3391         }
3392     }
3393     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3394     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3395     __kmp_avail_proc = n_new;                                 // correct avail_proc
3396     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3397 
3398     __kmp_free( *pAddr );
3399     *pAddr = newAddr;      // replace old topology with new one
3400 }
3401 
3402 # endif /* KMP_MIC */
3403 
3404 
3405 static AddrUnsPair *address2os = NULL;
3406 static int           * procarr = NULL;
3407 static int     __kmp_aff_depth = 0;
3408 
3409 static void
3410 __kmp_aux_affinity_initialize(void)
3411 {
3412     if (__kmp_affinity_masks != NULL) {
3413         KMP_ASSERT(fullMask != NULL);
3414         return;
3415     }
3416 
3417     //
3418     // Create the "full" mask - this defines all of the processors that we
3419     // consider to be in the machine model.  If respect is set, then it is
3420     // the initialization thread's affinity mask.  Otherwise, it is all
3421     // processors that we know about on the machine.
3422     //
3423     if (fullMask == NULL) {
3424         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3425     }
3426     if (KMP_AFFINITY_CAPABLE()) {
3427         if (__kmp_affinity_respect_mask) {
3428             __kmp_get_system_affinity(fullMask, TRUE);
3429 
3430             //
3431             // Count the number of available processors.
3432             //
3433             unsigned i;
3434             __kmp_avail_proc = 0;
3435             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3436                 if (! KMP_CPU_ISSET(i, fullMask)) {
3437                     continue;
3438                 }
3439                 __kmp_avail_proc++;
3440             }
3441             if (__kmp_avail_proc > __kmp_xproc) {
3442                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3443                   && (__kmp_affinity_type != affinity_none))) {
3444                     KMP_WARNING(ErrorInitializeAffinity);
3445                 }
3446                 __kmp_affinity_type = affinity_none;
3447                 __kmp_affin_mask_size = 0;
3448                 return;
3449             }
3450         }
3451         else {
3452             __kmp_affinity_entire_machine_mask(fullMask);
3453             __kmp_avail_proc = __kmp_xproc;
3454         }
3455     }
3456 
3457     int depth = -1;
3458     kmp_i18n_id_t msg_id = kmp_i18n_null;
3459 
3460     //
3461     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3462     // KMP_TOPOLOGY_METHOD=cpuinfo
3463     //
3464     if ((__kmp_cpuinfo_file != NULL) &&
3465       (__kmp_affinity_top_method == affinity_top_method_all)) {
3466         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3467     }
3468 
3469     if (__kmp_affinity_top_method == affinity_top_method_all) {
3470         //
3471         // In the default code path, errors are not fatal - we just try using
3472         // another method.  We only emit a warning message if affinity is on,
3473         // or the verbose flag is set, an the nowarnings flag was not set.
3474         //
3475         const char *file_name = NULL;
3476         int line = 0;
3477 
3478 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3479 
3480         if (__kmp_affinity_verbose) {
3481             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3482         }
3483 
3484         file_name = NULL;
3485         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3486         if (depth == 0) {
3487             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3488             KMP_ASSERT(address2os == NULL);
3489             return;
3490         }
3491 
3492         if (depth < 0) {
3493             if (__kmp_affinity_verbose) {
3494                 if (msg_id != kmp_i18n_null) {
3495                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3496                       KMP_I18N_STR(DecodingLegacyAPIC));
3497                 }
3498                 else {
3499                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3500                 }
3501             }
3502 
3503             file_name = NULL;
3504             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3505             if (depth == 0) {
3506                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3507                 KMP_ASSERT(address2os == NULL);
3508                 return;
3509             }
3510         }
3511 
3512 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3513 
3514 # if KMP_OS_LINUX
3515 
3516         if (depth < 0) {
3517             if (__kmp_affinity_verbose) {
3518                 if (msg_id != kmp_i18n_null) {
3519                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3520                 }
3521                 else {
3522                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3523                 }
3524             }
3525 
3526             FILE *f = fopen("/proc/cpuinfo", "r");
3527             if (f == NULL) {
3528                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3529             }
3530             else {
3531                 file_name = "/proc/cpuinfo";
3532                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3533                 fclose(f);
3534                 if (depth == 0) {
3535                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3536                     KMP_ASSERT(address2os == NULL);
3537                     return;
3538                 }
3539             }
3540         }
3541 
3542 # endif /* KMP_OS_LINUX */
3543 
3544 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3545 
3546         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3547             if (__kmp_affinity_verbose) {
3548                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3549             }
3550 
3551             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3552             KMP_ASSERT(depth != 0);
3553         }
3554 
3555 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3556 
3557         if (depth < 0) {
3558             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3559                 if (file_name == NULL) {
3560                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3561                 }
3562                 else if (line == 0) {
3563                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3564                 }
3565                 else {
3566                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3567                 }
3568             }
3569             // FIXME - print msg if msg_id = kmp_i18n_null ???
3570 
3571             file_name = "";
3572             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3573             if (depth == 0) {
3574                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3575                 KMP_ASSERT(address2os == NULL);
3576                 return;
3577             }
3578             KMP_ASSERT(depth > 0);
3579             KMP_ASSERT(address2os != NULL);
3580         }
3581     }
3582 
3583     //
3584     // If the user has specified that a paricular topology discovery method
3585     // is to be used, then we abort if that method fails.  The exception is
3586     // group affinity, which might have been implicitly set.
3587     //
3588 
3589 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3590 
3591     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3592         if (__kmp_affinity_verbose) {
3593             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3594               KMP_I18N_STR(Decodingx2APIC));
3595         }
3596 
3597         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3598         if (depth == 0) {
3599             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3600             KMP_ASSERT(address2os == NULL);
3601             return;
3602         }
3603         if (depth < 0) {
3604             KMP_ASSERT(msg_id != kmp_i18n_null);
3605             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3606         }
3607     }
3608     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3609         if (__kmp_affinity_verbose) {
3610             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3611               KMP_I18N_STR(DecodingLegacyAPIC));
3612         }
3613 
3614         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3615         if (depth == 0) {
3616             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3617             KMP_ASSERT(address2os == NULL);
3618             return;
3619         }
3620         if (depth < 0) {
3621             KMP_ASSERT(msg_id != kmp_i18n_null);
3622             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3623         }
3624     }
3625 
3626 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3627 
3628     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3629         const char *filename;
3630         if (__kmp_cpuinfo_file != NULL) {
3631             filename = __kmp_cpuinfo_file;
3632         }
3633         else {
3634             filename = "/proc/cpuinfo";
3635         }
3636 
3637         if (__kmp_affinity_verbose) {
3638             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3639         }
3640 
3641         FILE *f = fopen(filename, "r");
3642         if (f == NULL) {
3643             int code = errno;
3644             if (__kmp_cpuinfo_file != NULL) {
3645                 __kmp_msg(
3646                     kmp_ms_fatal,
3647                     KMP_MSG(CantOpenFileForReading, filename),
3648                     KMP_ERR(code),
3649                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3650                     __kmp_msg_null
3651                 );
3652             }
3653             else {
3654                 __kmp_msg(
3655                     kmp_ms_fatal,
3656                     KMP_MSG(CantOpenFileForReading, filename),
3657                     KMP_ERR(code),
3658                     __kmp_msg_null
3659                 );
3660             }
3661         }
3662         int line = 0;
3663         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3664         fclose(f);
3665         if (depth < 0) {
3666             KMP_ASSERT(msg_id != kmp_i18n_null);
3667             if (line > 0) {
3668                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3669             }
3670             else {
3671                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3672             }
3673         }
3674         if (__kmp_affinity_type == affinity_none) {
3675             KMP_ASSERT(depth == 0);
3676             KMP_ASSERT(address2os == NULL);
3677             return;
3678         }
3679     }
3680 
3681 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3682 
3683     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3684         if (__kmp_affinity_verbose) {
3685             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3686         }
3687 
3688         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3689         KMP_ASSERT(depth != 0);
3690         if (depth < 0) {
3691             KMP_ASSERT(msg_id != kmp_i18n_null);
3692             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3693         }
3694     }
3695 
3696 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3697 
3698     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3699         if (__kmp_affinity_verbose) {
3700             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3701         }
3702 
3703         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3704         if (depth == 0) {
3705             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3706             KMP_ASSERT(address2os == NULL);
3707             return;
3708         }
3709         // should not fail
3710         KMP_ASSERT(depth > 0);
3711         KMP_ASSERT(address2os != NULL);
3712     }
3713 
3714     if (address2os == NULL) {
3715         if (KMP_AFFINITY_CAPABLE()
3716           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3717           && (__kmp_affinity_type != affinity_none)))) {
3718             KMP_WARNING(ErrorInitializeAffinity);
3719         }
3720         __kmp_affinity_type = affinity_none;
3721         __kmp_affin_mask_size = 0;
3722         return;
3723     }
3724 
3725 # if KMP_MIC
3726     __kmp_apply_thread_places(&address2os, depth);
3727 # endif
3728 
3729     //
3730     // Create the table of masks, indexed by thread Id.
3731     //
3732     unsigned maxIndex;
3733     unsigned numUnique;
3734     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3735       address2os, __kmp_avail_proc);
3736     if (__kmp_affinity_gran_levels == 0) {
3737         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3738     }
3739 
3740     //
3741     // Set the childNums vector in all Address objects.  This must be done
3742     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3743     // which takes into account the setting of __kmp_affinity_compact.
3744     //
3745     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3746 
3747     switch (__kmp_affinity_type) {
3748 
3749         case affinity_explicit:
3750         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3751 # if OMP_40_ENABLED
3752         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3753 # endif
3754         {
3755             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3756               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3757               maxIndex);
3758         }
3759 # if OMP_40_ENABLED
3760         else {
3761             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3762               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3763               maxIndex);
3764         }
3765 # endif
3766         if (__kmp_affinity_num_masks == 0) {
3767             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3768               && (__kmp_affinity_type != affinity_none))) {
3769                 KMP_WARNING(AffNoValidProcID);
3770             }
3771             __kmp_affinity_type = affinity_none;
3772             return;
3773         }
3774         break;
3775 
3776         //
3777         // The other affinity types rely on sorting the Addresses according
3778         // to some permutation of the machine topology tree.  Set
3779         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3780         // then jump to a common code fragment to do the sort and create
3781         // the array of affinity masks.
3782         //
3783 
3784         case affinity_logical:
3785         __kmp_affinity_compact = 0;
3786         if (__kmp_affinity_offset) {
3787             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3788               % __kmp_avail_proc;
3789         }
3790         goto sortAddresses;
3791 
3792         case affinity_physical:
3793         if (__kmp_nThreadsPerCore > 1) {
3794             __kmp_affinity_compact = 1;
3795             if (__kmp_affinity_compact >= depth) {
3796                 __kmp_affinity_compact = 0;
3797             }
3798         } else {
3799             __kmp_affinity_compact = 0;
3800         }
3801         if (__kmp_affinity_offset) {
3802             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3803               % __kmp_avail_proc;
3804         }
3805         goto sortAddresses;
3806 
3807         case affinity_scatter:
3808         if (__kmp_affinity_compact >= depth) {
3809             __kmp_affinity_compact = 0;
3810         }
3811         else {
3812             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3813         }
3814         goto sortAddresses;
3815 
3816         case affinity_compact:
3817         if (__kmp_affinity_compact >= depth) {
3818             __kmp_affinity_compact = depth - 1;
3819         }
3820         goto sortAddresses;
3821 
3822 # if KMP_MIC
3823         case affinity_balanced:
3824         // Balanced works only for the case of a single package and uniform topology
3825         if( nPackages > 1 ) {
3826             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3827                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3828             }
3829             __kmp_affinity_type = affinity_none;
3830             return;
3831         } else if( __kmp_affinity_uniform_topology() ) {
3832             break;
3833         } else { // Non-uniform topology
3834 
3835             // Save the depth for further usage
3836             __kmp_aff_depth = depth;
3837 
3838             // Number of hyper threads per core in HT machine
3839             int nth_per_core = __kmp_nThreadsPerCore;
3840 
3841             int core_level;
3842             if( nth_per_core > 1 ) {
3843                 core_level = depth - 2;
3844             } else {
3845                 core_level = depth - 1;
3846             }
3847             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3848             int nproc = nth_per_core * ncores;
3849 
3850             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3851             for( int i = 0; i < nproc; i++ ) {
3852                 procarr[ i ] = -1;
3853             }
3854 
3855             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3856                 int proc = address2os[ i ].second;
3857                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3858                 // If there is only one thread per core then depth == 2: level 0 - package,
3859                 // level 1 - core.
3860                 int level = depth - 1;
3861 
3862                 // __kmp_nth_per_core == 1
3863                 int thread = 0;
3864                 int core = address2os[ i ].first.labels[ level ];
3865                 // If the thread level exists, that is we have more than one thread context per core
3866                 if( nth_per_core > 1 ) {
3867                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3868                     core = address2os[ i ].first.labels[ level - 1 ];
3869                 }
3870                 procarr[ core * nth_per_core + thread ] = proc;
3871             }
3872 
3873             break;
3874         }
3875 # endif
3876 
3877         sortAddresses:
3878         //
3879         // Allocate the gtid->affinity mask table.
3880         //
3881         if (__kmp_affinity_dups) {
3882             __kmp_affinity_num_masks = __kmp_avail_proc;
3883         }
3884         else {
3885             __kmp_affinity_num_masks = numUnique;
3886         }
3887 
3888 # if OMP_40_ENABLED
3889         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3890           && ( __kmp_affinity_num_places > 0 )
3891           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3892             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3893         }
3894 # endif
3895 
3896         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3897           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3898 
3899         //
3900         // Sort the address2os table according to the current setting of
3901         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3902         //
3903         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3904           __kmp_affinity_cmp_Address_child_num);
3905         {
3906             int i;
3907             unsigned j;
3908             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3909                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3910                     continue;
3911                 }
3912                 unsigned osId = address2os[i].second;
3913                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3914                 kmp_affin_mask_t *dest
3915                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3916                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3917                 KMP_CPU_COPY(dest, src);
3918                 if (++j >= __kmp_affinity_num_masks) {
3919                     break;
3920                 }
3921             }
3922             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3923         }
3924         break;
3925 
3926         default:
3927         KMP_ASSERT2(0, "Unexpected affinity setting");
3928     }
3929 
3930     __kmp_free(osId2Mask);
3931     machine_hierarchy.init(address2os, __kmp_avail_proc);
3932 }
3933 
3934 
3935 void
3936 __kmp_affinity_initialize(void)
3937 {
3938     //
3939     // Much of the code above was written assumming that if a machine was not
3940     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3941     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3942     //
3943     // There are too many checks for __kmp_affinity_type == affinity_none
3944     // in this code.  Instead of trying to change them all, check if
3945     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3946     // affinity_none, call the real initialization routine, then restore
3947     // __kmp_affinity_type to affinity_disabled.
3948     //
3949     int disabled = (__kmp_affinity_type == affinity_disabled);
3950     if (! KMP_AFFINITY_CAPABLE()) {
3951         KMP_ASSERT(disabled);
3952     }
3953     if (disabled) {
3954         __kmp_affinity_type = affinity_none;
3955     }
3956     __kmp_aux_affinity_initialize();
3957     if (disabled) {
3958         __kmp_affinity_type = affinity_disabled;
3959     }
3960 }
3961 
3962 
3963 void
3964 __kmp_affinity_uninitialize(void)
3965 {
3966     if (__kmp_affinity_masks != NULL) {
3967         __kmp_free(__kmp_affinity_masks);
3968         __kmp_affinity_masks = NULL;
3969     }
3970     if (fullMask != NULL) {
3971         KMP_CPU_FREE(fullMask);
3972         fullMask = NULL;
3973     }
3974     __kmp_affinity_num_masks = 0;
3975 # if OMP_40_ENABLED
3976     __kmp_affinity_num_places = 0;
3977 # endif
3978     if (__kmp_affinity_proclist != NULL) {
3979         __kmp_free(__kmp_affinity_proclist);
3980         __kmp_affinity_proclist = NULL;
3981     }
3982     if( address2os != NULL ) {
3983         __kmp_free( address2os );
3984         address2os = NULL;
3985     }
3986     if( procarr != NULL ) {
3987         __kmp_free( procarr );
3988         procarr = NULL;
3989     }
3990 }
3991 
3992 
3993 void
3994 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3995 {
3996     if (! KMP_AFFINITY_CAPABLE()) {
3997         return;
3998     }
3999 
4000     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4001     if (th->th.th_affin_mask == NULL) {
4002         KMP_CPU_ALLOC(th->th.th_affin_mask);
4003     }
4004     else {
4005         KMP_CPU_ZERO(th->th.th_affin_mask);
4006     }
4007 
4008     //
4009     // Copy the thread mask to the kmp_info_t strucuture.
4010     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4011     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4012     // is set, then the full mask is the same as the mask of the initialization
4013     // thread.
4014     //
4015     kmp_affin_mask_t *mask;
4016     int i;
4017 
4018 # if OMP_40_ENABLED
4019     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4020 # endif
4021     {
4022         if ((__kmp_affinity_type == affinity_none)
4023 # if KMP_MIC
4024           || (__kmp_affinity_type == affinity_balanced)
4025 # endif
4026           ) {
4027 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4028             if (__kmp_num_proc_groups > 1) {
4029                 return;
4030             }
4031 # endif
4032             KMP_ASSERT(fullMask != NULL);
4033             i = KMP_PLACE_ALL;
4034             mask = fullMask;
4035         }
4036         else {
4037             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4038             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4039             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4040         }
4041     }
4042 # if OMP_40_ENABLED
4043     else {
4044         if ((! isa_root)
4045           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4046 #  if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4047             if (__kmp_num_proc_groups > 1) {
4048                 return;
4049             }
4050 #  endif
4051             KMP_ASSERT(fullMask != NULL);
4052             i = KMP_PLACE_ALL;
4053             mask = fullMask;
4054         }
4055         else {
4056             //
4057             // int i = some hash function or just a counter that doesn't
4058             // always start at 0.  Use gtid for now.
4059             //
4060             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4061             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4062             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4063         }
4064     }
4065 # endif
4066 
4067 # if OMP_40_ENABLED
4068     th->th.th_current_place = i;
4069     if (isa_root) {
4070         th->th.th_new_place = i;
4071         th->th.th_first_place = 0;
4072         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4073     }
4074 
4075     if (i == KMP_PLACE_ALL) {
4076         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4077           gtid));
4078     }
4079     else {
4080         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4081           gtid, i));
4082     }
4083 # else
4084     if (i == -1) {
4085         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4086           gtid));
4087     }
4088     else {
4089         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4090           gtid, i));
4091     }
4092 # endif /* OMP_40_ENABLED */
4093 
4094     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4095 
4096     if (__kmp_affinity_verbose) {
4097         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4098         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4099           th->th.th_affin_mask);
4100         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4101           buf);
4102     }
4103 
4104 # if KMP_OS_WINDOWS
4105     //
4106     // On Windows* OS, the process affinity mask might have changed.
4107     // If the user didn't request affinity and this call fails,
4108     // just continue silently.  See CQ171393.
4109     //
4110     if ( __kmp_affinity_type == affinity_none ) {
4111         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4112     }
4113     else
4114 # endif
4115     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4116 }
4117 
4118 
4119 # if OMP_40_ENABLED
4120 
4121 void
4122 __kmp_affinity_set_place(int gtid)
4123 {
4124     int retval;
4125 
4126     if (! KMP_AFFINITY_CAPABLE()) {
4127         return;
4128     }
4129 
4130     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4131 
4132     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4133       gtid, th->th.th_new_place, th->th.th_current_place));
4134 
4135     //
4136     // Check that the new place is within this thread's partition.
4137     //
4138     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4139     KMP_ASSERT(th->th.th_new_place >= 0);
4140     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4141     if (th->th.th_first_place <= th->th.th_last_place) {
4142         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4143          && (th->th.th_new_place <= th->th.th_last_place));
4144     }
4145     else {
4146         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4147          || (th->th.th_new_place >= th->th.th_last_place));
4148     }
4149 
4150     //
4151     // Copy the thread mask to the kmp_info_t strucuture,
4152     // and set this thread's affinity.
4153     //
4154     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4155       th->th.th_new_place);
4156     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4157     th->th.th_current_place = th->th.th_new_place;
4158 
4159     if (__kmp_affinity_verbose) {
4160         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4161         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4162           th->th.th_affin_mask);
4163         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4164           gtid, buf);
4165     }
4166     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4167 }
4168 
4169 # endif /* OMP_40_ENABLED */
4170 
4171 
4172 int
4173 __kmp_aux_set_affinity(void **mask)
4174 {
4175     int gtid;
4176     kmp_info_t *th;
4177     int retval;
4178 
4179     if (! KMP_AFFINITY_CAPABLE()) {
4180         return -1;
4181     }
4182 
4183     gtid = __kmp_entry_gtid();
4184     KA_TRACE(1000, ;{
4185         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4186         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4187           (kmp_affin_mask_t *)(*mask));
4188         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4189           gtid, buf);
4190     });
4191 
4192     if (__kmp_env_consistency_check) {
4193         if ((mask == NULL) || (*mask == NULL)) {
4194             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4195         }
4196         else {
4197             unsigned proc;
4198             int num_procs = 0;
4199 
4200             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4201                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4202                     continue;
4203                 }
4204                 num_procs++;
4205                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4206                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4207                     break;
4208                 }
4209             }
4210             if (num_procs == 0) {
4211                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4212             }
4213 
4214 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4215             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4216                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4217             }
4218 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4219 
4220         }
4221     }
4222 
4223     th = __kmp_threads[gtid];
4224     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4225     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4226     if (retval == 0) {
4227         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4228     }
4229 
4230 # if OMP_40_ENABLED
4231     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4232     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4233     th->th.th_first_place = 0;
4234     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4235 
4236     //
4237     // Turn off 4.0 affinity for the current tread at this parallel level.
4238     //
4239     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4240 # endif
4241 
4242     return retval;
4243 }
4244 
4245 
4246 int
4247 __kmp_aux_get_affinity(void **mask)
4248 {
4249     int gtid;
4250     int retval;
4251     kmp_info_t *th;
4252 
4253     if (! KMP_AFFINITY_CAPABLE()) {
4254         return -1;
4255     }
4256 
4257     gtid = __kmp_entry_gtid();
4258     th = __kmp_threads[gtid];
4259     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4260 
4261     KA_TRACE(1000, ;{
4262         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4263         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4264           th->th.th_affin_mask);
4265         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4266     });
4267 
4268     if (__kmp_env_consistency_check) {
4269         if ((mask == NULL) || (*mask == NULL)) {
4270             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4271         }
4272     }
4273 
4274 # if !KMP_OS_WINDOWS
4275 
4276     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4277     KA_TRACE(1000, ;{
4278         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4279         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4280           (kmp_affin_mask_t *)(*mask));
4281         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4282     });
4283     return retval;
4284 
4285 # else
4286 
4287     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4288     return 0;
4289 
4290 # endif /* KMP_OS_WINDOWS */
4291 
4292 }
4293 
4294 int
4295 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4296 {
4297     int retval;
4298 
4299     if (! KMP_AFFINITY_CAPABLE()) {
4300         return -1;
4301     }
4302 
4303     KA_TRACE(1000, ;{
4304         int gtid = __kmp_entry_gtid();
4305         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4306         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4307           (kmp_affin_mask_t *)(*mask));
4308         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4309           proc, gtid, buf);
4310     });
4311 
4312     if (__kmp_env_consistency_check) {
4313         if ((mask == NULL) || (*mask == NULL)) {
4314             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4315         }
4316     }
4317 
4318     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4319         return -1;
4320     }
4321     if (! KMP_CPU_ISSET(proc, fullMask)) {
4322         return -2;
4323     }
4324 
4325     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4326     return 0;
4327 }
4328 
4329 
4330 int
4331 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4332 {
4333     int retval;
4334 
4335     if (! KMP_AFFINITY_CAPABLE()) {
4336         return -1;
4337     }
4338 
4339     KA_TRACE(1000, ;{
4340         int gtid = __kmp_entry_gtid();
4341         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4342         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4343           (kmp_affin_mask_t *)(*mask));
4344         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4345           proc, gtid, buf);
4346     });
4347 
4348     if (__kmp_env_consistency_check) {
4349         if ((mask == NULL) || (*mask == NULL)) {
4350             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4351         }
4352     }
4353 
4354     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4355         return -1;
4356     }
4357     if (! KMP_CPU_ISSET(proc, fullMask)) {
4358         return -2;
4359     }
4360 
4361     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4362     return 0;
4363 }
4364 
4365 
4366 int
4367 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4368 {
4369     int retval;
4370 
4371     if (! KMP_AFFINITY_CAPABLE()) {
4372         return -1;
4373     }
4374 
4375     KA_TRACE(1000, ;{
4376         int gtid = __kmp_entry_gtid();
4377         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4378         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4379           (kmp_affin_mask_t *)(*mask));
4380         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4381           proc, gtid, buf);
4382     });
4383 
4384     if (__kmp_env_consistency_check) {
4385         if ((mask == NULL) || (*mask == NULL)) {
4386             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4387         }
4388     }
4389 
4390     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4391         return 0;
4392     }
4393     if (! KMP_CPU_ISSET(proc, fullMask)) {
4394         return 0;
4395     }
4396 
4397     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4398 }
4399 
4400 # if KMP_MIC
4401 
4402 // Dynamic affinity settings - Affinity balanced
4403 void __kmp_balanced_affinity( int tid, int nthreads )
4404 {
4405     if( __kmp_affinity_uniform_topology() ) {
4406         int coreID;
4407         int threadID;
4408         // Number of hyper threads per core in HT machine
4409         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4410         // Number of cores
4411         int ncores = __kmp_ncores;
4412         // How many threads will be bound to each core
4413         int chunk = nthreads / ncores;
4414         // How many cores will have an additional thread bound to it - "big cores"
4415         int big_cores = nthreads % ncores;
4416         // Number of threads on the big cores
4417         int big_nth = ( chunk + 1 ) * big_cores;
4418         if( tid < big_nth ) {
4419             coreID = tid / (chunk + 1 );
4420             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4421         } else { //tid >= big_nth
4422             coreID = ( tid - big_cores ) / chunk;
4423             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4424         }
4425 
4426         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4427           "Illegal set affinity operation when not capable");
4428 
4429         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4430         KMP_CPU_ZERO(mask);
4431 
4432         // Granularity == thread
4433         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4434             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4435             KMP_CPU_SET( osID, mask);
4436         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4437             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4438                 int osID;
4439                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4440                 KMP_CPU_SET( osID, mask);
4441             }
4442         }
4443         if (__kmp_affinity_verbose) {
4444             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4445             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4446             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4447               tid, buf);
4448         }
4449         __kmp_set_system_affinity( mask, TRUE );
4450     } else { // Non-uniform topology
4451 
4452         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4453         KMP_CPU_ZERO(mask);
4454 
4455         // Number of hyper threads per core in HT machine
4456         int nth_per_core = __kmp_nThreadsPerCore;
4457         int core_level;
4458         if( nth_per_core > 1 ) {
4459             core_level = __kmp_aff_depth - 2;
4460         } else {
4461             core_level = __kmp_aff_depth - 1;
4462         }
4463 
4464         // Number of cores - maximum value; it does not count trail cores with 0 processors
4465         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4466 
4467         // For performance gain consider the special case nthreads == __kmp_avail_proc
4468         if( nthreads == __kmp_avail_proc ) {
4469             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4470                 int osID = address2os[ tid ].second;
4471                 KMP_CPU_SET( osID, mask);
4472             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4473                 int coreID = address2os[ tid ].first.labels[ core_level ];
4474                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4475                 // since the address2os is sortied we can break when cnt==nth_per_core
4476                 int cnt = 0;
4477                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4478                     int osID = address2os[ i ].second;
4479                     int core = address2os[ i ].first.labels[ core_level ];
4480                     if( core == coreID ) {
4481                         KMP_CPU_SET( osID, mask);
4482                         cnt++;
4483                         if( cnt == nth_per_core ) {
4484                             break;
4485                         }
4486                     }
4487                 }
4488             }
4489         } else if( nthreads <= __kmp_ncores ) {
4490 
4491             int core = 0;
4492             for( int i = 0; i < ncores; i++ ) {
4493                 // Check if this core from procarr[] is in the mask
4494                 int in_mask = 0;
4495                 for( int j = 0; j < nth_per_core; j++ ) {
4496                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4497                         in_mask = 1;
4498                         break;
4499                     }
4500                 }
4501                 if( in_mask ) {
4502                     if( tid == core ) {
4503                         for( int j = 0; j < nth_per_core; j++ ) {
4504                             int osID = procarr[ i * nth_per_core + j ];
4505                             if( osID != -1 ) {
4506                                 KMP_CPU_SET( osID, mask );
4507                                 // For granularity=thread it is enough to set the first available osID for this core
4508                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4509                                     break;
4510                                 }
4511                             }
4512                         }
4513                         break;
4514                     } else {
4515                         core++;
4516                     }
4517                 }
4518             }
4519 
4520         } else { // nthreads > __kmp_ncores
4521 
4522             // Array to save the number of processors at each core
4523             int nproc_at_core[ ncores ];
4524             // Array to save the number of cores with "x" available processors;
4525             int ncores_with_x_procs[ nth_per_core + 1 ];
4526             // Array to save the number of cores with # procs from x to nth_per_core
4527             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4528 
4529             for( int i = 0; i <= nth_per_core; i++ ) {
4530                 ncores_with_x_procs[ i ] = 0;
4531                 ncores_with_x_to_max_procs[ i ] = 0;
4532             }
4533 
4534             for( int i = 0; i < ncores; i++ ) {
4535                 int cnt = 0;
4536                 for( int j = 0; j < nth_per_core; j++ ) {
4537                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4538                         cnt++;
4539                     }
4540                 }
4541                 nproc_at_core[ i ] = cnt;
4542                 ncores_with_x_procs[ cnt ]++;
4543             }
4544 
4545             for( int i = 0; i <= nth_per_core; i++ ) {
4546                 for( int j = i; j <= nth_per_core; j++ ) {
4547                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4548                 }
4549             }
4550 
4551             // Max number of processors
4552             int nproc = nth_per_core * ncores;
4553             // An array to keep number of threads per each context
4554             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4555             for( int i = 0; i < nproc; i++ ) {
4556                 newarr[ i ] = 0;
4557             }
4558 
4559             int nth = nthreads;
4560             int flag = 0;
4561             while( nth > 0 ) {
4562                 for( int j = 1; j <= nth_per_core; j++ ) {
4563                     int cnt = ncores_with_x_to_max_procs[ j ];
4564                     for( int i = 0; i < ncores; i++ ) {
4565                         // Skip the core with 0 processors
4566                         if( nproc_at_core[ i ] == 0 ) {
4567                             continue;
4568                         }
4569                         for( int k = 0; k < nth_per_core; k++ ) {
4570                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4571                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4572                                     newarr[ i * nth_per_core + k ] = 1;
4573                                     cnt--;
4574                                     nth--;
4575                                     break;
4576                                 } else {
4577                                     if( flag != 0 ) {
4578                                         newarr[ i * nth_per_core + k ] ++;
4579                                         cnt--;
4580                                         nth--;
4581                                         break;
4582                                     }
4583                                 }
4584                             }
4585                         }
4586                         if( cnt == 0 || nth == 0 ) {
4587                             break;
4588                         }
4589                     }
4590                     if( nth == 0 ) {
4591                         break;
4592                     }
4593                 }
4594                 flag = 1;
4595             }
4596             int sum = 0;
4597             for( int i = 0; i < nproc; i++ ) {
4598                 sum += newarr[ i ];
4599                 if( sum > tid ) {
4600                     // Granularity == thread
4601                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4602                         int osID = procarr[ i ];
4603                         KMP_CPU_SET( osID, mask);
4604                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4605                         int coreID = i / nth_per_core;
4606                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4607                             int osID = procarr[ coreID * nth_per_core + ii ];
4608                             if( osID != -1 ) {
4609                                 KMP_CPU_SET( osID, mask);
4610                             }
4611                         }
4612                     }
4613                     break;
4614                 }
4615             }
4616             __kmp_free( newarr );
4617         }
4618 
4619         if (__kmp_affinity_verbose) {
4620             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4621             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4622             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4623               tid, buf);
4624         }
4625         __kmp_set_system_affinity( mask, TRUE );
4626     }
4627 }
4628 
4629 # endif /* KMP_MIC */
4630 
4631 #else
4632     // affinity not supported
4633 
4634 kmp_uint32 mac_skipPerLevel[7];
4635 kmp_uint32 mac_depth;
4636 kmp_uint8 mac_leaf_kids;
4637 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4638     static int first = 1;
4639     if (first) {
4640         const kmp_uint32 maxLevels = 7;
4641         kmp_uint32 numPerLevel[maxLevels];
4642 
4643         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4644             numPerLevel[i] = 1;
4645             mac_skipPerLevel[i] = 1;
4646         }
4647 
4648         mac_depth = 2;
4649         numPerLevel[0] = nproc;
4650 
4651         kmp_uint32 branch = 4;
4652         if (numPerLevel[0] == 1) branch = nproc/4;
4653         if (branch<4) branch=4;
4654         for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4655             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4656                 if (numPerLevel[d] & 1) numPerLevel[d]++;
4657                 numPerLevel[d] = numPerLevel[d] >> 1;
4658                 if (numPerLevel[d+1] == 1) mac_depth++;
4659                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
4660             }
4661             if(numPerLevel[0] == 1) {
4662                 branch = branch >> 1;
4663                 if (branch<4) branch = 4;
4664             }
4665         }
4666 
4667         for (kmp_uint32 i=1; i<mac_depth; ++i)
4668             mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4669         mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4670         first=0;
4671     }
4672     thr_bar->depth = mac_depth;
4673     thr_bar->base_leaf_kids = mac_leaf_kids;
4674     thr_bar->skip_per_level = mac_skipPerLevel;
4675 }
4676 
4677 #endif // KMP_AFFINITY_SUPPORTED
4678