1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30     KMP_ASSERT(buf_len >= 40);
31     char *scan = buf;
32     char *end = buf + buf_len - 1;
33 
34     //
35     // Find first element / check for empty set.
36     //
37     size_t i;
38     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39         if (KMP_CPU_ISSET(i, mask)) {
40             break;
41         }
42     }
43     if (i == KMP_CPU_SETSIZE) {
44         KMP_SNPRINTF(scan, buf_len, "{<empty>}");
45         while (*scan != '\0') scan++;
46         KMP_ASSERT(scan <= end);
47         return buf;
48     }
49 
50     KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
51     while (*scan != '\0') scan++;
52     i++;
53     for (; i < KMP_CPU_SETSIZE; i++) {
54         if (! KMP_CPU_ISSET(i, mask)) {
55             continue;
56         }
57 
58         //
59         // Check for buffer overflow.  A string of the form ",<n>" will have
60         // at most 10 characters, plus we want to leave room to print ",...}"
61         // if the set is too large to print for a total of 15 characters.
62         // We already left room for '\0' in setting end.
63         //
64         if (end - scan < 15) {
65            break;
66         }
67         KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
68         while (*scan != '\0') scan++;
69     }
70     if (i < KMP_CPU_SETSIZE) {
71         KMP_SNPRINTF(scan, buf_len,  ",...");
72         while (*scan != '\0') scan++;
73     }
74     KMP_SNPRINTF(scan, buf_len, "}");
75     while (*scan != '\0') scan++;
76     KMP_ASSERT(scan <= end);
77     return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84     KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88     if (__kmp_num_proc_groups > 1) {
89         int group;
90         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91         for (group = 0; group < __kmp_num_proc_groups; group++) {
92             int i;
93             int num = __kmp_GetActiveProcessorCount(group);
94             for (i = 0; i < num; i++) {
95                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96             }
97         }
98     }
99     else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103     {
104         int proc;
105         for (proc = 0; proc < __kmp_xproc; proc++) {
106             KMP_CPU_SET(proc, mask);
107         }
108     }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings.  I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libiomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138     static const unsigned maxDepth = 32;
139     unsigned labels[maxDepth];
140     unsigned childNums[maxDepth];
141     unsigned depth;
142     unsigned leader;
143     Address(unsigned _depth)
144       : depth(_depth), leader(FALSE) {
145     }
146     Address &operator=(const Address &b) {
147         depth = b.depth;
148         for (unsigned i = 0; i < depth; i++) {
149             labels[i] = b.labels[i];
150             childNums[i] = b.childNums[i];
151         }
152         leader = FALSE;
153         return *this;
154     }
155     bool operator==(const Address &b) const {
156         if (depth != b.depth)
157             return false;
158         for (unsigned i = 0; i < depth; i++)
159             if(labels[i] != b.labels[i])
160                 return false;
161         return true;
162     }
163     bool isClose(const Address &b, int level) const {
164         if (depth != b.depth)
165             return false;
166         if ((unsigned)level >= depth)
167             return true;
168         for (unsigned i = 0; i < (depth - level); i++)
169             if(labels[i] != b.labels[i])
170                 return false;
171         return true;
172     }
173     bool operator!=(const Address &b) const {
174         return !operator==(b);
175     }
176 };
177 
178 class AddrUnsPair {
179 public:
180     Address first;
181     unsigned second;
182     AddrUnsPair(Address _first, unsigned _second)
183       : first(_first), second(_second) {
184     }
185     AddrUnsPair &operator=(const AddrUnsPair &b)
186     {
187         first = b.first;
188         second = b.second;
189         return *this;
190     }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197     static const unsigned maxDepth = 32;
198     unsigned labels[maxDepth];
199     unsigned childNums[maxDepth];
200     unsigned depth;
201     unsigned leader;
202     Address(unsigned _depth);
203     Address &operator=(const Address &b);
204     bool operator==(const Address &b) const;
205     bool isClose(const Address &b, int level) const;
206     bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211     depth = _depth;
212     leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216     depth = b.depth;
217     for (unsigned i = 0; i < depth; i++) {
218         labels[i] = b.labels[i];
219         childNums[i] = b.childNums[i];
220     }
221     leader = FALSE;
222     return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226     if (depth != b.depth)
227         return false;
228     for (unsigned i = 0; i < depth; i++)
229         if(labels[i] != b.labels[i])
230             return false;
231     return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235     if (depth != b.depth)
236         return false;
237     if ((unsigned)level >= depth)
238         return true;
239     for (unsigned i = 0; i < (depth - level); i++)
240         if(labels[i] != b.labels[i])
241             return false;
242     return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246     return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251     Address first;
252     unsigned second;
253     AddrUnsPair(Address _first, unsigned _second);
254     AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258   : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264     first = b.first;
265     second = b.second;
266     return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276       ->first);
277     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278       ->first);
279     unsigned depth = aa->depth;
280     unsigned i;
281     KMP_DEBUG_ASSERT(depth == bb->depth);
282     for (i  = 0; i < depth; i++) {
283         if (aa->labels[i] < bb->labels[i]) return -1;
284         if (aa->labels[i] > bb->labels[i]) return 1;
285     }
286     return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294       ->first);
295     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296       ->first);
297     unsigned depth = aa->depth;
298     unsigned i;
299     KMP_DEBUG_ASSERT(depth == bb->depth);
300     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303         int j = depth - i - 1;
304         if (aa->childNums[j] < bb->childNums[j]) return -1;
305         if (aa->childNums[j] > bb->childNums[j]) return 1;
306     }
307     for (; i < depth; i++) {
308         int j = i - __kmp_affinity_compact;
309         if (aa->childNums[j] < bb->childNums[j]) return -1;
310         if (aa->childNums[j] > bb->childNums[j]) return 1;
311     }
312     return 0;
313 }
314 
315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */
316 class hierarchy_info {
317 public:
318     /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
319         etc.  We don't want to get specific with nomenclature */
320     static const kmp_uint32 maxLevels=7;
321 
322     /** This is specifically the depth of the machine configuration hierarchy, in terms of the
323         number of levels along the longest path from root to any leaf. It corresponds to the
324         number of entries in numPerLevel if we exclude all but one trailing 1. */
325     kmp_uint32 depth;
326     kmp_uint32 base_num_threads;
327     volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
328 
329     /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
330         node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
331         and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
332     kmp_uint32 numPerLevel[maxLevels];
333     kmp_uint32 skipPerLevel[maxLevels];
334 
335     void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
336         int hier_depth = adr2os[0].first.depth;
337         int level = 0;
338         for (int i=hier_depth-1; i>=0; --i) {
339             int max = -1;
340             for (int j=0; j<num_addrs; ++j) {
341                 int next = adr2os[j].first.childNums[i];
342                 if (next > max) max = next;
343             }
344             numPerLevel[level] = max+1;
345             ++level;
346         }
347     }
348 
349     hierarchy_info() : depth(1), uninitialized(1) {}
350     void init(AddrUnsPair *adr2os, int num_addrs)
351     {
352         kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
353         if (bool_result == 0) { // Wait for initialization
354             while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
355             return;
356         }
357         KMP_DEBUG_ASSERT(bool_result==1);
358 
359         /* Added explicit initialization of the depth here to prevent usage of dirty value
360            observed when static library is re-initialized multiple times (e.g. when
361            non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
362         depth = 1;
363         for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
364             numPerLevel[i] = 1;
365             skipPerLevel[i] = 1;
366         }
367 
368         // Sort table by physical ID
369         if (adr2os) {
370             qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
371             deriveLevels(adr2os, num_addrs);
372         }
373         else {
374             numPerLevel[0] = 4;
375             numPerLevel[1] = num_addrs/4;
376             if (num_addrs%4) numPerLevel[1]++;
377         }
378 
379         base_num_threads = num_addrs;
380         for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
381             if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
382                 depth++;
383 
384         kmp_uint32 branch = 4;
385         if (numPerLevel[0] == 1) branch = num_addrs/4;
386         if (branch<4) branch=4;
387         for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
388             while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
389                 if (numPerLevel[d] & 1) numPerLevel[d]++;
390                 numPerLevel[d] = numPerLevel[d] >> 1;
391                 if (numPerLevel[d+1] == 1) depth++;
392                 numPerLevel[d+1] = numPerLevel[d+1] << 1;
393             }
394             if(numPerLevel[0] == 1) {
395                 branch = branch >> 1;
396                 if (branch<4) branch = 4;
397             }
398         }
399 
400         for (kmp_uint32 i=1; i<depth; ++i)
401             skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
402         // Fill in hierarchy in the case of oversubscription
403         for (kmp_uint32 i=depth; i<maxLevels; ++i)
404             skipPerLevel[i] = 2*skipPerLevel[i-1];
405 
406         uninitialized = 0; // One writer
407 
408     }
409 };
410 
411 static hierarchy_info machine_hierarchy;
412 
413 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
414     kmp_uint32 depth;
415     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
416     if (TCR_1(machine_hierarchy.uninitialized))
417         machine_hierarchy.init(NULL, nproc);
418 
419     depth = machine_hierarchy.depth;
420     KMP_DEBUG_ASSERT(depth > 0);
421     // The loop below adjusts the depth in the case of oversubscription
422     while (nproc > machine_hierarchy.skipPerLevel[depth-1] && depth<machine_hierarchy.maxLevels-1)
423         depth++;
424 
425     thr_bar->depth = depth;
426     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
427     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
428 }
429 
430 //
431 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
432 // called to renumber the labels from [0..n] and place them into the child_num
433 // vector of the address object.  This is done in case the labels used for
434 // the children at one node of the hierarchy differ from those used for
435 // another node at the same level.  Example:  suppose the machine has 2 nodes
436 // with 2 packages each.  The first node contains packages 601 and 602, and
437 // second node contains packages 603 and 604.  If we try to sort the table
438 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
439 // because we are paying attention to the labels themselves, not the ordinal
440 // child numbers.  By using the child numbers in the sort, the result is
441 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
442 //
443 static void
444 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
445   int numAddrs)
446 {
447     KMP_DEBUG_ASSERT(numAddrs > 0);
448     int depth = address2os->first.depth;
449     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
450     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
451       * sizeof(unsigned));
452     int labCt;
453     for (labCt = 0; labCt < depth; labCt++) {
454         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
455         lastLabel[labCt] = address2os[0].first.labels[labCt];
456     }
457     int i;
458     for (i = 1; i < numAddrs; i++) {
459         for (labCt = 0; labCt < depth; labCt++) {
460             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
461                 int labCt2;
462                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
463                     counts[labCt2] = 0;
464                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
465                 }
466                 counts[labCt]++;
467                 lastLabel[labCt] = address2os[i].first.labels[labCt];
468                 break;
469             }
470         }
471         for (labCt = 0; labCt < depth; labCt++) {
472             address2os[i].first.childNums[labCt] = counts[labCt];
473         }
474         for (; labCt < (int)Address::maxDepth; labCt++) {
475             address2os[i].first.childNums[labCt] = 0;
476         }
477     }
478 }
479 
480 
481 //
482 // All of the __kmp_affinity_create_*_map() routines should set
483 // __kmp_affinity_masks to a vector of affinity mask objects of length
484 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
485 // return the number of levels in the machine topology tree (zero if
486 // __kmp_affinity_type == affinity_none).
487 //
488 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
489 // to the affinity mask for the initialization thread.  They need to save and
490 // restore the mask, and it could be needed later, so saving it is just an
491 // optimization to avoid calling kmp_get_system_affinity() again.
492 //
493 static kmp_affin_mask_t *fullMask = NULL;
494 
495 kmp_affin_mask_t *
496 __kmp_affinity_get_fullMask() { return fullMask; }
497 
498 
499 static int nCoresPerPkg, nPackages;
500 static int __kmp_nThreadsPerCore;
501 #ifndef KMP_DFLT_NTH_CORES
502 static int __kmp_ncores;
503 #endif
504 
505 //
506 // __kmp_affinity_uniform_topology() doesn't work when called from
507 // places which support arbitrarily many levels in the machine topology
508 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
509 // __kmp_affinity_create_x2apicid_map().
510 //
511 inline static bool
512 __kmp_affinity_uniform_topology()
513 {
514     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
515 }
516 
517 
518 //
519 // Print out the detailed machine topology map, i.e. the physical locations
520 // of each OS proc.
521 //
522 static void
523 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
524   int pkgLevel, int coreLevel, int threadLevel)
525 {
526     int proc;
527 
528     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
529     for (proc = 0; proc < len; proc++) {
530         int level;
531         kmp_str_buf_t buf;
532         __kmp_str_buf_init(&buf);
533         for (level = 0; level < depth; level++) {
534             if (level == threadLevel) {
535                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
536             }
537             else if (level == coreLevel) {
538                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
539             }
540             else if (level == pkgLevel) {
541                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
542             }
543             else if (level > pkgLevel) {
544                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
545                   level - pkgLevel - 1);
546             }
547             else {
548                 __kmp_str_buf_print(&buf, "L%d ", level);
549             }
550             __kmp_str_buf_print(&buf, "%d ",
551               address2os[proc].first.labels[level]);
552         }
553         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
554           buf.str);
555         __kmp_str_buf_free(&buf);
556     }
557 }
558 
559 
560 //
561 // If we don't know how to retrieve the machine's processor topology, or
562 // encounter an error in doing so, this routine is called to form a "flat"
563 // mapping of os thread id's <-> processor id's.
564 //
565 static int
566 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
567   kmp_i18n_id_t *const msg_id)
568 {
569     *address2os = NULL;
570     *msg_id = kmp_i18n_null;
571 
572     //
573     // Even if __kmp_affinity_type == affinity_none, this routine might still
574     // called to set __kmp_ncores, as well as
575     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
576     //
577     if (! KMP_AFFINITY_CAPABLE()) {
578         KMP_ASSERT(__kmp_affinity_type == affinity_none);
579         __kmp_ncores = nPackages = __kmp_xproc;
580         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
581         if (__kmp_affinity_verbose) {
582             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
583             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
584             KMP_INFORM(Uniform, "KMP_AFFINITY");
585             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
586               __kmp_nThreadsPerCore, __kmp_ncores);
587         }
588         return 0;
589     }
590 
591     //
592     // When affinity is off, this routine will still be called to set
593     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
594     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
595     //  correctly, and return now if affinity is not enabled.
596     //
597     __kmp_ncores = nPackages = __kmp_avail_proc;
598     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
599     if (__kmp_affinity_verbose) {
600         char buf[KMP_AFFIN_MASK_PRINT_LEN];
601         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
602 
603         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
604         if (__kmp_affinity_respect_mask) {
605             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
606         } else {
607             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
608         }
609         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
610         KMP_INFORM(Uniform, "KMP_AFFINITY");
611         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
612           __kmp_nThreadsPerCore, __kmp_ncores);
613     }
614     if (__kmp_affinity_type == affinity_none) {
615         return 0;
616     }
617 
618     //
619     // Contruct the data structure to be returned.
620     //
621     *address2os = (AddrUnsPair*)
622       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
623     int avail_ct = 0;
624     unsigned int i;
625     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
626         //
627         // Skip this proc if it is not included in the machine model.
628         //
629         if (! KMP_CPU_ISSET(i, fullMask)) {
630             continue;
631         }
632 
633         Address addr(1);
634         addr.labels[0] = i;
635         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
636     }
637     if (__kmp_affinity_verbose) {
638         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
639     }
640 
641     if (__kmp_affinity_gran_levels < 0) {
642         //
643         // Only the package level is modeled in the machine topology map,
644         // so the #levels of granularity is either 0 or 1.
645         //
646         if (__kmp_affinity_gran > affinity_gran_package) {
647             __kmp_affinity_gran_levels = 1;
648         }
649         else {
650             __kmp_affinity_gran_levels = 0;
651         }
652     }
653     return 1;
654 }
655 
656 
657 # if KMP_GROUP_AFFINITY
658 
659 //
660 // If multiple Windows* OS processor groups exist, we can create a 2-level
661 // topology map with the groups at level 0 and the individual procs at
662 // level 1.
663 //
664 // This facilitates letting the threads float among all procs in a group,
665 // if granularity=group (the default when there are multiple groups).
666 //
667 static int
668 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
669   kmp_i18n_id_t *const msg_id)
670 {
671     *address2os = NULL;
672     *msg_id = kmp_i18n_null;
673 
674     //
675     // If we don't have multiple processor groups, return now.
676     // The flat mapping will be used.
677     //
678     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
679         // FIXME set *msg_id
680         return -1;
681     }
682 
683     //
684     // Contruct the data structure to be returned.
685     //
686     *address2os = (AddrUnsPair*)
687       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
688     int avail_ct = 0;
689     int i;
690     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
691         //
692         // Skip this proc if it is not included in the machine model.
693         //
694         if (! KMP_CPU_ISSET(i, fullMask)) {
695             continue;
696         }
697 
698         Address addr(2);
699         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
700         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
701         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
702 
703         if (__kmp_affinity_verbose) {
704             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
705               addr.labels[1]);
706         }
707     }
708 
709     if (__kmp_affinity_gran_levels < 0) {
710         if (__kmp_affinity_gran == affinity_gran_group) {
711             __kmp_affinity_gran_levels = 1;
712         }
713         else if ((__kmp_affinity_gran == affinity_gran_fine)
714           || (__kmp_affinity_gran == affinity_gran_thread)) {
715             __kmp_affinity_gran_levels = 0;
716         }
717         else {
718             const char *gran_str = NULL;
719             if (__kmp_affinity_gran == affinity_gran_core) {
720                 gran_str = "core";
721             }
722             else if (__kmp_affinity_gran == affinity_gran_package) {
723                 gran_str = "package";
724             }
725             else if (__kmp_affinity_gran == affinity_gran_node) {
726                 gran_str = "node";
727             }
728             else {
729                 KMP_ASSERT(0);
730             }
731 
732             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
733             __kmp_affinity_gran_levels = 0;
734         }
735     }
736     return 2;
737 }
738 
739 # endif /* KMP_GROUP_AFFINITY */
740 
741 
742 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
743 
744 static int
745 __kmp_cpuid_mask_width(int count) {
746     int r = 0;
747 
748     while((1<<r) < count)
749         ++r;
750     return r;
751 }
752 
753 
754 class apicThreadInfo {
755 public:
756     unsigned osId;              // param to __kmp_affinity_bind_thread
757     unsigned apicId;            // from cpuid after binding
758     unsigned maxCoresPerPkg;    //      ""
759     unsigned maxThreadsPerPkg;  //      ""
760     unsigned pkgId;             // inferred from above values
761     unsigned coreId;            //      ""
762     unsigned threadId;          //      ""
763 };
764 
765 
766 static int
767 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
768 {
769     const apicThreadInfo *aa = (const apicThreadInfo *)a;
770     const apicThreadInfo *bb = (const apicThreadInfo *)b;
771     if (aa->osId < bb->osId) return -1;
772     if (aa->osId > bb->osId) return 1;
773     return 0;
774 }
775 
776 
777 static int
778 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
779 {
780     const apicThreadInfo *aa = (const apicThreadInfo *)a;
781     const apicThreadInfo *bb = (const apicThreadInfo *)b;
782     if (aa->pkgId < bb->pkgId) return -1;
783     if (aa->pkgId > bb->pkgId) return 1;
784     if (aa->coreId < bb->coreId) return -1;
785     if (aa->coreId > bb->coreId) return 1;
786     if (aa->threadId < bb->threadId) return -1;
787     if (aa->threadId > bb->threadId) return 1;
788     return 0;
789 }
790 
791 
792 //
793 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
794 // an algorithm which cycles through the available os threads, setting
795 // the current thread's affinity mask to that thread, and then retrieves
796 // the Apic Id for each thread context using the cpuid instruction.
797 //
798 static int
799 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
800   kmp_i18n_id_t *const msg_id)
801 {
802     kmp_cpuid buf;
803     int rc;
804     *address2os = NULL;
805     *msg_id = kmp_i18n_null;
806 
807     //
808     // Check if cpuid leaf 4 is supported.
809     //
810         __kmp_x86_cpuid(0, 0, &buf);
811         if (buf.eax < 4) {
812             *msg_id = kmp_i18n_str_NoLeaf4Support;
813             return -1;
814         }
815 
816     //
817     // The algorithm used starts by setting the affinity to each available
818     // thread and retrieving info from the cpuid instruction, so if we are
819     // not capable of calling __kmp_get_system_affinity() and
820     // _kmp_get_system_affinity(), then we need to do something else - use
821     // the defaults that we calculated from issuing cpuid without binding
822     // to each proc.
823     //
824     if (! KMP_AFFINITY_CAPABLE()) {
825         //
826         // Hack to try and infer the machine topology using only the data
827         // available from cpuid on the current thread, and __kmp_xproc.
828         //
829         KMP_ASSERT(__kmp_affinity_type == affinity_none);
830 
831         //
832         // Get an upper bound on the number of threads per package using
833         // cpuid(1).
834         //
835         // On some OS/chps combinations where HT is supported by the chip
836         // but is disabled, this value will be 2 on a single core chip.
837         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
838         //
839         __kmp_x86_cpuid(1, 0, &buf);
840         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
841         if (maxThreadsPerPkg == 0) {
842             maxThreadsPerPkg = 1;
843         }
844 
845         //
846         // The num cores per pkg comes from cpuid(4).
847         // 1 must be added to the encoded value.
848         //
849         // The author of cpu_count.cpp treated this only an upper bound
850         // on the number of cores, but I haven't seen any cases where it
851         // was greater than the actual number of cores, so we will treat
852         // it as exact in this block of code.
853         //
854         // First, we need to check if cpuid(4) is supported on this chip.
855         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
856         // has the value n or greater.
857         //
858         __kmp_x86_cpuid(0, 0, &buf);
859         if (buf.eax >= 4) {
860             __kmp_x86_cpuid(4, 0, &buf);
861             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
862         }
863         else {
864             nCoresPerPkg = 1;
865         }
866 
867         //
868         // There is no way to reliably tell if HT is enabled without issuing
869         // the cpuid instruction from every thread, can correlating the cpuid
870         // info, so if the machine is not affinity capable, we assume that HT
871         // is off.  We have seen quite a few machines where maxThreadsPerPkg
872         // is 2, yet the machine does not support HT.
873         //
874         // - Older OSes are usually found on machines with older chips, which
875         //   do not support HT.
876         //
877         // - The performance penalty for mistakenly identifying a machine as
878         //   HT when it isn't (which results in blocktime being incorrecly set
879         //   to 0) is greater than the penalty when for mistakenly identifying
880         //   a machine as being 1 thread/core when it is really HT enabled
881         //   (which results in blocktime being incorrectly set to a positive
882         //   value).
883         //
884         __kmp_ncores = __kmp_xproc;
885         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
886         __kmp_nThreadsPerCore = 1;
887         if (__kmp_affinity_verbose) {
888             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
889             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
890             if (__kmp_affinity_uniform_topology()) {
891                 KMP_INFORM(Uniform, "KMP_AFFINITY");
892             } else {
893                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
894             }
895             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
896               __kmp_nThreadsPerCore, __kmp_ncores);
897         }
898         return 0;
899     }
900 
901     //
902     //
903     // From here on, we can assume that it is safe to call
904     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
905     // even if __kmp_affinity_type = affinity_none.
906     //
907 
908     //
909     // Save the affinity mask for the current thread.
910     //
911     kmp_affin_mask_t *oldMask;
912     KMP_CPU_ALLOC(oldMask);
913     KMP_ASSERT(oldMask != NULL);
914     __kmp_get_system_affinity(oldMask, TRUE);
915 
916     //
917     // Run through each of the available contexts, binding the current thread
918     // to it, and obtaining the pertinent information using the cpuid instr.
919     //
920     // The relevant information is:
921     //
922     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
923     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
924     //
925     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
926     //    value of this field determines the width of the core# + thread#
927     //    fields in the Apic Id.  It is also an upper bound on the number
928     //    of threads per package, but it has been verified that situations
929     //    happen were it is not exact.  In particular, on certain OS/chip
930     //    combinations where Intel(R) Hyper-Threading Technology is supported
931     //    by the chip but has
932     //    been disabled, the value of this field will be 2 (for a single core
933     //    chip).  On other OS/chip combinations supporting
934     //    Intel(R) Hyper-Threading Technology, the value of
935     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
936     //    disabled and 2 when it is enabled.
937     //
938     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
939     //    value of this field (+1) determines the width of the core# field in
940     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
941     //    an upper bound, but the IA-32 architecture manual says that it is
942     //    exactly the number of cores per package, and I haven't seen any
943     //    case where it wasn't.
944     //
945     // From this information, deduce the package Id, core Id, and thread Id,
946     // and set the corresponding fields in the apicThreadInfo struct.
947     //
948     unsigned i;
949     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
950       __kmp_avail_proc * sizeof(apicThreadInfo));
951     unsigned nApics = 0;
952     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
953         //
954         // Skip this proc if it is not included in the machine model.
955         //
956         if (! KMP_CPU_ISSET(i, fullMask)) {
957             continue;
958         }
959         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
960 
961         __kmp_affinity_bind_thread(i);
962         threadInfo[nApics].osId = i;
963 
964         //
965         // The apic id and max threads per pkg come from cpuid(1).
966         //
967         __kmp_x86_cpuid(1, 0, &buf);
968         if (! (buf.edx >> 9) & 1) {
969             __kmp_set_system_affinity(oldMask, TRUE);
970             __kmp_free(threadInfo);
971             KMP_CPU_FREE(oldMask);
972             *msg_id = kmp_i18n_str_ApicNotPresent;
973             return -1;
974         }
975         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
976         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
977         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
978             threadInfo[nApics].maxThreadsPerPkg = 1;
979         }
980 
981         //
982         // Max cores per pkg comes from cpuid(4).
983         // 1 must be added to the encoded value.
984         //
985         // First, we need to check if cpuid(4) is supported on this chip.
986         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
987         // has the value n or greater.
988         //
989         __kmp_x86_cpuid(0, 0, &buf);
990         if (buf.eax >= 4) {
991             __kmp_x86_cpuid(4, 0, &buf);
992             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
993         }
994         else {
995             threadInfo[nApics].maxCoresPerPkg = 1;
996         }
997 
998         //
999         // Infer the pkgId / coreId / threadId using only the info
1000         // obtained locally.
1001         //
1002         int widthCT = __kmp_cpuid_mask_width(
1003           threadInfo[nApics].maxThreadsPerPkg);
1004         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1005 
1006         int widthC = __kmp_cpuid_mask_width(
1007           threadInfo[nApics].maxCoresPerPkg);
1008         int widthT = widthCT - widthC;
1009         if (widthT < 0) {
1010             //
1011             // I've never seen this one happen, but I suppose it could, if
1012             // the cpuid instruction on a chip was really screwed up.
1013             // Make sure to restore the affinity mask before the tail call.
1014             //
1015             __kmp_set_system_affinity(oldMask, TRUE);
1016             __kmp_free(threadInfo);
1017             KMP_CPU_FREE(oldMask);
1018             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1019             return -1;
1020         }
1021 
1022         int maskC = (1 << widthC) - 1;
1023         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1024           &maskC;
1025 
1026         int maskT = (1 << widthT) - 1;
1027         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1028 
1029         nApics++;
1030     }
1031 
1032     //
1033     // We've collected all the info we need.
1034     // Restore the old affinity mask for this thread.
1035     //
1036     __kmp_set_system_affinity(oldMask, TRUE);
1037 
1038     //
1039     // If there's only one thread context to bind to, form an Address object
1040     // with depth 1 and return immediately (or, if affinity is off, set
1041     // address2os to NULL and return).
1042     //
1043     // If it is configured to omit the package level when there is only a
1044     // single package, the logic at the end of this routine won't work if
1045     // there is only a single thread - it would try to form an Address
1046     // object with depth 0.
1047     //
1048     KMP_ASSERT(nApics > 0);
1049     if (nApics == 1) {
1050         __kmp_ncores = nPackages = 1;
1051         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1052         if (__kmp_affinity_verbose) {
1053             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1054             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1055 
1056             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1057             if (__kmp_affinity_respect_mask) {
1058                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1059             } else {
1060                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1061             }
1062             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1063             KMP_INFORM(Uniform, "KMP_AFFINITY");
1064             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1065               __kmp_nThreadsPerCore, __kmp_ncores);
1066         }
1067 
1068         if (__kmp_affinity_type == affinity_none) {
1069             __kmp_free(threadInfo);
1070             KMP_CPU_FREE(oldMask);
1071             return 0;
1072         }
1073 
1074         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1075         Address addr(1);
1076         addr.labels[0] = threadInfo[0].pkgId;
1077         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1078 
1079         if (__kmp_affinity_gran_levels < 0) {
1080             __kmp_affinity_gran_levels = 0;
1081         }
1082 
1083         if (__kmp_affinity_verbose) {
1084             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1085         }
1086 
1087         __kmp_free(threadInfo);
1088         KMP_CPU_FREE(oldMask);
1089         return 1;
1090     }
1091 
1092     //
1093     // Sort the threadInfo table by physical Id.
1094     //
1095     qsort(threadInfo, nApics, sizeof(*threadInfo),
1096       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1097 
1098     //
1099     // The table is now sorted by pkgId / coreId / threadId, but we really
1100     // don't know the radix of any of the fields.  pkgId's may be sparsely
1101     // assigned among the chips on a system.  Although coreId's are usually
1102     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1103     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1104     //
1105     // For that matter, we don't know what coresPerPkg and threadsPerCore
1106     // (or the total # packages) are at this point - we want to determine
1107     // that now.  We only have an upper bound on the first two figures.
1108     //
1109     // We also perform a consistency check at this point: the values returned
1110     // by the cpuid instruction for any thread bound to a given package had
1111     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1112     //
1113     nPackages = 1;
1114     nCoresPerPkg = 1;
1115     __kmp_nThreadsPerCore = 1;
1116     unsigned nCores = 1;
1117 
1118     unsigned pkgCt = 1;                         // to determine radii
1119     unsigned lastPkgId = threadInfo[0].pkgId;
1120     unsigned coreCt = 1;
1121     unsigned lastCoreId = threadInfo[0].coreId;
1122     unsigned threadCt = 1;
1123     unsigned lastThreadId = threadInfo[0].threadId;
1124 
1125                                                 // intra-pkg consist checks
1126     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1127     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1128 
1129     for (i = 1; i < nApics; i++) {
1130         if (threadInfo[i].pkgId != lastPkgId) {
1131             nCores++;
1132             pkgCt++;
1133             lastPkgId = threadInfo[i].pkgId;
1134             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1135             coreCt = 1;
1136             lastCoreId = threadInfo[i].coreId;
1137             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1138             threadCt = 1;
1139             lastThreadId = threadInfo[i].threadId;
1140 
1141             //
1142             // This is a different package, so go on to the next iteration
1143             // without doing any consistency checks.  Reset the consistency
1144             // check vars, though.
1145             //
1146             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1147             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1148             continue;
1149         }
1150 
1151         if (threadInfo[i].coreId != lastCoreId) {
1152             nCores++;
1153             coreCt++;
1154             lastCoreId = threadInfo[i].coreId;
1155             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1156             threadCt = 1;
1157             lastThreadId = threadInfo[i].threadId;
1158         }
1159         else if (threadInfo[i].threadId != lastThreadId) {
1160             threadCt++;
1161             lastThreadId = threadInfo[i].threadId;
1162         }
1163         else {
1164             __kmp_free(threadInfo);
1165             KMP_CPU_FREE(oldMask);
1166             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1167             return -1;
1168         }
1169 
1170         //
1171         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1172         // fields agree between all the threads bounds to a given package.
1173         //
1174         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1175           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1176             __kmp_free(threadInfo);
1177             KMP_CPU_FREE(oldMask);
1178             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1179             return -1;
1180         }
1181     }
1182     nPackages = pkgCt;
1183     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1184     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1185 
1186     //
1187     // When affinity is off, this routine will still be called to set
1188     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1189     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1190     // correctly, and return now if affinity is not enabled.
1191     //
1192     __kmp_ncores = nCores;
1193     if (__kmp_affinity_verbose) {
1194         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1195         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1196 
1197         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1198         if (__kmp_affinity_respect_mask) {
1199             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1200         } else {
1201             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1202         }
1203         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1204         if (__kmp_affinity_uniform_topology()) {
1205             KMP_INFORM(Uniform, "KMP_AFFINITY");
1206         } else {
1207             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1208         }
1209         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1210           __kmp_nThreadsPerCore, __kmp_ncores);
1211 
1212     }
1213 
1214     if (__kmp_affinity_type == affinity_none) {
1215         __kmp_free(threadInfo);
1216         KMP_CPU_FREE(oldMask);
1217         return 0;
1218     }
1219 
1220     //
1221     // Now that we've determined the number of packages, the number of cores
1222     // per package, and the number of threads per core, we can construct the
1223     // data structure that is to be returned.
1224     //
1225     int pkgLevel = 0;
1226     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1227     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1228     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1229 
1230     KMP_ASSERT(depth > 0);
1231     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1232 
1233     for (i = 0; i < nApics; ++i) {
1234         Address addr(depth);
1235         unsigned os = threadInfo[i].osId;
1236         int d = 0;
1237 
1238         if (pkgLevel >= 0) {
1239             addr.labels[d++] = threadInfo[i].pkgId;
1240         }
1241         if (coreLevel >= 0) {
1242             addr.labels[d++] = threadInfo[i].coreId;
1243         }
1244         if (threadLevel >= 0) {
1245             addr.labels[d++] = threadInfo[i].threadId;
1246         }
1247         (*address2os)[i] = AddrUnsPair(addr, os);
1248     }
1249 
1250     if (__kmp_affinity_gran_levels < 0) {
1251         //
1252         // Set the granularity level based on what levels are modeled
1253         // in the machine topology map.
1254         //
1255         __kmp_affinity_gran_levels = 0;
1256         if ((threadLevel >= 0)
1257           && (__kmp_affinity_gran > affinity_gran_thread)) {
1258             __kmp_affinity_gran_levels++;
1259         }
1260         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1261             __kmp_affinity_gran_levels++;
1262         }
1263         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1264             __kmp_affinity_gran_levels++;
1265         }
1266     }
1267 
1268     if (__kmp_affinity_verbose) {
1269         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1270           coreLevel, threadLevel);
1271     }
1272 
1273     __kmp_free(threadInfo);
1274     KMP_CPU_FREE(oldMask);
1275     return depth;
1276 }
1277 
1278 
1279 //
1280 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1281 // architectures support a newer interface for specifying the x2APIC Ids,
1282 // based on cpuid leaf 11.
1283 //
1284 static int
1285 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1286   kmp_i18n_id_t *const msg_id)
1287 {
1288     kmp_cpuid buf;
1289 
1290     *address2os = NULL;
1291     *msg_id = kmp_i18n_null;
1292 
1293     //
1294     // Check to see if cpuid leaf 11 is supported.
1295     //
1296     __kmp_x86_cpuid(0, 0, &buf);
1297     if (buf.eax < 11) {
1298         *msg_id = kmp_i18n_str_NoLeaf11Support;
1299         return -1;
1300     }
1301     __kmp_x86_cpuid(11, 0, &buf);
1302     if (buf.ebx == 0) {
1303         *msg_id = kmp_i18n_str_NoLeaf11Support;
1304         return -1;
1305     }
1306 
1307     //
1308     // Find the number of levels in the machine topology.  While we're at it,
1309     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1310     // try to get more accurate values later by explicitly counting them,
1311     // but get reasonable defaults now, in case we return early.
1312     //
1313     int level;
1314     int threadLevel = -1;
1315     int coreLevel = -1;
1316     int pkgLevel = -1;
1317     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1318 
1319     for (level = 0;; level++) {
1320         if (level > 31) {
1321             //
1322             // FIXME: Hack for DPD200163180
1323             //
1324             // If level is big then something went wrong -> exiting
1325             //
1326             // There could actually be 32 valid levels in the machine topology,
1327             // but so far, the only machine we have seen which does not exit
1328             // this loop before iteration 32 has fubar x2APIC settings.
1329             //
1330             // For now, just reject this case based upon loop trip count.
1331             //
1332             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1333             return -1;
1334         }
1335         __kmp_x86_cpuid(11, level, &buf);
1336         if (buf.ebx == 0) {
1337             if (pkgLevel < 0) {
1338                 //
1339                 // Will infer nPackages from __kmp_xproc
1340                 //
1341                 pkgLevel = level;
1342                 level++;
1343             }
1344             break;
1345         }
1346         int kind = (buf.ecx >> 8) & 0xff;
1347         if (kind == 1) {
1348             //
1349             // SMT level
1350             //
1351             threadLevel = level;
1352             coreLevel = -1;
1353             pkgLevel = -1;
1354             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1355             if (__kmp_nThreadsPerCore == 0) {
1356                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1357                 return -1;
1358             }
1359         }
1360         else if (kind == 2) {
1361             //
1362             // core level
1363             //
1364             coreLevel = level;
1365             pkgLevel = -1;
1366             nCoresPerPkg = buf.ebx & 0xff;
1367             if (nCoresPerPkg == 0) {
1368                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1369                 return -1;
1370             }
1371         }
1372         else {
1373             if (level <= 0) {
1374                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1375                 return -1;
1376             }
1377             if (pkgLevel >= 0) {
1378                 continue;
1379             }
1380             pkgLevel = level;
1381             nPackages = buf.ebx & 0xff;
1382             if (nPackages == 0) {
1383                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1384                 return -1;
1385             }
1386         }
1387     }
1388     int depth = level;
1389 
1390     //
1391     // In the above loop, "level" was counted from the finest level (usually
1392     // thread) to the coarsest.  The caller expects that we will place the
1393     // labels in (*address2os)[].first.labels[] in the inverse order, so
1394     // we need to invert the vars saying which level means what.
1395     //
1396     if (threadLevel >= 0) {
1397         threadLevel = depth - threadLevel - 1;
1398     }
1399     if (coreLevel >= 0) {
1400         coreLevel = depth - coreLevel - 1;
1401     }
1402     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1403     pkgLevel = depth - pkgLevel - 1;
1404 
1405     //
1406     // The algorithm used starts by setting the affinity to each available
1407     // thread and retrieving info from the cpuid instruction, so if we are
1408     // not capable of calling __kmp_get_system_affinity() and
1409     // _kmp_get_system_affinity(), then we need to do something else - use
1410     // the defaults that we calculated from issuing cpuid without binding
1411     // to each proc.
1412     //
1413     if (! KMP_AFFINITY_CAPABLE())
1414     {
1415         //
1416         // Hack to try and infer the machine topology using only the data
1417         // available from cpuid on the current thread, and __kmp_xproc.
1418         //
1419         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1420 
1421         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1422         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1423         if (__kmp_affinity_verbose) {
1424             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1425             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1426             if (__kmp_affinity_uniform_topology()) {
1427                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1428             } else {
1429                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1430             }
1431             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1432               __kmp_nThreadsPerCore, __kmp_ncores);
1433         }
1434         return 0;
1435     }
1436 
1437     //
1438     //
1439     // From here on, we can assume that it is safe to call
1440     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1441     // even if __kmp_affinity_type = affinity_none.
1442     //
1443 
1444     //
1445     // Save the affinity mask for the current thread.
1446     //
1447     kmp_affin_mask_t *oldMask;
1448     KMP_CPU_ALLOC(oldMask);
1449     __kmp_get_system_affinity(oldMask, TRUE);
1450 
1451     //
1452     // Allocate the data structure to be returned.
1453     //
1454     AddrUnsPair *retval = (AddrUnsPair *)
1455       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1456 
1457     //
1458     // Run through each of the available contexts, binding the current thread
1459     // to it, and obtaining the pertinent information using the cpuid instr.
1460     //
1461     unsigned int proc;
1462     int nApics = 0;
1463     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1464         //
1465         // Skip this proc if it is not included in the machine model.
1466         //
1467         if (! KMP_CPU_ISSET(proc, fullMask)) {
1468             continue;
1469         }
1470         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1471 
1472         __kmp_affinity_bind_thread(proc);
1473 
1474         //
1475         // Extrach the labels for each level in the machine topology map
1476         // from the Apic ID.
1477         //
1478         Address addr(depth);
1479         int prev_shift = 0;
1480 
1481         for (level = 0; level < depth; level++) {
1482             __kmp_x86_cpuid(11, level, &buf);
1483             unsigned apicId = buf.edx;
1484             if (buf.ebx == 0) {
1485                 if (level != depth - 1) {
1486                     KMP_CPU_FREE(oldMask);
1487                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1488                     return -1;
1489                 }
1490                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1491                 level++;
1492                 break;
1493             }
1494             int shift = buf.eax & 0x1f;
1495             int mask = (1 << shift) - 1;
1496             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1497             prev_shift = shift;
1498         }
1499         if (level != depth) {
1500             KMP_CPU_FREE(oldMask);
1501             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1502             return -1;
1503         }
1504 
1505         retval[nApics] = AddrUnsPair(addr, proc);
1506         nApics++;
1507     }
1508 
1509     //
1510     // We've collected all the info we need.
1511     // Restore the old affinity mask for this thread.
1512     //
1513     __kmp_set_system_affinity(oldMask, TRUE);
1514 
1515     //
1516     // If there's only one thread context to bind to, return now.
1517     //
1518     KMP_ASSERT(nApics > 0);
1519     if (nApics == 1) {
1520         __kmp_ncores = nPackages = 1;
1521         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1522         if (__kmp_affinity_verbose) {
1523             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1524             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1525 
1526             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1527             if (__kmp_affinity_respect_mask) {
1528                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1529             } else {
1530                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1531             }
1532             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1533             KMP_INFORM(Uniform, "KMP_AFFINITY");
1534             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1535               __kmp_nThreadsPerCore, __kmp_ncores);
1536         }
1537 
1538         if (__kmp_affinity_type == affinity_none) {
1539             __kmp_free(retval);
1540             KMP_CPU_FREE(oldMask);
1541             return 0;
1542         }
1543 
1544         //
1545         // Form an Address object which only includes the package level.
1546         //
1547         Address addr(1);
1548         addr.labels[0] = retval[0].first.labels[pkgLevel];
1549         retval[0].first = addr;
1550 
1551         if (__kmp_affinity_gran_levels < 0) {
1552             __kmp_affinity_gran_levels = 0;
1553         }
1554 
1555         if (__kmp_affinity_verbose) {
1556             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1557         }
1558 
1559         *address2os = retval;
1560         KMP_CPU_FREE(oldMask);
1561         return 1;
1562     }
1563 
1564     //
1565     // Sort the table by physical Id.
1566     //
1567     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1568 
1569     //
1570     // Find the radix at each of the levels.
1571     //
1572     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1573     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1574     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1575     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1576     for (level = 0; level < depth; level++) {
1577         totals[level] = 1;
1578         maxCt[level] = 1;
1579         counts[level] = 1;
1580         last[level] = retval[0].first.labels[level];
1581     }
1582 
1583     //
1584     // From here on, the iteration variable "level" runs from the finest
1585     // level to the coarsest, i.e. we iterate forward through
1586     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1587     // backwards.
1588     //
1589     for (proc = 1; (int)proc < nApics; proc++) {
1590         int level;
1591         for (level = 0; level < depth; level++) {
1592             if (retval[proc].first.labels[level] != last[level]) {
1593                 int j;
1594                 for (j = level + 1; j < depth; j++) {
1595                     totals[j]++;
1596                     counts[j] = 1;
1597                     // The line below causes printing incorrect topology information
1598                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1599                     // some less value while going through the array.
1600                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1601                     // whereas it must be 4.
1602                     // TODO!!! Check if it can be commented safely
1603                     //maxCt[j] = 1;
1604                     last[j] = retval[proc].first.labels[j];
1605                 }
1606                 totals[level]++;
1607                 counts[level]++;
1608                 if (counts[level] > maxCt[level]) {
1609                     maxCt[level] = counts[level];
1610                 }
1611                 last[level] = retval[proc].first.labels[level];
1612                 break;
1613             }
1614             else if (level == depth - 1) {
1615                 __kmp_free(last);
1616                 __kmp_free(maxCt);
1617                 __kmp_free(counts);
1618                 __kmp_free(totals);
1619                 __kmp_free(retval);
1620                 KMP_CPU_FREE(oldMask);
1621                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1622                 return -1;
1623             }
1624         }
1625     }
1626 
1627     //
1628     // When affinity is off, this routine will still be called to set
1629     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1630     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1631     // correctly, and return if affinity is not enabled.
1632     //
1633     if (threadLevel >= 0) {
1634         __kmp_nThreadsPerCore = maxCt[threadLevel];
1635     }
1636     else {
1637         __kmp_nThreadsPerCore = 1;
1638     }
1639     nPackages = totals[pkgLevel];
1640 
1641     if (coreLevel >= 0) {
1642         __kmp_ncores = totals[coreLevel];
1643         nCoresPerPkg = maxCt[coreLevel];
1644     }
1645     else {
1646         __kmp_ncores = nPackages;
1647         nCoresPerPkg = 1;
1648     }
1649 
1650     //
1651     // Check to see if the machine topology is uniform
1652     //
1653     unsigned prod = maxCt[0];
1654     for (level = 1; level < depth; level++) {
1655        prod *= maxCt[level];
1656     }
1657     bool uniform = (prod == totals[level - 1]);
1658 
1659     //
1660     // Print the machine topology summary.
1661     //
1662     if (__kmp_affinity_verbose) {
1663         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1664         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1665 
1666         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1667         if (__kmp_affinity_respect_mask) {
1668             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1669         } else {
1670             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1671         }
1672         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1673         if (uniform) {
1674             KMP_INFORM(Uniform, "KMP_AFFINITY");
1675         } else {
1676             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1677         }
1678 
1679         kmp_str_buf_t buf;
1680         __kmp_str_buf_init(&buf);
1681 
1682         __kmp_str_buf_print(&buf, "%d", totals[0]);
1683         for (level = 1; level <= pkgLevel; level++) {
1684             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1685         }
1686         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1687           __kmp_nThreadsPerCore, __kmp_ncores);
1688 
1689         __kmp_str_buf_free(&buf);
1690     }
1691 
1692     if (__kmp_affinity_type == affinity_none) {
1693         __kmp_free(last);
1694         __kmp_free(maxCt);
1695         __kmp_free(counts);
1696         __kmp_free(totals);
1697         __kmp_free(retval);
1698         KMP_CPU_FREE(oldMask);
1699         return 0;
1700     }
1701 
1702     //
1703     // Find any levels with radiix 1, and remove them from the map
1704     // (except for the package level).
1705     //
1706     int new_depth = 0;
1707     for (level = 0; level < depth; level++) {
1708         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1709            continue;
1710         }
1711         new_depth++;
1712     }
1713 
1714     //
1715     // If we are removing any levels, allocate a new vector to return,
1716     // and copy the relevant information to it.
1717     //
1718     if (new_depth != depth) {
1719         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1720           sizeof(AddrUnsPair) * nApics);
1721         for (proc = 0; (int)proc < nApics; proc++) {
1722             Address addr(new_depth);
1723             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1724         }
1725         int new_level = 0;
1726         for (level = 0; level < depth; level++) {
1727             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1728                if (level == threadLevel) {
1729                    threadLevel = -1;
1730                }
1731                else if ((threadLevel >= 0) && (level < threadLevel)) {
1732                    threadLevel--;
1733                }
1734                if (level == coreLevel) {
1735                    coreLevel = -1;
1736                }
1737                else if ((coreLevel >= 0) && (level < coreLevel)) {
1738                    coreLevel--;
1739                }
1740                if (level < pkgLevel) {
1741                    pkgLevel--;
1742                }
1743                continue;
1744             }
1745             for (proc = 0; (int)proc < nApics; proc++) {
1746                 new_retval[proc].first.labels[new_level]
1747                   = retval[proc].first.labels[level];
1748             }
1749             new_level++;
1750         }
1751 
1752         __kmp_free(retval);
1753         retval = new_retval;
1754         depth = new_depth;
1755     }
1756 
1757     if (__kmp_affinity_gran_levels < 0) {
1758         //
1759         // Set the granularity level based on what levels are modeled
1760         // in the machine topology map.
1761         //
1762         __kmp_affinity_gran_levels = 0;
1763         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1764             __kmp_affinity_gran_levels++;
1765         }
1766         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1767             __kmp_affinity_gran_levels++;
1768         }
1769         if (__kmp_affinity_gran > affinity_gran_package) {
1770             __kmp_affinity_gran_levels++;
1771         }
1772     }
1773 
1774     if (__kmp_affinity_verbose) {
1775         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1776           coreLevel, threadLevel);
1777     }
1778 
1779     __kmp_free(last);
1780     __kmp_free(maxCt);
1781     __kmp_free(counts);
1782     __kmp_free(totals);
1783     KMP_CPU_FREE(oldMask);
1784     *address2os = retval;
1785     return depth;
1786 }
1787 
1788 
1789 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1790 
1791 
1792 #define osIdIndex       0
1793 #define threadIdIndex   1
1794 #define coreIdIndex     2
1795 #define pkgIdIndex      3
1796 #define nodeIdIndex     4
1797 
1798 typedef unsigned *ProcCpuInfo;
1799 static unsigned maxIndex = pkgIdIndex;
1800 
1801 
1802 static int
1803 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1804 {
1805     const unsigned *aa = (const unsigned *)a;
1806     const unsigned *bb = (const unsigned *)b;
1807     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1808     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1809     return 0;
1810 };
1811 
1812 
1813 static int
1814 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1815 {
1816     unsigned i;
1817     const unsigned *aa = *((const unsigned **)a);
1818     const unsigned *bb = *((const unsigned **)b);
1819     for (i = maxIndex; ; i--) {
1820         if (aa[i] < bb[i]) return -1;
1821         if (aa[i] > bb[i]) return 1;
1822         if (i == osIdIndex) break;
1823     }
1824     return 0;
1825 }
1826 
1827 
1828 //
1829 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1830 // affinity map.
1831 //
1832 static int
1833 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1834   kmp_i18n_id_t *const msg_id, FILE *f)
1835 {
1836     *address2os = NULL;
1837     *msg_id = kmp_i18n_null;
1838 
1839     //
1840     // Scan of the file, and count the number of "processor" (osId) fields,
1841     // and find the highest value of <n> for a node_<n> field.
1842     //
1843     char buf[256];
1844     unsigned num_records = 0;
1845     while (! feof(f)) {
1846         buf[sizeof(buf) - 1] = 1;
1847         if (! fgets(buf, sizeof(buf), f)) {
1848             //
1849             // Read errors presumably because of EOF
1850             //
1851             break;
1852         }
1853 
1854         char s1[] = "processor";
1855         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1856             num_records++;
1857             continue;
1858         }
1859 
1860         //
1861         // FIXME - this will match "node_<n> <garbage>"
1862         //
1863         unsigned level;
1864         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1865             if (nodeIdIndex + level >= maxIndex) {
1866                 maxIndex = nodeIdIndex + level;
1867             }
1868             continue;
1869         }
1870     }
1871 
1872     //
1873     // Check for empty file / no valid processor records, or too many.
1874     // The number of records can't exceed the number of valid bits in the
1875     // affinity mask.
1876     //
1877     if (num_records == 0) {
1878         *line = 0;
1879         *msg_id = kmp_i18n_str_NoProcRecords;
1880         return -1;
1881     }
1882     if (num_records > (unsigned)__kmp_xproc) {
1883         *line = 0;
1884         *msg_id = kmp_i18n_str_TooManyProcRecords;
1885         return -1;
1886     }
1887 
1888     //
1889     // Set the file pointer back to the begginning, so that we can scan the
1890     // file again, this time performing a full parse of the data.
1891     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1892     // Adding an extra element at the end allows us to remove a lot of extra
1893     // checks for termination conditions.
1894     //
1895     if (fseek(f, 0, SEEK_SET) != 0) {
1896         *line = 0;
1897         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1898         return -1;
1899     }
1900 
1901     //
1902     // Allocate the array of records to store the proc info in.  The dummy
1903     // element at the end makes the logic in filling them out easier to code.
1904     //
1905     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1906       * sizeof(unsigned *));
1907     unsigned i;
1908     for (i = 0; i <= num_records; i++) {
1909         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1910           * sizeof(unsigned));
1911     }
1912 
1913 #define CLEANUP_THREAD_INFO \
1914     for (i = 0; i <= num_records; i++) {                                \
1915         __kmp_free(threadInfo[i]);                                      \
1916     }                                                                   \
1917     __kmp_free(threadInfo);
1918 
1919     //
1920     // A value of UINT_MAX means that we didn't find the field
1921     //
1922     unsigned __index;
1923 
1924 #define INIT_PROC_INFO(p) \
1925     for (__index = 0; __index <= maxIndex; __index++) {                 \
1926         (p)[__index] = UINT_MAX;                                        \
1927     }
1928 
1929     for (i = 0; i <= num_records; i++) {
1930         INIT_PROC_INFO(threadInfo[i]);
1931     }
1932 
1933     unsigned num_avail = 0;
1934     *line = 0;
1935     while (! feof(f)) {
1936         //
1937         // Create an inner scoping level, so that all the goto targets at the
1938         // end of the loop appear in an outer scoping level.  This avoids
1939         // warnings about jumping past an initialization to a target in the
1940         // same block.
1941         //
1942         {
1943             buf[sizeof(buf) - 1] = 1;
1944             bool long_line = false;
1945             if (! fgets(buf, sizeof(buf), f)) {
1946                 //
1947                 // Read errors presumably because of EOF
1948                 //
1949                 // If there is valid data in threadInfo[num_avail], then fake
1950                 // a blank line in ensure that the last address gets parsed.
1951                 //
1952                 bool valid = false;
1953                 for (i = 0; i <= maxIndex; i++) {
1954                     if (threadInfo[num_avail][i] != UINT_MAX) {
1955                         valid = true;
1956                     }
1957                 }
1958                 if (! valid) {
1959                     break;
1960                 }
1961                 buf[0] = 0;
1962             } else if (!buf[sizeof(buf) - 1]) {
1963                 //
1964                 // The line is longer than the buffer.  Set a flag and don't
1965                 // emit an error if we were going to ignore the line, anyway.
1966                 //
1967                 long_line = true;
1968 
1969 #define CHECK_LINE \
1970     if (long_line) {                                                    \
1971         CLEANUP_THREAD_INFO;                                            \
1972         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1973         return -1;                                                      \
1974     }
1975             }
1976             (*line)++;
1977 
1978             char s1[] = "processor";
1979             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1980                 CHECK_LINE;
1981                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1982                 unsigned val;
1983                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1984                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1985                 threadInfo[num_avail][osIdIndex] = val;
1986 #if KMP_OS_LINUX && USE_SYSFS_INFO
1987                 char path[256];
1988                 KMP_SNPRINTF(path, sizeof(path),
1989                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1990                     threadInfo[num_avail][osIdIndex]);
1991                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1992 
1993                 KMP_SNPRINTF(path, sizeof(path),
1994                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1995                     threadInfo[num_avail][osIdIndex]);
1996                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1997                 continue;
1998 #else
1999             }
2000             char s2[] = "physical id";
2001             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2002                 CHECK_LINE;
2003                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2004                 unsigned val;
2005                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2006                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2007                 threadInfo[num_avail][pkgIdIndex] = val;
2008                 continue;
2009             }
2010             char s3[] = "core id";
2011             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2012                 CHECK_LINE;
2013                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2014                 unsigned val;
2015                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2016                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2017                 threadInfo[num_avail][coreIdIndex] = val;
2018                 continue;
2019 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2020             }
2021             char s4[] = "thread id";
2022             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2023                 CHECK_LINE;
2024                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2025                 unsigned val;
2026                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2027                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2028                 threadInfo[num_avail][threadIdIndex] = val;
2029                 continue;
2030             }
2031             unsigned level;
2032             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2033                 CHECK_LINE;
2034                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2035                 unsigned val;
2036                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2037                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2038                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2039                 threadInfo[num_avail][nodeIdIndex + level] = val;
2040                 continue;
2041             }
2042 
2043             //
2044             // We didn't recognize the leading token on the line.
2045             // There are lots of leading tokens that we don't recognize -
2046             // if the line isn't empty, go on to the next line.
2047             //
2048             if ((*buf != 0) && (*buf != '\n')) {
2049                 //
2050                 // If the line is longer than the buffer, read characters
2051                 // until we find a newline.
2052                 //
2053                 if (long_line) {
2054                     int ch;
2055                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2056                 }
2057                 continue;
2058             }
2059 
2060             //
2061             // A newline has signalled the end of the processor record.
2062             // Check that there aren't too many procs specified.
2063             //
2064             if ((int)num_avail == __kmp_xproc) {
2065                 CLEANUP_THREAD_INFO;
2066                 *msg_id = kmp_i18n_str_TooManyEntries;
2067                 return -1;
2068             }
2069 
2070             //
2071             // Check for missing fields.  The osId field must be there, and we
2072             // currently require that the physical id field is specified, also.
2073             //
2074             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2075                 CLEANUP_THREAD_INFO;
2076                 *msg_id = kmp_i18n_str_MissingProcField;
2077                 return -1;
2078             }
2079             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2080                 CLEANUP_THREAD_INFO;
2081                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2082                 return -1;
2083             }
2084 
2085             //
2086             // Skip this proc if it is not included in the machine model.
2087             //
2088             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2089                 INIT_PROC_INFO(threadInfo[num_avail]);
2090                 continue;
2091             }
2092 
2093             //
2094             // We have a successful parse of this proc's info.
2095             // Increment the counter, and prepare for the next proc.
2096             //
2097             num_avail++;
2098             KMP_ASSERT(num_avail <= num_records);
2099             INIT_PROC_INFO(threadInfo[num_avail]);
2100         }
2101         continue;
2102 
2103         no_val:
2104         CLEANUP_THREAD_INFO;
2105         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2106         return -1;
2107 
2108         dup_field:
2109         CLEANUP_THREAD_INFO;
2110         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2111         return -1;
2112     }
2113     *line = 0;
2114 
2115 # if KMP_MIC && REDUCE_TEAM_SIZE
2116     unsigned teamSize = 0;
2117 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2118 
2119     // check for num_records == __kmp_xproc ???
2120 
2121     //
2122     // If there's only one thread context to bind to, form an Address object
2123     // with depth 1 and return immediately (or, if affinity is off, set
2124     // address2os to NULL and return).
2125     //
2126     // If it is configured to omit the package level when there is only a
2127     // single package, the logic at the end of this routine won't work if
2128     // there is only a single thread - it would try to form an Address
2129     // object with depth 0.
2130     //
2131     KMP_ASSERT(num_avail > 0);
2132     KMP_ASSERT(num_avail <= num_records);
2133     if (num_avail == 1) {
2134         __kmp_ncores = 1;
2135         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2136         if (__kmp_affinity_verbose) {
2137             if (! KMP_AFFINITY_CAPABLE()) {
2138                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2139                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2140                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2141             }
2142             else {
2143                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2144                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2145                   fullMask);
2146                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2147                 if (__kmp_affinity_respect_mask) {
2148                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2149                 } else {
2150                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2151                 }
2152                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2153                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2154             }
2155             int index;
2156             kmp_str_buf_t buf;
2157             __kmp_str_buf_init(&buf);
2158             __kmp_str_buf_print(&buf, "1");
2159             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2160                 __kmp_str_buf_print(&buf, " x 1");
2161             }
2162             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2163             __kmp_str_buf_free(&buf);
2164         }
2165 
2166         if (__kmp_affinity_type == affinity_none) {
2167             CLEANUP_THREAD_INFO;
2168             return 0;
2169         }
2170 
2171         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2172         Address addr(1);
2173         addr.labels[0] = threadInfo[0][pkgIdIndex];
2174         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2175 
2176         if (__kmp_affinity_gran_levels < 0) {
2177             __kmp_affinity_gran_levels = 0;
2178         }
2179 
2180         if (__kmp_affinity_verbose) {
2181             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2182         }
2183 
2184         CLEANUP_THREAD_INFO;
2185         return 1;
2186     }
2187 
2188     //
2189     // Sort the threadInfo table by physical Id.
2190     //
2191     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2192       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2193 
2194     //
2195     // The table is now sorted by pkgId / coreId / threadId, but we really
2196     // don't know the radix of any of the fields.  pkgId's may be sparsely
2197     // assigned among the chips on a system.  Although coreId's are usually
2198     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2199     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2200     //
2201     // For that matter, we don't know what coresPerPkg and threadsPerCore
2202     // (or the total # packages) are at this point - we want to determine
2203     // that now.  We only have an upper bound on the first two figures.
2204     //
2205     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2206       * sizeof(unsigned));
2207     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2208       * sizeof(unsigned));
2209     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2210       * sizeof(unsigned));
2211     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2212       * sizeof(unsigned));
2213 
2214     bool assign_thread_ids = false;
2215     unsigned threadIdCt;
2216     unsigned index;
2217 
2218     restart_radix_check:
2219     threadIdCt = 0;
2220 
2221     //
2222     // Initialize the counter arrays with data from threadInfo[0].
2223     //
2224     if (assign_thread_ids) {
2225         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2226             threadInfo[0][threadIdIndex] = threadIdCt++;
2227         }
2228         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2229             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2230         }
2231     }
2232     for (index = 0; index <= maxIndex; index++) {
2233         counts[index] = 1;
2234         maxCt[index] = 1;
2235         totals[index] = 1;
2236         lastId[index] = threadInfo[0][index];;
2237     }
2238 
2239     //
2240     // Run through the rest of the OS procs.
2241     //
2242     for (i = 1; i < num_avail; i++) {
2243         //
2244         // Find the most significant index whose id differs
2245         // from the id for the previous OS proc.
2246         //
2247         for (index = maxIndex; index >= threadIdIndex; index--) {
2248             if (assign_thread_ids && (index == threadIdIndex)) {
2249                 //
2250                 // Auto-assign the thread id field if it wasn't specified.
2251                 //
2252                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2253                     threadInfo[i][threadIdIndex] = threadIdCt++;
2254                 }
2255 
2256                 //
2257                 // Aparrently the thread id field was specified for some
2258                 // entries and not others.  Start the thread id counter
2259                 // off at the next higher thread id.
2260                 //
2261                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2262                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2263                 }
2264             }
2265             if (threadInfo[i][index] != lastId[index]) {
2266                 //
2267                 // Run through all indices which are less significant,
2268                 // and reset the counts to 1.
2269                 //
2270                 // At all levels up to and including index, we need to
2271                 // increment the totals and record the last id.
2272                 //
2273                 unsigned index2;
2274                 for (index2 = threadIdIndex; index2 < index; index2++) {
2275                     totals[index2]++;
2276                     if (counts[index2] > maxCt[index2]) {
2277                         maxCt[index2] = counts[index2];
2278                     }
2279                     counts[index2] = 1;
2280                     lastId[index2] = threadInfo[i][index2];
2281                 }
2282                 counts[index]++;
2283                 totals[index]++;
2284                 lastId[index] = threadInfo[i][index];
2285 
2286                 if (assign_thread_ids && (index > threadIdIndex)) {
2287 
2288 # if KMP_MIC && REDUCE_TEAM_SIZE
2289                     //
2290                     // The default team size is the total #threads in the machine
2291                     // minus 1 thread for every core that has 3 or more threads.
2292                     //
2293                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2294 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2295 
2296                     //
2297                     // Restart the thread counter, as we are on a new core.
2298                     //
2299                     threadIdCt = 0;
2300 
2301                     //
2302                     // Auto-assign the thread id field if it wasn't specified.
2303                     //
2304                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2305                         threadInfo[i][threadIdIndex] = threadIdCt++;
2306                     }
2307 
2308                     //
2309                     // Aparrently the thread id field was specified for some
2310                     // entries and not others.  Start the thread id counter
2311                     // off at the next higher thread id.
2312                     //
2313                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2314                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2315                     }
2316                 }
2317                 break;
2318             }
2319         }
2320         if (index < threadIdIndex) {
2321             //
2322             // If thread ids were specified, it is an error if they are not
2323             // unique.  Also, check that we waven't already restarted the
2324             // loop (to be safe - shouldn't need to).
2325             //
2326             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2327               || assign_thread_ids) {
2328                 __kmp_free(lastId);
2329                 __kmp_free(totals);
2330                 __kmp_free(maxCt);
2331                 __kmp_free(counts);
2332                 CLEANUP_THREAD_INFO;
2333                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2334                 return -1;
2335             }
2336 
2337             //
2338             // If the thread ids were not specified and we see entries
2339             // entries that are duplicates, start the loop over and
2340             // assign the thread ids manually.
2341             //
2342             assign_thread_ids = true;
2343             goto restart_radix_check;
2344         }
2345     }
2346 
2347 # if KMP_MIC && REDUCE_TEAM_SIZE
2348     //
2349     // The default team size is the total #threads in the machine
2350     // minus 1 thread for every core that has 3 or more threads.
2351     //
2352     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2353 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2354 
2355     for (index = threadIdIndex; index <= maxIndex; index++) {
2356         if (counts[index] > maxCt[index]) {
2357             maxCt[index] = counts[index];
2358         }
2359     }
2360 
2361     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2362     nCoresPerPkg = maxCt[coreIdIndex];
2363     nPackages = totals[pkgIdIndex];
2364 
2365     //
2366     // Check to see if the machine topology is uniform
2367     //
2368     unsigned prod = totals[maxIndex];
2369     for (index = threadIdIndex; index < maxIndex; index++) {
2370        prod *= maxCt[index];
2371     }
2372     bool uniform = (prod == totals[threadIdIndex]);
2373 
2374     //
2375     // When affinity is off, this routine will still be called to set
2376     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2377     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2378     // correctly, and return now if affinity is not enabled.
2379     //
2380     __kmp_ncores = totals[coreIdIndex];
2381 
2382     if (__kmp_affinity_verbose) {
2383         if (! KMP_AFFINITY_CAPABLE()) {
2384                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2385                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2386                 if (uniform) {
2387                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2388                 } else {
2389                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2390                 }
2391         }
2392         else {
2393             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2394             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2395                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2396                 if (__kmp_affinity_respect_mask) {
2397                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2398                 } else {
2399                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2400                 }
2401                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2402                 if (uniform) {
2403                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2404                 } else {
2405                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2406                 }
2407         }
2408         kmp_str_buf_t buf;
2409         __kmp_str_buf_init(&buf);
2410 
2411         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2412         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2413             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2414         }
2415         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2416           maxCt[threadIdIndex], __kmp_ncores);
2417 
2418         __kmp_str_buf_free(&buf);
2419     }
2420 
2421 # if KMP_MIC && REDUCE_TEAM_SIZE
2422     //
2423     // Set the default team size.
2424     //
2425     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2426         __kmp_dflt_team_nth = teamSize;
2427         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2428           __kmp_dflt_team_nth));
2429     }
2430 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2431 
2432     if (__kmp_affinity_type == affinity_none) {
2433         __kmp_free(lastId);
2434         __kmp_free(totals);
2435         __kmp_free(maxCt);
2436         __kmp_free(counts);
2437         CLEANUP_THREAD_INFO;
2438         return 0;
2439     }
2440 
2441     //
2442     // Count the number of levels which have more nodes at that level than
2443     // at the parent's level (with there being an implicit root node of
2444     // the top level).  This is equivalent to saying that there is at least
2445     // one node at this level which has a sibling.  These levels are in the
2446     // map, and the package level is always in the map.
2447     //
2448     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2449     int level = 0;
2450     for (index = threadIdIndex; index < maxIndex; index++) {
2451         KMP_ASSERT(totals[index] >= totals[index + 1]);
2452         inMap[index] = (totals[index] > totals[index + 1]);
2453     }
2454     inMap[maxIndex] = (totals[maxIndex] > 1);
2455     inMap[pkgIdIndex] = true;
2456 
2457     int depth = 0;
2458     for (index = threadIdIndex; index <= maxIndex; index++) {
2459         if (inMap[index]) {
2460             depth++;
2461         }
2462     }
2463     KMP_ASSERT(depth > 0);
2464 
2465     //
2466     // Construct the data structure that is to be returned.
2467     //
2468     *address2os = (AddrUnsPair*)
2469       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2470     int pkgLevel = -1;
2471     int coreLevel = -1;
2472     int threadLevel = -1;
2473 
2474     for (i = 0; i < num_avail; ++i) {
2475         Address addr(depth);
2476         unsigned os = threadInfo[i][osIdIndex];
2477         int src_index;
2478         int dst_index = 0;
2479 
2480         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2481             if (! inMap[src_index]) {
2482                 continue;
2483             }
2484             addr.labels[dst_index] = threadInfo[i][src_index];
2485             if (src_index == pkgIdIndex) {
2486                 pkgLevel = dst_index;
2487             }
2488             else if (src_index == coreIdIndex) {
2489                 coreLevel = dst_index;
2490             }
2491             else if (src_index == threadIdIndex) {
2492                 threadLevel = dst_index;
2493             }
2494             dst_index++;
2495         }
2496         (*address2os)[i] = AddrUnsPair(addr, os);
2497     }
2498 
2499     if (__kmp_affinity_gran_levels < 0) {
2500         //
2501         // Set the granularity level based on what levels are modeled
2502         // in the machine topology map.
2503         //
2504         unsigned src_index;
2505         __kmp_affinity_gran_levels = 0;
2506         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2507             if (! inMap[src_index]) {
2508                 continue;
2509             }
2510             switch (src_index) {
2511                 case threadIdIndex:
2512                 if (__kmp_affinity_gran > affinity_gran_thread) {
2513                     __kmp_affinity_gran_levels++;
2514                 }
2515 
2516                 break;
2517                 case coreIdIndex:
2518                 if (__kmp_affinity_gran > affinity_gran_core) {
2519                     __kmp_affinity_gran_levels++;
2520                 }
2521                 break;
2522 
2523                 case pkgIdIndex:
2524                 if (__kmp_affinity_gran > affinity_gran_package) {
2525                     __kmp_affinity_gran_levels++;
2526                 }
2527                 break;
2528             }
2529         }
2530     }
2531 
2532     if (__kmp_affinity_verbose) {
2533         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2534           coreLevel, threadLevel);
2535     }
2536 
2537     __kmp_free(inMap);
2538     __kmp_free(lastId);
2539     __kmp_free(totals);
2540     __kmp_free(maxCt);
2541     __kmp_free(counts);
2542     CLEANUP_THREAD_INFO;
2543     return depth;
2544 }
2545 
2546 
2547 //
2548 // Create and return a table of affinity masks, indexed by OS thread ID.
2549 // This routine handles OR'ing together all the affinity masks of threads
2550 // that are sufficiently close, if granularity > fine.
2551 //
2552 static kmp_affin_mask_t *
2553 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2554   AddrUnsPair *address2os, unsigned numAddrs)
2555 {
2556     //
2557     // First form a table of affinity masks in order of OS thread id.
2558     //
2559     unsigned depth;
2560     unsigned maxOsId;
2561     unsigned i;
2562 
2563     KMP_ASSERT(numAddrs > 0);
2564     depth = address2os[0].first.depth;
2565 
2566     maxOsId = 0;
2567     for (i = 0; i < numAddrs; i++) {
2568         unsigned osId = address2os[i].second;
2569         if (osId > maxOsId) {
2570             maxOsId = osId;
2571         }
2572     }
2573     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2574       (maxOsId + 1) * __kmp_affin_mask_size);
2575 
2576     //
2577     // Sort the address2os table according to physical order.  Doing so
2578     // will put all threads on the same core/package/node in consecutive
2579     // locations.
2580     //
2581     qsort(address2os, numAddrs, sizeof(*address2os),
2582       __kmp_affinity_cmp_Address_labels);
2583 
2584     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2585     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2586         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2587     }
2588     if (__kmp_affinity_gran_levels >= (int)depth) {
2589         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2590           && (__kmp_affinity_type != affinity_none))) {
2591             KMP_WARNING(AffThreadsMayMigrate);
2592         }
2593     }
2594 
2595     //
2596     // Run through the table, forming the masks for all threads on each
2597     // core.  Threads on the same core will have identical "Address"
2598     // objects, not considering the last level, which must be the thread
2599     // id.  All threads on a core will appear consecutively.
2600     //
2601     unsigned unique = 0;
2602     unsigned j = 0;                             // index of 1st thread on core
2603     unsigned leader = 0;
2604     Address *leaderAddr = &(address2os[0].first);
2605     kmp_affin_mask_t *sum
2606       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2607     KMP_CPU_ZERO(sum);
2608     KMP_CPU_SET(address2os[0].second, sum);
2609     for (i = 1; i < numAddrs; i++) {
2610         //
2611         // If this thread is sufficiently close to the leader (within the
2612         // granularity setting), then set the bit for this os thread in the
2613         // affinity mask for this group, and go on to the next thread.
2614         //
2615         if (leaderAddr->isClose(address2os[i].first,
2616           __kmp_affinity_gran_levels)) {
2617             KMP_CPU_SET(address2os[i].second, sum);
2618             continue;
2619         }
2620 
2621         //
2622         // For every thread in this group, copy the mask to the thread's
2623         // entry in the osId2Mask table.  Mark the first address as a
2624         // leader.
2625         //
2626         for (; j < i; j++) {
2627             unsigned osId = address2os[j].second;
2628             KMP_DEBUG_ASSERT(osId <= maxOsId);
2629             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2630             KMP_CPU_COPY(mask, sum);
2631             address2os[j].first.leader = (j == leader);
2632         }
2633         unique++;
2634 
2635         //
2636         // Start a new mask.
2637         //
2638         leader = i;
2639         leaderAddr = &(address2os[i].first);
2640         KMP_CPU_ZERO(sum);
2641         KMP_CPU_SET(address2os[i].second, sum);
2642     }
2643 
2644     //
2645     // For every thread in last group, copy the mask to the thread's
2646     // entry in the osId2Mask table.
2647     //
2648     for (; j < i; j++) {
2649         unsigned osId = address2os[j].second;
2650         KMP_DEBUG_ASSERT(osId <= maxOsId);
2651         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2652         KMP_CPU_COPY(mask, sum);
2653         address2os[j].first.leader = (j == leader);
2654     }
2655     unique++;
2656 
2657     *maxIndex = maxOsId;
2658     *numUnique = unique;
2659     return osId2Mask;
2660 }
2661 
2662 
2663 //
2664 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2665 // as file-static than to try and pass them through the calling sequence of
2666 // the recursive-descent OMP_PLACES parser.
2667 //
2668 static kmp_affin_mask_t *newMasks;
2669 static int numNewMasks;
2670 static int nextNewMask;
2671 
2672 #define ADD_MASK(_mask) \
2673     {                                                                   \
2674         if (nextNewMask >= numNewMasks) {                               \
2675             numNewMasks *= 2;                                           \
2676             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2677               numNewMasks * __kmp_affin_mask_size);                     \
2678         }                                                               \
2679         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2680         nextNewMask++;                                                  \
2681     }
2682 
2683 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2684     {                                                                   \
2685         if (((_osId) > _maxOsId) ||                                     \
2686           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2687             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2688               && (__kmp_affinity_type != affinity_none))) {             \
2689                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2690             }                                                           \
2691         }                                                               \
2692         else {                                                          \
2693             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2694         }                                                               \
2695     }
2696 
2697 
2698 //
2699 // Re-parse the proclist (for the explicit affinity type), and form the list
2700 // of affinity newMasks indexed by gtid.
2701 //
2702 static void
2703 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2704   unsigned int *out_numMasks, const char *proclist,
2705   kmp_affin_mask_t *osId2Mask, int maxOsId)
2706 {
2707     const char *scan = proclist;
2708     const char *next = proclist;
2709 
2710     //
2711     // We use malloc() for the temporary mask vector,
2712     // so that we can use realloc() to extend it.
2713     //
2714     numNewMasks = 2;
2715     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2716       * __kmp_affin_mask_size);
2717     nextNewMask = 0;
2718     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2719       __kmp_affin_mask_size);
2720     int setSize = 0;
2721 
2722     for (;;) {
2723         int start, end, stride;
2724 
2725         SKIP_WS(scan);
2726         next = scan;
2727         if (*next == '\0') {
2728             break;
2729         }
2730 
2731         if (*next == '{') {
2732             int num;
2733             setSize = 0;
2734             next++;     // skip '{'
2735             SKIP_WS(next);
2736             scan = next;
2737 
2738             //
2739             // Read the first integer in the set.
2740             //
2741             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2742               "bad proclist");
2743             SKIP_DIGITS(next);
2744             num = __kmp_str_to_int(scan, *next);
2745             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2746 
2747             //
2748             // Copy the mask for that osId to the sum (union) mask.
2749             //
2750             if ((num > maxOsId) ||
2751               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2752                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2753                   && (__kmp_affinity_type != affinity_none))) {
2754                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2755                 }
2756                 KMP_CPU_ZERO(sumMask);
2757             }
2758             else {
2759                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2760                 setSize = 1;
2761             }
2762 
2763             for (;;) {
2764                 //
2765                 // Check for end of set.
2766                 //
2767                 SKIP_WS(next);
2768                 if (*next == '}') {
2769                     next++;     // skip '}'
2770                     break;
2771                 }
2772 
2773                 //
2774                 // Skip optional comma.
2775                 //
2776                 if (*next == ',') {
2777                     next++;
2778                 }
2779                 SKIP_WS(next);
2780 
2781                 //
2782                 // Read the next integer in the set.
2783                 //
2784                 scan = next;
2785                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2786                   "bad explicit proc list");
2787 
2788                 SKIP_DIGITS(next);
2789                 num = __kmp_str_to_int(scan, *next);
2790                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2791 
2792                 //
2793                 // Add the mask for that osId to the sum mask.
2794                 //
2795                 if ((num > maxOsId) ||
2796                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2797                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2798                       && (__kmp_affinity_type != affinity_none))) {
2799                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2800                     }
2801                 }
2802                 else {
2803                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2804                     setSize++;
2805                 }
2806             }
2807             if (setSize > 0) {
2808                 ADD_MASK(sumMask);
2809             }
2810 
2811             SKIP_WS(next);
2812             if (*next == ',') {
2813                 next++;
2814             }
2815             scan = next;
2816             continue;
2817         }
2818 
2819         //
2820         // Read the first integer.
2821         //
2822         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2823         SKIP_DIGITS(next);
2824         start = __kmp_str_to_int(scan, *next);
2825         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2826         SKIP_WS(next);
2827 
2828         //
2829         // If this isn't a range, then add a mask to the list and go on.
2830         //
2831         if (*next != '-') {
2832             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2833 
2834             //
2835             // Skip optional comma.
2836             //
2837             if (*next == ',') {
2838                 next++;
2839             }
2840             scan = next;
2841             continue;
2842         }
2843 
2844         //
2845         // This is a range.  Skip over the '-' and read in the 2nd int.
2846         //
2847         next++;         // skip '-'
2848         SKIP_WS(next);
2849         scan = next;
2850         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2851         SKIP_DIGITS(next);
2852         end = __kmp_str_to_int(scan, *next);
2853         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2854 
2855         //
2856         // Check for a stride parameter
2857         //
2858         stride = 1;
2859         SKIP_WS(next);
2860         if (*next == ':') {
2861             //
2862             // A stride is specified.  Skip over the ':" and read the 3rd int.
2863             //
2864             int sign = +1;
2865             next++;         // skip ':'
2866             SKIP_WS(next);
2867             scan = next;
2868             if (*next == '-') {
2869                 sign = -1;
2870                 next++;
2871                 SKIP_WS(next);
2872                 scan = next;
2873             }
2874             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2875               "bad explicit proc list");
2876             SKIP_DIGITS(next);
2877             stride = __kmp_str_to_int(scan, *next);
2878             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2879             stride *= sign;
2880         }
2881 
2882         //
2883         // Do some range checks.
2884         //
2885         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2886         if (stride > 0) {
2887             KMP_ASSERT2(start <= end, "bad explicit proc list");
2888         }
2889         else {
2890             KMP_ASSERT2(start >= end, "bad explicit proc list");
2891         }
2892         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2893 
2894         //
2895         // Add the mask for each OS proc # to the list.
2896         //
2897         if (stride > 0) {
2898             do {
2899                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2900                 start += stride;
2901             } while (start <= end);
2902         }
2903         else {
2904             do {
2905                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2906                 start += stride;
2907             } while (start >= end);
2908         }
2909 
2910         //
2911         // Skip optional comma.
2912         //
2913         SKIP_WS(next);
2914         if (*next == ',') {
2915             next++;
2916         }
2917         scan = next;
2918     }
2919 
2920     *out_numMasks = nextNewMask;
2921     if (nextNewMask == 0) {
2922         *out_masks = NULL;
2923         KMP_INTERNAL_FREE(newMasks);
2924         return;
2925     }
2926     *out_masks
2927       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2928     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2929     __kmp_free(sumMask);
2930     KMP_INTERNAL_FREE(newMasks);
2931 }
2932 
2933 
2934 # if OMP_40_ENABLED
2935 
2936 /*-----------------------------------------------------------------------------
2937 
2938 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2939 places.  Again, Here is the grammar:
2940 
2941 place_list := place
2942 place_list := place , place_list
2943 place := num
2944 place := place : num
2945 place := place : num : signed
2946 place := { subplacelist }
2947 place := ! place                  // (lowest priority)
2948 subplace_list := subplace
2949 subplace_list := subplace , subplace_list
2950 subplace := num
2951 subplace := num : num
2952 subplace := num : num : signed
2953 signed := num
2954 signed := + signed
2955 signed := - signed
2956 
2957 -----------------------------------------------------------------------------*/
2958 
2959 static void
2960 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2961   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2962 {
2963     const char *next;
2964 
2965     for (;;) {
2966         int start, count, stride, i;
2967 
2968         //
2969         // Read in the starting proc id
2970         //
2971         SKIP_WS(*scan);
2972         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2973           "bad explicit places list");
2974         next = *scan;
2975         SKIP_DIGITS(next);
2976         start = __kmp_str_to_int(*scan, *next);
2977         KMP_ASSERT(start >= 0);
2978         *scan = next;
2979 
2980         //
2981         // valid follow sets are ',' ':' and '}'
2982         //
2983         SKIP_WS(*scan);
2984         if (**scan == '}' || **scan == ',') {
2985             if ((start > maxOsId) ||
2986               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2987                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2988                   && (__kmp_affinity_type != affinity_none))) {
2989                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2990                 }
2991             }
2992             else {
2993                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2994                 (*setSize)++;
2995             }
2996             if (**scan == '}') {
2997                 break;
2998             }
2999             (*scan)++;  // skip ','
3000             continue;
3001         }
3002         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3003         (*scan)++;      // skip ':'
3004 
3005         //
3006         // Read count parameter
3007         //
3008         SKIP_WS(*scan);
3009         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3010           "bad explicit places list");
3011         next = *scan;
3012         SKIP_DIGITS(next);
3013         count = __kmp_str_to_int(*scan, *next);
3014         KMP_ASSERT(count >= 0);
3015         *scan = next;
3016 
3017         //
3018         // valid follow sets are ',' ':' and '}'
3019         //
3020         SKIP_WS(*scan);
3021         if (**scan == '}' || **scan == ',') {
3022             for (i = 0; i < count; i++) {
3023                 if ((start > maxOsId) ||
3024                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3025                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3026                       && (__kmp_affinity_type != affinity_none))) {
3027                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3028                     }
3029                     break;  // don't proliferate warnings for large count
3030                 }
3031                 else {
3032                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3033                     start++;
3034                     (*setSize)++;
3035                 }
3036             }
3037             if (**scan == '}') {
3038                 break;
3039             }
3040             (*scan)++;  // skip ','
3041             continue;
3042         }
3043         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3044         (*scan)++;      // skip ':'
3045 
3046         //
3047         // Read stride parameter
3048         //
3049         int sign = +1;
3050         for (;;) {
3051             SKIP_WS(*scan);
3052             if (**scan == '+') {
3053                 (*scan)++; // skip '+'
3054                 continue;
3055             }
3056             if (**scan == '-') {
3057                 sign *= -1;
3058                 (*scan)++; // skip '-'
3059                 continue;
3060             }
3061             break;
3062         }
3063         SKIP_WS(*scan);
3064         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3065           "bad explicit places list");
3066         next = *scan;
3067         SKIP_DIGITS(next);
3068         stride = __kmp_str_to_int(*scan, *next);
3069         KMP_ASSERT(stride >= 0);
3070         *scan = next;
3071         stride *= sign;
3072 
3073         //
3074         // valid follow sets are ',' and '}'
3075         //
3076         SKIP_WS(*scan);
3077         if (**scan == '}' || **scan == ',') {
3078             for (i = 0; i < count; i++) {
3079                 if ((start > maxOsId) ||
3080                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3081                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3082                       && (__kmp_affinity_type != affinity_none))) {
3083                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3084                     }
3085                     break;  // don't proliferate warnings for large count
3086                 }
3087                 else {
3088                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3089                     start += stride;
3090                     (*setSize)++;
3091                 }
3092             }
3093             if (**scan == '}') {
3094                 break;
3095             }
3096             (*scan)++;  // skip ','
3097             continue;
3098         }
3099 
3100         KMP_ASSERT2(0, "bad explicit places list");
3101     }
3102 }
3103 
3104 
3105 static void
3106 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3107   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3108 {
3109     const char *next;
3110 
3111     //
3112     // valid follow sets are '{' '!' and num
3113     //
3114     SKIP_WS(*scan);
3115     if (**scan == '{') {
3116         (*scan)++;      // skip '{'
3117         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3118           setSize);
3119         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3120         (*scan)++;      // skip '}'
3121     }
3122     else if (**scan == '!') {
3123         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3124         KMP_CPU_COMPLEMENT(tempMask);
3125         (*scan)++;      // skip '!'
3126     }
3127     else if ((**scan >= '0') && (**scan <= '9')) {
3128         next = *scan;
3129         SKIP_DIGITS(next);
3130         int num = __kmp_str_to_int(*scan, *next);
3131         KMP_ASSERT(num >= 0);
3132         if ((num > maxOsId) ||
3133           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3134             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3135               && (__kmp_affinity_type != affinity_none))) {
3136                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3137             }
3138         }
3139         else {
3140             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3141             (*setSize)++;
3142         }
3143         *scan = next;  // skip num
3144     }
3145     else {
3146         KMP_ASSERT2(0, "bad explicit places list");
3147     }
3148 }
3149 
3150 
3151 //static void
3152 void
3153 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3154   unsigned int *out_numMasks, const char *placelist,
3155   kmp_affin_mask_t *osId2Mask, int maxOsId)
3156 {
3157     const char *scan = placelist;
3158     const char *next = placelist;
3159 
3160     numNewMasks = 2;
3161     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3162       * __kmp_affin_mask_size);
3163     nextNewMask = 0;
3164 
3165     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3166       __kmp_affin_mask_size);
3167     KMP_CPU_ZERO(tempMask);
3168     int setSize = 0;
3169 
3170     for (;;) {
3171         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3172 
3173         //
3174         // valid follow sets are ',' ':' and EOL
3175         //
3176         SKIP_WS(scan);
3177         if (*scan == '\0' || *scan == ',') {
3178             if (setSize > 0) {
3179                 ADD_MASK(tempMask);
3180             }
3181             KMP_CPU_ZERO(tempMask);
3182             setSize = 0;
3183             if (*scan == '\0') {
3184                 break;
3185             }
3186             scan++;     // skip ','
3187             continue;
3188         }
3189 
3190         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3191         scan++;         // skip ':'
3192 
3193         //
3194         // Read count parameter
3195         //
3196         SKIP_WS(scan);
3197         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3198           "bad explicit places list");
3199         next = scan;
3200         SKIP_DIGITS(next);
3201         int count = __kmp_str_to_int(scan, *next);
3202         KMP_ASSERT(count >= 0);
3203         scan = next;
3204 
3205         //
3206         // valid follow sets are ',' ':' and EOL
3207         //
3208         SKIP_WS(scan);
3209         int stride;
3210         if (*scan == '\0' || *scan == ',') {
3211             stride = +1;
3212         }
3213         else {
3214             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3215             scan++;         // skip ':'
3216 
3217             //
3218             // Read stride parameter
3219             //
3220             int sign = +1;
3221             for (;;) {
3222                 SKIP_WS(scan);
3223                 if (*scan == '+') {
3224                     scan++; // skip '+'
3225                     continue;
3226                 }
3227                 if (*scan == '-') {
3228                     sign *= -1;
3229                     scan++; // skip '-'
3230                     continue;
3231                 }
3232                 break;
3233             }
3234             SKIP_WS(scan);
3235             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3236               "bad explicit places list");
3237             next = scan;
3238             SKIP_DIGITS(next);
3239             stride = __kmp_str_to_int(scan, *next);
3240             KMP_DEBUG_ASSERT(stride >= 0);
3241             scan = next;
3242             stride *= sign;
3243         }
3244 
3245         if (stride > 0) {
3246             int i;
3247             for (i = 0; i < count; i++) {
3248                 int j;
3249                 if (setSize == 0) {
3250                     break;
3251                 }
3252                 ADD_MASK(tempMask);
3253                 setSize = 0;
3254                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3255                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3256                         KMP_CPU_CLR(j, tempMask);
3257                     }
3258                     else if ((j > maxOsId) ||
3259                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3260                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3261                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3262                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3263                         }
3264                         KMP_CPU_CLR(j, tempMask);
3265                     }
3266                     else {
3267                         KMP_CPU_SET(j, tempMask);
3268                         setSize++;
3269                     }
3270                 }
3271                 for (; j >= 0; j--) {
3272                     KMP_CPU_CLR(j, tempMask);
3273                 }
3274             }
3275         }
3276         else {
3277             int i;
3278             for (i = 0; i < count; i++) {
3279                 int j;
3280                 if (setSize == 0) {
3281                     break;
3282                 }
3283                 ADD_MASK(tempMask);
3284                 setSize = 0;
3285                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3286                   j++) {
3287                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3288                         KMP_CPU_CLR(j, tempMask);
3289                     }
3290                     else if ((j > maxOsId) ||
3291                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3292                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3293                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3294                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3295                         }
3296                         KMP_CPU_CLR(j, tempMask);
3297                     }
3298                     else {
3299                         KMP_CPU_SET(j, tempMask);
3300                         setSize++;
3301                     }
3302                 }
3303                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3304                     KMP_CPU_CLR(j, tempMask);
3305                 }
3306             }
3307         }
3308         KMP_CPU_ZERO(tempMask);
3309         setSize = 0;
3310 
3311         //
3312         // valid follow sets are ',' and EOL
3313         //
3314         SKIP_WS(scan);
3315         if (*scan == '\0') {
3316             break;
3317         }
3318         if (*scan == ',') {
3319             scan++;     // skip ','
3320             continue;
3321         }
3322 
3323         KMP_ASSERT2(0, "bad explicit places list");
3324     }
3325 
3326     *out_numMasks = nextNewMask;
3327     if (nextNewMask == 0) {
3328         *out_masks = NULL;
3329         KMP_INTERNAL_FREE(newMasks);
3330         return;
3331     }
3332     *out_masks
3333       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3334     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3335     __kmp_free(tempMask);
3336     KMP_INTERNAL_FREE(newMasks);
3337 }
3338 
3339 # endif /* OMP_40_ENABLED */
3340 
3341 #undef ADD_MASK
3342 #undef ADD_MASK_OSID
3343 
3344 static void
3345 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3346 {
3347     if ( __kmp_place_num_cores == 0 ) {
3348         if ( __kmp_place_num_threads_per_core == 0 ) {
3349             return;   // no cores limiting actions requested, exit
3350         }
3351         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3352     }
3353     if ( !__kmp_affinity_uniform_topology() ) {
3354         KMP_WARNING( AffThrPlaceNonUniform );
3355         return; // don't support non-uniform topology
3356     }
3357     if ( depth != 3 ) {
3358         KMP_WARNING( AffThrPlaceNonThreeLevel );
3359         return; // don't support not-3-level topology
3360     }
3361     if ( __kmp_place_num_threads_per_core == 0 ) {
3362         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3363     }
3364     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3365         KMP_WARNING( AffThrPlaceManyCores );
3366         return;
3367     }
3368 
3369     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3370                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3371     int i, j, k, n_old = 0, n_new = 0;
3372     for ( i = 0; i < nPackages; ++i ) {
3373         for ( j = 0; j < nCoresPerPkg; ++j ) {
3374             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3375                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3376             } else {
3377                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3378                     if ( k < __kmp_place_num_threads_per_core ) {
3379                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3380                         n_new++;
3381                     }
3382                     n_old++;
3383                 }
3384             }
3385         }
3386     }
3387     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3388     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3389     __kmp_avail_proc = n_new;                                 // correct avail_proc
3390     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3391 
3392     __kmp_free( *pAddr );
3393     *pAddr = newAddr;      // replace old topology with new one
3394 }
3395 
3396 
3397 static AddrUnsPair *address2os = NULL;
3398 static int           * procarr = NULL;
3399 static int     __kmp_aff_depth = 0;
3400 
3401 static void
3402 __kmp_aux_affinity_initialize(void)
3403 {
3404     if (__kmp_affinity_masks != NULL) {
3405         KMP_ASSERT(fullMask != NULL);
3406         return;
3407     }
3408 
3409     //
3410     // Create the "full" mask - this defines all of the processors that we
3411     // consider to be in the machine model.  If respect is set, then it is
3412     // the initialization thread's affinity mask.  Otherwise, it is all
3413     // processors that we know about on the machine.
3414     //
3415     if (fullMask == NULL) {
3416         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3417     }
3418     if (KMP_AFFINITY_CAPABLE()) {
3419         if (__kmp_affinity_respect_mask) {
3420             __kmp_get_system_affinity(fullMask, TRUE);
3421 
3422             //
3423             // Count the number of available processors.
3424             //
3425             unsigned i;
3426             __kmp_avail_proc = 0;
3427             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3428                 if (! KMP_CPU_ISSET(i, fullMask)) {
3429                     continue;
3430                 }
3431                 __kmp_avail_proc++;
3432             }
3433             if (__kmp_avail_proc > __kmp_xproc) {
3434                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3435                   && (__kmp_affinity_type != affinity_none))) {
3436                     KMP_WARNING(ErrorInitializeAffinity);
3437                 }
3438                 __kmp_affinity_type = affinity_none;
3439                 KMP_AFFINITY_DISABLE();
3440                 return;
3441             }
3442         }
3443         else {
3444             __kmp_affinity_entire_machine_mask(fullMask);
3445             __kmp_avail_proc = __kmp_xproc;
3446         }
3447     }
3448 
3449     int depth = -1;
3450     kmp_i18n_id_t msg_id = kmp_i18n_null;
3451 
3452     //
3453     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3454     // KMP_TOPOLOGY_METHOD=cpuinfo
3455     //
3456     if ((__kmp_cpuinfo_file != NULL) &&
3457       (__kmp_affinity_top_method == affinity_top_method_all)) {
3458         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3459     }
3460 
3461     if (__kmp_affinity_top_method == affinity_top_method_all) {
3462         //
3463         // In the default code path, errors are not fatal - we just try using
3464         // another method.  We only emit a warning message if affinity is on,
3465         // or the verbose flag is set, an the nowarnings flag was not set.
3466         //
3467         const char *file_name = NULL;
3468         int line = 0;
3469 
3470 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3471 
3472         if (__kmp_affinity_verbose) {
3473             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3474         }
3475 
3476         file_name = NULL;
3477         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3478         if (depth == 0) {
3479             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3480             KMP_ASSERT(address2os == NULL);
3481             return;
3482         }
3483 
3484         if (depth < 0) {
3485             if (__kmp_affinity_verbose) {
3486                 if (msg_id != kmp_i18n_null) {
3487                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3488                       KMP_I18N_STR(DecodingLegacyAPIC));
3489                 }
3490                 else {
3491                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3492                 }
3493             }
3494 
3495             file_name = NULL;
3496             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3497             if (depth == 0) {
3498                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3499                 KMP_ASSERT(address2os == NULL);
3500                 return;
3501             }
3502         }
3503 
3504 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3505 
3506 # if KMP_OS_LINUX
3507 
3508         if (depth < 0) {
3509             if (__kmp_affinity_verbose) {
3510                 if (msg_id != kmp_i18n_null) {
3511                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3512                 }
3513                 else {
3514                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3515                 }
3516             }
3517 
3518             FILE *f = fopen("/proc/cpuinfo", "r");
3519             if (f == NULL) {
3520                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3521             }
3522             else {
3523                 file_name = "/proc/cpuinfo";
3524                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3525                 fclose(f);
3526                 if (depth == 0) {
3527                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3528                     KMP_ASSERT(address2os == NULL);
3529                     return;
3530                 }
3531             }
3532         }
3533 
3534 # endif /* KMP_OS_LINUX */
3535 
3536 # if KMP_GROUP_AFFINITY
3537 
3538         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3539             if (__kmp_affinity_verbose) {
3540                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3541             }
3542 
3543             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3544             KMP_ASSERT(depth != 0);
3545         }
3546 
3547 # endif /* KMP_GROUP_AFFINITY */
3548 
3549         if (depth < 0) {
3550             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3551                 if (file_name == NULL) {
3552                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3553                 }
3554                 else if (line == 0) {
3555                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3556                 }
3557                 else {
3558                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3559                 }
3560             }
3561             // FIXME - print msg if msg_id = kmp_i18n_null ???
3562 
3563             file_name = "";
3564             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3565             if (depth == 0) {
3566                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3567                 KMP_ASSERT(address2os == NULL);
3568                 return;
3569             }
3570             KMP_ASSERT(depth > 0);
3571             KMP_ASSERT(address2os != NULL);
3572         }
3573     }
3574 
3575     //
3576     // If the user has specified that a paricular topology discovery method
3577     // is to be used, then we abort if that method fails.  The exception is
3578     // group affinity, which might have been implicitly set.
3579     //
3580 
3581 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3582 
3583     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3584         if (__kmp_affinity_verbose) {
3585             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3586               KMP_I18N_STR(Decodingx2APIC));
3587         }
3588 
3589         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3590         if (depth == 0) {
3591             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3592             KMP_ASSERT(address2os == NULL);
3593             return;
3594         }
3595         if (depth < 0) {
3596             KMP_ASSERT(msg_id != kmp_i18n_null);
3597             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3598         }
3599     }
3600     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3601         if (__kmp_affinity_verbose) {
3602             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3603               KMP_I18N_STR(DecodingLegacyAPIC));
3604         }
3605 
3606         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3607         if (depth == 0) {
3608             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3609             KMP_ASSERT(address2os == NULL);
3610             return;
3611         }
3612         if (depth < 0) {
3613             KMP_ASSERT(msg_id != kmp_i18n_null);
3614             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3615         }
3616     }
3617 
3618 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3619 
3620     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3621         const char *filename;
3622         if (__kmp_cpuinfo_file != NULL) {
3623             filename = __kmp_cpuinfo_file;
3624         }
3625         else {
3626             filename = "/proc/cpuinfo";
3627         }
3628 
3629         if (__kmp_affinity_verbose) {
3630             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3631         }
3632 
3633         FILE *f = fopen(filename, "r");
3634         if (f == NULL) {
3635             int code = errno;
3636             if (__kmp_cpuinfo_file != NULL) {
3637                 __kmp_msg(
3638                     kmp_ms_fatal,
3639                     KMP_MSG(CantOpenFileForReading, filename),
3640                     KMP_ERR(code),
3641                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3642                     __kmp_msg_null
3643                 );
3644             }
3645             else {
3646                 __kmp_msg(
3647                     kmp_ms_fatal,
3648                     KMP_MSG(CantOpenFileForReading, filename),
3649                     KMP_ERR(code),
3650                     __kmp_msg_null
3651                 );
3652             }
3653         }
3654         int line = 0;
3655         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3656         fclose(f);
3657         if (depth < 0) {
3658             KMP_ASSERT(msg_id != kmp_i18n_null);
3659             if (line > 0) {
3660                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3661             }
3662             else {
3663                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3664             }
3665         }
3666         if (__kmp_affinity_type == affinity_none) {
3667             KMP_ASSERT(depth == 0);
3668             KMP_ASSERT(address2os == NULL);
3669             return;
3670         }
3671     }
3672 
3673 # if KMP_GROUP_AFFINITY
3674 
3675     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3676         if (__kmp_affinity_verbose) {
3677             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3678         }
3679 
3680         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3681         KMP_ASSERT(depth != 0);
3682         if (depth < 0) {
3683             KMP_ASSERT(msg_id != kmp_i18n_null);
3684             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3685         }
3686     }
3687 
3688 # endif /* KMP_GROUP_AFFINITY */
3689 
3690     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3691         if (__kmp_affinity_verbose) {
3692             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3693         }
3694 
3695         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3696         if (depth == 0) {
3697             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3698             KMP_ASSERT(address2os == NULL);
3699             return;
3700         }
3701         // should not fail
3702         KMP_ASSERT(depth > 0);
3703         KMP_ASSERT(address2os != NULL);
3704     }
3705 
3706     if (address2os == NULL) {
3707         if (KMP_AFFINITY_CAPABLE()
3708           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3709           && (__kmp_affinity_type != affinity_none)))) {
3710             KMP_WARNING(ErrorInitializeAffinity);
3711         }
3712         __kmp_affinity_type = affinity_none;
3713         KMP_AFFINITY_DISABLE();
3714         return;
3715     }
3716 
3717     __kmp_apply_thread_places(&address2os, depth);
3718 
3719     //
3720     // Create the table of masks, indexed by thread Id.
3721     //
3722     unsigned maxIndex;
3723     unsigned numUnique;
3724     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3725       address2os, __kmp_avail_proc);
3726     if (__kmp_affinity_gran_levels == 0) {
3727         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3728     }
3729 
3730     //
3731     // Set the childNums vector in all Address objects.  This must be done
3732     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3733     // which takes into account the setting of __kmp_affinity_compact.
3734     //
3735     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3736 
3737     switch (__kmp_affinity_type) {
3738 
3739         case affinity_explicit:
3740         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3741 # if OMP_40_ENABLED
3742         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3743 # endif
3744         {
3745             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3746               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3747               maxIndex);
3748         }
3749 # if OMP_40_ENABLED
3750         else {
3751             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3752               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3753               maxIndex);
3754         }
3755 # endif
3756         if (__kmp_affinity_num_masks == 0) {
3757             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3758               && (__kmp_affinity_type != affinity_none))) {
3759                 KMP_WARNING(AffNoValidProcID);
3760             }
3761             __kmp_affinity_type = affinity_none;
3762             return;
3763         }
3764         break;
3765 
3766         //
3767         // The other affinity types rely on sorting the Addresses according
3768         // to some permutation of the machine topology tree.  Set
3769         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3770         // then jump to a common code fragment to do the sort and create
3771         // the array of affinity masks.
3772         //
3773 
3774         case affinity_logical:
3775         __kmp_affinity_compact = 0;
3776         if (__kmp_affinity_offset) {
3777             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3778               % __kmp_avail_proc;
3779         }
3780         goto sortAddresses;
3781 
3782         case affinity_physical:
3783         if (__kmp_nThreadsPerCore > 1) {
3784             __kmp_affinity_compact = 1;
3785             if (__kmp_affinity_compact >= depth) {
3786                 __kmp_affinity_compact = 0;
3787             }
3788         } else {
3789             __kmp_affinity_compact = 0;
3790         }
3791         if (__kmp_affinity_offset) {
3792             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3793               % __kmp_avail_proc;
3794         }
3795         goto sortAddresses;
3796 
3797         case affinity_scatter:
3798         if (__kmp_affinity_compact >= depth) {
3799             __kmp_affinity_compact = 0;
3800         }
3801         else {
3802             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3803         }
3804         goto sortAddresses;
3805 
3806         case affinity_compact:
3807         if (__kmp_affinity_compact >= depth) {
3808             __kmp_affinity_compact = depth - 1;
3809         }
3810         goto sortAddresses;
3811 
3812         case affinity_balanced:
3813         // Balanced works only for the case of a single package and uniform topology
3814         if( nPackages > 1 ) {
3815             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3816                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3817             }
3818             __kmp_affinity_type = affinity_none;
3819             return;
3820         } else if( __kmp_affinity_uniform_topology() ) {
3821             break;
3822         } else { // Non-uniform topology
3823 
3824             // Save the depth for further usage
3825             __kmp_aff_depth = depth;
3826 
3827             // Number of hyper threads per core in HT machine
3828             int nth_per_core = __kmp_nThreadsPerCore;
3829 
3830             int core_level;
3831             if( nth_per_core > 1 ) {
3832                 core_level = depth - 2;
3833             } else {
3834                 core_level = depth - 1;
3835             }
3836             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3837             int nproc = nth_per_core * ncores;
3838 
3839             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3840             for( int i = 0; i < nproc; i++ ) {
3841                 procarr[ i ] = -1;
3842             }
3843 
3844             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3845                 int proc = address2os[ i ].second;
3846                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3847                 // If there is only one thread per core then depth == 2: level 0 - package,
3848                 // level 1 - core.
3849                 int level = depth - 1;
3850 
3851                 // __kmp_nth_per_core == 1
3852                 int thread = 0;
3853                 int core = address2os[ i ].first.labels[ level ];
3854                 // If the thread level exists, that is we have more than one thread context per core
3855                 if( nth_per_core > 1 ) {
3856                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3857                     core = address2os[ i ].first.labels[ level - 1 ];
3858                 }
3859                 procarr[ core * nth_per_core + thread ] = proc;
3860             }
3861 
3862             break;
3863         }
3864 
3865         sortAddresses:
3866         //
3867         // Allocate the gtid->affinity mask table.
3868         //
3869         if (__kmp_affinity_dups) {
3870             __kmp_affinity_num_masks = __kmp_avail_proc;
3871         }
3872         else {
3873             __kmp_affinity_num_masks = numUnique;
3874         }
3875 
3876 # if OMP_40_ENABLED
3877         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3878           && ( __kmp_affinity_num_places > 0 )
3879           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3880             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3881         }
3882 # endif
3883 
3884         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3885           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3886 
3887         //
3888         // Sort the address2os table according to the current setting of
3889         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3890         //
3891         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3892           __kmp_affinity_cmp_Address_child_num);
3893         {
3894             int i;
3895             unsigned j;
3896             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3897                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3898                     continue;
3899                 }
3900                 unsigned osId = address2os[i].second;
3901                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3902                 kmp_affin_mask_t *dest
3903                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3904                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3905                 KMP_CPU_COPY(dest, src);
3906                 if (++j >= __kmp_affinity_num_masks) {
3907                     break;
3908                 }
3909             }
3910             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3911         }
3912         break;
3913 
3914         default:
3915         KMP_ASSERT2(0, "Unexpected affinity setting");
3916     }
3917 
3918     __kmp_free(osId2Mask);
3919     machine_hierarchy.init(address2os, __kmp_avail_proc);
3920 }
3921 
3922 
3923 void
3924 __kmp_affinity_initialize(void)
3925 {
3926     //
3927     // Much of the code above was written assumming that if a machine was not
3928     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3929     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3930     //
3931     // There are too many checks for __kmp_affinity_type == affinity_none
3932     // in this code.  Instead of trying to change them all, check if
3933     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3934     // affinity_none, call the real initialization routine, then restore
3935     // __kmp_affinity_type to affinity_disabled.
3936     //
3937     int disabled = (__kmp_affinity_type == affinity_disabled);
3938     if (! KMP_AFFINITY_CAPABLE()) {
3939         KMP_ASSERT(disabled);
3940     }
3941     if (disabled) {
3942         __kmp_affinity_type = affinity_none;
3943     }
3944     __kmp_aux_affinity_initialize();
3945     if (disabled) {
3946         __kmp_affinity_type = affinity_disabled;
3947     }
3948 }
3949 
3950 
3951 void
3952 __kmp_affinity_uninitialize(void)
3953 {
3954     if (__kmp_affinity_masks != NULL) {
3955         __kmp_free(__kmp_affinity_masks);
3956         __kmp_affinity_masks = NULL;
3957     }
3958     if (fullMask != NULL) {
3959         KMP_CPU_FREE(fullMask);
3960         fullMask = NULL;
3961     }
3962     __kmp_affinity_num_masks = 0;
3963 # if OMP_40_ENABLED
3964     __kmp_affinity_num_places = 0;
3965 # endif
3966     if (__kmp_affinity_proclist != NULL) {
3967         __kmp_free(__kmp_affinity_proclist);
3968         __kmp_affinity_proclist = NULL;
3969     }
3970     if( address2os != NULL ) {
3971         __kmp_free( address2os );
3972         address2os = NULL;
3973     }
3974     if( procarr != NULL ) {
3975         __kmp_free( procarr );
3976         procarr = NULL;
3977     }
3978 }
3979 
3980 
3981 void
3982 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3983 {
3984     if (! KMP_AFFINITY_CAPABLE()) {
3985         return;
3986     }
3987 
3988     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3989     if (th->th.th_affin_mask == NULL) {
3990         KMP_CPU_ALLOC(th->th.th_affin_mask);
3991     }
3992     else {
3993         KMP_CPU_ZERO(th->th.th_affin_mask);
3994     }
3995 
3996     //
3997     // Copy the thread mask to the kmp_info_t strucuture.
3998     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3999     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4000     // is set, then the full mask is the same as the mask of the initialization
4001     // thread.
4002     //
4003     kmp_affin_mask_t *mask;
4004     int i;
4005 
4006 # if OMP_40_ENABLED
4007     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4008 # endif
4009     {
4010         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4011           ) {
4012 # if KMP_GROUP_AFFINITY
4013             if (__kmp_num_proc_groups > 1) {
4014                 return;
4015             }
4016 # endif
4017             KMP_ASSERT(fullMask != NULL);
4018             i = KMP_PLACE_ALL;
4019             mask = fullMask;
4020         }
4021         else {
4022             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4023             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4024             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4025         }
4026     }
4027 # if OMP_40_ENABLED
4028     else {
4029         if ((! isa_root)
4030           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4031 #  if KMP_GROUP_AFFINITY
4032             if (__kmp_num_proc_groups > 1) {
4033                 return;
4034             }
4035 #  endif
4036             KMP_ASSERT(fullMask != NULL);
4037             i = KMP_PLACE_ALL;
4038             mask = fullMask;
4039         }
4040         else {
4041             //
4042             // int i = some hash function or just a counter that doesn't
4043             // always start at 0.  Use gtid for now.
4044             //
4045             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4046             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4047             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4048         }
4049     }
4050 # endif
4051 
4052 # if OMP_40_ENABLED
4053     th->th.th_current_place = i;
4054     if (isa_root) {
4055         th->th.th_new_place = i;
4056         th->th.th_first_place = 0;
4057         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4058     }
4059 
4060     if (i == KMP_PLACE_ALL) {
4061         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4062           gtid));
4063     }
4064     else {
4065         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4066           gtid, i));
4067     }
4068 # else
4069     if (i == -1) {
4070         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4071           gtid));
4072     }
4073     else {
4074         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4075           gtid, i));
4076     }
4077 # endif /* OMP_40_ENABLED */
4078 
4079     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4080 
4081     if (__kmp_affinity_verbose) {
4082         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4083         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4084           th->th.th_affin_mask);
4085         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4086           buf);
4087     }
4088 
4089 # if KMP_OS_WINDOWS
4090     //
4091     // On Windows* OS, the process affinity mask might have changed.
4092     // If the user didn't request affinity and this call fails,
4093     // just continue silently.  See CQ171393.
4094     //
4095     if ( __kmp_affinity_type == affinity_none ) {
4096         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4097     }
4098     else
4099 # endif
4100     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4101 }
4102 
4103 
4104 # if OMP_40_ENABLED
4105 
4106 void
4107 __kmp_affinity_set_place(int gtid)
4108 {
4109     int retval;
4110 
4111     if (! KMP_AFFINITY_CAPABLE()) {
4112         return;
4113     }
4114 
4115     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4116 
4117     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4118       gtid, th->th.th_new_place, th->th.th_current_place));
4119 
4120     //
4121     // Check that the new place is within this thread's partition.
4122     //
4123     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4124     KMP_ASSERT(th->th.th_new_place >= 0);
4125     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4126     if (th->th.th_first_place <= th->th.th_last_place) {
4127         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4128          && (th->th.th_new_place <= th->th.th_last_place));
4129     }
4130     else {
4131         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4132          || (th->th.th_new_place >= th->th.th_last_place));
4133     }
4134 
4135     //
4136     // Copy the thread mask to the kmp_info_t strucuture,
4137     // and set this thread's affinity.
4138     //
4139     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4140       th->th.th_new_place);
4141     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4142     th->th.th_current_place = th->th.th_new_place;
4143 
4144     if (__kmp_affinity_verbose) {
4145         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4146         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4147           th->th.th_affin_mask);
4148         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4149           gtid, buf);
4150     }
4151     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4152 }
4153 
4154 # endif /* OMP_40_ENABLED */
4155 
4156 
4157 int
4158 __kmp_aux_set_affinity(void **mask)
4159 {
4160     int gtid;
4161     kmp_info_t *th;
4162     int retval;
4163 
4164     if (! KMP_AFFINITY_CAPABLE()) {
4165         return -1;
4166     }
4167 
4168     gtid = __kmp_entry_gtid();
4169     KA_TRACE(1000, ;{
4170         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4171         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4172           (kmp_affin_mask_t *)(*mask));
4173         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4174           gtid, buf);
4175     });
4176 
4177     if (__kmp_env_consistency_check) {
4178         if ((mask == NULL) || (*mask == NULL)) {
4179             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4180         }
4181         else {
4182             unsigned proc;
4183             int num_procs = 0;
4184 
4185             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4186                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4187                     continue;
4188                 }
4189                 num_procs++;
4190                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4191                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4192                     break;
4193                 }
4194             }
4195             if (num_procs == 0) {
4196                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4197             }
4198 
4199 # if KMP_GROUP_AFFINITY
4200             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4201                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4202             }
4203 # endif /* KMP_GROUP_AFFINITY */
4204 
4205         }
4206     }
4207 
4208     th = __kmp_threads[gtid];
4209     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4210     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4211     if (retval == 0) {
4212         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4213     }
4214 
4215 # if OMP_40_ENABLED
4216     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4217     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4218     th->th.th_first_place = 0;
4219     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4220 
4221     //
4222     // Turn off 4.0 affinity for the current tread at this parallel level.
4223     //
4224     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4225 # endif
4226 
4227     return retval;
4228 }
4229 
4230 
4231 int
4232 __kmp_aux_get_affinity(void **mask)
4233 {
4234     int gtid;
4235     int retval;
4236     kmp_info_t *th;
4237 
4238     if (! KMP_AFFINITY_CAPABLE()) {
4239         return -1;
4240     }
4241 
4242     gtid = __kmp_entry_gtid();
4243     th = __kmp_threads[gtid];
4244     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4245 
4246     KA_TRACE(1000, ;{
4247         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4248         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4249           th->th.th_affin_mask);
4250         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4251     });
4252 
4253     if (__kmp_env_consistency_check) {
4254         if ((mask == NULL) || (*mask == NULL)) {
4255             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4256         }
4257     }
4258 
4259 # if !KMP_OS_WINDOWS
4260 
4261     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4262     KA_TRACE(1000, ;{
4263         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4264         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4265           (kmp_affin_mask_t *)(*mask));
4266         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4267     });
4268     return retval;
4269 
4270 # else
4271 
4272     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4273     return 0;
4274 
4275 # endif /* KMP_OS_WINDOWS */
4276 
4277 }
4278 
4279 int
4280 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4281 {
4282     int retval;
4283 
4284     if (! KMP_AFFINITY_CAPABLE()) {
4285         return -1;
4286     }
4287 
4288     KA_TRACE(1000, ;{
4289         int gtid = __kmp_entry_gtid();
4290         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4291         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4292           (kmp_affin_mask_t *)(*mask));
4293         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4294           proc, gtid, buf);
4295     });
4296 
4297     if (__kmp_env_consistency_check) {
4298         if ((mask == NULL) || (*mask == NULL)) {
4299             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4300         }
4301     }
4302 
4303     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4304         return -1;
4305     }
4306     if (! KMP_CPU_ISSET(proc, fullMask)) {
4307         return -2;
4308     }
4309 
4310     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4311     return 0;
4312 }
4313 
4314 
4315 int
4316 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4317 {
4318     int retval;
4319 
4320     if (! KMP_AFFINITY_CAPABLE()) {
4321         return -1;
4322     }
4323 
4324     KA_TRACE(1000, ;{
4325         int gtid = __kmp_entry_gtid();
4326         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4327         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4328           (kmp_affin_mask_t *)(*mask));
4329         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4330           proc, gtid, buf);
4331     });
4332 
4333     if (__kmp_env_consistency_check) {
4334         if ((mask == NULL) || (*mask == NULL)) {
4335             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4336         }
4337     }
4338 
4339     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4340         return -1;
4341     }
4342     if (! KMP_CPU_ISSET(proc, fullMask)) {
4343         return -2;
4344     }
4345 
4346     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4347     return 0;
4348 }
4349 
4350 
4351 int
4352 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4353 {
4354     int retval;
4355 
4356     if (! KMP_AFFINITY_CAPABLE()) {
4357         return -1;
4358     }
4359 
4360     KA_TRACE(1000, ;{
4361         int gtid = __kmp_entry_gtid();
4362         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4363         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4364           (kmp_affin_mask_t *)(*mask));
4365         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4366           proc, gtid, buf);
4367     });
4368 
4369     if (__kmp_env_consistency_check) {
4370         if ((mask == NULL) || (*mask == NULL)) {
4371             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4372         }
4373     }
4374 
4375     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4376         return 0;
4377     }
4378     if (! KMP_CPU_ISSET(proc, fullMask)) {
4379         return 0;
4380     }
4381 
4382     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4383 }
4384 
4385 
4386 // Dynamic affinity settings - Affinity balanced
4387 void __kmp_balanced_affinity( int tid, int nthreads )
4388 {
4389     if( __kmp_affinity_uniform_topology() ) {
4390         int coreID;
4391         int threadID;
4392         // Number of hyper threads per core in HT machine
4393         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4394         // Number of cores
4395         int ncores = __kmp_ncores;
4396         // How many threads will be bound to each core
4397         int chunk = nthreads / ncores;
4398         // How many cores will have an additional thread bound to it - "big cores"
4399         int big_cores = nthreads % ncores;
4400         // Number of threads on the big cores
4401         int big_nth = ( chunk + 1 ) * big_cores;
4402         if( tid < big_nth ) {
4403             coreID = tid / (chunk + 1 );
4404             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4405         } else { //tid >= big_nth
4406             coreID = ( tid - big_cores ) / chunk;
4407             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4408         }
4409 
4410         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4411           "Illegal set affinity operation when not capable");
4412 
4413         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4414         KMP_CPU_ZERO(mask);
4415 
4416         // Granularity == thread
4417         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4418             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4419             KMP_CPU_SET( osID, mask);
4420         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4421             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4422                 int osID;
4423                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4424                 KMP_CPU_SET( osID, mask);
4425             }
4426         }
4427         if (__kmp_affinity_verbose) {
4428             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4429             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4430             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4431               tid, buf);
4432         }
4433         __kmp_set_system_affinity( mask, TRUE );
4434     } else { // Non-uniform topology
4435 
4436         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4437         KMP_CPU_ZERO(mask);
4438 
4439         // Number of hyper threads per core in HT machine
4440         int nth_per_core = __kmp_nThreadsPerCore;
4441         int core_level;
4442         if( nth_per_core > 1 ) {
4443             core_level = __kmp_aff_depth - 2;
4444         } else {
4445             core_level = __kmp_aff_depth - 1;
4446         }
4447 
4448         // Number of cores - maximum value; it does not count trail cores with 0 processors
4449         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4450 
4451         // For performance gain consider the special case nthreads == __kmp_avail_proc
4452         if( nthreads == __kmp_avail_proc ) {
4453             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4454                 int osID = address2os[ tid ].second;
4455                 KMP_CPU_SET( osID, mask);
4456             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4457                 int coreID = address2os[ tid ].first.labels[ core_level ];
4458                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4459                 // since the address2os is sortied we can break when cnt==nth_per_core
4460                 int cnt = 0;
4461                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4462                     int osID = address2os[ i ].second;
4463                     int core = address2os[ i ].first.labels[ core_level ];
4464                     if( core == coreID ) {
4465                         KMP_CPU_SET( osID, mask);
4466                         cnt++;
4467                         if( cnt == nth_per_core ) {
4468                             break;
4469                         }
4470                     }
4471                 }
4472             }
4473         } else if( nthreads <= __kmp_ncores ) {
4474 
4475             int core = 0;
4476             for( int i = 0; i < ncores; i++ ) {
4477                 // Check if this core from procarr[] is in the mask
4478                 int in_mask = 0;
4479                 for( int j = 0; j < nth_per_core; j++ ) {
4480                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4481                         in_mask = 1;
4482                         break;
4483                     }
4484                 }
4485                 if( in_mask ) {
4486                     if( tid == core ) {
4487                         for( int j = 0; j < nth_per_core; j++ ) {
4488                             int osID = procarr[ i * nth_per_core + j ];
4489                             if( osID != -1 ) {
4490                                 KMP_CPU_SET( osID, mask );
4491                                 // For granularity=thread it is enough to set the first available osID for this core
4492                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4493                                     break;
4494                                 }
4495                             }
4496                         }
4497                         break;
4498                     } else {
4499                         core++;
4500                     }
4501                 }
4502             }
4503 
4504         } else { // nthreads > __kmp_ncores
4505 
4506             // Array to save the number of processors at each core
4507             int nproc_at_core[ ncores ];
4508             // Array to save the number of cores with "x" available processors;
4509             int ncores_with_x_procs[ nth_per_core + 1 ];
4510             // Array to save the number of cores with # procs from x to nth_per_core
4511             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4512 
4513             for( int i = 0; i <= nth_per_core; i++ ) {
4514                 ncores_with_x_procs[ i ] = 0;
4515                 ncores_with_x_to_max_procs[ i ] = 0;
4516             }
4517 
4518             for( int i = 0; i < ncores; i++ ) {
4519                 int cnt = 0;
4520                 for( int j = 0; j < nth_per_core; j++ ) {
4521                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4522                         cnt++;
4523                     }
4524                 }
4525                 nproc_at_core[ i ] = cnt;
4526                 ncores_with_x_procs[ cnt ]++;
4527             }
4528 
4529             for( int i = 0; i <= nth_per_core; i++ ) {
4530                 for( int j = i; j <= nth_per_core; j++ ) {
4531                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4532                 }
4533             }
4534 
4535             // Max number of processors
4536             int nproc = nth_per_core * ncores;
4537             // An array to keep number of threads per each context
4538             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4539             for( int i = 0; i < nproc; i++ ) {
4540                 newarr[ i ] = 0;
4541             }
4542 
4543             int nth = nthreads;
4544             int flag = 0;
4545             while( nth > 0 ) {
4546                 for( int j = 1; j <= nth_per_core; j++ ) {
4547                     int cnt = ncores_with_x_to_max_procs[ j ];
4548                     for( int i = 0; i < ncores; i++ ) {
4549                         // Skip the core with 0 processors
4550                         if( nproc_at_core[ i ] == 0 ) {
4551                             continue;
4552                         }
4553                         for( int k = 0; k < nth_per_core; k++ ) {
4554                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4555                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4556                                     newarr[ i * nth_per_core + k ] = 1;
4557                                     cnt--;
4558                                     nth--;
4559                                     break;
4560                                 } else {
4561                                     if( flag != 0 ) {
4562                                         newarr[ i * nth_per_core + k ] ++;
4563                                         cnt--;
4564                                         nth--;
4565                                         break;
4566                                     }
4567                                 }
4568                             }
4569                         }
4570                         if( cnt == 0 || nth == 0 ) {
4571                             break;
4572                         }
4573                     }
4574                     if( nth == 0 ) {
4575                         break;
4576                     }
4577                 }
4578                 flag = 1;
4579             }
4580             int sum = 0;
4581             for( int i = 0; i < nproc; i++ ) {
4582                 sum += newarr[ i ];
4583                 if( sum > tid ) {
4584                     // Granularity == thread
4585                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4586                         int osID = procarr[ i ];
4587                         KMP_CPU_SET( osID, mask);
4588                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4589                         int coreID = i / nth_per_core;
4590                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4591                             int osID = procarr[ coreID * nth_per_core + ii ];
4592                             if( osID != -1 ) {
4593                                 KMP_CPU_SET( osID, mask);
4594                             }
4595                         }
4596                     }
4597                     break;
4598                 }
4599             }
4600             __kmp_free( newarr );
4601         }
4602 
4603         if (__kmp_affinity_verbose) {
4604             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4605             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4606             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4607               tid, buf);
4608         }
4609         __kmp_set_system_affinity( mask, TRUE );
4610     }
4611 }
4612 
4613 #else
4614     // affinity not supported
4615 
4616 static const kmp_uint32 noaff_maxLevels=7;
4617 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4618 kmp_uint32 noaff_depth;
4619 kmp_uint8 noaff_leaf_kids;
4620 kmp_int8 noaff_uninitialized=1;
4621 
4622 void noaff_init(int nprocs)
4623 {
4624     kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4625     if (result == 0) return; // Already initialized
4626     else if (result == 2) { // Someone else is initializing
4627         while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4628         return;
4629     }
4630     KMP_DEBUG_ASSERT(result==1);
4631 
4632     kmp_uint32 numPerLevel[noaff_maxLevels];
4633     noaff_depth = 1;
4634     for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4635         numPerLevel[i] = 1;
4636         noaff_skipPerLevel[i] = 1;
4637     }
4638 
4639     numPerLevel[0] = 4;
4640     numPerLevel[1] = nprocs/4;
4641     if (nprocs%4) numPerLevel[1]++;
4642 
4643     for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4644         if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4645             noaff_depth++;
4646 
4647     kmp_uint32 branch = 4;
4648     if (numPerLevel[0] == 1) branch = nprocs/4;
4649     if (branch<4) branch=4;
4650     for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4651         while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4652             if (numPerLevel[d] & 1) numPerLevel[d]++;
4653             numPerLevel[d] = numPerLevel[d] >> 1;
4654             if (numPerLevel[d+1] == 1) noaff_depth++;
4655             numPerLevel[d+1] = numPerLevel[d+1] << 1;
4656         }
4657         if(numPerLevel[0] == 1) {
4658             branch = branch >> 1;
4659             if (branch<4) branch = 4;
4660         }
4661     }
4662 
4663     for (kmp_uint32 i=1; i<noaff_depth; ++i)
4664         noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4665     // Fill in hierarchy in the case of oversubscription
4666     for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4667         noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4668     noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4669     noaff_uninitialized = 0; // One writer
4670 
4671 }
4672 
4673 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4674     if (noaff_uninitialized)
4675         noaff_init(nproc);
4676 
4677     thr_bar->depth = noaff_depth;
4678     thr_bar->base_leaf_kids = noaff_leaf_kids;
4679     thr_bar->skip_per_level = noaff_skipPerLevel;
4680 }
4681 
4682 #endif // KMP_AFFINITY_SUPPORTED
4683