1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 42613 $
4  * $Date: 2013-08-23 13:29:50 -0500 (Fri, 23 Aug 2013) $
5  */
6 
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 
18 #include "kmp.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_str.h"
22 
23 
24 #if KMP_OS_WINDOWS || KMP_OS_LINUX
25 
26 //
27 // Print the affinity mask to the character array in a pretty format.
28 //
29 char *
30 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
31 {
32     KMP_ASSERT(buf_len >= 40);
33     char *scan = buf;
34     char *end = buf + buf_len - 1;
35 
36     //
37     // Find first element / check for empty set.
38     //
39     size_t i;
40     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
41         if (KMP_CPU_ISSET(i, mask)) {
42             break;
43         }
44     }
45     if (i == KMP_CPU_SETSIZE) {
46         sprintf(scan, "{<empty>}");
47         while (*scan != '\0') scan++;
48         KMP_ASSERT(scan <= end);
49         return buf;
50     }
51 
52     sprintf(scan, "{%ld", i);
53     while (*scan != '\0') scan++;
54     i++;
55     for (; i < KMP_CPU_SETSIZE; i++) {
56         if (! KMP_CPU_ISSET(i, mask)) {
57             continue;
58         }
59 
60         //
61         // Check for buffer overflow.  A string of the form ",<n>" will have
62         // at most 10 characters, plus we want to leave room to print ",...}"
63         // if the set is too large to print for a total of 15 characters.
64         // We already left room for '\0' in setting end.
65         //
66         if (end - scan < 15) {
67            break;
68         }
69         sprintf(scan, ",%-ld", i);
70         while (*scan != '\0') scan++;
71     }
72     if (i < KMP_CPU_SETSIZE) {
73         sprintf(scan, ",...");
74         while (*scan != '\0') scan++;
75     }
76     sprintf(scan, "}");
77     while (*scan != '\0') scan++;
78     KMP_ASSERT(scan <= end);
79     return buf;
80 }
81 
82 
83 void
84 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
85 {
86     KMP_CPU_ZERO(mask);
87 
88 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
89 
90     if (__kmp_num_proc_groups > 1) {
91         int group;
92         struct GROUP_AFFINITY ga;
93         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
94         for (group = 0; group < __kmp_num_proc_groups; group++) {
95             int i;
96             int num = __kmp_GetActiveProcessorCount(group);
97             for (i = 0; i < num; i++) {
98                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
99             }
100         }
101     }
102     else
103 
104 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
105 
106     {
107         int proc;
108         for (proc = 0; proc < __kmp_xproc; proc++) {
109             KMP_CPU_SET(proc, mask);
110         }
111     }
112 }
113 
114 
115 //
116 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
117 // functions.
118 //
119 // The icc codegen emits sections with extremely long names, of the form
120 // ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
121 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
122 // some sort of memory corruption or table overflow that is triggered by
123 // these long strings.  I checked the latest version of the linker -
124 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
125 // fixed.
126 //
127 // Unfortunately, my attempts to reproduce it in a smaller example have
128 // failed - I'm not sure what the prospects are of getting it fixed
129 // properly - but we need a reproducer smaller than all of libiomp.
130 //
131 // Work around the problem by avoiding inline constructors in such builds.
132 // We do this for all platforms, not just Linux* OS - non-inline functions are
133 // more debuggable and provide better coverage into than inline functions.
134 // Use inline functions in shipping libs, for performance.
135 //
136 
137 # if !defined(KMP_DEBUG) && !defined(COVER)
138 
139 class Address {
140 public:
141     static const unsigned maxDepth = 32;
142     unsigned labels[maxDepth];
143     unsigned childNums[maxDepth];
144     unsigned depth;
145     unsigned leader;
146     Address(unsigned _depth)
147       : depth(_depth), leader(FALSE) {
148     }
149     Address &operator=(const Address &b) {
150         depth = b.depth;
151         for (unsigned i = 0; i < depth; i++) {
152             labels[i] = b.labels[i];
153             childNums[i] = b.childNums[i];
154         }
155         leader = FALSE;
156         return *this;
157     }
158     bool operator==(const Address &b) const {
159         if (depth != b.depth)
160             return false;
161         for (unsigned i = 0; i < depth; i++)
162             if(labels[i] != b.labels[i])
163                 return false;
164         return true;
165     }
166     bool isClose(const Address &b, int level) const {
167         if (depth != b.depth)
168             return false;
169         if ((unsigned)level >= depth)
170             return true;
171         for (unsigned i = 0; i < (depth - level); i++)
172             if(labels[i] != b.labels[i])
173                 return false;
174         return true;
175     }
176     bool operator!=(const Address &b) const {
177         return !operator==(b);
178     }
179 };
180 
181 class AddrUnsPair {
182 public:
183     Address first;
184     unsigned second;
185     AddrUnsPair(Address _first, unsigned _second)
186       : first(_first), second(_second) {
187     }
188     AddrUnsPair &operator=(const AddrUnsPair &b)
189     {
190         first = b.first;
191         second = b.second;
192         return *this;
193     }
194 };
195 
196 # else
197 
198 class Address {
199 public:
200     static const unsigned maxDepth = 32;
201     unsigned labels[maxDepth];
202     unsigned childNums[maxDepth];
203     unsigned depth;
204     unsigned leader;
205     Address(unsigned _depth);
206     Address &operator=(const Address &b);
207     bool operator==(const Address &b) const;
208     bool isClose(const Address &b, int level) const;
209     bool operator!=(const Address &b) const;
210 };
211 
212 Address::Address(unsigned _depth)
213 {
214     depth = _depth;
215     leader = FALSE;
216 }
217 
218 Address &Address::operator=(const Address &b) {
219     depth = b.depth;
220     for (unsigned i = 0; i < depth; i++) {
221         labels[i] = b.labels[i];
222         childNums[i] = b.childNums[i];
223     }
224     leader = FALSE;
225     return *this;
226 }
227 
228 bool Address::operator==(const Address &b) const {
229     if (depth != b.depth)
230         return false;
231     for (unsigned i = 0; i < depth; i++)
232         if(labels[i] != b.labels[i])
233             return false;
234     return true;
235 }
236 
237 bool Address::isClose(const Address &b, int level) const {
238     if (depth != b.depth)
239         return false;
240     if ((unsigned)level >= depth)
241         return true;
242     for (unsigned i = 0; i < (depth - level); i++)
243         if(labels[i] != b.labels[i])
244             return false;
245     return true;
246 }
247 
248 bool Address::operator!=(const Address &b) const {
249     return !operator==(b);
250 }
251 
252 class AddrUnsPair {
253 public:
254     Address first;
255     unsigned second;
256     AddrUnsPair(Address _first, unsigned _second);
257     AddrUnsPair &operator=(const AddrUnsPair &b);
258 };
259 
260 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
261   : first(_first), second(_second)
262 {
263 }
264 
265 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
266 {
267     first = b.first;
268     second = b.second;
269     return *this;
270 }
271 
272 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
273 
274 
275 static int
276 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
277 {
278     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
279       ->first);
280     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
281       ->first);
282     unsigned depth = aa->depth;
283     unsigned i;
284     KMP_DEBUG_ASSERT(depth == bb->depth);
285     for (i  = 0; i < depth; i++) {
286         if (aa->labels[i] < bb->labels[i]) return -1;
287         if (aa->labels[i] > bb->labels[i]) return 1;
288     }
289     return 0;
290 }
291 
292 
293 static int
294 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
295 {
296     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
297       ->first);
298     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
299       ->first);
300     unsigned depth = aa->depth;
301     unsigned i;
302     KMP_DEBUG_ASSERT(depth == bb->depth);
303     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
304     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
305     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
306         int j = depth - i - 1;
307         if (aa->childNums[j] < bb->childNums[j]) return -1;
308         if (aa->childNums[j] > bb->childNums[j]) return 1;
309     }
310     for (; i < depth; i++) {
311         int j = i - __kmp_affinity_compact;
312         if (aa->childNums[j] < bb->childNums[j]) return -1;
313         if (aa->childNums[j] > bb->childNums[j]) return 1;
314     }
315     return 0;
316 }
317 
318 
319 //
320 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
321 // called to renumber the labels from [0..n] and place them into the child_num
322 // vector of the address object.  This is done in case the labels used for
323 // the children at one node of the heirarchy differ from those used for
324 // another node at the same level.  Example:  suppose the machine has 2 nodes
325 // with 2 packages each.  The first node contains packages 601 and 602, and
326 // second node contains packages 603 and 604.  If we try to sort the table
327 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
328 // because we are paying attention to the labels themselves, not the ordinal
329 // child numbers.  By using the child numbers in the sort, the result is
330 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
331 //
332 static void
333 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
334   int numAddrs)
335 {
336     KMP_DEBUG_ASSERT(numAddrs > 0);
337     int depth = address2os->first.depth;
338     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
339     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
340       * sizeof(unsigned));
341     int labCt;
342     for (labCt = 0; labCt < depth; labCt++) {
343         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
344         lastLabel[labCt] = address2os[0].first.labels[labCt];
345     }
346     int i;
347     for (i = 1; i < numAddrs; i++) {
348         for (labCt = 0; labCt < depth; labCt++) {
349             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
350                 int labCt2;
351                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
352                     counts[labCt2] = 0;
353                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
354                 }
355                 counts[labCt]++;
356                 lastLabel[labCt] = address2os[i].first.labels[labCt];
357                 break;
358             }
359         }
360         for (labCt = 0; labCt < depth; labCt++) {
361             address2os[i].first.childNums[labCt] = counts[labCt];
362         }
363         for (; labCt < (int)Address::maxDepth; labCt++) {
364             address2os[i].first.childNums[labCt] = 0;
365         }
366     }
367 }
368 
369 
370 //
371 // All of the __kmp_affinity_create_*_map() routines should set
372 // __kmp_affinity_masks to a vector of affinity mask objects of length
373 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
374 // return the number of levels in the machine topology tree (zero if
375 // __kmp_affinity_type == affinity_none).
376 //
377 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
378 // to the affinity mask for the initialization thread.  They need to save and
379 // restore the mask, and it could be needed later, so saving it is just an
380 // optimization to avoid calling kmp_get_system_affinity() again.
381 //
382 static kmp_affin_mask_t *fullMask = NULL;
383 
384 kmp_affin_mask_t *
385 __kmp_affinity_get_fullMask() { return fullMask; }
386 
387 
388 static int nCoresPerPkg, nPackages;
389 int __kmp_nThreadsPerCore;
390 
391 //
392 // __kmp_affinity_uniform_topology() doesn't work when called from
393 // places which support arbitrarily many levels in the machine topology
394 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
395 // __kmp_affinity_create_x2apicid_map().
396 //
397 inline static bool
398 __kmp_affinity_uniform_topology()
399 {
400     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
401 }
402 
403 
404 //
405 // Print out the detailed machine topology map, i.e. the physical locations
406 // of each OS proc.
407 //
408 static void
409 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
410   int pkgLevel, int coreLevel, int threadLevel)
411 {
412     int proc;
413 
414     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
415     for (proc = 0; proc < len; proc++) {
416         int level;
417         kmp_str_buf_t buf;
418         __kmp_str_buf_init(&buf);
419         for (level = 0; level < depth; level++) {
420             if (level == threadLevel) {
421                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
422             }
423             else if (level == coreLevel) {
424                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
425             }
426             else if (level == pkgLevel) {
427                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
428             }
429             else if (level > pkgLevel) {
430                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
431                   level - pkgLevel - 1);
432             }
433             else {
434                 __kmp_str_buf_print(&buf, "L%d ", level);
435             }
436             __kmp_str_buf_print(&buf, "%d ",
437               address2os[proc].first.labels[level]);
438         }
439         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
440           buf.str);
441         __kmp_str_buf_free(&buf);
442     }
443 }
444 
445 
446 //
447 // If we don't know how to retrieve the machine's processor topology, or
448 // encounter an error in doing so, this routine is called to form a "flat"
449 // mapping of os thread id's <-> processor id's.
450 //
451 static int
452 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
453   kmp_i18n_id_t *const msg_id)
454 {
455     *address2os = NULL;
456     *msg_id = kmp_i18n_null;
457 
458     //
459     // Even if __kmp_affinity_type == affinity_none, this routine might still
460     // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
461     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
462     //
463     if (! KMP_AFFINITY_CAPABLE()) {
464         KMP_ASSERT(__kmp_affinity_type == affinity_none);
465         __kmp_ncores = nPackages = __kmp_xproc;
466         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
467         __kmp_ht_enabled = FALSE;
468         if (__kmp_affinity_verbose) {
469             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
470             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
471             KMP_INFORM(Uniform, "KMP_AFFINITY");
472             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
473               __kmp_nThreadsPerCore, __kmp_ncores);
474         }
475         return 0;
476     }
477 
478     //
479     // When affinity is off, this routine will still be called to set
480     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
481     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
482     //  correctly, and return now if affinity is not enabled.
483     //
484     __kmp_ncores = nPackages = __kmp_avail_proc;
485     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
486     __kmp_ht_enabled = FALSE;
487     if (__kmp_affinity_verbose) {
488         char buf[KMP_AFFIN_MASK_PRINT_LEN];
489         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
490 
491         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
492         if (__kmp_affinity_respect_mask) {
493             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
494         } else {
495             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
496         }
497         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
498         KMP_INFORM(Uniform, "KMP_AFFINITY");
499         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
500           __kmp_nThreadsPerCore, __kmp_ncores);
501     }
502     if (__kmp_affinity_type == affinity_none) {
503         return 0;
504     }
505 
506     //
507     // Contruct the data structure to be returned.
508     //
509     *address2os = (AddrUnsPair*)
510       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
511     int avail_ct = 0;
512     unsigned int i;
513     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
514         //
515         // Skip this proc if it is not included in the machine model.
516         //
517         if (! KMP_CPU_ISSET(i, fullMask)) {
518             continue;
519         }
520 
521         Address addr(1);
522         addr.labels[0] = i;
523         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
524     }
525     if (__kmp_affinity_verbose) {
526         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
527     }
528 
529     if (__kmp_affinity_gran_levels < 0) {
530         //
531         // Only the package level is modeled in the machine topology map,
532         // so the #levels of granularity is either 0 or 1.
533         //
534         if (__kmp_affinity_gran > affinity_gran_package) {
535             __kmp_affinity_gran_levels = 1;
536         }
537         else {
538             __kmp_affinity_gran_levels = 0;
539         }
540     }
541     return 1;
542 }
543 
544 
545 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
546 
547 //
548 // If multiple Windows* OS processor groups exist, we can create a 2-level
549 // topology map with the groups at level 0 and the individual procs at
550 // level 1.
551 //
552 // This facilitates letting the threads float among all procs in a group,
553 // if granularity=group (the default when there are multiple groups).
554 //
555 static int
556 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
557   kmp_i18n_id_t *const msg_id)
558 {
559     *address2os = NULL;
560     *msg_id = kmp_i18n_null;
561 
562     //
563     // If we don't have multiple processor groups, return now.
564     // The flat mapping will be used.
565     //
566     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
567         // FIXME set *msg_id
568         return -1;
569     }
570 
571     //
572     // Contruct the data structure to be returned.
573     //
574     *address2os = (AddrUnsPair*)
575       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
576     int avail_ct = 0;
577     int i;
578     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
579         //
580         // Skip this proc if it is not included in the machine model.
581         //
582         if (! KMP_CPU_ISSET(i, fullMask)) {
583             continue;
584         }
585 
586         Address addr(2);
587         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
588         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
589         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
590 
591         if (__kmp_affinity_verbose) {
592             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
593               addr.labels[1]);
594         }
595     }
596 
597     if (__kmp_affinity_gran_levels < 0) {
598         if (__kmp_affinity_gran == affinity_gran_group) {
599             __kmp_affinity_gran_levels = 1;
600         }
601         else if ((__kmp_affinity_gran == affinity_gran_fine)
602           || (__kmp_affinity_gran == affinity_gran_thread)) {
603             __kmp_affinity_gran_levels = 0;
604         }
605         else {
606             const char *gran_str = NULL;
607             if (__kmp_affinity_gran == affinity_gran_core) {
608                 gran_str = "core";
609             }
610             else if (__kmp_affinity_gran == affinity_gran_package) {
611                 gran_str = "package";
612             }
613             else if (__kmp_affinity_gran == affinity_gran_node) {
614                 gran_str = "node";
615             }
616             else {
617                 KMP_ASSERT(0);
618             }
619 
620             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
621             __kmp_affinity_gran_levels = 0;
622         }
623     }
624     return 2;
625 }
626 
627 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
628 
629 
630 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
631 
632 static int
633 __kmp_cpuid_mask_width(int count) {
634     int r = 0;
635 
636     while((1<<r) < count)
637         ++r;
638     return r;
639 }
640 
641 
642 class apicThreadInfo {
643 public:
644     unsigned osId;              // param to __kmp_affinity_bind_thread
645     unsigned apicId;            // from cpuid after binding
646     unsigned maxCoresPerPkg;    //      ""
647     unsigned maxThreadsPerPkg;  //      ""
648     unsigned pkgId;             // inferred from above values
649     unsigned coreId;            //      ""
650     unsigned threadId;          //      ""
651 };
652 
653 
654 static int
655 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
656 {
657     const apicThreadInfo *aa = (const apicThreadInfo *)a;
658     const apicThreadInfo *bb = (const apicThreadInfo *)b;
659     if (aa->osId < bb->osId) return -1;
660     if (aa->osId > bb->osId) return 1;
661     return 0;
662 }
663 
664 
665 static int
666 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
667 {
668     const apicThreadInfo *aa = (const apicThreadInfo *)a;
669     const apicThreadInfo *bb = (const apicThreadInfo *)b;
670     if (aa->pkgId < bb->pkgId) return -1;
671     if (aa->pkgId > bb->pkgId) return 1;
672     if (aa->coreId < bb->coreId) return -1;
673     if (aa->coreId > bb->coreId) return 1;
674     if (aa->threadId < bb->threadId) return -1;
675     if (aa->threadId > bb->threadId) return 1;
676     return 0;
677 }
678 
679 
680 //
681 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
682 // an algorithm which cycles through the available os threads, setting
683 // the current thread's affinity mask to that thread, and then retrieves
684 // the Apic Id for each thread context using the cpuid instruction.
685 //
686 static int
687 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
688   kmp_i18n_id_t *const msg_id)
689 {
690     int rc;
691     *address2os = NULL;
692     *msg_id = kmp_i18n_null;
693 
694 #  if KMP_MIC
695     {
696         // The code below will use cpuid(4).
697         // Check if cpuid(4) is supported.
698         // FIXME? - this really doesn't need to be specific to MIC.
699         kmp_cpuid buf;
700         __kmp_x86_cpuid(0, 0, &buf);
701         if (buf.eax < 4) {
702             *msg_id = kmp_i18n_str_NoLeaf4Support;
703             return -1;
704         }
705     }
706 #  endif // KMP_MIC
707 
708     //
709     // Even if __kmp_affinity_type == affinity_none, this routine is still
710     // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
711     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
712     //
713     // The algorithm used starts by setting the affinity to each available
714     // thread and retreiving info from the cpuid instruction, so if we are not
715     // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
716     // then we need to do something else.
717     //
718     if (! KMP_AFFINITY_CAPABLE()) {
719         //
720         // Hack to try and infer the machine topology using only the data
721         // available from cpuid on the current thread, and __kmp_xproc.
722         //
723         KMP_ASSERT(__kmp_affinity_type == affinity_none);
724 
725         //
726         // Get an upper bound on the number of threads per package using
727         // cpuid(1).
728         //
729         // On some OS/chps combinations where HT is supported by the chip
730         // but is disabled, this value will be 2 on a single core chip.
731         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
732         //
733         kmp_cpuid buf;
734         __kmp_x86_cpuid(1, 0, &buf);
735         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
736         if (maxThreadsPerPkg == 0) {
737             maxThreadsPerPkg = 1;
738         }
739 
740         //
741         // The num cores per pkg comes from cpuid(4).
742         // 1 must be added to the encoded value.
743         //
744         // The author of cpu_count.cpp treated this only an upper bound
745         // on the number of cores, but I haven't seen any cases where it
746         // was greater than the actual number of cores, so we will treat
747         // it as exact in this block of code.
748         //
749         // First, we need to check if cpuid(4) is supported on this chip.
750         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
751         // has the value n or greater.
752         //
753         __kmp_x86_cpuid(0, 0, &buf);
754         if (buf.eax >= 4) {
755             __kmp_x86_cpuid(4, 0, &buf);
756             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
757         }
758         else {
759             nCoresPerPkg = 1;
760         }
761 
762         //
763         // There is no way to reliably tell if HT is enabled without issuing
764         // the cpuid instruction from every thread, can correlating the cpuid
765         // info, so if the machine is not affinity capable, we assume that HT
766         // is off.  We have seen quite a few machines where maxThreadsPerPkg
767         // is 2, yet the machine does not support HT.
768         //
769         // - Older OSes are usually found on machines with older chips, which
770         //   do not support HT.
771         //
772         // - The performance penalty for mistakenly identifying a machine as
773         //   HT when it isn't (which results in blocktime being incorrecly set
774         //   to 0) is greater than the penalty when for mistakenly identifying
775         //   a machine as being 1 thread/core when it is really HT enabled
776         //   (which results in blocktime being incorrectly set to a positive
777         //   value).
778         //
779         __kmp_ncores = __kmp_xproc;
780         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
781         __kmp_nThreadsPerCore = 1;
782         __kmp_ht_enabled = FALSE;
783         if (__kmp_affinity_verbose) {
784             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
785             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
786             if (__kmp_affinity_uniform_topology()) {
787                 KMP_INFORM(Uniform, "KMP_AFFINITY");
788             } else {
789                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
790             }
791             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
792               __kmp_nThreadsPerCore, __kmp_ncores);
793         }
794         return 0;
795     }
796 
797     //
798     //
799     // From here on, we can assume that it is safe to call
800     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
801     // even if __kmp_affinity_type = affinity_none.
802     //
803 
804     //
805     // Save the affinity mask for the current thread.
806     //
807     kmp_affin_mask_t *oldMask;
808     KMP_CPU_ALLOC(oldMask);
809     KMP_ASSERT(oldMask != NULL);
810     __kmp_get_system_affinity(oldMask, TRUE);
811 
812     //
813     // Run through each of the available contexts, binding the current thread
814     // to it, and obtaining the pertinent information using the cpuid instr.
815     //
816     // The relevant information is:
817     //
818     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
819     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
820     //
821     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
822     //    value of this field determines the width of the core# + thread#
823     //    fields in the Apic Id.  It is also an upper bound on the number
824     //    of threads per package, but it has been verified that situations
825     //    happen were it is not exact.  In particular, on certain OS/chip
826     //    combinations where Intel(R) Hyper-Threading Technology is supported
827     //    by the chip but has
828     //    been disabled, the value of this field will be 2 (for a single core
829     //    chip).  On other OS/chip combinations supporting
830     //    Intel(R) Hyper-Threading Technology, the value of
831     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
832     //    disabled and 2 when it is enabled.
833     //
834     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
835     //    value of this field (+1) determines the width of the core# field in
836     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
837     //    an upper bound, but the IA-32 architecture manual says that it is
838     //    exactly the number of cores per package, and I haven't seen any
839     //    case where it wasn't.
840     //
841     // From this information, deduce the package Id, core Id, and thread Id,
842     // and set the corresponding fields in the apicThreadInfo struct.
843     //
844     unsigned i;
845     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
846       __kmp_avail_proc * sizeof(apicThreadInfo));
847     unsigned nApics = 0;
848     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
849         //
850         // Skip this proc if it is not included in the machine model.
851         //
852         if (! KMP_CPU_ISSET(i, fullMask)) {
853             continue;
854         }
855         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
856 
857         __kmp_affinity_bind_thread(i);
858         threadInfo[nApics].osId = i;
859 
860         //
861         // The apic id and max threads per pkg come from cpuid(1).
862         //
863         kmp_cpuid buf;
864         __kmp_x86_cpuid(1, 0, &buf);
865         if (! (buf.edx >> 9) & 1) {
866             __kmp_set_system_affinity(oldMask, TRUE);
867             __kmp_free(threadInfo);
868             KMP_CPU_FREE(oldMask);
869             *msg_id = kmp_i18n_str_ApicNotPresent;
870             return -1;
871         }
872         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
873         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
874         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
875             threadInfo[nApics].maxThreadsPerPkg = 1;
876         }
877 
878         //
879         // Max cores per pkg comes from cpuid(4).
880         // 1 must be added to the encoded value.
881         //
882         // First, we need to check if cpuid(4) is supported on this chip.
883         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
884         // has the value n or greater.
885         //
886         __kmp_x86_cpuid(0, 0, &buf);
887         if (buf.eax >= 4) {
888             __kmp_x86_cpuid(4, 0, &buf);
889             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
890         }
891         else {
892             threadInfo[nApics].maxCoresPerPkg = 1;
893         }
894 
895         //
896         // Infer the pkgId / coreId / threadId using only the info
897         // obtained locally.
898         //
899         int widthCT = __kmp_cpuid_mask_width(
900           threadInfo[nApics].maxThreadsPerPkg);
901         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
902 
903         int widthC = __kmp_cpuid_mask_width(
904           threadInfo[nApics].maxCoresPerPkg);
905         int widthT = widthCT - widthC;
906         if (widthT < 0) {
907             //
908             // I've never seen this one happen, but I suppose it could, if
909             // the cpuid instruction on a chip was really screwed up.
910             // Make sure to restore the affinity mask before the tail call.
911             //
912             __kmp_set_system_affinity(oldMask, TRUE);
913             __kmp_free(threadInfo);
914             KMP_CPU_FREE(oldMask);
915             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
916             return -1;
917         }
918 
919         int maskC = (1 << widthC) - 1;
920         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
921           &maskC;
922 
923         int maskT = (1 << widthT) - 1;
924         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
925 
926         nApics++;
927     }
928 
929     //
930     // We've collected all the info we need.
931     // Restore the old affinity mask for this thread.
932     //
933     __kmp_set_system_affinity(oldMask, TRUE);
934 
935     //
936     // If there's only one thread context to bind to, form an Address object
937     // with depth 1 and return immediately (or, if affinity is off, set
938     // address2os to NULL and return).
939     //
940     // If it is configured to omit the package level when there is only a
941     // single package, the logic at the end of this routine won't work if
942     // there is only a single thread - it would try to form an Address
943     // object with depth 0.
944     //
945     KMP_ASSERT(nApics > 0);
946     if (nApics == 1) {
947         __kmp_ncores = nPackages = 1;
948         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
949         __kmp_ht_enabled = FALSE;
950         if (__kmp_affinity_verbose) {
951             char buf[KMP_AFFIN_MASK_PRINT_LEN];
952             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
953 
954             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
955             if (__kmp_affinity_respect_mask) {
956                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
957             } else {
958                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
959             }
960             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
961             KMP_INFORM(Uniform, "KMP_AFFINITY");
962             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
963               __kmp_nThreadsPerCore, __kmp_ncores);
964         }
965 
966         if (__kmp_affinity_type == affinity_none) {
967             __kmp_free(threadInfo);
968             KMP_CPU_FREE(oldMask);
969             return 0;
970         }
971 
972         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
973         Address addr(1);
974         addr.labels[0] = threadInfo[0].pkgId;
975         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
976 
977         if (__kmp_affinity_gran_levels < 0) {
978             __kmp_affinity_gran_levels = 0;
979         }
980 
981         if (__kmp_affinity_verbose) {
982             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
983         }
984 
985         __kmp_free(threadInfo);
986         KMP_CPU_FREE(oldMask);
987         return 1;
988     }
989 
990     //
991     // Sort the threadInfo table by physical Id.
992     //
993     qsort(threadInfo, nApics, sizeof(*threadInfo),
994       __kmp_affinity_cmp_apicThreadInfo_phys_id);
995 
996     //
997     // The table is now sorted by pkgId / coreId / threadId, but we really
998     // don't know the radix of any of the fields.  pkgId's may be sparsely
999     // assigned among the chips on a system.  Although coreId's are usually
1000     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1001     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1002     //
1003     // For that matter, we don't know what coresPerPkg and threadsPerCore
1004     // (or the total # packages) are at this point - we want to determine
1005     // that now.  We only have an upper bound on the first two figures.
1006     //
1007     // We also perform a consistency check at this point: the values returned
1008     // by the cpuid instruction for any thread bound to a given package had
1009     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1010     //
1011     nPackages = 1;
1012     nCoresPerPkg = 1;
1013     __kmp_nThreadsPerCore = 1;
1014     unsigned nCores = 1;
1015 
1016     unsigned pkgCt = 1;                         // to determine radii
1017     unsigned lastPkgId = threadInfo[0].pkgId;
1018     unsigned coreCt = 1;
1019     unsigned lastCoreId = threadInfo[0].coreId;
1020     unsigned threadCt = 1;
1021     unsigned lastThreadId = threadInfo[0].threadId;
1022 
1023                                                 // intra-pkg consist checks
1024     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1025     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1026 
1027     for (i = 1; i < nApics; i++) {
1028         if (threadInfo[i].pkgId != lastPkgId) {
1029             nCores++;
1030             pkgCt++;
1031             lastPkgId = threadInfo[i].pkgId;
1032             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1033             coreCt = 1;
1034             lastCoreId = threadInfo[i].coreId;
1035             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1036             threadCt = 1;
1037             lastThreadId = threadInfo[i].threadId;
1038 
1039             //
1040             // This is a different package, so go on to the next iteration
1041             // without doing any consistency checks.  Reset the consistency
1042             // check vars, though.
1043             //
1044             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1045             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1046             continue;
1047         }
1048 
1049         if (threadInfo[i].coreId != lastCoreId) {
1050             nCores++;
1051             coreCt++;
1052             lastCoreId = threadInfo[i].coreId;
1053             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1054             threadCt = 1;
1055             lastThreadId = threadInfo[i].threadId;
1056         }
1057         else if (threadInfo[i].threadId != lastThreadId) {
1058             threadCt++;
1059             lastThreadId = threadInfo[i].threadId;
1060         }
1061         else {
1062             __kmp_free(threadInfo);
1063             KMP_CPU_FREE(oldMask);
1064             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1065             return -1;
1066         }
1067 
1068         //
1069         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1070         // fields agree between all the threads bounds to a given package.
1071         //
1072         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1073           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1074             __kmp_free(threadInfo);
1075             KMP_CPU_FREE(oldMask);
1076             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1077             return -1;
1078         }
1079     }
1080     nPackages = pkgCt;
1081     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1082     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1083 
1084     //
1085     // When affinity is off, this routine will still be called to set
1086     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1087     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1088     // correctly, and return now if affinity is not enabled.
1089     //
1090     __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1091     __kmp_ncores = nCores;
1092     if (__kmp_affinity_verbose) {
1093         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1094         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1095 
1096         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1097         if (__kmp_affinity_respect_mask) {
1098             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1099         } else {
1100             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1101         }
1102         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1103         if (__kmp_affinity_uniform_topology()) {
1104             KMP_INFORM(Uniform, "KMP_AFFINITY");
1105         } else {
1106             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1107         }
1108         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1109           __kmp_nThreadsPerCore, __kmp_ncores);
1110 
1111     }
1112 
1113     if (__kmp_affinity_type == affinity_none) {
1114         __kmp_free(threadInfo);
1115         KMP_CPU_FREE(oldMask);
1116         return 0;
1117     }
1118 
1119     //
1120     // Now that we've determined the number of packages, the number of cores
1121     // per package, and the number of threads per core, we can construct the
1122     // data structure that is to be returned.
1123     //
1124     int pkgLevel = 0;
1125     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1126     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1127     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1128 
1129     KMP_ASSERT(depth > 0);
1130     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1131 
1132     for (i = 0; i < nApics; ++i) {
1133         Address addr(depth);
1134         unsigned os = threadInfo[i].osId;
1135         int d = 0;
1136 
1137         if (pkgLevel >= 0) {
1138             addr.labels[d++] = threadInfo[i].pkgId;
1139         }
1140         if (coreLevel >= 0) {
1141             addr.labels[d++] = threadInfo[i].coreId;
1142         }
1143         if (threadLevel >= 0) {
1144             addr.labels[d++] = threadInfo[i].threadId;
1145         }
1146         (*address2os)[i] = AddrUnsPair(addr, os);
1147     }
1148 
1149     if (__kmp_affinity_gran_levels < 0) {
1150         //
1151         // Set the granularity level based on what levels are modeled
1152         // in the machine topology map.
1153         //
1154         __kmp_affinity_gran_levels = 0;
1155         if ((threadLevel >= 0)
1156           && (__kmp_affinity_gran > affinity_gran_thread)) {
1157             __kmp_affinity_gran_levels++;
1158         }
1159         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1160             __kmp_affinity_gran_levels++;
1161         }
1162         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1163             __kmp_affinity_gran_levels++;
1164         }
1165     }
1166 
1167     if (__kmp_affinity_verbose) {
1168         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1169           coreLevel, threadLevel);
1170     }
1171 
1172     __kmp_free(threadInfo);
1173     KMP_CPU_FREE(oldMask);
1174     return depth;
1175 }
1176 
1177 
1178 //
1179 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1180 // architectures support a newer interface for specifying the x2APIC Ids,
1181 // based on cpuid leaf 11.
1182 //
1183 static int
1184 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1185   kmp_i18n_id_t *const msg_id)
1186 {
1187     kmp_cpuid buf;
1188 
1189     *address2os = NULL;
1190     *msg_id = kmp_i18n_null;
1191 
1192     //
1193     // Check to see if cpuid leaf 11 is supported.
1194     //
1195     __kmp_x86_cpuid(0, 0, &buf);
1196     if (buf.eax < 11) {
1197         *msg_id = kmp_i18n_str_NoLeaf11Support;
1198         return -1;
1199     }
1200     __kmp_x86_cpuid(11, 0, &buf);
1201     if (buf.ebx == 0) {
1202         *msg_id = kmp_i18n_str_NoLeaf11Support;
1203         return -1;
1204     }
1205 
1206     //
1207     // Find the number of levels in the machine topology.  While we're at it,
1208     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1209     // try to get more accurate values later by explicitly counting them,
1210     // but get reasonable defaults now, in case we return early.
1211     //
1212     int level;
1213     int threadLevel = -1;
1214     int coreLevel = -1;
1215     int pkgLevel = -1;
1216     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1217 
1218     for (level = 0;; level++) {
1219         if (level > 31) {
1220             //
1221             // FIXME: Hack for DPD200163180
1222             //
1223             // If level is big then something went wrong -> exiting
1224             //
1225             // There could actually be 32 valid levels in the machine topology,
1226             // but so far, the only machine we have seen which does not exit
1227             // this loop before iteration 32 has fubar x2APIC settings.
1228             //
1229             // For now, just reject this case based upon loop trip count.
1230             //
1231             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1232             return -1;
1233         }
1234         __kmp_x86_cpuid(11, level, &buf);
1235         if (buf.ebx == 0) {
1236             if (pkgLevel < 0) {
1237                 //
1238                 // Will infer nPackages from __kmp_xproc
1239                 //
1240                 pkgLevel = level;
1241                 level++;
1242             }
1243             break;
1244         }
1245         int kind = (buf.ecx >> 8) & 0xff;
1246         if (kind == 1) {
1247             //
1248             // SMT level
1249             //
1250             threadLevel = level;
1251             coreLevel = -1;
1252             pkgLevel = -1;
1253             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1254             if (__kmp_nThreadsPerCore == 0) {
1255                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1256                 return -1;
1257             }
1258         }
1259         else if (kind == 2) {
1260             //
1261             // core level
1262             //
1263             coreLevel = level;
1264             pkgLevel = -1;
1265             nCoresPerPkg = buf.ebx & 0xff;
1266             if (nCoresPerPkg == 0) {
1267                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1268                 return -1;
1269             }
1270         }
1271         else {
1272             if (level <= 0) {
1273                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1274                 return -1;
1275             }
1276             if (pkgLevel >= 0) {
1277                 continue;
1278             }
1279             pkgLevel = level;
1280             nPackages = buf.ebx & 0xff;
1281             if (nPackages == 0) {
1282                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1283                 return -1;
1284             }
1285         }
1286     }
1287     int depth = level;
1288 
1289     //
1290     // In the above loop, "level" was counted from the finest level (usually
1291     // thread) to the coarsest.  The caller expects that we will place the
1292     // labels in (*address2os)[].first.labels[] in the inverse order, so
1293     // we need to invert the vars saying which level means what.
1294     //
1295     if (threadLevel >= 0) {
1296         threadLevel = depth - threadLevel - 1;
1297     }
1298     if (coreLevel >= 0) {
1299         coreLevel = depth - coreLevel - 1;
1300     }
1301     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1302     pkgLevel = depth - pkgLevel - 1;
1303 
1304     //
1305     // The algorithm used starts by setting the affinity to each available
1306     // thread and retrieving info from the cpuid instruction, so if we are not
1307     // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1308     // then we need to do something else - use the defaults that we calculated
1309     // from issuing cpuid without binding to each proc.
1310     //
1311     if (! KMP_AFFINITY_CAPABLE())
1312     {
1313         //
1314         // Hack to try and infer the machine topology using only the data
1315         // available from cpuid on the current thread, and __kmp_xproc.
1316         //
1317         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1318 
1319         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1320         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1321         __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1322         if (__kmp_affinity_verbose) {
1323             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1324             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1325             if (__kmp_affinity_uniform_topology()) {
1326                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1327             } else {
1328                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1329             }
1330             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1331               __kmp_nThreadsPerCore, __kmp_ncores);
1332         }
1333         return 0;
1334     }
1335 
1336     //
1337     //
1338     // From here on, we can assume that it is safe to call
1339     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1340     // even if __kmp_affinity_type = affinity_none.
1341     //
1342 
1343     //
1344     // Save the affinity mask for the current thread.
1345     //
1346     kmp_affin_mask_t *oldMask;
1347     KMP_CPU_ALLOC(oldMask);
1348     __kmp_get_system_affinity(oldMask, TRUE);
1349 
1350     //
1351     // Allocate the data structure to be returned.
1352     //
1353     AddrUnsPair *retval = (AddrUnsPair *)
1354       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1355 
1356     //
1357     // Run through each of the available contexts, binding the current thread
1358     // to it, and obtaining the pertinent information using the cpuid instr.
1359     //
1360     unsigned int proc;
1361     int nApics = 0;
1362     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1363         //
1364         // Skip this proc if it is not included in the machine model.
1365         //
1366         if (! KMP_CPU_ISSET(proc, fullMask)) {
1367             continue;
1368         }
1369         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1370 
1371         __kmp_affinity_bind_thread(proc);
1372 
1373         //
1374         // Extrach the labels for each level in the machine topology map
1375         // from the Apic ID.
1376         //
1377         Address addr(depth);
1378         int prev_shift = 0;
1379 
1380         for (level = 0; level < depth; level++) {
1381             __kmp_x86_cpuid(11, level, &buf);
1382             unsigned apicId = buf.edx;
1383             if (buf.ebx == 0) {
1384                 if (level != depth - 1) {
1385                     KMP_CPU_FREE(oldMask);
1386                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1387                     return -1;
1388                 }
1389                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1390                 level++;
1391                 break;
1392             }
1393             int shift = buf.eax & 0x1f;
1394             int mask = (1 << shift) - 1;
1395             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1396             prev_shift = shift;
1397         }
1398         if (level != depth) {
1399             KMP_CPU_FREE(oldMask);
1400             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1401             return -1;
1402         }
1403 
1404         retval[nApics] = AddrUnsPair(addr, proc);
1405         nApics++;
1406     }
1407 
1408     //
1409     // We've collected all the info we need.
1410     // Restore the old affinity mask for this thread.
1411     //
1412     __kmp_set_system_affinity(oldMask, TRUE);
1413 
1414     //
1415     // If there's only one thread context to bind to, return now.
1416     //
1417     KMP_ASSERT(nApics > 0);
1418     if (nApics == 1) {
1419         __kmp_ncores = nPackages = 1;
1420         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1421         __kmp_ht_enabled = FALSE;
1422         if (__kmp_affinity_verbose) {
1423             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1424             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1425 
1426             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1427             if (__kmp_affinity_respect_mask) {
1428                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1429             } else {
1430                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1431             }
1432             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1433             KMP_INFORM(Uniform, "KMP_AFFINITY");
1434             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1435               __kmp_nThreadsPerCore, __kmp_ncores);
1436         }
1437 
1438         if (__kmp_affinity_type == affinity_none) {
1439             __kmp_free(retval);
1440             KMP_CPU_FREE(oldMask);
1441             return 0;
1442         }
1443 
1444         //
1445         // Form an Address object which only includes the package level.
1446         //
1447         Address addr(1);
1448         addr.labels[0] = retval[0].first.labels[pkgLevel];
1449         retval[0].first = addr;
1450 
1451         if (__kmp_affinity_gran_levels < 0) {
1452             __kmp_affinity_gran_levels = 0;
1453         }
1454 
1455         if (__kmp_affinity_verbose) {
1456             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1457         }
1458 
1459         *address2os = retval;
1460         KMP_CPU_FREE(oldMask);
1461         return 1;
1462     }
1463 
1464     //
1465     // Sort the table by physical Id.
1466     //
1467     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1468 
1469     //
1470     // Find the radix at each of the levels.
1471     //
1472     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1473     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1474     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1475     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1476     for (level = 0; level < depth; level++) {
1477         totals[level] = 1;
1478         maxCt[level] = 1;
1479         counts[level] = 1;
1480         last[level] = retval[0].first.labels[level];
1481     }
1482 
1483     //
1484     // From here on, the iteration variable "level" runs from the finest
1485     // level to the coarsest, i.e. we iterate forward through
1486     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1487     // backwards.
1488     //
1489     for (proc = 1; (int)proc < nApics; proc++) {
1490         int level;
1491         for (level = 0; level < depth; level++) {
1492             if (retval[proc].first.labels[level] != last[level]) {
1493                 int j;
1494                 for (j = level + 1; j < depth; j++) {
1495                     totals[j]++;
1496                     counts[j] = 1;
1497                     // The line below causes printing incorrect topology information
1498                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1499                     // some less value while going through the array.
1500                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1501                     // whereas it must be 4.
1502                     // TODO!!! Check if it can be commented safely
1503                     //maxCt[j] = 1;
1504                     last[j] = retval[proc].first.labels[j];
1505                 }
1506                 totals[level]++;
1507                 counts[level]++;
1508                 if (counts[level] > maxCt[level]) {
1509                     maxCt[level] = counts[level];
1510                 }
1511                 last[level] = retval[proc].first.labels[level];
1512                 break;
1513             }
1514             else if (level == depth - 1) {
1515                 __kmp_free(last);
1516                 __kmp_free(maxCt);
1517                 __kmp_free(counts);
1518                 __kmp_free(totals);
1519                 __kmp_free(retval);
1520                 KMP_CPU_FREE(oldMask);
1521                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1522                 return -1;
1523             }
1524         }
1525     }
1526 
1527     //
1528     // When affinity is off, this routine will still be called to set
1529     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1530     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1531     // correctly, and return if affinity is not enabled.
1532     //
1533     if (threadLevel >= 0) {
1534         __kmp_nThreadsPerCore = maxCt[threadLevel];
1535     }
1536     else {
1537         __kmp_nThreadsPerCore = 1;
1538     }
1539     __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1540 
1541     nPackages = totals[pkgLevel];
1542 
1543     if (coreLevel >= 0) {
1544         __kmp_ncores = totals[coreLevel];
1545         nCoresPerPkg = maxCt[coreLevel];
1546     }
1547     else {
1548         __kmp_ncores = nPackages;
1549         nCoresPerPkg = 1;
1550     }
1551 
1552     //
1553     // Check to see if the machine topology is uniform
1554     //
1555     unsigned prod = maxCt[0];
1556     for (level = 1; level < depth; level++) {
1557        prod *= maxCt[level];
1558     }
1559     bool uniform = (prod == totals[level - 1]);
1560 
1561     //
1562     // Print the machine topology summary.
1563     //
1564     if (__kmp_affinity_verbose) {
1565         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1566         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1567 
1568         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1569         if (__kmp_affinity_respect_mask) {
1570             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1571         } else {
1572             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1573         }
1574         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1575         if (uniform) {
1576             KMP_INFORM(Uniform, "KMP_AFFINITY");
1577         } else {
1578             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1579         }
1580 
1581         kmp_str_buf_t buf;
1582         __kmp_str_buf_init(&buf);
1583 
1584         __kmp_str_buf_print(&buf, "%d", totals[0]);
1585         for (level = 1; level <= pkgLevel; level++) {
1586             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1587         }
1588         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1589           __kmp_nThreadsPerCore, __kmp_ncores);
1590 
1591         __kmp_str_buf_free(&buf);
1592     }
1593 
1594     if (__kmp_affinity_type == affinity_none) {
1595         __kmp_free(last);
1596         __kmp_free(maxCt);
1597         __kmp_free(counts);
1598         __kmp_free(totals);
1599         __kmp_free(retval);
1600         KMP_CPU_FREE(oldMask);
1601         return 0;
1602     }
1603 
1604     //
1605     // Find any levels with radiix 1, and remove them from the map
1606     // (except for the package level).
1607     //
1608     int new_depth = 0;
1609     for (level = 0; level < depth; level++) {
1610         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1611            continue;
1612         }
1613         new_depth++;
1614     }
1615 
1616     //
1617     // If we are removing any levels, allocate a new vector to return,
1618     // and copy the relevant information to it.
1619     //
1620     if (new_depth != depth) {
1621         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1622           sizeof(AddrUnsPair) * nApics);
1623         for (proc = 0; (int)proc < nApics; proc++) {
1624             Address addr(new_depth);
1625             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1626         }
1627         int new_level = 0;
1628         for (level = 0; level < depth; level++) {
1629             if ((maxCt[level] == 1) && (level != pkgLevel)) {
1630                if (level == threadLevel) {
1631                    threadLevel = -1;
1632                }
1633                else if ((threadLevel >= 0) && (level < threadLevel)) {
1634                    threadLevel--;
1635                }
1636                if (level == coreLevel) {
1637                    coreLevel = -1;
1638                }
1639                else if ((coreLevel >= 0) && (level < coreLevel)) {
1640                    coreLevel--;
1641                }
1642                if (level < pkgLevel) {
1643                    pkgLevel--;
1644                }
1645                continue;
1646             }
1647             for (proc = 0; (int)proc < nApics; proc++) {
1648                 new_retval[proc].first.labels[new_level]
1649                   = retval[proc].first.labels[level];
1650             }
1651             new_level++;
1652         }
1653 
1654         __kmp_free(retval);
1655         retval = new_retval;
1656         depth = new_depth;
1657     }
1658 
1659     if (__kmp_affinity_gran_levels < 0) {
1660         //
1661         // Set the granularity level based on what levels are modeled
1662         // in the machine topology map.
1663         //
1664         __kmp_affinity_gran_levels = 0;
1665         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1666             __kmp_affinity_gran_levels++;
1667         }
1668         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1669             __kmp_affinity_gran_levels++;
1670         }
1671         if (__kmp_affinity_gran > affinity_gran_package) {
1672             __kmp_affinity_gran_levels++;
1673         }
1674     }
1675 
1676     if (__kmp_affinity_verbose) {
1677         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1678           coreLevel, threadLevel);
1679     }
1680 
1681     __kmp_free(last);
1682     __kmp_free(maxCt);
1683     __kmp_free(counts);
1684     __kmp_free(totals);
1685     KMP_CPU_FREE(oldMask);
1686     *address2os = retval;
1687     return depth;
1688 }
1689 
1690 
1691 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1692 
1693 
1694 #define osIdIndex       0
1695 #define threadIdIndex   1
1696 #define coreIdIndex     2
1697 #define pkgIdIndex      3
1698 #define nodeIdIndex     4
1699 
1700 typedef unsigned *ProcCpuInfo;
1701 static unsigned maxIndex = pkgIdIndex;
1702 
1703 
1704 static int
1705 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1706 {
1707     const unsigned *aa = (const unsigned *)a;
1708     const unsigned *bb = (const unsigned *)b;
1709     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1710     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1711     return 0;
1712 };
1713 
1714 
1715 static int
1716 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1717 {
1718     unsigned i;
1719     const unsigned *aa = *((const unsigned **)a);
1720     const unsigned *bb = *((const unsigned **)b);
1721     for (i = maxIndex; ; i--) {
1722         if (aa[i] < bb[i]) return -1;
1723         if (aa[i] > bb[i]) return 1;
1724         if (i == osIdIndex) break;
1725     }
1726     return 0;
1727 }
1728 
1729 
1730 //
1731 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1732 // affinity map.
1733 //
1734 static int
1735 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1736   kmp_i18n_id_t *const msg_id, FILE *f)
1737 {
1738     *address2os = NULL;
1739     *msg_id = kmp_i18n_null;
1740 
1741     //
1742     // Scan of the file, and count the number of "processor" (osId) fields,
1743     // and find the higest value of <n> for a node_<n> field.
1744     //
1745     char buf[256];
1746     unsigned num_records = 0;
1747     while (! feof(f)) {
1748         buf[sizeof(buf) - 1] = 1;
1749         if (! fgets(buf, sizeof(buf), f)) {
1750             //
1751             // Read errors presumably because of EOF
1752             //
1753             break;
1754         }
1755 
1756         char s1[] = "processor";
1757         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1758             num_records++;
1759             continue;
1760         }
1761 
1762         //
1763         // FIXME - this will match "node_<n> <garbage>"
1764         //
1765         unsigned level;
1766         if (sscanf(buf, "node_%d id", &level) == 1) {
1767             if (nodeIdIndex + level >= maxIndex) {
1768                 maxIndex = nodeIdIndex + level;
1769             }
1770             continue;
1771         }
1772     }
1773 
1774     //
1775     // Check for empty file / no valid processor records, or too many.
1776     // The number of records can't exceed the number of valid bits in the
1777     // affinity mask.
1778     //
1779     if (num_records == 0) {
1780         *line = 0;
1781         *msg_id = kmp_i18n_str_NoProcRecords;
1782         return -1;
1783     }
1784     if (num_records > (unsigned)__kmp_xproc) {
1785         *line = 0;
1786         *msg_id = kmp_i18n_str_TooManyProcRecords;
1787         return -1;
1788     }
1789 
1790     //
1791     // Set the file pointer back to the begginning, so that we can scan the
1792     // file again, this time performing a full parse of the data.
1793     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1794     // Adding an extra element at the end allows us to remove a lot of extra
1795     // checks for termination conditions.
1796     //
1797     if (fseek(f, 0, SEEK_SET) != 0) {
1798         *line = 0;
1799         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1800         return -1;
1801     }
1802 
1803     //
1804     // Allocate the array of records to store the proc info in.  The dummy
1805     // element at the end makes the logic in filling them out easier to code.
1806     //
1807     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1808       * sizeof(unsigned *));
1809     unsigned i;
1810     for (i = 0; i <= num_records; i++) {
1811         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1812           * sizeof(unsigned));
1813     }
1814 
1815 #define CLEANUP_THREAD_INFO \
1816     for (i = 0; i <= num_records; i++) {                                \
1817         __kmp_free(threadInfo[i]);                                      \
1818     }                                                                   \
1819     __kmp_free(threadInfo);
1820 
1821     //
1822     // A value of UINT_MAX means that we didn't find the field
1823     //
1824     unsigned __index;
1825 
1826 #define INIT_PROC_INFO(p) \
1827     for (__index = 0; __index <= maxIndex; __index++) {                 \
1828         (p)[__index] = UINT_MAX;                                        \
1829     }
1830 
1831     for (i = 0; i <= num_records; i++) {
1832         INIT_PROC_INFO(threadInfo[i]);
1833     }
1834 
1835     unsigned num_avail = 0;
1836     *line = 0;
1837     while (! feof(f)) {
1838         //
1839         // Create an inner scoping level, so that all the goto targets at the
1840         // end of the loop appear in an outer scoping level.  This avoids
1841         // warnings about jumping past an initialization to a target in the
1842         // same block.
1843         //
1844         {
1845             buf[sizeof(buf) - 1] = 1;
1846             bool long_line = false;
1847             if (! fgets(buf, sizeof(buf), f)) {
1848                 //
1849                 // Read errors presumably because of EOF
1850                 //
1851                 // If there is valid data in threadInfo[num_avail], then fake
1852                 // a blank line in ensure that the last address gets parsed.
1853                 //
1854                 bool valid = false;
1855                 for (i = 0; i <= maxIndex; i++) {
1856                     if (threadInfo[num_avail][i] != UINT_MAX) {
1857                         valid = true;
1858                     }
1859                 }
1860                 if (! valid) {
1861                     break;
1862                 }
1863                 buf[0] = 0;
1864             } else if (!buf[sizeof(buf) - 1]) {
1865                 //
1866                 // The line is longer than the buffer.  Set a flag and don't
1867                 // emit an error if we were going to ignore the line, anyway.
1868                 //
1869                 long_line = true;
1870 
1871 #define CHECK_LINE \
1872     if (long_line) {                                                    \
1873         CLEANUP_THREAD_INFO;                                            \
1874         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1875         return -1;                                                      \
1876     }
1877             }
1878             (*line)++;
1879 
1880             char s1[] = "processor";
1881             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1882                 CHECK_LINE;
1883                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1884                 unsigned val;
1885                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1886                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1887                 threadInfo[num_avail][osIdIndex] = val;
1888                 continue;
1889             }
1890             char s2[] = "physical id";
1891             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1892                 CHECK_LINE;
1893                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1894                 unsigned val;
1895                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1896                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1897                 threadInfo[num_avail][pkgIdIndex] = val;
1898                 continue;
1899             }
1900             char s3[] = "core id";
1901             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1902                 CHECK_LINE;
1903                 char *p = strchr(buf + sizeof(s3) - 1, ':');
1904                 unsigned val;
1905                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1906                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1907                 threadInfo[num_avail][coreIdIndex] = val;
1908                 continue;
1909             }
1910             char s4[] = "thread id";
1911             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1912                 CHECK_LINE;
1913                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1914                 unsigned val;
1915                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1916                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1917                 threadInfo[num_avail][threadIdIndex] = val;
1918                 continue;
1919             }
1920             unsigned level;
1921             if (sscanf(buf, "node_%d id", &level) == 1) {
1922                 CHECK_LINE;
1923                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1924                 unsigned val;
1925                 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1926                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1927                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1928                 threadInfo[num_avail][nodeIdIndex + level] = val;
1929                 continue;
1930             }
1931 
1932             //
1933             // We didn't recognize the leading token on the line.
1934             // There are lots of leading tokens that we don't recognize -
1935             // if the line isn't empty, go on to the next line.
1936             //
1937             if ((*buf != 0) && (*buf != '\n')) {
1938                 //
1939                 // If the line is longer than the buffer, read characters
1940                 // until we find a newline.
1941                 //
1942                 if (long_line) {
1943                     int ch;
1944                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1945                 }
1946                 continue;
1947             }
1948 
1949             //
1950             // A newline has signalled the end of the processor record.
1951             // Check that there aren't too many procs specified.
1952             //
1953             if (num_avail == __kmp_xproc) {
1954                 CLEANUP_THREAD_INFO;
1955                 *msg_id = kmp_i18n_str_TooManyEntries;
1956                 return -1;
1957             }
1958 
1959             //
1960             // Check for missing fields.  The osId field must be there, and we
1961             // currently require that the physical id field is specified, also.
1962             //
1963             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1964                 CLEANUP_THREAD_INFO;
1965                 *msg_id = kmp_i18n_str_MissingProcField;
1966                 return -1;
1967             }
1968             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1969                 CLEANUP_THREAD_INFO;
1970                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1971                 return -1;
1972             }
1973 
1974             //
1975             // Skip this proc if it is not included in the machine model.
1976             //
1977             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1978                 INIT_PROC_INFO(threadInfo[num_avail]);
1979                 continue;
1980             }
1981 
1982             //
1983             // We have a successful parse of this proc's info.
1984             // Increment the counter, and prepare for the next proc.
1985             //
1986             num_avail++;
1987             KMP_ASSERT(num_avail <= num_records);
1988             INIT_PROC_INFO(threadInfo[num_avail]);
1989         }
1990         continue;
1991 
1992         no_val:
1993         CLEANUP_THREAD_INFO;
1994         *msg_id = kmp_i18n_str_MissingValCpuinfo;
1995         return -1;
1996 
1997         dup_field:
1998         CLEANUP_THREAD_INFO;
1999         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2000         return -1;
2001     }
2002     *line = 0;
2003 
2004 # if KMP_MIC && REDUCE_TEAM_SIZE
2005     unsigned teamSize = 0;
2006 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2007 
2008     // check for num_records == __kmp_xproc ???
2009 
2010     //
2011     // If there's only one thread context to bind to, form an Address object
2012     // with depth 1 and return immediately (or, if affinity is off, set
2013     // address2os to NULL and return).
2014     //
2015     // If it is configured to omit the package level when there is only a
2016     // single package, the logic at the end of this routine won't work if
2017     // there is only a single thread - it would try to form an Address
2018     // object with depth 0.
2019     //
2020     KMP_ASSERT(num_avail > 0);
2021     KMP_ASSERT(num_avail <= num_records);
2022     if (num_avail == 1) {
2023         __kmp_ncores = 1;
2024         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2025         __kmp_ht_enabled = FALSE;
2026         if (__kmp_affinity_verbose) {
2027             if (! KMP_AFFINITY_CAPABLE()) {
2028                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2029                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2030                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2031             }
2032             else {
2033                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2034                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2035                   fullMask);
2036                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2037                 if (__kmp_affinity_respect_mask) {
2038                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2039                 } else {
2040                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2041                 }
2042                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2043                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2044             }
2045             int index;
2046             kmp_str_buf_t buf;
2047             __kmp_str_buf_init(&buf);
2048             __kmp_str_buf_print(&buf, "1");
2049             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2050                 __kmp_str_buf_print(&buf, " x 1");
2051             }
2052             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2053             __kmp_str_buf_free(&buf);
2054         }
2055 
2056         if (__kmp_affinity_type == affinity_none) {
2057             CLEANUP_THREAD_INFO;
2058             return 0;
2059         }
2060 
2061         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2062         Address addr(1);
2063         addr.labels[0] = threadInfo[0][pkgIdIndex];
2064         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2065 
2066         if (__kmp_affinity_gran_levels < 0) {
2067             __kmp_affinity_gran_levels = 0;
2068         }
2069 
2070         if (__kmp_affinity_verbose) {
2071             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2072         }
2073 
2074         CLEANUP_THREAD_INFO;
2075         return 1;
2076     }
2077 
2078     //
2079     // Sort the threadInfo table by physical Id.
2080     //
2081     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2082       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2083 
2084     //
2085     // The table is now sorted by pkgId / coreId / threadId, but we really
2086     // don't know the radix of any of the fields.  pkgId's may be sparsely
2087     // assigned among the chips on a system.  Although coreId's are usually
2088     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2089     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2090     //
2091     // For that matter, we don't know what coresPerPkg and threadsPerCore
2092     // (or the total # packages) are at this point - we want to determine
2093     // that now.  We only have an upper bound on the first two figures.
2094     //
2095     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2096       * sizeof(unsigned));
2097     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2098       * sizeof(unsigned));
2099     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2100       * sizeof(unsigned));
2101     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2102       * sizeof(unsigned));
2103 
2104     bool assign_thread_ids = false;
2105     unsigned threadIdCt;
2106     unsigned index;
2107 
2108     restart_radix_check:
2109     threadIdCt = 0;
2110 
2111     //
2112     // Initialize the counter arrays with data from threadInfo[0].
2113     //
2114     if (assign_thread_ids) {
2115         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2116             threadInfo[0][threadIdIndex] = threadIdCt++;
2117         }
2118         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2119             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2120         }
2121     }
2122     for (index = 0; index <= maxIndex; index++) {
2123         counts[index] = 1;
2124         maxCt[index] = 1;
2125         totals[index] = 1;
2126         lastId[index] = threadInfo[0][index];;
2127     }
2128 
2129     //
2130     // Run through the rest of the OS procs.
2131     //
2132     for (i = 1; i < num_avail; i++) {
2133         //
2134         // Find the most significant index whose id differs
2135         // from the id for the previous OS proc.
2136         //
2137         for (index = maxIndex; index >= threadIdIndex; index--) {
2138             if (assign_thread_ids && (index == threadIdIndex)) {
2139                 //
2140                 // Auto-assign the thread id field if it wasn't specified.
2141                 //
2142                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2143                     threadInfo[i][threadIdIndex] = threadIdCt++;
2144                 }
2145 
2146                 //
2147                 // Aparrently the thread id field was specified for some
2148                 // entries and not others.  Start the thread id counter
2149                 // off at the next higher thread id.
2150                 //
2151                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2152                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2153                 }
2154             }
2155             if (threadInfo[i][index] != lastId[index]) {
2156                 //
2157                 // Run through all indices which are less significant,
2158                 // and reset the counts to 1.
2159                 //
2160                 // At all levels up to and including index, we need to
2161                 // increment the totals and record the last id.
2162                 //
2163                 unsigned index2;
2164                 for (index2 = threadIdIndex; index2 < index; index2++) {
2165                     totals[index2]++;
2166                     if (counts[index2] > maxCt[index2]) {
2167                         maxCt[index2] = counts[index2];
2168                     }
2169                     counts[index2] = 1;
2170                     lastId[index2] = threadInfo[i][index2];
2171                 }
2172                 counts[index]++;
2173                 totals[index]++;
2174                 lastId[index] = threadInfo[i][index];
2175 
2176                 if (assign_thread_ids && (index > threadIdIndex)) {
2177 
2178 # if KMP_MIC && REDUCE_TEAM_SIZE
2179                     //
2180                     // The default team size is the total #threads in the machine
2181                     // minus 1 thread for every core that has 3 or more threads.
2182                     //
2183                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2184 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2185 
2186                     //
2187                     // Restart the thread counter, as we are on a new core.
2188                     //
2189                     threadIdCt = 0;
2190 
2191                     //
2192                     // Auto-assign the thread id field if it wasn't specified.
2193                     //
2194                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2195                         threadInfo[i][threadIdIndex] = threadIdCt++;
2196                     }
2197 
2198                     //
2199                     // Aparrently the thread id field was specified for some
2200                     // entries and not others.  Start the thread id counter
2201                     // off at the next higher thread id.
2202                     //
2203                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2204                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2205                     }
2206                 }
2207                 break;
2208             }
2209         }
2210         if (index < threadIdIndex) {
2211             //
2212             // If thread ids were specified, it is an error if they are not
2213             // unique.  Also, check that we waven't already restarted the
2214             // loop (to be safe - shouldn't need to).
2215             //
2216             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2217               || assign_thread_ids) {
2218                 __kmp_free(lastId);
2219                 __kmp_free(totals);
2220                 __kmp_free(maxCt);
2221                 __kmp_free(counts);
2222                 CLEANUP_THREAD_INFO;
2223                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2224                 return -1;
2225             }
2226 
2227             //
2228             // If the thread ids were not specified and we see entries
2229             // entries that are duplicates, start the loop over and
2230             // assign the thread ids manually.
2231             //
2232             assign_thread_ids = true;
2233             goto restart_radix_check;
2234         }
2235     }
2236 
2237 # if KMP_MIC && REDUCE_TEAM_SIZE
2238     //
2239     // The default team size is the total #threads in the machine
2240     // minus 1 thread for every core that has 3 or more threads.
2241     //
2242     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2243 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2244 
2245     for (index = threadIdIndex; index <= maxIndex; index++) {
2246         if (counts[index] > maxCt[index]) {
2247             maxCt[index] = counts[index];
2248         }
2249     }
2250 
2251     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2252     nCoresPerPkg = maxCt[coreIdIndex];
2253     nPackages = totals[pkgIdIndex];
2254 
2255     //
2256     // Check to see if the machine topology is uniform
2257     //
2258     unsigned prod = totals[maxIndex];
2259     for (index = threadIdIndex; index < maxIndex; index++) {
2260        prod *= maxCt[index];
2261     }
2262     bool uniform = (prod == totals[threadIdIndex]);
2263 
2264     //
2265     // When affinity is off, this routine will still be called to set
2266     // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2267     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2268     // correctly, and return now if affinity is not enabled.
2269     //
2270     __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2271     __kmp_ncores = totals[coreIdIndex];
2272 
2273     if (__kmp_affinity_verbose) {
2274         if (! KMP_AFFINITY_CAPABLE()) {
2275                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2276                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2277                 if (uniform) {
2278                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2279                 } else {
2280                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2281                 }
2282         }
2283         else {
2284             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2285             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2286                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2287                 if (__kmp_affinity_respect_mask) {
2288                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2289                 } else {
2290                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2291                 }
2292                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2293                 if (uniform) {
2294                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2295                 } else {
2296                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2297                 }
2298         }
2299         kmp_str_buf_t buf;
2300         __kmp_str_buf_init(&buf);
2301 
2302         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2303         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2304             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2305         }
2306         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2307           maxCt[threadIdIndex], __kmp_ncores);
2308 
2309         __kmp_str_buf_free(&buf);
2310     }
2311 
2312 # if KMP_MIC && REDUCE_TEAM_SIZE
2313     //
2314     // Set the default team size.
2315     //
2316     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2317         __kmp_dflt_team_nth = teamSize;
2318         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2319           __kmp_dflt_team_nth));
2320     }
2321 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2322 
2323     if (__kmp_affinity_type == affinity_none) {
2324         __kmp_free(lastId);
2325         __kmp_free(totals);
2326         __kmp_free(maxCt);
2327         __kmp_free(counts);
2328         CLEANUP_THREAD_INFO;
2329         return 0;
2330     }
2331 
2332     //
2333     // Count the number of levels which have more nodes at that level than
2334     // at the parent's level (with there being an implicit root node of
2335     // the top level).  This is equivalent to saying that there is at least
2336     // one node at this level which has a sibling.  These levels are in the
2337     // map, and the package level is always in the map.
2338     //
2339     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2340     int level = 0;
2341     for (index = threadIdIndex; index < maxIndex; index++) {
2342         KMP_ASSERT(totals[index] >= totals[index + 1]);
2343         inMap[index] = (totals[index] > totals[index + 1]);
2344     }
2345     inMap[maxIndex] = (totals[maxIndex] > 1);
2346     inMap[pkgIdIndex] = true;
2347 
2348     int depth = 0;
2349     for (index = threadIdIndex; index <= maxIndex; index++) {
2350         if (inMap[index]) {
2351             depth++;
2352         }
2353     }
2354     KMP_ASSERT(depth > 0);
2355 
2356     //
2357     // Construct the data structure that is to be returned.
2358     //
2359     *address2os = (AddrUnsPair*)
2360       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2361     int pkgLevel = -1;
2362     int coreLevel = -1;
2363     int threadLevel = -1;
2364 
2365     for (i = 0; i < num_avail; ++i) {
2366         Address addr(depth);
2367         unsigned os = threadInfo[i][osIdIndex];
2368         int src_index;
2369         int dst_index = 0;
2370 
2371         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2372             if (! inMap[src_index]) {
2373                 continue;
2374             }
2375             addr.labels[dst_index] = threadInfo[i][src_index];
2376             if (src_index == pkgIdIndex) {
2377                 pkgLevel = dst_index;
2378             }
2379             else if (src_index == coreIdIndex) {
2380                 coreLevel = dst_index;
2381             }
2382             else if (src_index == threadIdIndex) {
2383                 threadLevel = dst_index;
2384             }
2385             dst_index++;
2386         }
2387         (*address2os)[i] = AddrUnsPair(addr, os);
2388     }
2389 
2390     if (__kmp_affinity_gran_levels < 0) {
2391         //
2392         // Set the granularity level based on what levels are modeled
2393         // in the machine topology map.
2394         //
2395         unsigned src_index;
2396         __kmp_affinity_gran_levels = 0;
2397         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2398             if (! inMap[src_index]) {
2399                 continue;
2400             }
2401             switch (src_index) {
2402                 case threadIdIndex:
2403                 if (__kmp_affinity_gran > affinity_gran_thread) {
2404                     __kmp_affinity_gran_levels++;
2405                 }
2406 
2407                 break;
2408                 case coreIdIndex:
2409                 if (__kmp_affinity_gran > affinity_gran_core) {
2410                     __kmp_affinity_gran_levels++;
2411                 }
2412                 break;
2413 
2414                 case pkgIdIndex:
2415                 if (__kmp_affinity_gran > affinity_gran_package) {
2416                     __kmp_affinity_gran_levels++;
2417                 }
2418                 break;
2419             }
2420         }
2421     }
2422 
2423     if (__kmp_affinity_verbose) {
2424         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2425           coreLevel, threadLevel);
2426     }
2427 
2428     __kmp_free(inMap);
2429     __kmp_free(lastId);
2430     __kmp_free(totals);
2431     __kmp_free(maxCt);
2432     __kmp_free(counts);
2433     CLEANUP_THREAD_INFO;
2434     return depth;
2435 }
2436 
2437 
2438 //
2439 // Create and return a table of affinity masks, indexed by OS thread ID.
2440 // This routine handles OR'ing together all the affinity masks of threads
2441 // that are sufficiently close, if granularity > fine.
2442 //
2443 static kmp_affin_mask_t *
2444 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2445   AddrUnsPair *address2os, unsigned numAddrs)
2446 {
2447     //
2448     // First form a table of affinity masks in order of OS thread id.
2449     //
2450     unsigned depth;
2451     unsigned maxOsId;
2452     unsigned i;
2453 
2454     KMP_ASSERT(numAddrs > 0);
2455     depth = address2os[0].first.depth;
2456 
2457     maxOsId = 0;
2458     for (i = 0; i < numAddrs; i++) {
2459         unsigned osId = address2os[i].second;
2460         if (osId > maxOsId) {
2461             maxOsId = osId;
2462         }
2463     }
2464     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2465       (maxOsId + 1) * __kmp_affin_mask_size);
2466 
2467     //
2468     // Sort the address2os table according to physical order.  Doing so
2469     // will put all threads on the same core/package/node in consecutive
2470     // locations.
2471     //
2472     qsort(address2os, numAddrs, sizeof(*address2os),
2473       __kmp_affinity_cmp_Address_labels);
2474 
2475     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2476     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2477         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2478     }
2479     if (__kmp_affinity_gran_levels >= (int)depth) {
2480         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2481           && (__kmp_affinity_type != affinity_none))) {
2482             KMP_WARNING(AffThreadsMayMigrate);
2483         }
2484     }
2485 
2486     //
2487     // Run through the table, forming the masks for all threads on each
2488     // core.  Threads on the same core will have identical "Address"
2489     // objects, not considering the last level, which must be the thread
2490     // id.  All threads on a core will appear consecutively.
2491     //
2492     unsigned unique = 0;
2493     unsigned j = 0;                             // index of 1st thread on core
2494     unsigned leader = 0;
2495     Address *leaderAddr = &(address2os[0].first);
2496     kmp_affin_mask_t *sum
2497       = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2498     KMP_CPU_ZERO(sum);
2499     KMP_CPU_SET(address2os[0].second, sum);
2500     for (i = 1; i < numAddrs; i++) {
2501         //
2502         // If this thread is sufficiently close to the leader (withing the
2503         // granularity setting), then set the bit for this os thread in the
2504         // affinity mask for this group, and go on to the next thread.
2505         //
2506         if (leaderAddr->isClose(address2os[i].first,
2507           __kmp_affinity_gran_levels)) {
2508             KMP_CPU_SET(address2os[i].second, sum);
2509             continue;
2510         }
2511 
2512         //
2513         // For every thread in this group, copy the mask to the thread's
2514         // entry in the osId2Mask table.  Mark the first address as a
2515         // leader.
2516         //
2517         for (; j < i; j++) {
2518             unsigned osId = address2os[j].second;
2519             KMP_DEBUG_ASSERT(osId <= maxOsId);
2520             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2521             KMP_CPU_COPY(mask, sum);
2522             address2os[j].first.leader = (j == leader);
2523         }
2524         unique++;
2525 
2526         //
2527         // Start a new mask.
2528         //
2529         leader = i;
2530         leaderAddr = &(address2os[i].first);
2531         KMP_CPU_ZERO(sum);
2532         KMP_CPU_SET(address2os[i].second, sum);
2533     }
2534 
2535     //
2536     // For every thread in last group, copy the mask to the thread's
2537     // entry in the osId2Mask table.
2538     //
2539     for (; j < i; j++) {
2540         unsigned osId = address2os[j].second;
2541         KMP_DEBUG_ASSERT(osId <= maxOsId);
2542         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2543         KMP_CPU_COPY(mask, sum);
2544         address2os[j].first.leader = (j == leader);
2545     }
2546     unique++;
2547 
2548     *maxIndex = maxOsId;
2549     *numUnique = unique;
2550     return osId2Mask;
2551 }
2552 
2553 
2554 //
2555 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2556 // as file-static than to try and pass them through the calling sequence of
2557 // the recursive-descent OMP_PLACES parser.
2558 //
2559 static kmp_affin_mask_t *newMasks;
2560 static int numNewMasks;
2561 static int nextNewMask;
2562 
2563 #define ADD_MASK(_mask) \
2564     {                                                                   \
2565         if (nextNewMask >= numNewMasks) {                               \
2566             numNewMasks *= 2;                                           \
2567             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2568               numNewMasks * __kmp_affin_mask_size);                     \
2569         }                                                               \
2570         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2571         nextNewMask++;                                                  \
2572     }
2573 
2574 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2575     {                                                                   \
2576         if (((_osId) > _maxOsId) ||                                     \
2577           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\
2578             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2579               && (__kmp_affinity_type != affinity_none))) {             \
2580                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2581             }                                                           \
2582         }                                                               \
2583         else {                                                          \
2584             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2585         }                                                               \
2586     }
2587 
2588 
2589 //
2590 // Re-parse the proclist (for the explicit affinity type), and form the list
2591 // of affinity newMasks indexed by gtid.
2592 //
2593 static void
2594 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2595   unsigned int *out_numMasks, const char *proclist,
2596   kmp_affin_mask_t *osId2Mask, int maxOsId)
2597 {
2598     const char *scan = proclist;
2599     const char *next = proclist;
2600 
2601     //
2602     // We use malloc() for the temporary mask vector,
2603     // so that we can use realloc() to extend it.
2604     //
2605     numNewMasks = 2;
2606     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2607       * __kmp_affin_mask_size);
2608     nextNewMask = 0;
2609     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2610       __kmp_affin_mask_size);
2611     int setSize = 0;
2612 
2613     for (;;) {
2614         int start, end, stride;
2615 
2616         SKIP_WS(scan);
2617         next = scan;
2618         if (*next == '\0') {
2619             break;
2620         }
2621 
2622         if (*next == '{') {
2623             int num;
2624             setSize = 0;
2625             next++;     // skip '{'
2626             SKIP_WS(next);
2627             scan = next;
2628 
2629             //
2630             // Read the first integer in the set.
2631             //
2632             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2633               "bad proclist");
2634             SKIP_DIGITS(next);
2635             num = __kmp_str_to_int(scan, *next);
2636             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2637 
2638             //
2639             // Copy the mask for that osId to the sum (union) mask.
2640             //
2641             if ((num > maxOsId) ||
2642               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2643                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2644                   && (__kmp_affinity_type != affinity_none))) {
2645                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2646                 }
2647                 KMP_CPU_ZERO(sumMask);
2648             }
2649             else {
2650                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2651                 setSize = 1;
2652             }
2653 
2654             for (;;) {
2655                 //
2656                 // Check for end of set.
2657                 //
2658                 SKIP_WS(next);
2659                 if (*next == '}') {
2660                     next++;     // skip '}'
2661                     break;
2662                 }
2663 
2664                 //
2665                 // Skip optional comma.
2666                 //
2667                 if (*next == ',') {
2668                     next++;
2669                 }
2670                 SKIP_WS(next);
2671 
2672                 //
2673                 // Read the next integer in the set.
2674                 //
2675                 scan = next;
2676                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2677                   "bad explicit proc list");
2678 
2679                 SKIP_DIGITS(next);
2680                 num = __kmp_str_to_int(scan, *next);
2681                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2682 
2683                 //
2684                 // Add the mask for that osId to the sum mask.
2685                 //
2686                 if ((num > maxOsId) ||
2687                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2688                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2689                       && (__kmp_affinity_type != affinity_none))) {
2690                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2691                     }
2692                 }
2693                 else {
2694                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2695                     setSize++;
2696                 }
2697             }
2698             if (setSize > 0) {
2699                 ADD_MASK(sumMask);
2700             }
2701 
2702             SKIP_WS(next);
2703             if (*next == ',') {
2704                 next++;
2705             }
2706             scan = next;
2707             continue;
2708         }
2709 
2710         //
2711         // Read the first integer.
2712         //
2713         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2714         SKIP_DIGITS(next);
2715         start = __kmp_str_to_int(scan, *next);
2716         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2717         SKIP_WS(next);
2718 
2719         //
2720         // If this isn't a range, then add a mask to the list and go on.
2721         //
2722         if (*next != '-') {
2723             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2724 
2725             //
2726             // Skip optional comma.
2727             //
2728             if (*next == ',') {
2729                 next++;
2730             }
2731             scan = next;
2732             continue;
2733         }
2734 
2735         //
2736         // This is a range.  Skip over the '-' and read in the 2nd int.
2737         //
2738         next++;         // skip '-'
2739         SKIP_WS(next);
2740         scan = next;
2741         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2742         SKIP_DIGITS(next);
2743         end = __kmp_str_to_int(scan, *next);
2744         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2745 
2746         //
2747         // Check for a stride parameter
2748         //
2749         stride = 1;
2750         SKIP_WS(next);
2751         if (*next == ':') {
2752             //
2753             // A stride is specified.  Skip over the ':" and read the 3rd int.
2754             //
2755             int sign = +1;
2756             next++;         // skip ':'
2757             SKIP_WS(next);
2758             scan = next;
2759             if (*next == '-') {
2760                 sign = -1;
2761                 next++;
2762                 SKIP_WS(next);
2763                 scan = next;
2764             }
2765             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2766               "bad explicit proc list");
2767             SKIP_DIGITS(next);
2768             stride = __kmp_str_to_int(scan, *next);
2769             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2770             stride *= sign;
2771         }
2772 
2773         //
2774         // Do some range checks.
2775         //
2776         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2777         if (stride > 0) {
2778             KMP_ASSERT2(start <= end, "bad explicit proc list");
2779         }
2780         else {
2781             KMP_ASSERT2(start >= end, "bad explicit proc list");
2782         }
2783         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2784 
2785         //
2786         // Add the mask for each OS proc # to the list.
2787         //
2788         if (stride > 0) {
2789             do {
2790                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2791                 start += stride;
2792             } while (start <= end);
2793         }
2794         else {
2795             do {
2796                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2797                 start += stride;
2798             } while (start >= end);
2799         }
2800 
2801         //
2802         // Skip optional comma.
2803         //
2804         SKIP_WS(next);
2805         if (*next == ',') {
2806             next++;
2807         }
2808         scan = next;
2809     }
2810 
2811     *out_numMasks = nextNewMask;
2812     if (nextNewMask == 0) {
2813         *out_masks = NULL;
2814         KMP_INTERNAL_FREE(newMasks);
2815         return;
2816     }
2817     *out_masks
2818       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2819     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2820     __kmp_free(sumMask);
2821     KMP_INTERNAL_FREE(newMasks);
2822 }
2823 
2824 
2825 # if OMP_40_ENABLED
2826 
2827 /*-----------------------------------------------------------------------------
2828 
2829 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2830 places.  Again, Here is the grammar:
2831 
2832 place_list := place
2833 place_list := place , place_list
2834 place := num
2835 place := place : num
2836 place := place : num : signed
2837 place := { subplacelist }
2838 place := ! place                  // (lowest priority)
2839 subplace_list := subplace
2840 subplace_list := subplace , subplace_list
2841 subplace := num
2842 subplace := num : num
2843 subplace := num : num : signed
2844 signed := num
2845 signed := + signed
2846 signed := - signed
2847 
2848 -----------------------------------------------------------------------------*/
2849 
2850 static void
2851 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2852   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2853 {
2854     const char *next;
2855 
2856     for (;;) {
2857         int start, count, stride, i;
2858 
2859         //
2860         // Read in the starting proc id
2861         //
2862         SKIP_WS(*scan);
2863         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2864           "bad explicit places list");
2865         next = *scan;
2866         SKIP_DIGITS(next);
2867         start = __kmp_str_to_int(*scan, *next);
2868         KMP_ASSERT(start >= 0);
2869         *scan = next;
2870 
2871         //
2872         // valid follow sets are ',' ':' and '}'
2873         //
2874         SKIP_WS(*scan);
2875         if (**scan == '}' || **scan == ',') {
2876             if ((start > maxOsId) ||
2877               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2878                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2879                   && (__kmp_affinity_type != affinity_none))) {
2880                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2881                 }
2882             }
2883             else {
2884                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2885                 (*setSize)++;
2886             }
2887             if (**scan == '}') {
2888                 break;
2889             }
2890             (*scan)++;  // skip ','
2891             continue;
2892         }
2893         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2894         (*scan)++;      // skip ':'
2895 
2896         //
2897         // Read count parameter
2898         //
2899         SKIP_WS(*scan);
2900         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2901           "bad explicit places list");
2902         next = *scan;
2903         SKIP_DIGITS(next);
2904         count = __kmp_str_to_int(*scan, *next);
2905         KMP_ASSERT(count >= 0);
2906         *scan = next;
2907 
2908         //
2909         // valid follow sets are ',' ':' and '}'
2910         //
2911         SKIP_WS(*scan);
2912         if (**scan == '}' || **scan == ',') {
2913             for (i = 0; i < count; i++) {
2914                 if ((start > maxOsId) ||
2915                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2916                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2917                       && (__kmp_affinity_type != affinity_none))) {
2918                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2919                     }
2920                     break;  // don't proliferate warnings for large count
2921                 }
2922                 else {
2923                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2924                     start++;
2925                     (*setSize)++;
2926                 }
2927             }
2928             if (**scan == '}') {
2929                 break;
2930             }
2931             (*scan)++;  // skip ','
2932             continue;
2933         }
2934         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2935         (*scan)++;      // skip ':'
2936 
2937         //
2938         // Read stride parameter
2939         //
2940         int sign = +1;
2941         for (;;) {
2942             SKIP_WS(*scan);
2943             if (**scan == '+') {
2944                 (*scan)++; // skip '+'
2945                 continue;
2946             }
2947             if (**scan == '-') {
2948                 sign *= -1;
2949                 (*scan)++; // skip '-'
2950                 continue;
2951             }
2952             break;
2953         }
2954         SKIP_WS(*scan);
2955         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2956           "bad explicit places list");
2957         next = *scan;
2958         SKIP_DIGITS(next);
2959         stride = __kmp_str_to_int(*scan, *next);
2960         KMP_ASSERT(stride >= 0);
2961         *scan = next;
2962         stride *= sign;
2963 
2964         //
2965         // valid follow sets are ',' and '}'
2966         //
2967         SKIP_WS(*scan);
2968         if (**scan == '}' || **scan == ',') {
2969             for (i = 0; i < count; i++) {
2970                 if ((start > maxOsId) ||
2971                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2972                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2973                       && (__kmp_affinity_type != affinity_none))) {
2974                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2975                     }
2976                     break;  // don't proliferate warnings for large count
2977                 }
2978                 else {
2979                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2980                     start += stride;
2981                     (*setSize)++;
2982                 }
2983             }
2984             if (**scan == '}') {
2985                 break;
2986             }
2987             (*scan)++;  // skip ','
2988             continue;
2989         }
2990 
2991         KMP_ASSERT2(0, "bad explicit places list");
2992     }
2993 }
2994 
2995 
2996 static void
2997 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
2998   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2999 {
3000     const char *next;
3001 
3002     //
3003     // valid follow sets are '{' '!' and num
3004     //
3005     SKIP_WS(*scan);
3006     if (**scan == '{') {
3007         (*scan)++;      // skip '{'
3008         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3009           setSize);
3010         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3011         (*scan)++;      // skip '}'
3012     }
3013     else if (**scan == '!') {
3014         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3015         KMP_CPU_COMPLEMENT(tempMask);
3016         (*scan)++;      // skip '!'
3017     }
3018     else if ((**scan >= '0') && (**scan <= '9')) {
3019         next = *scan;
3020         SKIP_DIGITS(next);
3021         int num = __kmp_str_to_int(*scan, *next);
3022         KMP_ASSERT(num >= 0);
3023         if ((num > maxOsId) ||
3024           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3025             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3026               && (__kmp_affinity_type != affinity_none))) {
3027                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3028             }
3029         }
3030         else {
3031             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3032             (*setSize)++;
3033         }
3034         *scan = next;  // skip num
3035         }
3036     else {
3037         KMP_ASSERT2(0, "bad explicit places list");
3038     }
3039 }
3040 
3041 
3042 static void
3043 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3044   unsigned int *out_numMasks, const char *placelist,
3045   kmp_affin_mask_t *osId2Mask, int maxOsId)
3046 {
3047     const char *scan = placelist;
3048     const char *next = placelist;
3049 
3050     numNewMasks = 2;
3051     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3052       * __kmp_affin_mask_size);
3053     nextNewMask = 0;
3054 
3055     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3056       __kmp_affin_mask_size);
3057     KMP_CPU_ZERO(tempMask);
3058     int setSize = 0;
3059 
3060     for (;;) {
3061         int start, count, stride;
3062 
3063         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3064 
3065         //
3066         // valid follow sets are ',' ':' and EOL
3067         //
3068         SKIP_WS(scan);
3069         if (*scan == '\0' || *scan == ',') {
3070             if (setSize > 0) {
3071                 ADD_MASK(tempMask);
3072             }
3073             KMP_CPU_ZERO(tempMask);
3074             setSize = 0;
3075             if (*scan == '\0') {
3076                 break;
3077             }
3078             scan++;     // skip ','
3079             continue;
3080         }
3081 
3082         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3083         scan++;         // skip ':'
3084 
3085         //
3086         // Read count parameter
3087         //
3088         SKIP_WS(scan);
3089         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3090           "bad explicit places list");
3091         next = scan;
3092         SKIP_DIGITS(next);
3093         count = __kmp_str_to_int(scan, *next);
3094         KMP_ASSERT(count >= 0);
3095         scan = next;
3096 
3097         //
3098         // valid follow sets are ',' ':' and EOL
3099         //
3100         SKIP_WS(scan);
3101         if (*scan == '\0' || *scan == ',') {
3102             int i;
3103             for (i = 0; i < count; i++) {
3104                 int j;
3105                 if (setSize == 0) {
3106                     break;
3107                 }
3108                 ADD_MASK(tempMask);
3109                 setSize = 0;
3110                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) {
3111                     //
3112                     // Use a temp var in case macro is changed to evaluate
3113                     // args multiple times.
3114                     //
3115                     if (KMP_CPU_ISSET(j - stride, tempMask)) {
3116                         KMP_CPU_SET(j, tempMask);
3117                         setSize++;
3118                     }
3119                     else {
3120                         KMP_CPU_CLR(j, tempMask);
3121                     }
3122                 }
3123                 for (; j >= 0; j--) {
3124                     KMP_CPU_CLR(j, tempMask);
3125                 }
3126             }
3127             KMP_CPU_ZERO(tempMask);
3128             setSize = 0;
3129 
3130             if (*scan == '\0') {
3131                 break;
3132             }
3133             scan++;     // skip ','
3134             continue;
3135         }
3136 
3137         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3138         scan++;         // skip ':'
3139 
3140         //
3141         // Read stride parameter
3142         //
3143         int sign = +1;
3144         for (;;) {
3145             SKIP_WS(scan);
3146             if (*scan == '+') {
3147                 scan++; // skip '+'
3148                 continue;
3149             }
3150             if (*scan == '-') {
3151                 sign *= -1;
3152                 scan++; // skip '-'
3153                 continue;
3154             }
3155             break;
3156         }
3157         SKIP_WS(scan);
3158         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3159           "bad explicit places list");
3160         next = scan;
3161         SKIP_DIGITS(next);
3162         stride = __kmp_str_to_int(scan, *next);
3163         KMP_DEBUG_ASSERT(stride >= 0);
3164         scan = next;
3165         stride *= sign;
3166 
3167         if (stride > 0) {
3168             int i;
3169             for (i = 0; i < count; i++) {
3170                 int j;
3171                 if (setSize == 0) {
3172                     break;
3173                 }
3174                 ADD_MASK(tempMask);
3175                 setSize = 0;
3176                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3177                     if (KMP_CPU_ISSET(j - stride, tempMask)) {
3178                         KMP_CPU_SET(j, tempMask);
3179                         setSize++;
3180                     }
3181                     else {
3182                         KMP_CPU_CLR(j, tempMask);
3183                     }
3184                 }
3185                 for (; j >= 0; j--) {
3186                     KMP_CPU_CLR(j, tempMask);
3187                 }
3188             }
3189         }
3190         else {
3191             int i;
3192             for (i = 0; i < count; i++) {
3193                 unsigned j;
3194                 if (setSize == 0) {
3195                     break;
3196                 }
3197                 ADD_MASK(tempMask);
3198                 setSize = 0;
3199                 for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride;
3200                   j++) {
3201                     if (KMP_CPU_ISSET(j - stride, tempMask)) {
3202                         KMP_CPU_SET(j, tempMask);
3203                         setSize++;
3204                     }
3205                     else {
3206                         KMP_CPU_CLR(j, tempMask);
3207                     }
3208                 }
3209                 for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) {
3210                     KMP_CPU_CLR(j, tempMask);
3211                 }
3212             }
3213         }
3214         KMP_CPU_ZERO(tempMask);
3215         setSize = 0;
3216 
3217         //
3218         // valid follow sets are ',' and EOL
3219         //
3220         SKIP_WS(scan);
3221         if (*scan == '\0') {
3222             break;
3223         }
3224         if (*scan == ',') {
3225             scan++;     // skip ','
3226             continue;
3227         }
3228 
3229         KMP_ASSERT2(0, "bad explicit places list");
3230     }
3231 
3232     *out_numMasks = nextNewMask;
3233     if (nextNewMask == 0) {
3234         *out_masks = NULL;
3235         KMP_INTERNAL_FREE(newMasks);
3236         return;
3237     }
3238     *out_masks
3239       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3240     memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3241     __kmp_free(tempMask);
3242     KMP_INTERNAL_FREE(newMasks);
3243 }
3244 
3245 # endif /* OMP_40_ENABLED */
3246 
3247 #undef ADD_MASK
3248 #undef ADD_MASK_OSID
3249 
3250 
3251 # if KMP_MIC
3252 
3253 static void
3254 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3255 {
3256     if ( __kmp_place_num_cores == 0 ) {
3257         if ( __kmp_place_num_threads_per_core == 0 ) {
3258             return;   // no cores limiting actions requested, exit
3259         }
3260         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3261     }
3262     if ( !__kmp_affinity_uniform_topology() || depth != 3 ) {
3263         KMP_WARNING( AffThrPlaceUnsupported );
3264         return; // don't support non-uniform topology or not-3-level architecture
3265     }
3266     if ( __kmp_place_num_threads_per_core == 0 ) {
3267         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3268     }
3269     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3270         KMP_WARNING( AffThrPlaceManyCores );
3271         return;
3272     }
3273 
3274     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3275                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3276     int i, j, k, n_old = 0, n_new = 0;
3277     for ( i = 0; i < nPackages; ++i ) {
3278         for ( j = 0; j < nCoresPerPkg; ++j ) {
3279             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3280                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3281             } else {
3282                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3283                     if ( k < __kmp_place_num_threads_per_core ) {
3284                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3285                         n_new++;
3286                     }
3287                     n_old++;
3288                 }
3289             }
3290         }
3291     }
3292     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3293     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3294     __kmp_avail_proc = n_new;                                 // correct avail_proc
3295     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3296 
3297     __kmp_free( *pAddr );
3298     *pAddr = newAddr;      // replace old topology with new one
3299 }
3300 
3301 # endif /* KMP_MIC */
3302 
3303 
3304 static AddrUnsPair *address2os = NULL;
3305 static int           * procarr = NULL;
3306 static int     __kmp_aff_depth = 0;
3307 
3308 static void
3309 __kmp_aux_affinity_initialize(void)
3310 {
3311     if (__kmp_affinity_masks != NULL) {
3312         KMP_ASSERT(fullMask != NULL);
3313         return;
3314     }
3315 
3316     //
3317     // Create the "full" mask - this defines all of the processors that we
3318     // consider to be in the machine model.  If respect is set, then it is
3319     // the initialization thread's affinity mask.  Otherwise, it is all
3320     // processors that we know about on the machine.
3321     //
3322     if (fullMask == NULL) {
3323         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3324     }
3325     if (KMP_AFFINITY_CAPABLE()) {
3326         if (__kmp_affinity_respect_mask) {
3327             __kmp_get_system_affinity(fullMask, TRUE);
3328 
3329             //
3330             // Count the number of available processors.
3331             //
3332             unsigned i;
3333             __kmp_avail_proc = 0;
3334             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3335                 if (! KMP_CPU_ISSET(i, fullMask)) {
3336                     continue;
3337                 }
3338                 __kmp_avail_proc++;
3339             }
3340             if (__kmp_avail_proc > __kmp_xproc) {
3341                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3342                   && (__kmp_affinity_type != affinity_none))) {
3343                     KMP_WARNING(ErrorInitializeAffinity);
3344                 }
3345                 __kmp_affinity_type = affinity_none;
3346                 __kmp_affin_mask_size = 0;
3347                 return;
3348             }
3349         }
3350         else {
3351             __kmp_affinity_entire_machine_mask(fullMask);
3352             __kmp_avail_proc = __kmp_xproc;
3353         }
3354     }
3355 
3356     int depth = -1;
3357     kmp_i18n_id_t msg_id = kmp_i18n_null;
3358 
3359     //
3360     // For backward compatiblity, setting KMP_CPUINFO_FILE =>
3361     // KMP_TOPOLOGY_METHOD=cpuinfo
3362     //
3363     if ((__kmp_cpuinfo_file != NULL) &&
3364       (__kmp_affinity_top_method == affinity_top_method_all)) {
3365         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3366     }
3367 
3368     if (__kmp_affinity_top_method == affinity_top_method_all) {
3369         //
3370         // In the default code path, errors are not fatal - we just try using
3371         // another method.  We only emit a warning message if affinity is on,
3372         // or the verbose flag is set, an the nowarnings flag was not set.
3373         //
3374         const char *file_name = NULL;
3375         int line = 0;
3376 
3377 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3378 
3379         if (__kmp_affinity_verbose) {
3380             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3381         }
3382 
3383         file_name = NULL;
3384         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3385         if (depth == 0) {
3386             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3387             KMP_ASSERT(address2os == NULL);
3388             return;
3389         }
3390 
3391         if (depth < 0) {
3392             if ((msg_id != kmp_i18n_null)
3393               && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3394               && (__kmp_affinity_type != affinity_none)))) {
3395 #  if KMP_MIC
3396                 if (__kmp_affinity_verbose) {
3397                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3398                       KMP_I18N_STR(DecodingLegacyAPIC));
3399                 }
3400 #  else
3401                 KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3402                   KMP_I18N_STR(DecodingLegacyAPIC));
3403 #  endif
3404             }
3405 
3406             file_name = NULL;
3407             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3408             if (depth == 0) {
3409                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3410                 KMP_ASSERT(address2os == NULL);
3411                 return;
3412             }
3413         }
3414 
3415 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3416 
3417 # if KMP_OS_LINUX
3418 
3419         if (depth < 0) {
3420             if ((msg_id != kmp_i18n_null)
3421               && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3422               && (__kmp_affinity_type != affinity_none)))) {
3423 #  if KMP_MIC
3424                 if (__kmp_affinity_verbose) {
3425                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3426                 }
3427 #  else
3428                 KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3429 #  endif
3430             }
3431             else if (__kmp_affinity_verbose) {
3432                 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3433             }
3434 
3435             FILE *f = fopen("/proc/cpuinfo", "r");
3436             if (f == NULL) {
3437                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3438             }
3439             else {
3440                 file_name = "/proc/cpuinfo";
3441                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3442                 fclose(f);
3443                 if (depth == 0) {
3444                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3445                     KMP_ASSERT(address2os == NULL);
3446                     return;
3447                 }
3448             }
3449         }
3450 
3451 # endif /* KMP_OS_LINUX */
3452 
3453         if (depth < 0) {
3454             if (msg_id != kmp_i18n_null
3455               && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3456               && (__kmp_affinity_type != affinity_none)))) {
3457                 if (file_name == NULL) {
3458                     KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3459                 }
3460                 else if (line == 0) {
3461                     KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3462                 }
3463                 else {
3464                     KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3465                 }
3466             }
3467 
3468             file_name = "";
3469             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3470             if (depth == 0) {
3471                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3472                 KMP_ASSERT(address2os == NULL);
3473                 return;
3474             }
3475             KMP_ASSERT(depth > 0);
3476             KMP_ASSERT(address2os != NULL);
3477         }
3478     }
3479 
3480     //
3481     // If the user has specified that a paricular topology discovery method
3482     // is to be used, then we abort if that method fails.  The exception is
3483     // group affinity, which might have been implicitly set.
3484     //
3485 
3486 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3487 
3488     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3489         if (__kmp_affinity_verbose) {
3490             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3491               KMP_I18N_STR(Decodingx2APIC));
3492         }
3493 
3494         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3495         if (depth == 0) {
3496             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3497             KMP_ASSERT(address2os == NULL);
3498             return;
3499         }
3500 
3501         if (depth < 0) {
3502             KMP_ASSERT(msg_id != kmp_i18n_null);
3503             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3504         }
3505     }
3506     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3507         if (__kmp_affinity_verbose) {
3508             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3509               KMP_I18N_STR(DecodingLegacyAPIC));
3510         }
3511 
3512         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3513         if (depth == 0) {
3514             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3515             KMP_ASSERT(address2os == NULL);
3516             return;
3517         }
3518 
3519         if (depth < 0) {
3520             KMP_ASSERT(msg_id != kmp_i18n_null);
3521             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3522         }
3523     }
3524 
3525 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3526 
3527     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3528         const char *filename;
3529         if (__kmp_cpuinfo_file != NULL) {
3530             filename = __kmp_cpuinfo_file;
3531         }
3532         else {
3533             filename = "/proc/cpuinfo";
3534         }
3535 
3536         if (__kmp_affinity_verbose) {
3537             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3538         }
3539 
3540         FILE *f = fopen(filename, "r");
3541         if (f == NULL) {
3542             int code = errno;
3543             if (__kmp_cpuinfo_file != NULL) {
3544                 __kmp_msg(
3545                     kmp_ms_fatal,
3546                     KMP_MSG(CantOpenFileForReading, filename),
3547                     KMP_ERR(code),
3548                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3549                     __kmp_msg_null
3550                 );
3551             }
3552             else {
3553                 __kmp_msg(
3554                     kmp_ms_fatal,
3555                     KMP_MSG(CantOpenFileForReading, filename),
3556                     KMP_ERR(code),
3557                     __kmp_msg_null
3558                 );
3559             }
3560         }
3561         int line = 0;
3562         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3563         fclose(f);
3564         if (depth < 0) {
3565             KMP_ASSERT(msg_id != kmp_i18n_null);
3566             if (line > 0) {
3567                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3568             }
3569             else {
3570                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3571             }
3572         }
3573         if (__kmp_affinity_type == affinity_none) {
3574             KMP_ASSERT(depth == 0);
3575             KMP_ASSERT(address2os == NULL);
3576             return;
3577         }
3578     }
3579 
3580 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3581 
3582     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3583         if (__kmp_affinity_verbose) {
3584             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3585         }
3586 
3587         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3588         KMP_ASSERT(depth != 0);
3589 
3590         if (depth < 0) {
3591             if ((msg_id != kmp_i18n_null)
3592               && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3593               && (__kmp_affinity_type != affinity_none)))) {
3594                 KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3595             }
3596 
3597             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3598             if (depth == 0) {
3599                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3600                 KMP_ASSERT(address2os == NULL);
3601                 return;
3602             }
3603             // should not fail
3604             KMP_ASSERT(depth > 0);
3605             KMP_ASSERT(address2os != NULL);
3606         }
3607     }
3608 
3609 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3610 
3611     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3612         if (__kmp_affinity_verbose) {
3613             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3614         }
3615 
3616         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3617         if (depth == 0) {
3618             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3619             KMP_ASSERT(address2os == NULL);
3620             return;
3621         }
3622         // should not fail
3623         KMP_ASSERT(depth > 0);
3624         KMP_ASSERT(address2os != NULL);
3625     }
3626 
3627     if (address2os == NULL) {
3628         if (KMP_AFFINITY_CAPABLE()
3629           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3630           && (__kmp_affinity_type != affinity_none)))) {
3631             KMP_WARNING(ErrorInitializeAffinity);
3632         }
3633         __kmp_affinity_type = affinity_none;
3634         __kmp_affin_mask_size = 0;
3635         return;
3636     }
3637 
3638 # if KMP_MIC
3639     __kmp_apply_thread_places(&address2os, depth);
3640 # endif
3641 
3642     //
3643     // Create the table of masks, indexed by thread Id.
3644     //
3645     unsigned maxIndex;
3646     unsigned numUnique;
3647     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3648       address2os, __kmp_avail_proc);
3649     if (__kmp_affinity_gran_levels == 0) {
3650         KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc);
3651     }
3652 
3653     //
3654     // Set the childNums vector in all Address objects.  This must be done
3655     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3656     // which takes into account the setting of __kmp_affinity_compact.
3657     //
3658     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3659 
3660     switch (__kmp_affinity_type) {
3661 
3662         case affinity_explicit:
3663         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3664 # if OMP_40_ENABLED
3665         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3666 # endif
3667         {
3668             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3669               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3670               maxIndex);
3671         }
3672 # if OMP_40_ENABLED
3673         else {
3674             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3675               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3676               maxIndex);
3677         }
3678 # endif
3679         if (__kmp_affinity_num_masks == 0) {
3680             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3681               && (__kmp_affinity_type != affinity_none))) {
3682                 KMP_WARNING(AffNoValidProcID);
3683             }
3684             __kmp_affinity_type = affinity_none;
3685             return;
3686         }
3687         break;
3688 
3689         //
3690         // The other affinity types rely on sorting the Addresses according
3691         // to some permutation of the machine topology tree.  Set
3692         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3693         // then jump to a common code fragment to do the sort and create
3694         // the array of affinity masks.
3695         //
3696 
3697         case affinity_logical:
3698         __kmp_affinity_compact = 0;
3699         if (__kmp_affinity_offset) {
3700             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3701               % __kmp_avail_proc;
3702         }
3703         goto sortAddresses;
3704 
3705         case affinity_physical:
3706         if (__kmp_nThreadsPerCore > 1) {
3707             __kmp_affinity_compact = 1;
3708             if (__kmp_affinity_compact >= depth) {
3709                 __kmp_affinity_compact = 0;
3710             }
3711         } else {
3712             __kmp_affinity_compact = 0;
3713         }
3714         if (__kmp_affinity_offset) {
3715             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3716               % __kmp_avail_proc;
3717         }
3718         goto sortAddresses;
3719 
3720         case affinity_scatter:
3721         if (__kmp_affinity_compact >= depth) {
3722             __kmp_affinity_compact = 0;
3723         }
3724         else {
3725             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3726         }
3727         goto sortAddresses;
3728 
3729         case affinity_compact:
3730         if (__kmp_affinity_compact >= depth) {
3731             __kmp_affinity_compact = depth - 1;
3732         }
3733         goto sortAddresses;
3734 
3735 # if KMP_MIC
3736         case affinity_balanced:
3737         // Balanced works only for the case of a single package and uniform topology
3738         if( nPackages > 1 ) {
3739             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3740                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3741             }
3742             __kmp_affinity_type = affinity_none;
3743             return;
3744         } else if( __kmp_affinity_uniform_topology() ) {
3745             break;
3746         } else { // Non-uniform topology
3747 
3748             // Save the depth for further usage
3749             __kmp_aff_depth = depth;
3750 
3751             // Number of hyper threads per core in HT machine
3752             int nth_per_core = __kmp_nThreadsPerCore;
3753 
3754             int core_level;
3755             if( nth_per_core > 1 ) {
3756                 core_level = depth - 2;
3757             } else {
3758                 core_level = depth - 1;
3759             }
3760             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3761             int nproc = nth_per_core * ncores;
3762 
3763             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3764             for( int i = 0; i < nproc; i++ ) {
3765                 procarr[ i ] = -1;
3766             }
3767 
3768             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3769                 int proc = address2os[ i ].second;
3770                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3771                 // If there is only one thread per core then depth == 2: level 0 - package,
3772                 // level 1 - core.
3773                 int level = depth - 1;
3774 
3775                 // __kmp_nth_per_core == 1
3776                 int thread = 0;
3777                 int core = address2os[ i ].first.labels[ level ];
3778                 // If the thread level exists, that is we have more than one thread context per core
3779                 if( nth_per_core > 1 ) {
3780                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3781                     core = address2os[ i ].first.labels[ level - 1 ];
3782                 }
3783                 procarr[ core * nth_per_core + thread ] = proc;
3784             }
3785 
3786             break;
3787         }
3788 # endif
3789 
3790         sortAddresses:
3791         //
3792         // Allocate the gtid->affinity mask table.
3793         //
3794         if (__kmp_affinity_dups) {
3795             __kmp_affinity_num_masks = __kmp_avail_proc;
3796         }
3797         else {
3798             __kmp_affinity_num_masks = numUnique;
3799         }
3800 
3801 # if OMP_40_ENABLED
3802         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3803           && ( __kmp_affinity_num_places > 0 )
3804           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3805             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3806         }
3807 # endif
3808 
3809         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3810           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3811 
3812         //
3813         // Sort the address2os table according to the current setting of
3814         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3815         //
3816         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3817           __kmp_affinity_cmp_Address_child_num);
3818         {
3819             int i;
3820             unsigned j;
3821             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3822                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3823                     continue;
3824                 }
3825                 unsigned osId = address2os[i].second;
3826                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3827                 kmp_affin_mask_t *dest
3828                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3829                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3830                 KMP_CPU_COPY(dest, src);
3831                 if (++j >= __kmp_affinity_num_masks) {
3832                     break;
3833                 }
3834             }
3835             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3836         }
3837         break;
3838 
3839         default:
3840         KMP_ASSERT2(0, "Unexpected affinity setting");
3841     }
3842 
3843     __kmp_free(osId2Mask);
3844 }
3845 
3846 
3847 void
3848 __kmp_affinity_initialize(void)
3849 {
3850     //
3851     // Much of the code above was written assumming that if a machine was not
3852     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3853     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3854     //
3855     // There are too many checks for __kmp_affinity_type == affinity_none
3856     // in this code.  Instead of trying to change them all, check if
3857     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3858     // affinity_none, call the real initialization routine, then restore
3859     // __kmp_affinity_type to affinity_disabled.
3860     //
3861     int disabled = (__kmp_affinity_type == affinity_disabled);
3862     if (! KMP_AFFINITY_CAPABLE()) {
3863         KMP_ASSERT(disabled);
3864     }
3865     if (disabled) {
3866         __kmp_affinity_type = affinity_none;
3867     }
3868     __kmp_aux_affinity_initialize();
3869     if (disabled) {
3870         __kmp_affinity_type = affinity_disabled;
3871     }
3872 }
3873 
3874 
3875 void
3876 __kmp_affinity_uninitialize(void)
3877 {
3878     if (__kmp_affinity_masks != NULL) {
3879         __kmp_free(__kmp_affinity_masks);
3880         __kmp_affinity_masks = NULL;
3881     }
3882     if (fullMask != NULL) {
3883         KMP_CPU_FREE(fullMask);
3884         fullMask = NULL;
3885     }
3886     __kmp_affinity_num_masks = 0;
3887 # if OMP_40_ENABLED
3888     __kmp_affinity_num_places = 0;
3889 # endif
3890     if (__kmp_affinity_proclist != NULL) {
3891         __kmp_free(__kmp_affinity_proclist);
3892         __kmp_affinity_proclist = NULL;
3893     }
3894     if( address2os != NULL ) {
3895         __kmp_free( address2os );
3896         address2os = NULL;
3897     }
3898     if( procarr != NULL ) {
3899         __kmp_free( procarr );
3900         procarr = NULL;
3901     }
3902 }
3903 
3904 
3905 void
3906 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3907 {
3908     if (! KMP_AFFINITY_CAPABLE()) {
3909         return;
3910     }
3911 
3912     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3913     if (th->th.th_affin_mask == NULL) {
3914         KMP_CPU_ALLOC(th->th.th_affin_mask);
3915     }
3916     else {
3917         KMP_CPU_ZERO(th->th.th_affin_mask);
3918     }
3919 
3920     //
3921     // Copy the thread mask to the kmp_info_t strucuture.
3922     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3923     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3924     // is set, then the full mask is the same as the mask of the initialization
3925     // thread.
3926     //
3927     kmp_affin_mask_t *mask;
3928     int i;
3929 
3930 # if OMP_40_ENABLED
3931     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3932 # endif
3933     {
3934         if ((__kmp_affinity_type == affinity_none)
3935 # if KMP_MIC
3936           || (__kmp_affinity_type == affinity_balanced)
3937 # endif
3938           ) {
3939 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3940             if (__kmp_num_proc_groups > 1) {
3941                 return;
3942             }
3943 # endif
3944             KMP_ASSERT(fullMask != NULL);
3945             i = -1;
3946             mask = fullMask;
3947         }
3948         else {
3949             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3950             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3951             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3952         }
3953     }
3954 # if OMP_40_ENABLED
3955     else {
3956         if ((! isa_root)
3957           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3958 #  if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3959             if (__kmp_num_proc_groups > 1) {
3960                 return;
3961             }
3962 #  endif
3963             KMP_ASSERT(fullMask != NULL);
3964             i = KMP_PLACE_ALL;
3965             mask = fullMask;
3966         }
3967         else {
3968             //
3969             // int i = some hash function or just a counter that doesn't
3970             // always start at 0.  Use gtid for now.
3971             //
3972             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3973             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3974             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3975         }
3976     }
3977 # endif
3978 
3979 # if OMP_40_ENABLED
3980     th->th.th_current_place = i;
3981     if (isa_root) {
3982         th->th.th_new_place = i;
3983         th->th.th_first_place = 0;
3984         th->th.th_last_place = __kmp_affinity_num_masks - 1;
3985     }
3986 
3987     if (i == KMP_PLACE_ALL) {
3988         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
3989           gtid));
3990     }
3991     else {
3992         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
3993           gtid, i));
3994     }
3995 # else
3996     if (i == -1) {
3997         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
3998           gtid));
3999     }
4000     else {
4001         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4002           gtid, i));
4003     }
4004 # endif /* OMP_40_ENABLED */
4005 
4006     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4007 
4008     if (__kmp_affinity_verbose) {
4009         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4010         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4011           th->th.th_affin_mask);
4012         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf);
4013     }
4014 
4015 # if KMP_OS_WINDOWS
4016     //
4017     // On Windows* OS, the process affinity mask might have changed.
4018     // If the user didn't request affinity and this call fails,
4019     // just continue silently.  See CQ171393.
4020     //
4021     if ( __kmp_affinity_type == affinity_none ) {
4022         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4023     }
4024     else
4025 # endif
4026     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4027 }
4028 
4029 
4030 # if OMP_40_ENABLED
4031 
4032 void
4033 __kmp_affinity_set_place(int gtid)
4034 {
4035     int retval;
4036 
4037     if (! KMP_AFFINITY_CAPABLE()) {
4038         return;
4039     }
4040 
4041     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4042 
4043     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4044       gtid, th->th.th_new_place, th->th.th_current_place));
4045 
4046     //
4047     // Check that the new place is withing this thread's partition.
4048     //
4049     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4050     KMP_DEBUG_ASSERT(th->th.th_new_place >= 0);
4051     KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4052     if (th->th.th_first_place <= th->th.th_last_place) {
4053         KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4054          && (th->th.th_new_place <= th->th.th_last_place));
4055     }
4056     else {
4057         KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4058          || (th->th.th_new_place >= th->th.th_last_place));
4059     }
4060 
4061     //
4062     // Copy the thread mask to the kmp_info_t strucuture,
4063     // and set this thread's affinity.
4064     //
4065     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4066       th->th.th_new_place);
4067     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4068     th->th.th_current_place = th->th.th_new_place;
4069 
4070     if (__kmp_affinity_verbose) {
4071         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4072         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4073           th->th.th_affin_mask);
4074         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf);
4075     }
4076     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4077 }
4078 
4079 # endif /* OMP_40_ENABLED */
4080 
4081 
4082 int
4083 __kmp_aux_set_affinity(void **mask)
4084 {
4085     int gtid;
4086     kmp_info_t *th;
4087     int retval;
4088 
4089     if (! KMP_AFFINITY_CAPABLE()) {
4090         return -1;
4091     }
4092 
4093     gtid = __kmp_entry_gtid();
4094     KA_TRACE(1000, ;{
4095         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4096         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4097           (kmp_affin_mask_t *)(*mask));
4098         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4099           gtid, buf);
4100     });
4101 
4102     if (__kmp_env_consistency_check) {
4103         if ((mask == NULL) || (*mask == NULL)) {
4104             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4105         }
4106         else {
4107             unsigned proc;
4108             int num_procs = 0;
4109 
4110             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4111                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4112                     continue;
4113                 }
4114                 num_procs++;
4115                 if (! KMP_CPU_ISSET(proc, fullMask)) {
4116                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4117                     break;
4118                 }
4119             }
4120             if (num_procs == 0) {
4121                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4122             }
4123 
4124 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4125             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4126                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4127             }
4128 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4129 
4130         }
4131     }
4132 
4133     th = __kmp_threads[gtid];
4134     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4135     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4136     if (retval == 0) {
4137         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4138     }
4139 
4140 # if OMP_40_ENABLED
4141     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4142     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4143     th->th.th_first_place = 0;
4144     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4145 # endif
4146 
4147     return retval;
4148 }
4149 
4150 
4151 int
4152 __kmp_aux_get_affinity(void **mask)
4153 {
4154     int gtid;
4155     int retval;
4156     kmp_info_t *th;
4157 
4158     if (! KMP_AFFINITY_CAPABLE()) {
4159         return -1;
4160     }
4161 
4162     gtid = __kmp_entry_gtid();
4163     th = __kmp_threads[gtid];
4164     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4165 
4166     KA_TRACE(1000, ;{
4167         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4168         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4169           th->th.th_affin_mask);
4170         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4171     });
4172 
4173     if (__kmp_env_consistency_check) {
4174         if ((mask == NULL) || (*mask == NULL)) {
4175             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4176         }
4177     }
4178 
4179 # if !KMP_OS_WINDOWS
4180 
4181     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4182     KA_TRACE(1000, ;{
4183         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4184         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4185           (kmp_affin_mask_t *)(*mask));
4186         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4187     });
4188     return retval;
4189 
4190 # else
4191 
4192     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4193     return 0;
4194 
4195 # endif /* KMP_OS_WINDOWS */
4196 
4197 }
4198 
4199 
4200 int
4201 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4202 {
4203     int retval;
4204 
4205     if (! KMP_AFFINITY_CAPABLE()) {
4206         return -1;
4207     }
4208 
4209     KA_TRACE(1000, ;{
4210         int gtid = __kmp_entry_gtid();
4211         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4212         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4213           (kmp_affin_mask_t *)(*mask));
4214         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4215           proc, gtid, buf);
4216     });
4217 
4218     if (__kmp_env_consistency_check) {
4219         if ((mask == NULL) || (*mask == NULL)) {
4220             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4221         }
4222     }
4223 
4224     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4225         return -1;
4226     }
4227     if (! KMP_CPU_ISSET(proc, fullMask)) {
4228         return -2;
4229     }
4230 
4231     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4232     return 0;
4233 }
4234 
4235 
4236 int
4237 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4238 {
4239     int retval;
4240 
4241     if (! KMP_AFFINITY_CAPABLE()) {
4242         return -1;
4243     }
4244 
4245     KA_TRACE(1000, ;{
4246         int gtid = __kmp_entry_gtid();
4247         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4248         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4249           (kmp_affin_mask_t *)(*mask));
4250         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4251           proc, gtid, buf);
4252     });
4253 
4254     if (__kmp_env_consistency_check) {
4255         if ((mask == NULL) || (*mask == NULL)) {
4256             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4257         }
4258     }
4259 
4260     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4261         return -1;
4262     }
4263     if (! KMP_CPU_ISSET(proc, fullMask)) {
4264         return -2;
4265     }
4266 
4267     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4268     return 0;
4269 }
4270 
4271 
4272 int
4273 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4274 {
4275     int retval;
4276 
4277     if (! KMP_AFFINITY_CAPABLE()) {
4278         return -1;
4279     }
4280 
4281     KA_TRACE(1000, ;{
4282         int gtid = __kmp_entry_gtid();
4283         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4284         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4285           (kmp_affin_mask_t *)(*mask));
4286         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4287           proc, gtid, buf);
4288     });
4289 
4290     if (__kmp_env_consistency_check) {
4291         if ((mask == NULL) || (*mask == NULL)) {
4292             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4293         }
4294     }
4295 
4296     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4297         return 0;
4298     }
4299     if (! KMP_CPU_ISSET(proc, fullMask)) {
4300         return 0;
4301     }
4302 
4303     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4304 }
4305 
4306 # if KMP_MIC
4307 
4308 // Dynamic affinity settings - Affinity balanced
4309 void __kmp_balanced_affinity( int tid, int nthreads )
4310 {
4311     if( __kmp_affinity_uniform_topology() ) {
4312         int coreID;
4313         int threadID;
4314         // Number of hyper threads per core in HT machine
4315         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4316         // Number of cores
4317         int ncores = __kmp_ncores;
4318         // How many threads will be bound to each core
4319         int chunk = nthreads / ncores;
4320         // How many cores will have an additional thread bound to it - "big cores"
4321         int big_cores = nthreads % ncores;
4322         // Number of threads on the big cores
4323         int big_nth = ( chunk + 1 ) * big_cores;
4324         if( tid < big_nth ) {
4325             coreID = tid / (chunk + 1 );
4326             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4327         } else { //tid >= big_nth
4328             coreID = ( tid - big_cores ) / chunk;
4329             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4330         }
4331 
4332         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4333           "Illegal set affinity operation when not capable");
4334 
4335         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4336         KMP_CPU_ZERO(mask);
4337 
4338         // Granularity == thread
4339         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4340             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4341             KMP_CPU_SET( osID, mask);
4342         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4343             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4344                 int osID;
4345                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4346                 KMP_CPU_SET( osID, mask);
4347             }
4348         }
4349         if (__kmp_affinity_verbose) {
4350             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4351             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4352             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4353         }
4354         __kmp_set_system_affinity( mask, TRUE );
4355     } else { // Non-uniform topology
4356 
4357         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4358         KMP_CPU_ZERO(mask);
4359 
4360         // Number of hyper threads per core in HT machine
4361         int nth_per_core = __kmp_nThreadsPerCore;
4362         int core_level;
4363         if( nth_per_core > 1 ) {
4364             core_level = __kmp_aff_depth - 2;
4365         } else {
4366             core_level = __kmp_aff_depth - 1;
4367         }
4368 
4369         // Number of cores - maximum value; it does not count trail cores with 0 processors
4370         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4371 
4372         // For performance gain consider the special case nthreads == __kmp_avail_proc
4373         if( nthreads == __kmp_avail_proc ) {
4374             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4375                 int osID = address2os[ tid ].second;
4376                 KMP_CPU_SET( osID, mask);
4377             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4378                 int coreID = address2os[ tid ].first.labels[ core_level ];
4379                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4380                 // since the address2os is sortied we can break when cnt==nth_per_core
4381                 int cnt = 0;
4382                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4383                     int osID = address2os[ i ].second;
4384                     int core = address2os[ i ].first.labels[ core_level ];
4385                     if( core == coreID ) {
4386                         KMP_CPU_SET( osID, mask);
4387                         cnt++;
4388                         if( cnt == nth_per_core ) {
4389                             break;
4390                         }
4391                     }
4392                 }
4393             }
4394         } else if( nthreads <= __kmp_ncores ) {
4395 
4396             int core = 0;
4397             for( int i = 0; i < ncores; i++ ) {
4398                 // Check if this core from procarr[] is in the mask
4399                 int in_mask = 0;
4400                 for( int j = 0; j < nth_per_core; j++ ) {
4401                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4402                         in_mask = 1;
4403                         break;
4404                     }
4405                 }
4406                 if( in_mask ) {
4407                     if( tid == core ) {
4408                         for( int j = 0; j < nth_per_core; j++ ) {
4409                             int osID = procarr[ i * nth_per_core + j ];
4410                             if( osID != -1 ) {
4411                                 KMP_CPU_SET( osID, mask );
4412                                 // For granularity=thread it is enough to set the first available osID for this core
4413                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4414                                     break;
4415                                 }
4416                             }
4417                         }
4418                         break;
4419                     } else {
4420                         core++;
4421                     }
4422                 }
4423             }
4424 
4425         } else { // nthreads > __kmp_ncores
4426 
4427             // Array to save the number of processors at each core
4428             int nproc_at_core[ ncores ];
4429             // Array to save the number of cores with "x" available processors;
4430             int ncores_with_x_procs[ nth_per_core + 1 ];
4431             // Array to save the number of cores with # procs from x to nth_per_core
4432             int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4433 
4434             for( int i = 0; i <= nth_per_core; i++ ) {
4435                 ncores_with_x_procs[ i ] = 0;
4436                 ncores_with_x_to_max_procs[ i ] = 0;
4437             }
4438 
4439             for( int i = 0; i < ncores; i++ ) {
4440                 int cnt = 0;
4441                 for( int j = 0; j < nth_per_core; j++ ) {
4442                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4443                         cnt++;
4444                     }
4445                 }
4446                 nproc_at_core[ i ] = cnt;
4447                 ncores_with_x_procs[ cnt ]++;
4448             }
4449 
4450             for( int i = 0; i <= nth_per_core; i++ ) {
4451                 for( int j = i; j <= nth_per_core; j++ ) {
4452                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4453                 }
4454             }
4455 
4456             // Max number of processors
4457             int nproc = nth_per_core * ncores;
4458             // An array to keep number of threads per each context
4459             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4460             for( int i = 0; i < nproc; i++ ) {
4461                 newarr[ i ] = 0;
4462             }
4463 
4464             int nth = nthreads;
4465             int flag = 0;
4466             while( nth > 0 ) {
4467                 for( int j = 1; j <= nth_per_core; j++ ) {
4468                     int cnt = ncores_with_x_to_max_procs[ j ];
4469                     for( int i = 0; i < ncores; i++ ) {
4470                         // Skip the core with 0 processors
4471                         if( nproc_at_core[ i ] == 0 ) {
4472                             continue;
4473                         }
4474                         for( int k = 0; k < nth_per_core; k++ ) {
4475                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4476                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4477                                     newarr[ i * nth_per_core + k ] = 1;
4478                                     cnt--;
4479                                     nth--;
4480                                     break;
4481                                 } else {
4482                                     if( flag != 0 ) {
4483                                         newarr[ i * nth_per_core + k ] ++;
4484                                         cnt--;
4485                                         nth--;
4486                                         break;
4487                                     }
4488                                 }
4489                             }
4490                         }
4491                         if( cnt == 0 || nth == 0 ) {
4492                             break;
4493                         }
4494                     }
4495                     if( nth == 0 ) {
4496                         break;
4497                     }
4498                 }
4499                 flag = 1;
4500             }
4501             int sum = 0;
4502             for( int i = 0; i < nproc; i++ ) {
4503                 sum += newarr[ i ];
4504                 if( sum > tid ) {
4505                     // Granularity == thread
4506                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4507                         int osID = procarr[ i ];
4508                         KMP_CPU_SET( osID, mask);
4509                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4510                         int coreID = i / nth_per_core;
4511                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4512                             int osID = procarr[ coreID * nth_per_core + ii ];
4513                             if( osID != -1 ) {
4514                                 KMP_CPU_SET( osID, mask);
4515                             }
4516                         }
4517                     }
4518                     break;
4519                 }
4520             }
4521             __kmp_free( newarr );
4522         }
4523 
4524         if (__kmp_affinity_verbose) {
4525             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4526             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4527             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4528         }
4529         __kmp_set_system_affinity( mask, TRUE );
4530     }
4531 }
4532 
4533 # endif /* KMP_MIC */
4534 
4535 #elif KMP_OS_DARWIN
4536     // affinity not supported
4537 #else
4538     #error "Unknown or unsupported OS"
4539 #endif // KMP_OS_WINDOWS || KMP_OS_LINUX
4540 
4541