1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     depth = machine_hierarchy.depth;
37     KMP_DEBUG_ASSERT(depth > 0);
38     // Adjust the hierarchy in case num threads exceeds original
39     if (nproc > machine_hierarchy.skipPerLevel[depth-1])
40         machine_hierarchy.resize(nproc);
41 
42     thr_bar->depth = depth;
43     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
44     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
45 }
46 
47 #if KMP_AFFINITY_SUPPORTED
48 
49 //
50 // Print the affinity mask to the character array in a pretty format.
51 //
52 char *
53 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
54 {
55     KMP_ASSERT(buf_len >= 40);
56     char *scan = buf;
57     char *end = buf + buf_len - 1;
58 
59     //
60     // Find first element / check for empty set.
61     //
62     size_t i;
63     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
64         if (KMP_CPU_ISSET(i, mask)) {
65             break;
66         }
67     }
68     if (i == KMP_CPU_SETSIZE) {
69         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
70         while (*scan != '\0') scan++;
71         KMP_ASSERT(scan <= end);
72         return buf;
73     }
74 
75     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
76     while (*scan != '\0') scan++;
77     i++;
78     for (; i < KMP_CPU_SETSIZE; i++) {
79         if (! KMP_CPU_ISSET(i, mask)) {
80             continue;
81         }
82 
83         //
84         // Check for buffer overflow.  A string of the form ",<n>" will have
85         // at most 10 characters, plus we want to leave room to print ",...}"
86         // if the set is too large to print for a total of 15 characters.
87         // We already left room for '\0' in setting end.
88         //
89         if (end - scan < 15) {
90            break;
91         }
92         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
93         while (*scan != '\0') scan++;
94     }
95     if (i < KMP_CPU_SETSIZE) {
96         KMP_SNPRINTF(scan, end-scan+1,  ",...");
97         while (*scan != '\0') scan++;
98     }
99     KMP_SNPRINTF(scan, end-scan+1, "}");
100     while (*scan != '\0') scan++;
101     KMP_ASSERT(scan <= end);
102     return buf;
103 }
104 
105 
106 void
107 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
108 {
109     KMP_CPU_ZERO(mask);
110 
111 # if KMP_GROUP_AFFINITY
112 
113     if (__kmp_num_proc_groups > 1) {
114         int group;
115         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
116         for (group = 0; group < __kmp_num_proc_groups; group++) {
117             int i;
118             int num = __kmp_GetActiveProcessorCount(group);
119             for (i = 0; i < num; i++) {
120                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
121             }
122         }
123     }
124     else
125 
126 # endif /* KMP_GROUP_AFFINITY */
127 
128     {
129         int proc;
130         for (proc = 0; proc < __kmp_xproc; proc++) {
131             KMP_CPU_SET(proc, mask);
132         }
133     }
134 }
135 
136 //
137 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
138 // called to renumber the labels from [0..n] and place them into the child_num
139 // vector of the address object.  This is done in case the labels used for
140 // the children at one node of the hierarchy differ from those used for
141 // another node at the same level.  Example:  suppose the machine has 2 nodes
142 // with 2 packages each.  The first node contains packages 601 and 602, and
143 // second node contains packages 603 and 604.  If we try to sort the table
144 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
145 // because we are paying attention to the labels themselves, not the ordinal
146 // child numbers.  By using the child numbers in the sort, the result is
147 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
148 //
149 static void
150 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
151   int numAddrs)
152 {
153     KMP_DEBUG_ASSERT(numAddrs > 0);
154     int depth = address2os->first.depth;
155     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
156     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
157       * sizeof(unsigned));
158     int labCt;
159     for (labCt = 0; labCt < depth; labCt++) {
160         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
161         lastLabel[labCt] = address2os[0].first.labels[labCt];
162     }
163     int i;
164     for (i = 1; i < numAddrs; i++) {
165         for (labCt = 0; labCt < depth; labCt++) {
166             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
167                 int labCt2;
168                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
169                     counts[labCt2] = 0;
170                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
171                 }
172                 counts[labCt]++;
173                 lastLabel[labCt] = address2os[i].first.labels[labCt];
174                 break;
175             }
176         }
177         for (labCt = 0; labCt < depth; labCt++) {
178             address2os[i].first.childNums[labCt] = counts[labCt];
179         }
180         for (; labCt < (int)Address::maxDepth; labCt++) {
181             address2os[i].first.childNums[labCt] = 0;
182         }
183     }
184 }
185 
186 
187 //
188 // All of the __kmp_affinity_create_*_map() routines should set
189 // __kmp_affinity_masks to a vector of affinity mask objects of length
190 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
191 // return the number of levels in the machine topology tree (zero if
192 // __kmp_affinity_type == affinity_none).
193 //
194 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
195 // to the affinity mask for the initialization thread.  They need to save and
196 // restore the mask, and it could be needed later, so saving it is just an
197 // optimization to avoid calling kmp_get_system_affinity() again.
198 //
199 static kmp_affin_mask_t *fullMask = NULL;
200 
201 kmp_affin_mask_t *
202 __kmp_affinity_get_fullMask() { return fullMask; }
203 
204 
205 static int nCoresPerPkg, nPackages;
206 static int __kmp_nThreadsPerCore;
207 #ifndef KMP_DFLT_NTH_CORES
208 static int __kmp_ncores;
209 #endif
210 
211 //
212 // __kmp_affinity_uniform_topology() doesn't work when called from
213 // places which support arbitrarily many levels in the machine topology
214 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
215 // __kmp_affinity_create_x2apicid_map().
216 //
217 inline static bool
218 __kmp_affinity_uniform_topology()
219 {
220     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
221 }
222 
223 
224 //
225 // Print out the detailed machine topology map, i.e. the physical locations
226 // of each OS proc.
227 //
228 static void
229 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
230   int pkgLevel, int coreLevel, int threadLevel)
231 {
232     int proc;
233 
234     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
235     for (proc = 0; proc < len; proc++) {
236         int level;
237         kmp_str_buf_t buf;
238         __kmp_str_buf_init(&buf);
239         for (level = 0; level < depth; level++) {
240             if (level == threadLevel) {
241                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
242             }
243             else if (level == coreLevel) {
244                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
245             }
246             else if (level == pkgLevel) {
247                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
248             }
249             else if (level > pkgLevel) {
250                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
251                   level - pkgLevel - 1);
252             }
253             else {
254                 __kmp_str_buf_print(&buf, "L%d ", level);
255             }
256             __kmp_str_buf_print(&buf, "%d ",
257               address2os[proc].first.labels[level]);
258         }
259         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
260           buf.str);
261         __kmp_str_buf_free(&buf);
262     }
263 }
264 
265 
266 //
267 // If we don't know how to retrieve the machine's processor topology, or
268 // encounter an error in doing so, this routine is called to form a "flat"
269 // mapping of os thread id's <-> processor id's.
270 //
271 static int
272 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
273   kmp_i18n_id_t *const msg_id)
274 {
275     *address2os = NULL;
276     *msg_id = kmp_i18n_null;
277 
278     //
279     // Even if __kmp_affinity_type == affinity_none, this routine might still
280     // called to set __kmp_ncores, as well as
281     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
282     //
283     if (! KMP_AFFINITY_CAPABLE()) {
284         KMP_ASSERT(__kmp_affinity_type == affinity_none);
285         __kmp_ncores = nPackages = __kmp_xproc;
286         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
287         if (__kmp_affinity_verbose) {
288             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
289             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
290             KMP_INFORM(Uniform, "KMP_AFFINITY");
291             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
292               __kmp_nThreadsPerCore, __kmp_ncores);
293         }
294         return 0;
295     }
296 
297     //
298     // When affinity is off, this routine will still be called to set
299     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
300     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
301     //  correctly, and return now if affinity is not enabled.
302     //
303     __kmp_ncores = nPackages = __kmp_avail_proc;
304     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
305     if (__kmp_affinity_verbose) {
306         char buf[KMP_AFFIN_MASK_PRINT_LEN];
307         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
308 
309         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
310         if (__kmp_affinity_respect_mask) {
311             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
312         } else {
313             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
314         }
315         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
316         KMP_INFORM(Uniform, "KMP_AFFINITY");
317         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
318           __kmp_nThreadsPerCore, __kmp_ncores);
319     }
320     if (__kmp_affinity_type == affinity_none) {
321         return 0;
322     }
323 
324     //
325     // Contruct the data structure to be returned.
326     //
327     *address2os = (AddrUnsPair*)
328       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
329     int avail_ct = 0;
330     unsigned int i;
331     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
332         //
333         // Skip this proc if it is not included in the machine model.
334         //
335         if (! KMP_CPU_ISSET(i, fullMask)) {
336             continue;
337         }
338 
339         Address addr(1);
340         addr.labels[0] = i;
341         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
342     }
343     if (__kmp_affinity_verbose) {
344         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
345     }
346 
347     if (__kmp_affinity_gran_levels < 0) {
348         //
349         // Only the package level is modeled in the machine topology map,
350         // so the #levels of granularity is either 0 or 1.
351         //
352         if (__kmp_affinity_gran > affinity_gran_package) {
353             __kmp_affinity_gran_levels = 1;
354         }
355         else {
356             __kmp_affinity_gran_levels = 0;
357         }
358     }
359     return 1;
360 }
361 
362 
363 # if KMP_GROUP_AFFINITY
364 
365 //
366 // If multiple Windows* OS processor groups exist, we can create a 2-level
367 // topology map with the groups at level 0 and the individual procs at
368 // level 1.
369 //
370 // This facilitates letting the threads float among all procs in a group,
371 // if granularity=group (the default when there are multiple groups).
372 //
373 static int
374 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
375   kmp_i18n_id_t *const msg_id)
376 {
377     *address2os = NULL;
378     *msg_id = kmp_i18n_null;
379 
380     //
381     // If we don't have multiple processor groups, return now.
382     // The flat mapping will be used.
383     //
384     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
385         // FIXME set *msg_id
386         return -1;
387     }
388 
389     //
390     // Contruct the data structure to be returned.
391     //
392     *address2os = (AddrUnsPair*)
393       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
394     int avail_ct = 0;
395     int i;
396     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
397         //
398         // Skip this proc if it is not included in the machine model.
399         //
400         if (! KMP_CPU_ISSET(i, fullMask)) {
401             continue;
402         }
403 
404         Address addr(2);
405         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
406         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
407         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
408 
409         if (__kmp_affinity_verbose) {
410             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
411               addr.labels[1]);
412         }
413     }
414 
415     if (__kmp_affinity_gran_levels < 0) {
416         if (__kmp_affinity_gran == affinity_gran_group) {
417             __kmp_affinity_gran_levels = 1;
418         }
419         else if ((__kmp_affinity_gran == affinity_gran_fine)
420           || (__kmp_affinity_gran == affinity_gran_thread)) {
421             __kmp_affinity_gran_levels = 0;
422         }
423         else {
424             const char *gran_str = NULL;
425             if (__kmp_affinity_gran == affinity_gran_core) {
426                 gran_str = "core";
427             }
428             else if (__kmp_affinity_gran == affinity_gran_package) {
429                 gran_str = "package";
430             }
431             else if (__kmp_affinity_gran == affinity_gran_node) {
432                 gran_str = "node";
433             }
434             else {
435                 KMP_ASSERT(0);
436             }
437 
438             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
439             __kmp_affinity_gran_levels = 0;
440         }
441     }
442     return 2;
443 }
444 
445 # endif /* KMP_GROUP_AFFINITY */
446 
447 
448 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
449 
450 static int
451 __kmp_cpuid_mask_width(int count) {
452     int r = 0;
453 
454     while((1<<r) < count)
455         ++r;
456     return r;
457 }
458 
459 
460 class apicThreadInfo {
461 public:
462     unsigned osId;              // param to __kmp_affinity_bind_thread
463     unsigned apicId;            // from cpuid after binding
464     unsigned maxCoresPerPkg;    //      ""
465     unsigned maxThreadsPerPkg;  //      ""
466     unsigned pkgId;             // inferred from above values
467     unsigned coreId;            //      ""
468     unsigned threadId;          //      ""
469 };
470 
471 
472 static int
473 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
474 {
475     const apicThreadInfo *aa = (const apicThreadInfo *)a;
476     const apicThreadInfo *bb = (const apicThreadInfo *)b;
477     if (aa->osId < bb->osId) return -1;
478     if (aa->osId > bb->osId) return 1;
479     return 0;
480 }
481 
482 
483 static int
484 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
485 {
486     const apicThreadInfo *aa = (const apicThreadInfo *)a;
487     const apicThreadInfo *bb = (const apicThreadInfo *)b;
488     if (aa->pkgId < bb->pkgId) return -1;
489     if (aa->pkgId > bb->pkgId) return 1;
490     if (aa->coreId < bb->coreId) return -1;
491     if (aa->coreId > bb->coreId) return 1;
492     if (aa->threadId < bb->threadId) return -1;
493     if (aa->threadId > bb->threadId) return 1;
494     return 0;
495 }
496 
497 
498 //
499 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
500 // an algorithm which cycles through the available os threads, setting
501 // the current thread's affinity mask to that thread, and then retrieves
502 // the Apic Id for each thread context using the cpuid instruction.
503 //
504 static int
505 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
506   kmp_i18n_id_t *const msg_id)
507 {
508     kmp_cpuid buf;
509     int rc;
510     *address2os = NULL;
511     *msg_id = kmp_i18n_null;
512 
513     //
514     // Check if cpuid leaf 4 is supported.
515     //
516         __kmp_x86_cpuid(0, 0, &buf);
517         if (buf.eax < 4) {
518             *msg_id = kmp_i18n_str_NoLeaf4Support;
519             return -1;
520         }
521 
522     //
523     // The algorithm used starts by setting the affinity to each available
524     // thread and retrieving info from the cpuid instruction, so if we are
525     // not capable of calling __kmp_get_system_affinity() and
526     // _kmp_get_system_affinity(), then we need to do something else - use
527     // the defaults that we calculated from issuing cpuid without binding
528     // to each proc.
529     //
530     if (! KMP_AFFINITY_CAPABLE()) {
531         //
532         // Hack to try and infer the machine topology using only the data
533         // available from cpuid on the current thread, and __kmp_xproc.
534         //
535         KMP_ASSERT(__kmp_affinity_type == affinity_none);
536 
537         //
538         // Get an upper bound on the number of threads per package using
539         // cpuid(1).
540         //
541         // On some OS/chps combinations where HT is supported by the chip
542         // but is disabled, this value will be 2 on a single core chip.
543         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
544         //
545         __kmp_x86_cpuid(1, 0, &buf);
546         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
547         if (maxThreadsPerPkg == 0) {
548             maxThreadsPerPkg = 1;
549         }
550 
551         //
552         // The num cores per pkg comes from cpuid(4).
553         // 1 must be added to the encoded value.
554         //
555         // The author of cpu_count.cpp treated this only an upper bound
556         // on the number of cores, but I haven't seen any cases where it
557         // was greater than the actual number of cores, so we will treat
558         // it as exact in this block of code.
559         //
560         // First, we need to check if cpuid(4) is supported on this chip.
561         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
562         // has the value n or greater.
563         //
564         __kmp_x86_cpuid(0, 0, &buf);
565         if (buf.eax >= 4) {
566             __kmp_x86_cpuid(4, 0, &buf);
567             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
568         }
569         else {
570             nCoresPerPkg = 1;
571         }
572 
573         //
574         // There is no way to reliably tell if HT is enabled without issuing
575         // the cpuid instruction from every thread, can correlating the cpuid
576         // info, so if the machine is not affinity capable, we assume that HT
577         // is off.  We have seen quite a few machines where maxThreadsPerPkg
578         // is 2, yet the machine does not support HT.
579         //
580         // - Older OSes are usually found on machines with older chips, which
581         //   do not support HT.
582         //
583         // - The performance penalty for mistakenly identifying a machine as
584         //   HT when it isn't (which results in blocktime being incorrecly set
585         //   to 0) is greater than the penalty when for mistakenly identifying
586         //   a machine as being 1 thread/core when it is really HT enabled
587         //   (which results in blocktime being incorrectly set to a positive
588         //   value).
589         //
590         __kmp_ncores = __kmp_xproc;
591         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
592         __kmp_nThreadsPerCore = 1;
593         if (__kmp_affinity_verbose) {
594             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
595             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
596             if (__kmp_affinity_uniform_topology()) {
597                 KMP_INFORM(Uniform, "KMP_AFFINITY");
598             } else {
599                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
600             }
601             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
602               __kmp_nThreadsPerCore, __kmp_ncores);
603         }
604         return 0;
605     }
606 
607     //
608     //
609     // From here on, we can assume that it is safe to call
610     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
611     // even if __kmp_affinity_type = affinity_none.
612     //
613 
614     //
615     // Save the affinity mask for the current thread.
616     //
617     kmp_affin_mask_t *oldMask;
618     KMP_CPU_ALLOC(oldMask);
619     KMP_ASSERT(oldMask != NULL);
620     __kmp_get_system_affinity(oldMask, TRUE);
621 
622     //
623     // Run through each of the available contexts, binding the current thread
624     // to it, and obtaining the pertinent information using the cpuid instr.
625     //
626     // The relevant information is:
627     //
628     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
629     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
630     //
631     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
632     //    value of this field determines the width of the core# + thread#
633     //    fields in the Apic Id.  It is also an upper bound on the number
634     //    of threads per package, but it has been verified that situations
635     //    happen were it is not exact.  In particular, on certain OS/chip
636     //    combinations where Intel(R) Hyper-Threading Technology is supported
637     //    by the chip but has
638     //    been disabled, the value of this field will be 2 (for a single core
639     //    chip).  On other OS/chip combinations supporting
640     //    Intel(R) Hyper-Threading Technology, the value of
641     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
642     //    disabled and 2 when it is enabled.
643     //
644     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
645     //    value of this field (+1) determines the width of the core# field in
646     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
647     //    an upper bound, but the IA-32 architecture manual says that it is
648     //    exactly the number of cores per package, and I haven't seen any
649     //    case where it wasn't.
650     //
651     // From this information, deduce the package Id, core Id, and thread Id,
652     // and set the corresponding fields in the apicThreadInfo struct.
653     //
654     unsigned i;
655     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
656       __kmp_avail_proc * sizeof(apicThreadInfo));
657     unsigned nApics = 0;
658     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
659         //
660         // Skip this proc if it is not included in the machine model.
661         //
662         if (! KMP_CPU_ISSET(i, fullMask)) {
663             continue;
664         }
665         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
666 
667         __kmp_affinity_bind_thread(i);
668         threadInfo[nApics].osId = i;
669 
670         //
671         // The apic id and max threads per pkg come from cpuid(1).
672         //
673         __kmp_x86_cpuid(1, 0, &buf);
674         if (! (buf.edx >> 9) & 1) {
675             __kmp_set_system_affinity(oldMask, TRUE);
676             __kmp_free(threadInfo);
677             KMP_CPU_FREE(oldMask);
678             *msg_id = kmp_i18n_str_ApicNotPresent;
679             return -1;
680         }
681         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
682         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
683         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
684             threadInfo[nApics].maxThreadsPerPkg = 1;
685         }
686 
687         //
688         // Max cores per pkg comes from cpuid(4).
689         // 1 must be added to the encoded value.
690         //
691         // First, we need to check if cpuid(4) is supported on this chip.
692         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
693         // has the value n or greater.
694         //
695         __kmp_x86_cpuid(0, 0, &buf);
696         if (buf.eax >= 4) {
697             __kmp_x86_cpuid(4, 0, &buf);
698             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
699         }
700         else {
701             threadInfo[nApics].maxCoresPerPkg = 1;
702         }
703 
704         //
705         // Infer the pkgId / coreId / threadId using only the info
706         // obtained locally.
707         //
708         int widthCT = __kmp_cpuid_mask_width(
709           threadInfo[nApics].maxThreadsPerPkg);
710         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
711 
712         int widthC = __kmp_cpuid_mask_width(
713           threadInfo[nApics].maxCoresPerPkg);
714         int widthT = widthCT - widthC;
715         if (widthT < 0) {
716             //
717             // I've never seen this one happen, but I suppose it could, if
718             // the cpuid instruction on a chip was really screwed up.
719             // Make sure to restore the affinity mask before the tail call.
720             //
721             __kmp_set_system_affinity(oldMask, TRUE);
722             __kmp_free(threadInfo);
723             KMP_CPU_FREE(oldMask);
724             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
725             return -1;
726         }
727 
728         int maskC = (1 << widthC) - 1;
729         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
730           &maskC;
731 
732         int maskT = (1 << widthT) - 1;
733         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
734 
735         nApics++;
736     }
737 
738     //
739     // We've collected all the info we need.
740     // Restore the old affinity mask for this thread.
741     //
742     __kmp_set_system_affinity(oldMask, TRUE);
743 
744     //
745     // If there's only one thread context to bind to, form an Address object
746     // with depth 1 and return immediately (or, if affinity is off, set
747     // address2os to NULL and return).
748     //
749     // If it is configured to omit the package level when there is only a
750     // single package, the logic at the end of this routine won't work if
751     // there is only a single thread - it would try to form an Address
752     // object with depth 0.
753     //
754     KMP_ASSERT(nApics > 0);
755     if (nApics == 1) {
756         __kmp_ncores = nPackages = 1;
757         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
758         if (__kmp_affinity_verbose) {
759             char buf[KMP_AFFIN_MASK_PRINT_LEN];
760             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
761 
762             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
763             if (__kmp_affinity_respect_mask) {
764                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
765             } else {
766                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
767             }
768             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
769             KMP_INFORM(Uniform, "KMP_AFFINITY");
770             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
771               __kmp_nThreadsPerCore, __kmp_ncores);
772         }
773 
774         if (__kmp_affinity_type == affinity_none) {
775             __kmp_free(threadInfo);
776             KMP_CPU_FREE(oldMask);
777             return 0;
778         }
779 
780         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
781         Address addr(1);
782         addr.labels[0] = threadInfo[0].pkgId;
783         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
784 
785         if (__kmp_affinity_gran_levels < 0) {
786             __kmp_affinity_gran_levels = 0;
787         }
788 
789         if (__kmp_affinity_verbose) {
790             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
791         }
792 
793         __kmp_free(threadInfo);
794         KMP_CPU_FREE(oldMask);
795         return 1;
796     }
797 
798     //
799     // Sort the threadInfo table by physical Id.
800     //
801     qsort(threadInfo, nApics, sizeof(*threadInfo),
802       __kmp_affinity_cmp_apicThreadInfo_phys_id);
803 
804     //
805     // The table is now sorted by pkgId / coreId / threadId, but we really
806     // don't know the radix of any of the fields.  pkgId's may be sparsely
807     // assigned among the chips on a system.  Although coreId's are usually
808     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
809     // [0..threadsPerCore-1], we don't want to make any such assumptions.
810     //
811     // For that matter, we don't know what coresPerPkg and threadsPerCore
812     // (or the total # packages) are at this point - we want to determine
813     // that now.  We only have an upper bound on the first two figures.
814     //
815     // We also perform a consistency check at this point: the values returned
816     // by the cpuid instruction for any thread bound to a given package had
817     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
818     //
819     nPackages = 1;
820     nCoresPerPkg = 1;
821     __kmp_nThreadsPerCore = 1;
822     unsigned nCores = 1;
823 
824     unsigned pkgCt = 1;                         // to determine radii
825     unsigned lastPkgId = threadInfo[0].pkgId;
826     unsigned coreCt = 1;
827     unsigned lastCoreId = threadInfo[0].coreId;
828     unsigned threadCt = 1;
829     unsigned lastThreadId = threadInfo[0].threadId;
830 
831                                                 // intra-pkg consist checks
832     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
833     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
834 
835     for (i = 1; i < nApics; i++) {
836         if (threadInfo[i].pkgId != lastPkgId) {
837             nCores++;
838             pkgCt++;
839             lastPkgId = threadInfo[i].pkgId;
840             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
841             coreCt = 1;
842             lastCoreId = threadInfo[i].coreId;
843             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
844             threadCt = 1;
845             lastThreadId = threadInfo[i].threadId;
846 
847             //
848             // This is a different package, so go on to the next iteration
849             // without doing any consistency checks.  Reset the consistency
850             // check vars, though.
851             //
852             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
853             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
854             continue;
855         }
856 
857         if (threadInfo[i].coreId != lastCoreId) {
858             nCores++;
859             coreCt++;
860             lastCoreId = threadInfo[i].coreId;
861             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
862             threadCt = 1;
863             lastThreadId = threadInfo[i].threadId;
864         }
865         else if (threadInfo[i].threadId != lastThreadId) {
866             threadCt++;
867             lastThreadId = threadInfo[i].threadId;
868         }
869         else {
870             __kmp_free(threadInfo);
871             KMP_CPU_FREE(oldMask);
872             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
873             return -1;
874         }
875 
876         //
877         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
878         // fields agree between all the threads bounds to a given package.
879         //
880         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
881           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
882             __kmp_free(threadInfo);
883             KMP_CPU_FREE(oldMask);
884             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
885             return -1;
886         }
887     }
888     nPackages = pkgCt;
889     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
890     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
891 
892     //
893     // When affinity is off, this routine will still be called to set
894     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
895     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
896     // correctly, and return now if affinity is not enabled.
897     //
898     __kmp_ncores = nCores;
899     if (__kmp_affinity_verbose) {
900         char buf[KMP_AFFIN_MASK_PRINT_LEN];
901         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
902 
903         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
904         if (__kmp_affinity_respect_mask) {
905             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
906         } else {
907             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
908         }
909         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
910         if (__kmp_affinity_uniform_topology()) {
911             KMP_INFORM(Uniform, "KMP_AFFINITY");
912         } else {
913             KMP_INFORM(NonUniform, "KMP_AFFINITY");
914         }
915         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
916           __kmp_nThreadsPerCore, __kmp_ncores);
917 
918     }
919 
920     if (__kmp_affinity_type == affinity_none) {
921         __kmp_free(threadInfo);
922         KMP_CPU_FREE(oldMask);
923         return 0;
924     }
925 
926     //
927     // Now that we've determined the number of packages, the number of cores
928     // per package, and the number of threads per core, we can construct the
929     // data structure that is to be returned.
930     //
931     int pkgLevel = 0;
932     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
933     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
934     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
935 
936     KMP_ASSERT(depth > 0);
937     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
938 
939     for (i = 0; i < nApics; ++i) {
940         Address addr(depth);
941         unsigned os = threadInfo[i].osId;
942         int d = 0;
943 
944         if (pkgLevel >= 0) {
945             addr.labels[d++] = threadInfo[i].pkgId;
946         }
947         if (coreLevel >= 0) {
948             addr.labels[d++] = threadInfo[i].coreId;
949         }
950         if (threadLevel >= 0) {
951             addr.labels[d++] = threadInfo[i].threadId;
952         }
953         (*address2os)[i] = AddrUnsPair(addr, os);
954     }
955 
956     if (__kmp_affinity_gran_levels < 0) {
957         //
958         // Set the granularity level based on what levels are modeled
959         // in the machine topology map.
960         //
961         __kmp_affinity_gran_levels = 0;
962         if ((threadLevel >= 0)
963           && (__kmp_affinity_gran > affinity_gran_thread)) {
964             __kmp_affinity_gran_levels++;
965         }
966         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
967             __kmp_affinity_gran_levels++;
968         }
969         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
970             __kmp_affinity_gran_levels++;
971         }
972     }
973 
974     if (__kmp_affinity_verbose) {
975         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
976           coreLevel, threadLevel);
977     }
978 
979     __kmp_free(threadInfo);
980     KMP_CPU_FREE(oldMask);
981     return depth;
982 }
983 
984 
985 //
986 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
987 // architectures support a newer interface for specifying the x2APIC Ids,
988 // based on cpuid leaf 11.
989 //
990 static int
991 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
992   kmp_i18n_id_t *const msg_id)
993 {
994     kmp_cpuid buf;
995 
996     *address2os = NULL;
997     *msg_id = kmp_i18n_null;
998 
999     //
1000     // Check to see if cpuid leaf 11 is supported.
1001     //
1002     __kmp_x86_cpuid(0, 0, &buf);
1003     if (buf.eax < 11) {
1004         *msg_id = kmp_i18n_str_NoLeaf11Support;
1005         return -1;
1006     }
1007     __kmp_x86_cpuid(11, 0, &buf);
1008     if (buf.ebx == 0) {
1009         *msg_id = kmp_i18n_str_NoLeaf11Support;
1010         return -1;
1011     }
1012 
1013     //
1014     // Find the number of levels in the machine topology.  While we're at it,
1015     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1016     // try to get more accurate values later by explicitly counting them,
1017     // but get reasonable defaults now, in case we return early.
1018     //
1019     int level;
1020     int threadLevel = -1;
1021     int coreLevel = -1;
1022     int pkgLevel = -1;
1023     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1024 
1025     for (level = 0;; level++) {
1026         if (level > 31) {
1027             //
1028             // FIXME: Hack for DPD200163180
1029             //
1030             // If level is big then something went wrong -> exiting
1031             //
1032             // There could actually be 32 valid levels in the machine topology,
1033             // but so far, the only machine we have seen which does not exit
1034             // this loop before iteration 32 has fubar x2APIC settings.
1035             //
1036             // For now, just reject this case based upon loop trip count.
1037             //
1038             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1039             return -1;
1040         }
1041         __kmp_x86_cpuid(11, level, &buf);
1042         if (buf.ebx == 0) {
1043             if (pkgLevel < 0) {
1044                 //
1045                 // Will infer nPackages from __kmp_xproc
1046                 //
1047                 pkgLevel = level;
1048                 level++;
1049             }
1050             break;
1051         }
1052         int kind = (buf.ecx >> 8) & 0xff;
1053         if (kind == 1) {
1054             //
1055             // SMT level
1056             //
1057             threadLevel = level;
1058             coreLevel = -1;
1059             pkgLevel = -1;
1060             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1061             if (__kmp_nThreadsPerCore == 0) {
1062                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1063                 return -1;
1064             }
1065         }
1066         else if (kind == 2) {
1067             //
1068             // core level
1069             //
1070             coreLevel = level;
1071             pkgLevel = -1;
1072             nCoresPerPkg = buf.ebx & 0xff;
1073             if (nCoresPerPkg == 0) {
1074                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1075                 return -1;
1076             }
1077         }
1078         else {
1079             if (level <= 0) {
1080                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1081                 return -1;
1082             }
1083             if (pkgLevel >= 0) {
1084                 continue;
1085             }
1086             pkgLevel = level;
1087             nPackages = buf.ebx & 0xff;
1088             if (nPackages == 0) {
1089                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1090                 return -1;
1091             }
1092         }
1093     }
1094     int depth = level;
1095 
1096     //
1097     // In the above loop, "level" was counted from the finest level (usually
1098     // thread) to the coarsest.  The caller expects that we will place the
1099     // labels in (*address2os)[].first.labels[] in the inverse order, so
1100     // we need to invert the vars saying which level means what.
1101     //
1102     if (threadLevel >= 0) {
1103         threadLevel = depth - threadLevel - 1;
1104     }
1105     if (coreLevel >= 0) {
1106         coreLevel = depth - coreLevel - 1;
1107     }
1108     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1109     pkgLevel = depth - pkgLevel - 1;
1110 
1111     //
1112     // The algorithm used starts by setting the affinity to each available
1113     // thread and retrieving info from the cpuid instruction, so if we are
1114     // not capable of calling __kmp_get_system_affinity() and
1115     // _kmp_get_system_affinity(), then we need to do something else - use
1116     // the defaults that we calculated from issuing cpuid without binding
1117     // to each proc.
1118     //
1119     if (! KMP_AFFINITY_CAPABLE())
1120     {
1121         //
1122         // Hack to try and infer the machine topology using only the data
1123         // available from cpuid on the current thread, and __kmp_xproc.
1124         //
1125         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1126 
1127         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1128         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1129         if (__kmp_affinity_verbose) {
1130             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1131             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1132             if (__kmp_affinity_uniform_topology()) {
1133                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1134             } else {
1135                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1136             }
1137             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1138               __kmp_nThreadsPerCore, __kmp_ncores);
1139         }
1140         return 0;
1141     }
1142 
1143     //
1144     //
1145     // From here on, we can assume that it is safe to call
1146     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1147     // even if __kmp_affinity_type = affinity_none.
1148     //
1149 
1150     //
1151     // Save the affinity mask for the current thread.
1152     //
1153     kmp_affin_mask_t *oldMask;
1154     KMP_CPU_ALLOC(oldMask);
1155     __kmp_get_system_affinity(oldMask, TRUE);
1156 
1157     //
1158     // Allocate the data structure to be returned.
1159     //
1160     AddrUnsPair *retval = (AddrUnsPair *)
1161       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1162 
1163     //
1164     // Run through each of the available contexts, binding the current thread
1165     // to it, and obtaining the pertinent information using the cpuid instr.
1166     //
1167     unsigned int proc;
1168     int nApics = 0;
1169     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1170         //
1171         // Skip this proc if it is not included in the machine model.
1172         //
1173         if (! KMP_CPU_ISSET(proc, fullMask)) {
1174             continue;
1175         }
1176         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1177 
1178         __kmp_affinity_bind_thread(proc);
1179 
1180         //
1181         // Extrach the labels for each level in the machine topology map
1182         // from the Apic ID.
1183         //
1184         Address addr(depth);
1185         int prev_shift = 0;
1186 
1187         for (level = 0; level < depth; level++) {
1188             __kmp_x86_cpuid(11, level, &buf);
1189             unsigned apicId = buf.edx;
1190             if (buf.ebx == 0) {
1191                 if (level != depth - 1) {
1192                     KMP_CPU_FREE(oldMask);
1193                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1194                     return -1;
1195                 }
1196                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1197                 level++;
1198                 break;
1199             }
1200             int shift = buf.eax & 0x1f;
1201             int mask = (1 << shift) - 1;
1202             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1203             prev_shift = shift;
1204         }
1205         if (level != depth) {
1206             KMP_CPU_FREE(oldMask);
1207             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1208             return -1;
1209         }
1210 
1211         retval[nApics] = AddrUnsPair(addr, proc);
1212         nApics++;
1213     }
1214 
1215     //
1216     // We've collected all the info we need.
1217     // Restore the old affinity mask for this thread.
1218     //
1219     __kmp_set_system_affinity(oldMask, TRUE);
1220 
1221     //
1222     // If there's only one thread context to bind to, return now.
1223     //
1224     KMP_ASSERT(nApics > 0);
1225     if (nApics == 1) {
1226         __kmp_ncores = nPackages = 1;
1227         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1228         if (__kmp_affinity_verbose) {
1229             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1230             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1231 
1232             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1233             if (__kmp_affinity_respect_mask) {
1234                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1235             } else {
1236                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1237             }
1238             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1239             KMP_INFORM(Uniform, "KMP_AFFINITY");
1240             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1241               __kmp_nThreadsPerCore, __kmp_ncores);
1242         }
1243 
1244         if (__kmp_affinity_type == affinity_none) {
1245             __kmp_free(retval);
1246             KMP_CPU_FREE(oldMask);
1247             return 0;
1248         }
1249 
1250         //
1251         // Form an Address object which only includes the package level.
1252         //
1253         Address addr(1);
1254         addr.labels[0] = retval[0].first.labels[pkgLevel];
1255         retval[0].first = addr;
1256 
1257         if (__kmp_affinity_gran_levels < 0) {
1258             __kmp_affinity_gran_levels = 0;
1259         }
1260 
1261         if (__kmp_affinity_verbose) {
1262             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1263         }
1264 
1265         *address2os = retval;
1266         KMP_CPU_FREE(oldMask);
1267         return 1;
1268     }
1269 
1270     //
1271     // Sort the table by physical Id.
1272     //
1273     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1274 
1275     //
1276     // Find the radix at each of the levels.
1277     //
1278     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1279     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1280     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1281     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1282     for (level = 0; level < depth; level++) {
1283         totals[level] = 1;
1284         maxCt[level] = 1;
1285         counts[level] = 1;
1286         last[level] = retval[0].first.labels[level];
1287     }
1288 
1289     //
1290     // From here on, the iteration variable "level" runs from the finest
1291     // level to the coarsest, i.e. we iterate forward through
1292     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1293     // backwards.
1294     //
1295     for (proc = 1; (int)proc < nApics; proc++) {
1296         int level;
1297         for (level = 0; level < depth; level++) {
1298             if (retval[proc].first.labels[level] != last[level]) {
1299                 int j;
1300                 for (j = level + 1; j < depth; j++) {
1301                     totals[j]++;
1302                     counts[j] = 1;
1303                     // The line below causes printing incorrect topology information
1304                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1305                     // some less value while going through the array.
1306                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1307                     // whereas it must be 4.
1308                     // TODO!!! Check if it can be commented safely
1309                     //maxCt[j] = 1;
1310                     last[j] = retval[proc].first.labels[j];
1311                 }
1312                 totals[level]++;
1313                 counts[level]++;
1314                 if (counts[level] > maxCt[level]) {
1315                     maxCt[level] = counts[level];
1316                 }
1317                 last[level] = retval[proc].first.labels[level];
1318                 break;
1319             }
1320             else if (level == depth - 1) {
1321                 __kmp_free(last);
1322                 __kmp_free(maxCt);
1323                 __kmp_free(counts);
1324                 __kmp_free(totals);
1325                 __kmp_free(retval);
1326                 KMP_CPU_FREE(oldMask);
1327                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1328                 return -1;
1329             }
1330         }
1331     }
1332 
1333     //
1334     // When affinity is off, this routine will still be called to set
1335     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1336     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1337     // correctly, and return if affinity is not enabled.
1338     //
1339     if (threadLevel >= 0) {
1340         __kmp_nThreadsPerCore = maxCt[threadLevel];
1341     }
1342     else {
1343         __kmp_nThreadsPerCore = 1;
1344     }
1345     nPackages = totals[pkgLevel];
1346 
1347     if (coreLevel >= 0) {
1348         __kmp_ncores = totals[coreLevel];
1349         nCoresPerPkg = maxCt[coreLevel];
1350     }
1351     else {
1352         __kmp_ncores = nPackages;
1353         nCoresPerPkg = 1;
1354     }
1355 
1356     //
1357     // Check to see if the machine topology is uniform
1358     //
1359     unsigned prod = maxCt[0];
1360     for (level = 1; level < depth; level++) {
1361        prod *= maxCt[level];
1362     }
1363     bool uniform = (prod == totals[level - 1]);
1364 
1365     //
1366     // Print the machine topology summary.
1367     //
1368     if (__kmp_affinity_verbose) {
1369         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1370         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1371 
1372         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1373         if (__kmp_affinity_respect_mask) {
1374             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1375         } else {
1376             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1377         }
1378         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1379         if (uniform) {
1380             KMP_INFORM(Uniform, "KMP_AFFINITY");
1381         } else {
1382             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1383         }
1384 
1385         kmp_str_buf_t buf;
1386         __kmp_str_buf_init(&buf);
1387 
1388         __kmp_str_buf_print(&buf, "%d", totals[0]);
1389         for (level = 1; level <= pkgLevel; level++) {
1390             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1391         }
1392         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1393           __kmp_nThreadsPerCore, __kmp_ncores);
1394 
1395         __kmp_str_buf_free(&buf);
1396     }
1397 
1398     if (__kmp_affinity_type == affinity_none) {
1399         __kmp_free(last);
1400         __kmp_free(maxCt);
1401         __kmp_free(counts);
1402         __kmp_free(totals);
1403         __kmp_free(retval);
1404         KMP_CPU_FREE(oldMask);
1405         return 0;
1406     }
1407 
1408     //
1409     // Find any levels with radiix 1, and remove them from the map
1410     // (except for the package level).
1411     //
1412     int new_depth = 0;
1413     for (level = 0; level < depth; level++) {
1414         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1415            continue;
1416         }
1417         new_depth++;
1418     }
1419 
1420     //
1421     // If we are removing any levels, allocate a new vector to return,
1422     // and copy the relevant information to it.
1423     //
1424     if (new_depth != depth) {
1425         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1426           sizeof(AddrUnsPair) * nApics);
1427         for (proc = 0; (int)proc < nApics; proc++) {
1428             Address addr(new_depth);
1429             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1430         }
1431         int new_level = 0;
1432         int newPkgLevel = -1;
1433         int newCoreLevel = -1;
1434         int newThreadLevel = -1;
1435         int i;
1436         for (level = 0; level < depth; level++) {
1437             if ((maxCt[level] == 1)
1438               && (level != pkgLevel)) {
1439                 //
1440                 // Remove this level. Never remove the package level
1441                 //
1442                 continue;
1443             }
1444             if (level == pkgLevel) {
1445                 newPkgLevel = level;
1446             }
1447             if (level == coreLevel) {
1448                 newCoreLevel = level;
1449             }
1450             if (level == threadLevel) {
1451                 newThreadLevel = level;
1452             }
1453             for (proc = 0; (int)proc < nApics; proc++) {
1454                 new_retval[proc].first.labels[new_level]
1455                   = retval[proc].first.labels[level];
1456             }
1457             new_level++;
1458         }
1459 
1460         __kmp_free(retval);
1461         retval = new_retval;
1462         depth = new_depth;
1463         pkgLevel = newPkgLevel;
1464         coreLevel = newCoreLevel;
1465         threadLevel = newThreadLevel;
1466     }
1467 
1468     if (__kmp_affinity_gran_levels < 0) {
1469         //
1470         // Set the granularity level based on what levels are modeled
1471         // in the machine topology map.
1472         //
1473         __kmp_affinity_gran_levels = 0;
1474         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1475             __kmp_affinity_gran_levels++;
1476         }
1477         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1478             __kmp_affinity_gran_levels++;
1479         }
1480         if (__kmp_affinity_gran > affinity_gran_package) {
1481             __kmp_affinity_gran_levels++;
1482         }
1483     }
1484 
1485     if (__kmp_affinity_verbose) {
1486         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1487           coreLevel, threadLevel);
1488     }
1489 
1490     __kmp_free(last);
1491     __kmp_free(maxCt);
1492     __kmp_free(counts);
1493     __kmp_free(totals);
1494     KMP_CPU_FREE(oldMask);
1495     *address2os = retval;
1496     return depth;
1497 }
1498 
1499 
1500 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1501 
1502 
1503 #define osIdIndex       0
1504 #define threadIdIndex   1
1505 #define coreIdIndex     2
1506 #define pkgIdIndex      3
1507 #define nodeIdIndex     4
1508 
1509 typedef unsigned *ProcCpuInfo;
1510 static unsigned maxIndex = pkgIdIndex;
1511 
1512 
1513 static int
1514 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1515 {
1516     const unsigned *aa = (const unsigned *)a;
1517     const unsigned *bb = (const unsigned *)b;
1518     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1519     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1520     return 0;
1521 };
1522 
1523 
1524 static int
1525 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1526 {
1527     unsigned i;
1528     const unsigned *aa = *((const unsigned **)a);
1529     const unsigned *bb = *((const unsigned **)b);
1530     for (i = maxIndex; ; i--) {
1531         if (aa[i] < bb[i]) return -1;
1532         if (aa[i] > bb[i]) return 1;
1533         if (i == osIdIndex) break;
1534     }
1535     return 0;
1536 }
1537 
1538 
1539 //
1540 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1541 // affinity map.
1542 //
1543 static int
1544 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1545   kmp_i18n_id_t *const msg_id, FILE *f)
1546 {
1547     *address2os = NULL;
1548     *msg_id = kmp_i18n_null;
1549 
1550     //
1551     // Scan of the file, and count the number of "processor" (osId) fields,
1552     // and find the highest value of <n> for a node_<n> field.
1553     //
1554     char buf[256];
1555     unsigned num_records = 0;
1556     while (! feof(f)) {
1557         buf[sizeof(buf) - 1] = 1;
1558         if (! fgets(buf, sizeof(buf), f)) {
1559             //
1560             // Read errors presumably because of EOF
1561             //
1562             break;
1563         }
1564 
1565         char s1[] = "processor";
1566         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1567             num_records++;
1568             continue;
1569         }
1570 
1571         //
1572         // FIXME - this will match "node_<n> <garbage>"
1573         //
1574         unsigned level;
1575         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1576             if (nodeIdIndex + level >= maxIndex) {
1577                 maxIndex = nodeIdIndex + level;
1578             }
1579             continue;
1580         }
1581     }
1582 
1583     //
1584     // Check for empty file / no valid processor records, or too many.
1585     // The number of records can't exceed the number of valid bits in the
1586     // affinity mask.
1587     //
1588     if (num_records == 0) {
1589         *line = 0;
1590         *msg_id = kmp_i18n_str_NoProcRecords;
1591         return -1;
1592     }
1593     if (num_records > (unsigned)__kmp_xproc) {
1594         *line = 0;
1595         *msg_id = kmp_i18n_str_TooManyProcRecords;
1596         return -1;
1597     }
1598 
1599     //
1600     // Set the file pointer back to the begginning, so that we can scan the
1601     // file again, this time performing a full parse of the data.
1602     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1603     // Adding an extra element at the end allows us to remove a lot of extra
1604     // checks for termination conditions.
1605     //
1606     if (fseek(f, 0, SEEK_SET) != 0) {
1607         *line = 0;
1608         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1609         return -1;
1610     }
1611 
1612     //
1613     // Allocate the array of records to store the proc info in.  The dummy
1614     // element at the end makes the logic in filling them out easier to code.
1615     //
1616     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1617       * sizeof(unsigned *));
1618     unsigned i;
1619     for (i = 0; i <= num_records; i++) {
1620         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1621           * sizeof(unsigned));
1622     }
1623 
1624 #define CLEANUP_THREAD_INFO \
1625     for (i = 0; i <= num_records; i++) {                                \
1626         __kmp_free(threadInfo[i]);                                      \
1627     }                                                                   \
1628     __kmp_free(threadInfo);
1629 
1630     //
1631     // A value of UINT_MAX means that we didn't find the field
1632     //
1633     unsigned __index;
1634 
1635 #define INIT_PROC_INFO(p) \
1636     for (__index = 0; __index <= maxIndex; __index++) {                 \
1637         (p)[__index] = UINT_MAX;                                        \
1638     }
1639 
1640     for (i = 0; i <= num_records; i++) {
1641         INIT_PROC_INFO(threadInfo[i]);
1642     }
1643 
1644     unsigned num_avail = 0;
1645     *line = 0;
1646     while (! feof(f)) {
1647         //
1648         // Create an inner scoping level, so that all the goto targets at the
1649         // end of the loop appear in an outer scoping level.  This avoids
1650         // warnings about jumping past an initialization to a target in the
1651         // same block.
1652         //
1653         {
1654             buf[sizeof(buf) - 1] = 1;
1655             bool long_line = false;
1656             if (! fgets(buf, sizeof(buf), f)) {
1657                 //
1658                 // Read errors presumably because of EOF
1659                 //
1660                 // If there is valid data in threadInfo[num_avail], then fake
1661                 // a blank line in ensure that the last address gets parsed.
1662                 //
1663                 bool valid = false;
1664                 for (i = 0; i <= maxIndex; i++) {
1665                     if (threadInfo[num_avail][i] != UINT_MAX) {
1666                         valid = true;
1667                     }
1668                 }
1669                 if (! valid) {
1670                     break;
1671                 }
1672                 buf[0] = 0;
1673             } else if (!buf[sizeof(buf) - 1]) {
1674                 //
1675                 // The line is longer than the buffer.  Set a flag and don't
1676                 // emit an error if we were going to ignore the line, anyway.
1677                 //
1678                 long_line = true;
1679 
1680 #define CHECK_LINE \
1681     if (long_line) {                                                    \
1682         CLEANUP_THREAD_INFO;                                            \
1683         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1684         return -1;                                                      \
1685     }
1686             }
1687             (*line)++;
1688 
1689             char s1[] = "processor";
1690             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1691                 CHECK_LINE;
1692                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1693                 unsigned val;
1694                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1695                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1696                 threadInfo[num_avail][osIdIndex] = val;
1697 #if KMP_OS_LINUX && USE_SYSFS_INFO
1698                 char path[256];
1699                 KMP_SNPRINTF(path, sizeof(path),
1700                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1701                     threadInfo[num_avail][osIdIndex]);
1702                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1703 
1704                 KMP_SNPRINTF(path, sizeof(path),
1705                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1706                     threadInfo[num_avail][osIdIndex]);
1707                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1708                 continue;
1709 #else
1710             }
1711             char s2[] = "physical id";
1712             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1713                 CHECK_LINE;
1714                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1715                 unsigned val;
1716                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1717                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1718                 threadInfo[num_avail][pkgIdIndex] = val;
1719                 continue;
1720             }
1721             char s3[] = "core id";
1722             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1723                 CHECK_LINE;
1724                 char *p = strchr(buf + sizeof(s3) - 1, ':');
1725                 unsigned val;
1726                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1727                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1728                 threadInfo[num_avail][coreIdIndex] = val;
1729                 continue;
1730 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
1731             }
1732             char s4[] = "thread id";
1733             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1734                 CHECK_LINE;
1735                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1736                 unsigned val;
1737                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1738                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1739                 threadInfo[num_avail][threadIdIndex] = val;
1740                 continue;
1741             }
1742             unsigned level;
1743             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1744                 CHECK_LINE;
1745                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1746                 unsigned val;
1747                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1748                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1749                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1750                 threadInfo[num_avail][nodeIdIndex + level] = val;
1751                 continue;
1752             }
1753 
1754             //
1755             // We didn't recognize the leading token on the line.
1756             // There are lots of leading tokens that we don't recognize -
1757             // if the line isn't empty, go on to the next line.
1758             //
1759             if ((*buf != 0) && (*buf != '\n')) {
1760                 //
1761                 // If the line is longer than the buffer, read characters
1762                 // until we find a newline.
1763                 //
1764                 if (long_line) {
1765                     int ch;
1766                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1767                 }
1768                 continue;
1769             }
1770 
1771             //
1772             // A newline has signalled the end of the processor record.
1773             // Check that there aren't too many procs specified.
1774             //
1775             if ((int)num_avail == __kmp_xproc) {
1776                 CLEANUP_THREAD_INFO;
1777                 *msg_id = kmp_i18n_str_TooManyEntries;
1778                 return -1;
1779             }
1780 
1781             //
1782             // Check for missing fields.  The osId field must be there, and we
1783             // currently require that the physical id field is specified, also.
1784             //
1785             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1786                 CLEANUP_THREAD_INFO;
1787                 *msg_id = kmp_i18n_str_MissingProcField;
1788                 return -1;
1789             }
1790             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1791                 CLEANUP_THREAD_INFO;
1792                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1793                 return -1;
1794             }
1795 
1796             //
1797             // Skip this proc if it is not included in the machine model.
1798             //
1799             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1800                 INIT_PROC_INFO(threadInfo[num_avail]);
1801                 continue;
1802             }
1803 
1804             //
1805             // We have a successful parse of this proc's info.
1806             // Increment the counter, and prepare for the next proc.
1807             //
1808             num_avail++;
1809             KMP_ASSERT(num_avail <= num_records);
1810             INIT_PROC_INFO(threadInfo[num_avail]);
1811         }
1812         continue;
1813 
1814         no_val:
1815         CLEANUP_THREAD_INFO;
1816         *msg_id = kmp_i18n_str_MissingValCpuinfo;
1817         return -1;
1818 
1819         dup_field:
1820         CLEANUP_THREAD_INFO;
1821         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
1822         return -1;
1823     }
1824     *line = 0;
1825 
1826 # if KMP_MIC && REDUCE_TEAM_SIZE
1827     unsigned teamSize = 0;
1828 # endif // KMP_MIC && REDUCE_TEAM_SIZE
1829 
1830     // check for num_records == __kmp_xproc ???
1831 
1832     //
1833     // If there's only one thread context to bind to, form an Address object
1834     // with depth 1 and return immediately (or, if affinity is off, set
1835     // address2os to NULL and return).
1836     //
1837     // If it is configured to omit the package level when there is only a
1838     // single package, the logic at the end of this routine won't work if
1839     // there is only a single thread - it would try to form an Address
1840     // object with depth 0.
1841     //
1842     KMP_ASSERT(num_avail > 0);
1843     KMP_ASSERT(num_avail <= num_records);
1844     if (num_avail == 1) {
1845         __kmp_ncores = 1;
1846         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1847         if (__kmp_affinity_verbose) {
1848             if (! KMP_AFFINITY_CAPABLE()) {
1849                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
1850                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1851                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1852             }
1853             else {
1854                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1855                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
1856                   fullMask);
1857                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
1858                 if (__kmp_affinity_respect_mask) {
1859                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1860                 } else {
1861                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1862                 }
1863                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1864                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1865             }
1866             int index;
1867             kmp_str_buf_t buf;
1868             __kmp_str_buf_init(&buf);
1869             __kmp_str_buf_print(&buf, "1");
1870             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
1871                 __kmp_str_buf_print(&buf, " x 1");
1872             }
1873             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
1874             __kmp_str_buf_free(&buf);
1875         }
1876 
1877         if (__kmp_affinity_type == affinity_none) {
1878             CLEANUP_THREAD_INFO;
1879             return 0;
1880         }
1881 
1882         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1883         Address addr(1);
1884         addr.labels[0] = threadInfo[0][pkgIdIndex];
1885         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
1886 
1887         if (__kmp_affinity_gran_levels < 0) {
1888             __kmp_affinity_gran_levels = 0;
1889         }
1890 
1891         if (__kmp_affinity_verbose) {
1892             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1893         }
1894 
1895         CLEANUP_THREAD_INFO;
1896         return 1;
1897     }
1898 
1899     //
1900     // Sort the threadInfo table by physical Id.
1901     //
1902     qsort(threadInfo, num_avail, sizeof(*threadInfo),
1903       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
1904 
1905     //
1906     // The table is now sorted by pkgId / coreId / threadId, but we really
1907     // don't know the radix of any of the fields.  pkgId's may be sparsely
1908     // assigned among the chips on a system.  Although coreId's are usually
1909     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1910     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1911     //
1912     // For that matter, we don't know what coresPerPkg and threadsPerCore
1913     // (or the total # packages) are at this point - we want to determine
1914     // that now.  We only have an upper bound on the first two figures.
1915     //
1916     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
1917       * sizeof(unsigned));
1918     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
1919       * sizeof(unsigned));
1920     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
1921       * sizeof(unsigned));
1922     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
1923       * sizeof(unsigned));
1924 
1925     bool assign_thread_ids = false;
1926     unsigned threadIdCt;
1927     unsigned index;
1928 
1929     restart_radix_check:
1930     threadIdCt = 0;
1931 
1932     //
1933     // Initialize the counter arrays with data from threadInfo[0].
1934     //
1935     if (assign_thread_ids) {
1936         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
1937             threadInfo[0][threadIdIndex] = threadIdCt++;
1938         }
1939         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
1940             threadIdCt = threadInfo[0][threadIdIndex] + 1;
1941         }
1942     }
1943     for (index = 0; index <= maxIndex; index++) {
1944         counts[index] = 1;
1945         maxCt[index] = 1;
1946         totals[index] = 1;
1947         lastId[index] = threadInfo[0][index];;
1948     }
1949 
1950     //
1951     // Run through the rest of the OS procs.
1952     //
1953     for (i = 1; i < num_avail; i++) {
1954         //
1955         // Find the most significant index whose id differs
1956         // from the id for the previous OS proc.
1957         //
1958         for (index = maxIndex; index >= threadIdIndex; index--) {
1959             if (assign_thread_ids && (index == threadIdIndex)) {
1960                 //
1961                 // Auto-assign the thread id field if it wasn't specified.
1962                 //
1963                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
1964                     threadInfo[i][threadIdIndex] = threadIdCt++;
1965                 }
1966 
1967                 //
1968                 // Aparrently the thread id field was specified for some
1969                 // entries and not others.  Start the thread id counter
1970                 // off at the next higher thread id.
1971                 //
1972                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
1973                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
1974                 }
1975             }
1976             if (threadInfo[i][index] != lastId[index]) {
1977                 //
1978                 // Run through all indices which are less significant,
1979                 // and reset the counts to 1.
1980                 //
1981                 // At all levels up to and including index, we need to
1982                 // increment the totals and record the last id.
1983                 //
1984                 unsigned index2;
1985                 for (index2 = threadIdIndex; index2 < index; index2++) {
1986                     totals[index2]++;
1987                     if (counts[index2] > maxCt[index2]) {
1988                         maxCt[index2] = counts[index2];
1989                     }
1990                     counts[index2] = 1;
1991                     lastId[index2] = threadInfo[i][index2];
1992                 }
1993                 counts[index]++;
1994                 totals[index]++;
1995                 lastId[index] = threadInfo[i][index];
1996 
1997                 if (assign_thread_ids && (index > threadIdIndex)) {
1998 
1999 # if KMP_MIC && REDUCE_TEAM_SIZE
2000                     //
2001                     // The default team size is the total #threads in the machine
2002                     // minus 1 thread for every core that has 3 or more threads.
2003                     //
2004                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2005 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2006 
2007                     //
2008                     // Restart the thread counter, as we are on a new core.
2009                     //
2010                     threadIdCt = 0;
2011 
2012                     //
2013                     // Auto-assign the thread id field if it wasn't specified.
2014                     //
2015                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2016                         threadInfo[i][threadIdIndex] = threadIdCt++;
2017                     }
2018 
2019                     //
2020                     // Aparrently the thread id field was specified for some
2021                     // entries and not others.  Start the thread id counter
2022                     // off at the next higher thread id.
2023                     //
2024                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2025                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2026                     }
2027                 }
2028                 break;
2029             }
2030         }
2031         if (index < threadIdIndex) {
2032             //
2033             // If thread ids were specified, it is an error if they are not
2034             // unique.  Also, check that we waven't already restarted the
2035             // loop (to be safe - shouldn't need to).
2036             //
2037             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2038               || assign_thread_ids) {
2039                 __kmp_free(lastId);
2040                 __kmp_free(totals);
2041                 __kmp_free(maxCt);
2042                 __kmp_free(counts);
2043                 CLEANUP_THREAD_INFO;
2044                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2045                 return -1;
2046             }
2047 
2048             //
2049             // If the thread ids were not specified and we see entries
2050             // entries that are duplicates, start the loop over and
2051             // assign the thread ids manually.
2052             //
2053             assign_thread_ids = true;
2054             goto restart_radix_check;
2055         }
2056     }
2057 
2058 # if KMP_MIC && REDUCE_TEAM_SIZE
2059     //
2060     // The default team size is the total #threads in the machine
2061     // minus 1 thread for every core that has 3 or more threads.
2062     //
2063     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2064 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2065 
2066     for (index = threadIdIndex; index <= maxIndex; index++) {
2067         if (counts[index] > maxCt[index]) {
2068             maxCt[index] = counts[index];
2069         }
2070     }
2071 
2072     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2073     nCoresPerPkg = maxCt[coreIdIndex];
2074     nPackages = totals[pkgIdIndex];
2075 
2076     //
2077     // Check to see if the machine topology is uniform
2078     //
2079     unsigned prod = totals[maxIndex];
2080     for (index = threadIdIndex; index < maxIndex; index++) {
2081        prod *= maxCt[index];
2082     }
2083     bool uniform = (prod == totals[threadIdIndex]);
2084 
2085     //
2086     // When affinity is off, this routine will still be called to set
2087     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2088     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2089     // correctly, and return now if affinity is not enabled.
2090     //
2091     __kmp_ncores = totals[coreIdIndex];
2092 
2093     if (__kmp_affinity_verbose) {
2094         if (! KMP_AFFINITY_CAPABLE()) {
2095                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2096                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2097                 if (uniform) {
2098                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2099                 } else {
2100                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2101                 }
2102         }
2103         else {
2104             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2105             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2106                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2107                 if (__kmp_affinity_respect_mask) {
2108                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2109                 } else {
2110                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2111                 }
2112                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2113                 if (uniform) {
2114                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2115                 } else {
2116                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2117                 }
2118         }
2119         kmp_str_buf_t buf;
2120         __kmp_str_buf_init(&buf);
2121 
2122         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2123         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2124             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2125         }
2126         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2127           maxCt[threadIdIndex], __kmp_ncores);
2128 
2129         __kmp_str_buf_free(&buf);
2130     }
2131 
2132 # if KMP_MIC && REDUCE_TEAM_SIZE
2133     //
2134     // Set the default team size.
2135     //
2136     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2137         __kmp_dflt_team_nth = teamSize;
2138         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2139           __kmp_dflt_team_nth));
2140     }
2141 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2142 
2143     if (__kmp_affinity_type == affinity_none) {
2144         __kmp_free(lastId);
2145         __kmp_free(totals);
2146         __kmp_free(maxCt);
2147         __kmp_free(counts);
2148         CLEANUP_THREAD_INFO;
2149         return 0;
2150     }
2151 
2152     //
2153     // Count the number of levels which have more nodes at that level than
2154     // at the parent's level (with there being an implicit root node of
2155     // the top level).  This is equivalent to saying that there is at least
2156     // one node at this level which has a sibling.  These levels are in the
2157     // map, and the package level is always in the map.
2158     //
2159     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2160     int level = 0;
2161     for (index = threadIdIndex; index < maxIndex; index++) {
2162         KMP_ASSERT(totals[index] >= totals[index + 1]);
2163         inMap[index] = (totals[index] > totals[index + 1]);
2164     }
2165     inMap[maxIndex] = (totals[maxIndex] > 1);
2166     inMap[pkgIdIndex] = true;
2167 
2168     int depth = 0;
2169     for (index = threadIdIndex; index <= maxIndex; index++) {
2170         if (inMap[index]) {
2171             depth++;
2172         }
2173     }
2174     KMP_ASSERT(depth > 0);
2175 
2176     //
2177     // Construct the data structure that is to be returned.
2178     //
2179     *address2os = (AddrUnsPair*)
2180       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2181     int pkgLevel = -1;
2182     int coreLevel = -1;
2183     int threadLevel = -1;
2184 
2185     for (i = 0; i < num_avail; ++i) {
2186         Address addr(depth);
2187         unsigned os = threadInfo[i][osIdIndex];
2188         int src_index;
2189         int dst_index = 0;
2190 
2191         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2192             if (! inMap[src_index]) {
2193                 continue;
2194             }
2195             addr.labels[dst_index] = threadInfo[i][src_index];
2196             if (src_index == pkgIdIndex) {
2197                 pkgLevel = dst_index;
2198             }
2199             else if (src_index == coreIdIndex) {
2200                 coreLevel = dst_index;
2201             }
2202             else if (src_index == threadIdIndex) {
2203                 threadLevel = dst_index;
2204             }
2205             dst_index++;
2206         }
2207         (*address2os)[i] = AddrUnsPair(addr, os);
2208     }
2209 
2210     if (__kmp_affinity_gran_levels < 0) {
2211         //
2212         // Set the granularity level based on what levels are modeled
2213         // in the machine topology map.
2214         //
2215         unsigned src_index;
2216         __kmp_affinity_gran_levels = 0;
2217         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2218             if (! inMap[src_index]) {
2219                 continue;
2220             }
2221             switch (src_index) {
2222                 case threadIdIndex:
2223                 if (__kmp_affinity_gran > affinity_gran_thread) {
2224                     __kmp_affinity_gran_levels++;
2225                 }
2226 
2227                 break;
2228                 case coreIdIndex:
2229                 if (__kmp_affinity_gran > affinity_gran_core) {
2230                     __kmp_affinity_gran_levels++;
2231                 }
2232                 break;
2233 
2234                 case pkgIdIndex:
2235                 if (__kmp_affinity_gran > affinity_gran_package) {
2236                     __kmp_affinity_gran_levels++;
2237                 }
2238                 break;
2239             }
2240         }
2241     }
2242 
2243     if (__kmp_affinity_verbose) {
2244         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2245           coreLevel, threadLevel);
2246     }
2247 
2248     __kmp_free(inMap);
2249     __kmp_free(lastId);
2250     __kmp_free(totals);
2251     __kmp_free(maxCt);
2252     __kmp_free(counts);
2253     CLEANUP_THREAD_INFO;
2254     return depth;
2255 }
2256 
2257 
2258 //
2259 // Create and return a table of affinity masks, indexed by OS thread ID.
2260 // This routine handles OR'ing together all the affinity masks of threads
2261 // that are sufficiently close, if granularity > fine.
2262 //
2263 static kmp_affin_mask_t *
2264 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2265   AddrUnsPair *address2os, unsigned numAddrs)
2266 {
2267     //
2268     // First form a table of affinity masks in order of OS thread id.
2269     //
2270     unsigned depth;
2271     unsigned maxOsId;
2272     unsigned i;
2273 
2274     KMP_ASSERT(numAddrs > 0);
2275     depth = address2os[0].first.depth;
2276 
2277     maxOsId = 0;
2278     for (i = 0; i < numAddrs; i++) {
2279         unsigned osId = address2os[i].second;
2280         if (osId > maxOsId) {
2281             maxOsId = osId;
2282         }
2283     }
2284     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2285       (maxOsId + 1) * __kmp_affin_mask_size);
2286 
2287     //
2288     // Sort the address2os table according to physical order.  Doing so
2289     // will put all threads on the same core/package/node in consecutive
2290     // locations.
2291     //
2292     qsort(address2os, numAddrs, sizeof(*address2os),
2293       __kmp_affinity_cmp_Address_labels);
2294 
2295     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2296     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2297         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2298     }
2299     if (__kmp_affinity_gran_levels >= (int)depth) {
2300         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2301           && (__kmp_affinity_type != affinity_none))) {
2302             KMP_WARNING(AffThreadsMayMigrate);
2303         }
2304     }
2305 
2306     //
2307     // Run through the table, forming the masks for all threads on each
2308     // core.  Threads on the same core will have identical "Address"
2309     // objects, not considering the last level, which must be the thread
2310     // id.  All threads on a core will appear consecutively.
2311     //
2312     unsigned unique = 0;
2313     unsigned j = 0;                             // index of 1st thread on core
2314     unsigned leader = 0;
2315     Address *leaderAddr = &(address2os[0].first);
2316     kmp_affin_mask_t *sum
2317       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2318     KMP_CPU_ZERO(sum);
2319     KMP_CPU_SET(address2os[0].second, sum);
2320     for (i = 1; i < numAddrs; i++) {
2321         //
2322         // If this thread is sufficiently close to the leader (within the
2323         // granularity setting), then set the bit for this os thread in the
2324         // affinity mask for this group, and go on to the next thread.
2325         //
2326         if (leaderAddr->isClose(address2os[i].first,
2327           __kmp_affinity_gran_levels)) {
2328             KMP_CPU_SET(address2os[i].second, sum);
2329             continue;
2330         }
2331 
2332         //
2333         // For every thread in this group, copy the mask to the thread's
2334         // entry in the osId2Mask table.  Mark the first address as a
2335         // leader.
2336         //
2337         for (; j < i; j++) {
2338             unsigned osId = address2os[j].second;
2339             KMP_DEBUG_ASSERT(osId <= maxOsId);
2340             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2341             KMP_CPU_COPY(mask, sum);
2342             address2os[j].first.leader = (j == leader);
2343         }
2344         unique++;
2345 
2346         //
2347         // Start a new mask.
2348         //
2349         leader = i;
2350         leaderAddr = &(address2os[i].first);
2351         KMP_CPU_ZERO(sum);
2352         KMP_CPU_SET(address2os[i].second, sum);
2353     }
2354 
2355     //
2356     // For every thread in last group, copy the mask to the thread's
2357     // entry in the osId2Mask table.
2358     //
2359     for (; j < i; j++) {
2360         unsigned osId = address2os[j].second;
2361         KMP_DEBUG_ASSERT(osId <= maxOsId);
2362         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2363         KMP_CPU_COPY(mask, sum);
2364         address2os[j].first.leader = (j == leader);
2365     }
2366     unique++;
2367 
2368     *maxIndex = maxOsId;
2369     *numUnique = unique;
2370     return osId2Mask;
2371 }
2372 
2373 
2374 //
2375 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2376 // as file-static than to try and pass them through the calling sequence of
2377 // the recursive-descent OMP_PLACES parser.
2378 //
2379 static kmp_affin_mask_t *newMasks;
2380 static int numNewMasks;
2381 static int nextNewMask;
2382 
2383 #define ADD_MASK(_mask) \
2384     {                                                                   \
2385         if (nextNewMask >= numNewMasks) {                               \
2386             numNewMasks *= 2;                                           \
2387             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2388               numNewMasks * __kmp_affin_mask_size);                     \
2389         }                                                               \
2390         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2391         nextNewMask++;                                                  \
2392     }
2393 
2394 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2395     {                                                                   \
2396         if (((_osId) > _maxOsId) ||                                     \
2397           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2398             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2399               && (__kmp_affinity_type != affinity_none))) {             \
2400                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2401             }                                                           \
2402         }                                                               \
2403         else {                                                          \
2404             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2405         }                                                               \
2406     }
2407 
2408 
2409 //
2410 // Re-parse the proclist (for the explicit affinity type), and form the list
2411 // of affinity newMasks indexed by gtid.
2412 //
2413 static void
2414 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2415   unsigned int *out_numMasks, const char *proclist,
2416   kmp_affin_mask_t *osId2Mask, int maxOsId)
2417 {
2418     const char *scan = proclist;
2419     const char *next = proclist;
2420 
2421     //
2422     // We use malloc() for the temporary mask vector,
2423     // so that we can use realloc() to extend it.
2424     //
2425     numNewMasks = 2;
2426     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2427       * __kmp_affin_mask_size);
2428     nextNewMask = 0;
2429     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2430       __kmp_affin_mask_size);
2431     int setSize = 0;
2432 
2433     for (;;) {
2434         int start, end, stride;
2435 
2436         SKIP_WS(scan);
2437         next = scan;
2438         if (*next == '\0') {
2439             break;
2440         }
2441 
2442         if (*next == '{') {
2443             int num;
2444             setSize = 0;
2445             next++;     // skip '{'
2446             SKIP_WS(next);
2447             scan = next;
2448 
2449             //
2450             // Read the first integer in the set.
2451             //
2452             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2453               "bad proclist");
2454             SKIP_DIGITS(next);
2455             num = __kmp_str_to_int(scan, *next);
2456             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2457 
2458             //
2459             // Copy the mask for that osId to the sum (union) mask.
2460             //
2461             if ((num > maxOsId) ||
2462               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2463                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2464                   && (__kmp_affinity_type != affinity_none))) {
2465                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2466                 }
2467                 KMP_CPU_ZERO(sumMask);
2468             }
2469             else {
2470                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2471                 setSize = 1;
2472             }
2473 
2474             for (;;) {
2475                 //
2476                 // Check for end of set.
2477                 //
2478                 SKIP_WS(next);
2479                 if (*next == '}') {
2480                     next++;     // skip '}'
2481                     break;
2482                 }
2483 
2484                 //
2485                 // Skip optional comma.
2486                 //
2487                 if (*next == ',') {
2488                     next++;
2489                 }
2490                 SKIP_WS(next);
2491 
2492                 //
2493                 // Read the next integer in the set.
2494                 //
2495                 scan = next;
2496                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2497                   "bad explicit proc list");
2498 
2499                 SKIP_DIGITS(next);
2500                 num = __kmp_str_to_int(scan, *next);
2501                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2502 
2503                 //
2504                 // Add the mask for that osId to the sum mask.
2505                 //
2506                 if ((num > maxOsId) ||
2507                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2508                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2509                       && (__kmp_affinity_type != affinity_none))) {
2510                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2511                     }
2512                 }
2513                 else {
2514                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2515                     setSize++;
2516                 }
2517             }
2518             if (setSize > 0) {
2519                 ADD_MASK(sumMask);
2520             }
2521 
2522             SKIP_WS(next);
2523             if (*next == ',') {
2524                 next++;
2525             }
2526             scan = next;
2527             continue;
2528         }
2529 
2530         //
2531         // Read the first integer.
2532         //
2533         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2534         SKIP_DIGITS(next);
2535         start = __kmp_str_to_int(scan, *next);
2536         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2537         SKIP_WS(next);
2538 
2539         //
2540         // If this isn't a range, then add a mask to the list and go on.
2541         //
2542         if (*next != '-') {
2543             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2544 
2545             //
2546             // Skip optional comma.
2547             //
2548             if (*next == ',') {
2549                 next++;
2550             }
2551             scan = next;
2552             continue;
2553         }
2554 
2555         //
2556         // This is a range.  Skip over the '-' and read in the 2nd int.
2557         //
2558         next++;         // skip '-'
2559         SKIP_WS(next);
2560         scan = next;
2561         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2562         SKIP_DIGITS(next);
2563         end = __kmp_str_to_int(scan, *next);
2564         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2565 
2566         //
2567         // Check for a stride parameter
2568         //
2569         stride = 1;
2570         SKIP_WS(next);
2571         if (*next == ':') {
2572             //
2573             // A stride is specified.  Skip over the ':" and read the 3rd int.
2574             //
2575             int sign = +1;
2576             next++;         // skip ':'
2577             SKIP_WS(next);
2578             scan = next;
2579             if (*next == '-') {
2580                 sign = -1;
2581                 next++;
2582                 SKIP_WS(next);
2583                 scan = next;
2584             }
2585             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2586               "bad explicit proc list");
2587             SKIP_DIGITS(next);
2588             stride = __kmp_str_to_int(scan, *next);
2589             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2590             stride *= sign;
2591         }
2592 
2593         //
2594         // Do some range checks.
2595         //
2596         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2597         if (stride > 0) {
2598             KMP_ASSERT2(start <= end, "bad explicit proc list");
2599         }
2600         else {
2601             KMP_ASSERT2(start >= end, "bad explicit proc list");
2602         }
2603         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2604 
2605         //
2606         // Add the mask for each OS proc # to the list.
2607         //
2608         if (stride > 0) {
2609             do {
2610                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2611                 start += stride;
2612             } while (start <= end);
2613         }
2614         else {
2615             do {
2616                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2617                 start += stride;
2618             } while (start >= end);
2619         }
2620 
2621         //
2622         // Skip optional comma.
2623         //
2624         SKIP_WS(next);
2625         if (*next == ',') {
2626             next++;
2627         }
2628         scan = next;
2629     }
2630 
2631     *out_numMasks = nextNewMask;
2632     if (nextNewMask == 0) {
2633         *out_masks = NULL;
2634         KMP_INTERNAL_FREE(newMasks);
2635         return;
2636     }
2637     *out_masks
2638       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2639     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2640     __kmp_free(sumMask);
2641     KMP_INTERNAL_FREE(newMasks);
2642 }
2643 
2644 
2645 # if OMP_40_ENABLED
2646 
2647 /*-----------------------------------------------------------------------------
2648 
2649 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2650 places.  Again, Here is the grammar:
2651 
2652 place_list := place
2653 place_list := place , place_list
2654 place := num
2655 place := place : num
2656 place := place : num : signed
2657 place := { subplacelist }
2658 place := ! place                  // (lowest priority)
2659 subplace_list := subplace
2660 subplace_list := subplace , subplace_list
2661 subplace := num
2662 subplace := num : num
2663 subplace := num : num : signed
2664 signed := num
2665 signed := + signed
2666 signed := - signed
2667 
2668 -----------------------------------------------------------------------------*/
2669 
2670 static void
2671 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2672   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2673 {
2674     const char *next;
2675 
2676     for (;;) {
2677         int start, count, stride, i;
2678 
2679         //
2680         // Read in the starting proc id
2681         //
2682         SKIP_WS(*scan);
2683         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2684           "bad explicit places list");
2685         next = *scan;
2686         SKIP_DIGITS(next);
2687         start = __kmp_str_to_int(*scan, *next);
2688         KMP_ASSERT(start >= 0);
2689         *scan = next;
2690 
2691         //
2692         // valid follow sets are ',' ':' and '}'
2693         //
2694         SKIP_WS(*scan);
2695         if (**scan == '}' || **scan == ',') {
2696             if ((start > maxOsId) ||
2697               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2698                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2699                   && (__kmp_affinity_type != affinity_none))) {
2700                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2701                 }
2702             }
2703             else {
2704                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2705                 (*setSize)++;
2706             }
2707             if (**scan == '}') {
2708                 break;
2709             }
2710             (*scan)++;  // skip ','
2711             continue;
2712         }
2713         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2714         (*scan)++;      // skip ':'
2715 
2716         //
2717         // Read count parameter
2718         //
2719         SKIP_WS(*scan);
2720         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2721           "bad explicit places list");
2722         next = *scan;
2723         SKIP_DIGITS(next);
2724         count = __kmp_str_to_int(*scan, *next);
2725         KMP_ASSERT(count >= 0);
2726         *scan = next;
2727 
2728         //
2729         // valid follow sets are ',' ':' and '}'
2730         //
2731         SKIP_WS(*scan);
2732         if (**scan == '}' || **scan == ',') {
2733             for (i = 0; i < count; i++) {
2734                 if ((start > maxOsId) ||
2735                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2736                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2737                       && (__kmp_affinity_type != affinity_none))) {
2738                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2739                     }
2740                     break;  // don't proliferate warnings for large count
2741                 }
2742                 else {
2743                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2744                     start++;
2745                     (*setSize)++;
2746                 }
2747             }
2748             if (**scan == '}') {
2749                 break;
2750             }
2751             (*scan)++;  // skip ','
2752             continue;
2753         }
2754         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2755         (*scan)++;      // skip ':'
2756 
2757         //
2758         // Read stride parameter
2759         //
2760         int sign = +1;
2761         for (;;) {
2762             SKIP_WS(*scan);
2763             if (**scan == '+') {
2764                 (*scan)++; // skip '+'
2765                 continue;
2766             }
2767             if (**scan == '-') {
2768                 sign *= -1;
2769                 (*scan)++; // skip '-'
2770                 continue;
2771             }
2772             break;
2773         }
2774         SKIP_WS(*scan);
2775         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2776           "bad explicit places list");
2777         next = *scan;
2778         SKIP_DIGITS(next);
2779         stride = __kmp_str_to_int(*scan, *next);
2780         KMP_ASSERT(stride >= 0);
2781         *scan = next;
2782         stride *= sign;
2783 
2784         //
2785         // valid follow sets are ',' and '}'
2786         //
2787         SKIP_WS(*scan);
2788         if (**scan == '}' || **scan == ',') {
2789             for (i = 0; i < count; i++) {
2790                 if ((start > maxOsId) ||
2791                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2792                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2793                       && (__kmp_affinity_type != affinity_none))) {
2794                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2795                     }
2796                     break;  // don't proliferate warnings for large count
2797                 }
2798                 else {
2799                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2800                     start += stride;
2801                     (*setSize)++;
2802                 }
2803             }
2804             if (**scan == '}') {
2805                 break;
2806             }
2807             (*scan)++;  // skip ','
2808             continue;
2809         }
2810 
2811         KMP_ASSERT2(0, "bad explicit places list");
2812     }
2813 }
2814 
2815 
2816 static void
2817 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
2818   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2819 {
2820     const char *next;
2821 
2822     //
2823     // valid follow sets are '{' '!' and num
2824     //
2825     SKIP_WS(*scan);
2826     if (**scan == '{') {
2827         (*scan)++;      // skip '{'
2828         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
2829           setSize);
2830         KMP_ASSERT2(**scan == '}', "bad explicit places list");
2831         (*scan)++;      // skip '}'
2832     }
2833     else if (**scan == '!') {
2834         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
2835         KMP_CPU_COMPLEMENT(tempMask);
2836         (*scan)++;      // skip '!'
2837     }
2838     else if ((**scan >= '0') && (**scan <= '9')) {
2839         next = *scan;
2840         SKIP_DIGITS(next);
2841         int num = __kmp_str_to_int(*scan, *next);
2842         KMP_ASSERT(num >= 0);
2843         if ((num > maxOsId) ||
2844           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2845             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2846               && (__kmp_affinity_type != affinity_none))) {
2847                 KMP_WARNING(AffIgnoreInvalidProcID, num);
2848             }
2849         }
2850         else {
2851             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
2852             (*setSize)++;
2853         }
2854         *scan = next;  // skip num
2855     }
2856     else {
2857         KMP_ASSERT2(0, "bad explicit places list");
2858     }
2859 }
2860 
2861 
2862 //static void
2863 void
2864 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
2865   unsigned int *out_numMasks, const char *placelist,
2866   kmp_affin_mask_t *osId2Mask, int maxOsId)
2867 {
2868     const char *scan = placelist;
2869     const char *next = placelist;
2870 
2871     numNewMasks = 2;
2872     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2873       * __kmp_affin_mask_size);
2874     nextNewMask = 0;
2875 
2876     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
2877       __kmp_affin_mask_size);
2878     KMP_CPU_ZERO(tempMask);
2879     int setSize = 0;
2880 
2881     for (;;) {
2882         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
2883 
2884         //
2885         // valid follow sets are ',' ':' and EOL
2886         //
2887         SKIP_WS(scan);
2888         if (*scan == '\0' || *scan == ',') {
2889             if (setSize > 0) {
2890                 ADD_MASK(tempMask);
2891             }
2892             KMP_CPU_ZERO(tempMask);
2893             setSize = 0;
2894             if (*scan == '\0') {
2895                 break;
2896             }
2897             scan++;     // skip ','
2898             continue;
2899         }
2900 
2901         KMP_ASSERT2(*scan == ':', "bad explicit places list");
2902         scan++;         // skip ':'
2903 
2904         //
2905         // Read count parameter
2906         //
2907         SKIP_WS(scan);
2908         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2909           "bad explicit places list");
2910         next = scan;
2911         SKIP_DIGITS(next);
2912         int count = __kmp_str_to_int(scan, *next);
2913         KMP_ASSERT(count >= 0);
2914         scan = next;
2915 
2916         //
2917         // valid follow sets are ',' ':' and EOL
2918         //
2919         SKIP_WS(scan);
2920         int stride;
2921         if (*scan == '\0' || *scan == ',') {
2922             stride = +1;
2923         }
2924         else {
2925             KMP_ASSERT2(*scan == ':', "bad explicit places list");
2926             scan++;         // skip ':'
2927 
2928             //
2929             // Read stride parameter
2930             //
2931             int sign = +1;
2932             for (;;) {
2933                 SKIP_WS(scan);
2934                 if (*scan == '+') {
2935                     scan++; // skip '+'
2936                     continue;
2937                 }
2938                 if (*scan == '-') {
2939                     sign *= -1;
2940                     scan++; // skip '-'
2941                     continue;
2942                 }
2943                 break;
2944             }
2945             SKIP_WS(scan);
2946             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2947               "bad explicit places list");
2948             next = scan;
2949             SKIP_DIGITS(next);
2950             stride = __kmp_str_to_int(scan, *next);
2951             KMP_DEBUG_ASSERT(stride >= 0);
2952             scan = next;
2953             stride *= sign;
2954         }
2955 
2956         if (stride > 0) {
2957             int i;
2958             for (i = 0; i < count; i++) {
2959                 int j;
2960                 if (setSize == 0) {
2961                     break;
2962                 }
2963                 ADD_MASK(tempMask);
2964                 setSize = 0;
2965                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
2966                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
2967                         KMP_CPU_CLR(j, tempMask);
2968                     }
2969                     else if ((j > maxOsId) ||
2970                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
2971                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
2972                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
2973                             KMP_WARNING(AffIgnoreInvalidProcID, j);
2974                         }
2975                         KMP_CPU_CLR(j, tempMask);
2976                     }
2977                     else {
2978                         KMP_CPU_SET(j, tempMask);
2979                         setSize++;
2980                     }
2981                 }
2982                 for (; j >= 0; j--) {
2983                     KMP_CPU_CLR(j, tempMask);
2984                 }
2985             }
2986         }
2987         else {
2988             int i;
2989             for (i = 0; i < count; i++) {
2990                 int j;
2991                 if (setSize == 0) {
2992                     break;
2993                 }
2994                 ADD_MASK(tempMask);
2995                 setSize = 0;
2996                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
2997                   j++) {
2998                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
2999                         KMP_CPU_CLR(j, tempMask);
3000                     }
3001                     else if ((j > maxOsId) ||
3002                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3003                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3004                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3005                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3006                         }
3007                         KMP_CPU_CLR(j, tempMask);
3008                     }
3009                     else {
3010                         KMP_CPU_SET(j, tempMask);
3011                         setSize++;
3012                     }
3013                 }
3014                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3015                     KMP_CPU_CLR(j, tempMask);
3016                 }
3017             }
3018         }
3019         KMP_CPU_ZERO(tempMask);
3020         setSize = 0;
3021 
3022         //
3023         // valid follow sets are ',' and EOL
3024         //
3025         SKIP_WS(scan);
3026         if (*scan == '\0') {
3027             break;
3028         }
3029         if (*scan == ',') {
3030             scan++;     // skip ','
3031             continue;
3032         }
3033 
3034         KMP_ASSERT2(0, "bad explicit places list");
3035     }
3036 
3037     *out_numMasks = nextNewMask;
3038     if (nextNewMask == 0) {
3039         *out_masks = NULL;
3040         KMP_INTERNAL_FREE(newMasks);
3041         return;
3042     }
3043     *out_masks
3044       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3045     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3046     __kmp_free(tempMask);
3047     KMP_INTERNAL_FREE(newMasks);
3048 }
3049 
3050 # endif /* OMP_40_ENABLED */
3051 
3052 #undef ADD_MASK
3053 #undef ADD_MASK_OSID
3054 
3055 static void
3056 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3057 {
3058     if ( __kmp_place_num_cores == 0 ) {
3059         if ( __kmp_place_num_threads_per_core == 0 ) {
3060             return;   // no cores limiting actions requested, exit
3061         }
3062         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3063     }
3064     if ( !__kmp_affinity_uniform_topology() ) {
3065         KMP_WARNING( AffThrPlaceNonUniform );
3066         return; // don't support non-uniform topology
3067     }
3068     if ( depth != 3 ) {
3069         KMP_WARNING( AffThrPlaceNonThreeLevel );
3070         return; // don't support not-3-level topology
3071     }
3072     if ( __kmp_place_num_threads_per_core == 0 ) {
3073         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
3074     }
3075     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3076         KMP_WARNING( AffThrPlaceManyCores );
3077         return;
3078     }
3079 
3080     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3081                             nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3082     int i, j, k, n_old = 0, n_new = 0;
3083     for ( i = 0; i < nPackages; ++i ) {
3084         for ( j = 0; j < nCoresPerPkg; ++j ) {
3085             if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3086                 n_old += __kmp_nThreadsPerCore;   // skip not-requested core
3087             } else {
3088                 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3089                     if ( k < __kmp_place_num_threads_per_core ) {
3090                         newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
3091                         n_new++;
3092                     }
3093                     n_old++;
3094                 }
3095             }
3096         }
3097     }
3098     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3099     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3100     __kmp_avail_proc = n_new;                                 // correct avail_proc
3101     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3102 
3103     __kmp_free( *pAddr );
3104     *pAddr = newAddr;      // replace old topology with new one
3105 }
3106 
3107 
3108 static AddrUnsPair *address2os = NULL;
3109 static int           * procarr = NULL;
3110 static int     __kmp_aff_depth = 0;
3111 
3112 static void
3113 __kmp_aux_affinity_initialize(void)
3114 {
3115     if (__kmp_affinity_masks != NULL) {
3116         KMP_ASSERT(fullMask != NULL);
3117         return;
3118     }
3119 
3120     //
3121     // Create the "full" mask - this defines all of the processors that we
3122     // consider to be in the machine model.  If respect is set, then it is
3123     // the initialization thread's affinity mask.  Otherwise, it is all
3124     // processors that we know about on the machine.
3125     //
3126     if (fullMask == NULL) {
3127         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3128     }
3129     if (KMP_AFFINITY_CAPABLE()) {
3130         if (__kmp_affinity_respect_mask) {
3131             __kmp_get_system_affinity(fullMask, TRUE);
3132 
3133             //
3134             // Count the number of available processors.
3135             //
3136             unsigned i;
3137             __kmp_avail_proc = 0;
3138             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3139                 if (! KMP_CPU_ISSET(i, fullMask)) {
3140                     continue;
3141                 }
3142                 __kmp_avail_proc++;
3143             }
3144             if (__kmp_avail_proc > __kmp_xproc) {
3145                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3146                   && (__kmp_affinity_type != affinity_none))) {
3147                     KMP_WARNING(ErrorInitializeAffinity);
3148                 }
3149                 __kmp_affinity_type = affinity_none;
3150                 KMP_AFFINITY_DISABLE();
3151                 return;
3152             }
3153         }
3154         else {
3155             __kmp_affinity_entire_machine_mask(fullMask);
3156             __kmp_avail_proc = __kmp_xproc;
3157         }
3158     }
3159 
3160     int depth = -1;
3161     kmp_i18n_id_t msg_id = kmp_i18n_null;
3162 
3163     //
3164     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3165     // KMP_TOPOLOGY_METHOD=cpuinfo
3166     //
3167     if ((__kmp_cpuinfo_file != NULL) &&
3168       (__kmp_affinity_top_method == affinity_top_method_all)) {
3169         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3170     }
3171 
3172     if (__kmp_affinity_top_method == affinity_top_method_all) {
3173         //
3174         // In the default code path, errors are not fatal - we just try using
3175         // another method.  We only emit a warning message if affinity is on,
3176         // or the verbose flag is set, an the nowarnings flag was not set.
3177         //
3178         const char *file_name = NULL;
3179         int line = 0;
3180 
3181 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3182 
3183         if (__kmp_affinity_verbose) {
3184             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3185         }
3186 
3187         file_name = NULL;
3188         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3189         if (depth == 0) {
3190             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3191             KMP_ASSERT(address2os == NULL);
3192             return;
3193         }
3194 
3195         if (depth < 0) {
3196             if (__kmp_affinity_verbose) {
3197                 if (msg_id != kmp_i18n_null) {
3198                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3199                       KMP_I18N_STR(DecodingLegacyAPIC));
3200                 }
3201                 else {
3202                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3203                 }
3204             }
3205 
3206             file_name = NULL;
3207             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3208             if (depth == 0) {
3209                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3210                 KMP_ASSERT(address2os == NULL);
3211                 return;
3212             }
3213         }
3214 
3215 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3216 
3217 # if KMP_OS_LINUX
3218 
3219         if (depth < 0) {
3220             if (__kmp_affinity_verbose) {
3221                 if (msg_id != kmp_i18n_null) {
3222                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3223                 }
3224                 else {
3225                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3226                 }
3227             }
3228 
3229             FILE *f = fopen("/proc/cpuinfo", "r");
3230             if (f == NULL) {
3231                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3232             }
3233             else {
3234                 file_name = "/proc/cpuinfo";
3235                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3236                 fclose(f);
3237                 if (depth == 0) {
3238                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3239                     KMP_ASSERT(address2os == NULL);
3240                     return;
3241                 }
3242             }
3243         }
3244 
3245 # endif /* KMP_OS_LINUX */
3246 
3247 # if KMP_GROUP_AFFINITY
3248 
3249         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3250             if (__kmp_affinity_verbose) {
3251                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3252             }
3253 
3254             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3255             KMP_ASSERT(depth != 0);
3256         }
3257 
3258 # endif /* KMP_GROUP_AFFINITY */
3259 
3260         if (depth < 0) {
3261             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3262                 if (file_name == NULL) {
3263                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3264                 }
3265                 else if (line == 0) {
3266                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3267                 }
3268                 else {
3269                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3270                 }
3271             }
3272             // FIXME - print msg if msg_id = kmp_i18n_null ???
3273 
3274             file_name = "";
3275             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3276             if (depth == 0) {
3277                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3278                 KMP_ASSERT(address2os == NULL);
3279                 return;
3280             }
3281             KMP_ASSERT(depth > 0);
3282             KMP_ASSERT(address2os != NULL);
3283         }
3284     }
3285 
3286     //
3287     // If the user has specified that a paricular topology discovery method
3288     // is to be used, then we abort if that method fails.  The exception is
3289     // group affinity, which might have been implicitly set.
3290     //
3291 
3292 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3293 
3294     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3295         if (__kmp_affinity_verbose) {
3296             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3297               KMP_I18N_STR(Decodingx2APIC));
3298         }
3299 
3300         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3301         if (depth == 0) {
3302             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3303             KMP_ASSERT(address2os == NULL);
3304             return;
3305         }
3306         if (depth < 0) {
3307             KMP_ASSERT(msg_id != kmp_i18n_null);
3308             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3309         }
3310     }
3311     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3312         if (__kmp_affinity_verbose) {
3313             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3314               KMP_I18N_STR(DecodingLegacyAPIC));
3315         }
3316 
3317         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3318         if (depth == 0) {
3319             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3320             KMP_ASSERT(address2os == NULL);
3321             return;
3322         }
3323         if (depth < 0) {
3324             KMP_ASSERT(msg_id != kmp_i18n_null);
3325             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3326         }
3327     }
3328 
3329 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3330 
3331     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3332         const char *filename;
3333         if (__kmp_cpuinfo_file != NULL) {
3334             filename = __kmp_cpuinfo_file;
3335         }
3336         else {
3337             filename = "/proc/cpuinfo";
3338         }
3339 
3340         if (__kmp_affinity_verbose) {
3341             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3342         }
3343 
3344         FILE *f = fopen(filename, "r");
3345         if (f == NULL) {
3346             int code = errno;
3347             if (__kmp_cpuinfo_file != NULL) {
3348                 __kmp_msg(
3349                     kmp_ms_fatal,
3350                     KMP_MSG(CantOpenFileForReading, filename),
3351                     KMP_ERR(code),
3352                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3353                     __kmp_msg_null
3354                 );
3355             }
3356             else {
3357                 __kmp_msg(
3358                     kmp_ms_fatal,
3359                     KMP_MSG(CantOpenFileForReading, filename),
3360                     KMP_ERR(code),
3361                     __kmp_msg_null
3362                 );
3363             }
3364         }
3365         int line = 0;
3366         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3367         fclose(f);
3368         if (depth < 0) {
3369             KMP_ASSERT(msg_id != kmp_i18n_null);
3370             if (line > 0) {
3371                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3372             }
3373             else {
3374                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3375             }
3376         }
3377         if (__kmp_affinity_type == affinity_none) {
3378             KMP_ASSERT(depth == 0);
3379             KMP_ASSERT(address2os == NULL);
3380             return;
3381         }
3382     }
3383 
3384 # if KMP_GROUP_AFFINITY
3385 
3386     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3387         if (__kmp_affinity_verbose) {
3388             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3389         }
3390 
3391         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3392         KMP_ASSERT(depth != 0);
3393         if (depth < 0) {
3394             KMP_ASSERT(msg_id != kmp_i18n_null);
3395             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3396         }
3397     }
3398 
3399 # endif /* KMP_GROUP_AFFINITY */
3400 
3401     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3402         if (__kmp_affinity_verbose) {
3403             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3404         }
3405 
3406         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3407         if (depth == 0) {
3408             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3409             KMP_ASSERT(address2os == NULL);
3410             return;
3411         }
3412         // should not fail
3413         KMP_ASSERT(depth > 0);
3414         KMP_ASSERT(address2os != NULL);
3415     }
3416 
3417     if (address2os == NULL) {
3418         if (KMP_AFFINITY_CAPABLE()
3419           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3420           && (__kmp_affinity_type != affinity_none)))) {
3421             KMP_WARNING(ErrorInitializeAffinity);
3422         }
3423         __kmp_affinity_type = affinity_none;
3424         KMP_AFFINITY_DISABLE();
3425         return;
3426     }
3427 
3428     __kmp_apply_thread_places(&address2os, depth);
3429 
3430     //
3431     // Create the table of masks, indexed by thread Id.
3432     //
3433     unsigned maxIndex;
3434     unsigned numUnique;
3435     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3436       address2os, __kmp_avail_proc);
3437     if (__kmp_affinity_gran_levels == 0) {
3438         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3439     }
3440 
3441     //
3442     // Set the childNums vector in all Address objects.  This must be done
3443     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3444     // which takes into account the setting of __kmp_affinity_compact.
3445     //
3446     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3447 
3448     switch (__kmp_affinity_type) {
3449 
3450         case affinity_explicit:
3451         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3452 # if OMP_40_ENABLED
3453         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3454 # endif
3455         {
3456             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3457               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3458               maxIndex);
3459         }
3460 # if OMP_40_ENABLED
3461         else {
3462             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3463               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3464               maxIndex);
3465         }
3466 # endif
3467         if (__kmp_affinity_num_masks == 0) {
3468             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3469               && (__kmp_affinity_type != affinity_none))) {
3470                 KMP_WARNING(AffNoValidProcID);
3471             }
3472             __kmp_affinity_type = affinity_none;
3473             return;
3474         }
3475         break;
3476 
3477         //
3478         // The other affinity types rely on sorting the Addresses according
3479         // to some permutation of the machine topology tree.  Set
3480         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3481         // then jump to a common code fragment to do the sort and create
3482         // the array of affinity masks.
3483         //
3484 
3485         case affinity_logical:
3486         __kmp_affinity_compact = 0;
3487         if (__kmp_affinity_offset) {
3488             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3489               % __kmp_avail_proc;
3490         }
3491         goto sortAddresses;
3492 
3493         case affinity_physical:
3494         if (__kmp_nThreadsPerCore > 1) {
3495             __kmp_affinity_compact = 1;
3496             if (__kmp_affinity_compact >= depth) {
3497                 __kmp_affinity_compact = 0;
3498             }
3499         } else {
3500             __kmp_affinity_compact = 0;
3501         }
3502         if (__kmp_affinity_offset) {
3503             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3504               % __kmp_avail_proc;
3505         }
3506         goto sortAddresses;
3507 
3508         case affinity_scatter:
3509         if (__kmp_affinity_compact >= depth) {
3510             __kmp_affinity_compact = 0;
3511         }
3512         else {
3513             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3514         }
3515         goto sortAddresses;
3516 
3517         case affinity_compact:
3518         if (__kmp_affinity_compact >= depth) {
3519             __kmp_affinity_compact = depth - 1;
3520         }
3521         goto sortAddresses;
3522 
3523         case affinity_balanced:
3524         // Balanced works only for the case of a single package
3525         if( nPackages > 1 ) {
3526             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3527                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3528             }
3529             __kmp_affinity_type = affinity_none;
3530             return;
3531         } else if( __kmp_affinity_uniform_topology() ) {
3532             break;
3533         } else { // Non-uniform topology
3534 
3535             // Save the depth for further usage
3536             __kmp_aff_depth = depth;
3537 
3538             // Number of hyper threads per core in HT machine
3539             int nth_per_core = __kmp_nThreadsPerCore;
3540 
3541             int core_level;
3542             if( nth_per_core > 1 ) {
3543                 core_level = depth - 2;
3544             } else {
3545                 core_level = depth - 1;
3546             }
3547             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3548             int nproc = nth_per_core * ncores;
3549 
3550             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3551             for( int i = 0; i < nproc; i++ ) {
3552                 procarr[ i ] = -1;
3553             }
3554 
3555             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3556                 int proc = address2os[ i ].second;
3557                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3558                 // If there is only one thread per core then depth == 2: level 0 - package,
3559                 // level 1 - core.
3560                 int level = depth - 1;
3561 
3562                 // __kmp_nth_per_core == 1
3563                 int thread = 0;
3564                 int core = address2os[ i ].first.labels[ level ];
3565                 // If the thread level exists, that is we have more than one thread context per core
3566                 if( nth_per_core > 1 ) {
3567                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3568                     core = address2os[ i ].first.labels[ level - 1 ];
3569                 }
3570                 procarr[ core * nth_per_core + thread ] = proc;
3571             }
3572 
3573             break;
3574         }
3575 
3576         sortAddresses:
3577         //
3578         // Allocate the gtid->affinity mask table.
3579         //
3580         if (__kmp_affinity_dups) {
3581             __kmp_affinity_num_masks = __kmp_avail_proc;
3582         }
3583         else {
3584             __kmp_affinity_num_masks = numUnique;
3585         }
3586 
3587 # if OMP_40_ENABLED
3588         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3589           && ( __kmp_affinity_num_places > 0 )
3590           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3591             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3592         }
3593 # endif
3594 
3595         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3596           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3597 
3598         //
3599         // Sort the address2os table according to the current setting of
3600         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3601         //
3602         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3603           __kmp_affinity_cmp_Address_child_num);
3604         {
3605             int i;
3606             unsigned j;
3607             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3608                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3609                     continue;
3610                 }
3611                 unsigned osId = address2os[i].second;
3612                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3613                 kmp_affin_mask_t *dest
3614                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3615                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3616                 KMP_CPU_COPY(dest, src);
3617                 if (++j >= __kmp_affinity_num_masks) {
3618                     break;
3619                 }
3620             }
3621             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3622         }
3623         break;
3624 
3625         default:
3626         KMP_ASSERT2(0, "Unexpected affinity setting");
3627     }
3628 
3629     __kmp_free(osId2Mask);
3630     machine_hierarchy.init(address2os, __kmp_avail_proc);
3631 }
3632 
3633 
3634 void
3635 __kmp_affinity_initialize(void)
3636 {
3637     //
3638     // Much of the code above was written assumming that if a machine was not
3639     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3640     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3641     //
3642     // There are too many checks for __kmp_affinity_type == affinity_none
3643     // in this code.  Instead of trying to change them all, check if
3644     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3645     // affinity_none, call the real initialization routine, then restore
3646     // __kmp_affinity_type to affinity_disabled.
3647     //
3648     int disabled = (__kmp_affinity_type == affinity_disabled);
3649     if (! KMP_AFFINITY_CAPABLE()) {
3650         KMP_ASSERT(disabled);
3651     }
3652     if (disabled) {
3653         __kmp_affinity_type = affinity_none;
3654     }
3655     __kmp_aux_affinity_initialize();
3656     if (disabled) {
3657         __kmp_affinity_type = affinity_disabled;
3658     }
3659 }
3660 
3661 
3662 void
3663 __kmp_affinity_uninitialize(void)
3664 {
3665     if (__kmp_affinity_masks != NULL) {
3666         __kmp_free(__kmp_affinity_masks);
3667         __kmp_affinity_masks = NULL;
3668     }
3669     if (fullMask != NULL) {
3670         KMP_CPU_FREE(fullMask);
3671         fullMask = NULL;
3672     }
3673     __kmp_affinity_num_masks = 0;
3674 # if OMP_40_ENABLED
3675     __kmp_affinity_num_places = 0;
3676 # endif
3677     if (__kmp_affinity_proclist != NULL) {
3678         __kmp_free(__kmp_affinity_proclist);
3679         __kmp_affinity_proclist = NULL;
3680     }
3681     if( address2os != NULL ) {
3682         __kmp_free( address2os );
3683         address2os = NULL;
3684     }
3685     if( procarr != NULL ) {
3686         __kmp_free( procarr );
3687         procarr = NULL;
3688     }
3689 }
3690 
3691 
3692 void
3693 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3694 {
3695     if (! KMP_AFFINITY_CAPABLE()) {
3696         return;
3697     }
3698 
3699     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3700     if (th->th.th_affin_mask == NULL) {
3701         KMP_CPU_ALLOC(th->th.th_affin_mask);
3702     }
3703     else {
3704         KMP_CPU_ZERO(th->th.th_affin_mask);
3705     }
3706 
3707     //
3708     // Copy the thread mask to the kmp_info_t strucuture.
3709     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3710     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3711     // is set, then the full mask is the same as the mask of the initialization
3712     // thread.
3713     //
3714     kmp_affin_mask_t *mask;
3715     int i;
3716 
3717 # if OMP_40_ENABLED
3718     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3719 # endif
3720     {
3721         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
3722           ) {
3723 # if KMP_GROUP_AFFINITY
3724             if (__kmp_num_proc_groups > 1) {
3725                 return;
3726             }
3727 # endif
3728             KMP_ASSERT(fullMask != NULL);
3729             i = KMP_PLACE_ALL;
3730             mask = fullMask;
3731         }
3732         else {
3733             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3734             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3735             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3736         }
3737     }
3738 # if OMP_40_ENABLED
3739     else {
3740         if ((! isa_root)
3741           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3742 #  if KMP_GROUP_AFFINITY
3743             if (__kmp_num_proc_groups > 1) {
3744                 return;
3745             }
3746 #  endif
3747             KMP_ASSERT(fullMask != NULL);
3748             i = KMP_PLACE_ALL;
3749             mask = fullMask;
3750         }
3751         else {
3752             //
3753             // int i = some hash function or just a counter that doesn't
3754             // always start at 0.  Use gtid for now.
3755             //
3756             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3757             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3758             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3759         }
3760     }
3761 # endif
3762 
3763 # if OMP_40_ENABLED
3764     th->th.th_current_place = i;
3765     if (isa_root) {
3766         th->th.th_new_place = i;
3767         th->th.th_first_place = 0;
3768         th->th.th_last_place = __kmp_affinity_num_masks - 1;
3769     }
3770 
3771     if (i == KMP_PLACE_ALL) {
3772         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
3773           gtid));
3774     }
3775     else {
3776         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
3777           gtid, i));
3778     }
3779 # else
3780     if (i == -1) {
3781         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
3782           gtid));
3783     }
3784     else {
3785         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
3786           gtid, i));
3787     }
3788 # endif /* OMP_40_ENABLED */
3789 
3790     KMP_CPU_COPY(th->th.th_affin_mask, mask);
3791 
3792     if (__kmp_affinity_verbose) {
3793         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3794         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3795           th->th.th_affin_mask);
3796         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
3797           buf);
3798     }
3799 
3800 # if KMP_OS_WINDOWS
3801     //
3802     // On Windows* OS, the process affinity mask might have changed.
3803     // If the user didn't request affinity and this call fails,
3804     // just continue silently.  See CQ171393.
3805     //
3806     if ( __kmp_affinity_type == affinity_none ) {
3807         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
3808     }
3809     else
3810 # endif
3811     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3812 }
3813 
3814 
3815 # if OMP_40_ENABLED
3816 
3817 void
3818 __kmp_affinity_set_place(int gtid)
3819 {
3820     int retval;
3821 
3822     if (! KMP_AFFINITY_CAPABLE()) {
3823         return;
3824     }
3825 
3826     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3827 
3828     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
3829       gtid, th->th.th_new_place, th->th.th_current_place));
3830 
3831     //
3832     // Check that the new place is within this thread's partition.
3833     //
3834     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3835     KMP_ASSERT(th->th.th_new_place >= 0);
3836     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
3837     if (th->th.th_first_place <= th->th.th_last_place) {
3838         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
3839          && (th->th.th_new_place <= th->th.th_last_place));
3840     }
3841     else {
3842         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
3843          || (th->th.th_new_place >= th->th.th_last_place));
3844     }
3845 
3846     //
3847     // Copy the thread mask to the kmp_info_t strucuture,
3848     // and set this thread's affinity.
3849     //
3850     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
3851       th->th.th_new_place);
3852     KMP_CPU_COPY(th->th.th_affin_mask, mask);
3853     th->th.th_current_place = th->th.th_new_place;
3854 
3855     if (__kmp_affinity_verbose) {
3856         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3857         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3858           th->th.th_affin_mask);
3859         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
3860           gtid, buf);
3861     }
3862     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3863 }
3864 
3865 # endif /* OMP_40_ENABLED */
3866 
3867 
3868 int
3869 __kmp_aux_set_affinity(void **mask)
3870 {
3871     int gtid;
3872     kmp_info_t *th;
3873     int retval;
3874 
3875     if (! KMP_AFFINITY_CAPABLE()) {
3876         return -1;
3877     }
3878 
3879     gtid = __kmp_entry_gtid();
3880     KA_TRACE(1000, ;{
3881         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3882         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3883           (kmp_affin_mask_t *)(*mask));
3884         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
3885           gtid, buf);
3886     });
3887 
3888     if (__kmp_env_consistency_check) {
3889         if ((mask == NULL) || (*mask == NULL)) {
3890             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3891         }
3892         else {
3893             unsigned proc;
3894             int num_procs = 0;
3895 
3896             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
3897                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
3898                     continue;
3899                 }
3900                 num_procs++;
3901                 if (! KMP_CPU_ISSET(proc, fullMask)) {
3902                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3903                     break;
3904                 }
3905             }
3906             if (num_procs == 0) {
3907                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3908             }
3909 
3910 # if KMP_GROUP_AFFINITY
3911             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
3912                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3913             }
3914 # endif /* KMP_GROUP_AFFINITY */
3915 
3916         }
3917     }
3918 
3919     th = __kmp_threads[gtid];
3920     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3921     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
3922     if (retval == 0) {
3923         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
3924     }
3925 
3926 # if OMP_40_ENABLED
3927     th->th.th_current_place = KMP_PLACE_UNDEFINED;
3928     th->th.th_new_place = KMP_PLACE_UNDEFINED;
3929     th->th.th_first_place = 0;
3930     th->th.th_last_place = __kmp_affinity_num_masks - 1;
3931 
3932     //
3933     // Turn off 4.0 affinity for the current tread at this parallel level.
3934     //
3935     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
3936 # endif
3937 
3938     return retval;
3939 }
3940 
3941 
3942 int
3943 __kmp_aux_get_affinity(void **mask)
3944 {
3945     int gtid;
3946     int retval;
3947     kmp_info_t *th;
3948 
3949     if (! KMP_AFFINITY_CAPABLE()) {
3950         return -1;
3951     }
3952 
3953     gtid = __kmp_entry_gtid();
3954     th = __kmp_threads[gtid];
3955     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3956 
3957     KA_TRACE(1000, ;{
3958         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3959         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3960           th->th.th_affin_mask);
3961         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
3962     });
3963 
3964     if (__kmp_env_consistency_check) {
3965         if ((mask == NULL) || (*mask == NULL)) {
3966             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
3967         }
3968     }
3969 
3970 # if !KMP_OS_WINDOWS
3971 
3972     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
3973     KA_TRACE(1000, ;{
3974         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3975         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3976           (kmp_affin_mask_t *)(*mask));
3977         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
3978     });
3979     return retval;
3980 
3981 # else
3982 
3983     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
3984     return 0;
3985 
3986 # endif /* KMP_OS_WINDOWS */
3987 
3988 }
3989 
3990 int
3991 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
3992 {
3993     int retval;
3994 
3995     if (! KMP_AFFINITY_CAPABLE()) {
3996         return -1;
3997     }
3998 
3999     KA_TRACE(1000, ;{
4000         int gtid = __kmp_entry_gtid();
4001         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4002         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4003           (kmp_affin_mask_t *)(*mask));
4004         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4005           proc, gtid, buf);
4006     });
4007 
4008     if (__kmp_env_consistency_check) {
4009         if ((mask == NULL) || (*mask == NULL)) {
4010             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4011         }
4012     }
4013 
4014     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4015         return -1;
4016     }
4017     if (! KMP_CPU_ISSET(proc, fullMask)) {
4018         return -2;
4019     }
4020 
4021     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4022     return 0;
4023 }
4024 
4025 
4026 int
4027 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4028 {
4029     int retval;
4030 
4031     if (! KMP_AFFINITY_CAPABLE()) {
4032         return -1;
4033     }
4034 
4035     KA_TRACE(1000, ;{
4036         int gtid = __kmp_entry_gtid();
4037         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4038         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4039           (kmp_affin_mask_t *)(*mask));
4040         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4041           proc, gtid, buf);
4042     });
4043 
4044     if (__kmp_env_consistency_check) {
4045         if ((mask == NULL) || (*mask == NULL)) {
4046             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4047         }
4048     }
4049 
4050     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4051         return -1;
4052     }
4053     if (! KMP_CPU_ISSET(proc, fullMask)) {
4054         return -2;
4055     }
4056 
4057     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4058     return 0;
4059 }
4060 
4061 
4062 int
4063 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4064 {
4065     int retval;
4066 
4067     if (! KMP_AFFINITY_CAPABLE()) {
4068         return -1;
4069     }
4070 
4071     KA_TRACE(1000, ;{
4072         int gtid = __kmp_entry_gtid();
4073         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4074         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4075           (kmp_affin_mask_t *)(*mask));
4076         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4077           proc, gtid, buf);
4078     });
4079 
4080     if (__kmp_env_consistency_check) {
4081         if ((mask == NULL) || (*mask == NULL)) {
4082             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4083         }
4084     }
4085 
4086     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4087         return 0;
4088     }
4089     if (! KMP_CPU_ISSET(proc, fullMask)) {
4090         return 0;
4091     }
4092 
4093     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4094 }
4095 
4096 
4097 // Dynamic affinity settings - Affinity balanced
4098 void __kmp_balanced_affinity( int tid, int nthreads )
4099 {
4100     if( __kmp_affinity_uniform_topology() ) {
4101         int coreID;
4102         int threadID;
4103         // Number of hyper threads per core in HT machine
4104         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4105         // Number of cores
4106         int ncores = __kmp_ncores;
4107         // How many threads will be bound to each core
4108         int chunk = nthreads / ncores;
4109         // How many cores will have an additional thread bound to it - "big cores"
4110         int big_cores = nthreads % ncores;
4111         // Number of threads on the big cores
4112         int big_nth = ( chunk + 1 ) * big_cores;
4113         if( tid < big_nth ) {
4114             coreID = tid / (chunk + 1 );
4115             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4116         } else { //tid >= big_nth
4117             coreID = ( tid - big_cores ) / chunk;
4118             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4119         }
4120 
4121         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4122           "Illegal set affinity operation when not capable");
4123 
4124         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4125         KMP_CPU_ZERO(mask);
4126 
4127         // Granularity == thread
4128         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4129             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4130             KMP_CPU_SET( osID, mask);
4131         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4132             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4133                 int osID;
4134                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4135                 KMP_CPU_SET( osID, mask);
4136             }
4137         }
4138         if (__kmp_affinity_verbose) {
4139             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4140             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4141             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4142               tid, buf);
4143         }
4144         __kmp_set_system_affinity( mask, TRUE );
4145     } else { // Non-uniform topology
4146 
4147         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4148         KMP_CPU_ZERO(mask);
4149 
4150         // Number of hyper threads per core in HT machine
4151         int nth_per_core = __kmp_nThreadsPerCore;
4152         int core_level;
4153         if( nth_per_core > 1 ) {
4154             core_level = __kmp_aff_depth - 2;
4155         } else {
4156             core_level = __kmp_aff_depth - 1;
4157         }
4158 
4159         // Number of cores - maximum value; it does not count trail cores with 0 processors
4160         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4161 
4162         // For performance gain consider the special case nthreads == __kmp_avail_proc
4163         if( nthreads == __kmp_avail_proc ) {
4164             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4165                 int osID = address2os[ tid ].second;
4166                 KMP_CPU_SET( osID, mask);
4167             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4168                 int coreID = address2os[ tid ].first.labels[ core_level ];
4169                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4170                 // since the address2os is sortied we can break when cnt==nth_per_core
4171                 int cnt = 0;
4172                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4173                     int osID = address2os[ i ].second;
4174                     int core = address2os[ i ].first.labels[ core_level ];
4175                     if( core == coreID ) {
4176                         KMP_CPU_SET( osID, mask);
4177                         cnt++;
4178                         if( cnt == nth_per_core ) {
4179                             break;
4180                         }
4181                     }
4182                 }
4183             }
4184         } else if( nthreads <= __kmp_ncores ) {
4185 
4186             int core = 0;
4187             for( int i = 0; i < ncores; i++ ) {
4188                 // Check if this core from procarr[] is in the mask
4189                 int in_mask = 0;
4190                 for( int j = 0; j < nth_per_core; j++ ) {
4191                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4192                         in_mask = 1;
4193                         break;
4194                     }
4195                 }
4196                 if( in_mask ) {
4197                     if( tid == core ) {
4198                         for( int j = 0; j < nth_per_core; j++ ) {
4199                             int osID = procarr[ i * nth_per_core + j ];
4200                             if( osID != -1 ) {
4201                                 KMP_CPU_SET( osID, mask );
4202                                 // For granularity=thread it is enough to set the first available osID for this core
4203                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4204                                     break;
4205                                 }
4206                             }
4207                         }
4208                         break;
4209                     } else {
4210                         core++;
4211                     }
4212                 }
4213             }
4214 
4215         } else { // nthreads > __kmp_ncores
4216 
4217             // Array to save the number of processors at each core
4218             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4219             // Array to save the number of cores with "x" available processors;
4220             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4221             // Array to save the number of cores with # procs from x to nth_per_core
4222             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4223 
4224             for( int i = 0; i <= nth_per_core; i++ ) {
4225                 ncores_with_x_procs[ i ] = 0;
4226                 ncores_with_x_to_max_procs[ i ] = 0;
4227             }
4228 
4229             for( int i = 0; i < ncores; i++ ) {
4230                 int cnt = 0;
4231                 for( int j = 0; j < nth_per_core; j++ ) {
4232                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4233                         cnt++;
4234                     }
4235                 }
4236                 nproc_at_core[ i ] = cnt;
4237                 ncores_with_x_procs[ cnt ]++;
4238             }
4239 
4240             for( int i = 0; i <= nth_per_core; i++ ) {
4241                 for( int j = i; j <= nth_per_core; j++ ) {
4242                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4243                 }
4244             }
4245 
4246             // Max number of processors
4247             int nproc = nth_per_core * ncores;
4248             // An array to keep number of threads per each context
4249             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4250             for( int i = 0; i < nproc; i++ ) {
4251                 newarr[ i ] = 0;
4252             }
4253 
4254             int nth = nthreads;
4255             int flag = 0;
4256             while( nth > 0 ) {
4257                 for( int j = 1; j <= nth_per_core; j++ ) {
4258                     int cnt = ncores_with_x_to_max_procs[ j ];
4259                     for( int i = 0; i < ncores; i++ ) {
4260                         // Skip the core with 0 processors
4261                         if( nproc_at_core[ i ] == 0 ) {
4262                             continue;
4263                         }
4264                         for( int k = 0; k < nth_per_core; k++ ) {
4265                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4266                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4267                                     newarr[ i * nth_per_core + k ] = 1;
4268                                     cnt--;
4269                                     nth--;
4270                                     break;
4271                                 } else {
4272                                     if( flag != 0 ) {
4273                                         newarr[ i * nth_per_core + k ] ++;
4274                                         cnt--;
4275                                         nth--;
4276                                         break;
4277                                     }
4278                                 }
4279                             }
4280                         }
4281                         if( cnt == 0 || nth == 0 ) {
4282                             break;
4283                         }
4284                     }
4285                     if( nth == 0 ) {
4286                         break;
4287                     }
4288                 }
4289                 flag = 1;
4290             }
4291             int sum = 0;
4292             for( int i = 0; i < nproc; i++ ) {
4293                 sum += newarr[ i ];
4294                 if( sum > tid ) {
4295                     // Granularity == thread
4296                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4297                         int osID = procarr[ i ];
4298                         KMP_CPU_SET( osID, mask);
4299                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4300                         int coreID = i / nth_per_core;
4301                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4302                             int osID = procarr[ coreID * nth_per_core + ii ];
4303                             if( osID != -1 ) {
4304                                 KMP_CPU_SET( osID, mask);
4305                             }
4306                         }
4307                     }
4308                     break;
4309                 }
4310             }
4311             __kmp_free( newarr );
4312         }
4313 
4314         if (__kmp_affinity_verbose) {
4315             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4316             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4317             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4318               tid, buf);
4319         }
4320         __kmp_set_system_affinity( mask, TRUE );
4321     }
4322 }
4323 
4324 #endif // KMP_AFFINITY_SUPPORTED
4325