1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     // Adjust the hierarchy in case num threads exceeds original
37     if (nproc > machine_hierarchy.base_num_threads)
38         machine_hierarchy.resize(nproc);
39 
40     depth = machine_hierarchy.depth;
41     KMP_DEBUG_ASSERT(depth > 0);
42 
43     thr_bar->depth = depth;
44     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 char *
54 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
55 {
56     KMP_ASSERT(buf_len >= 40);
57     char *scan = buf;
58     char *end = buf + buf_len - 1;
59 
60     //
61     // Find first element / check for empty set.
62     //
63     size_t i;
64     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
65         if (KMP_CPU_ISSET(i, mask)) {
66             break;
67         }
68     }
69     if (i == KMP_CPU_SETSIZE) {
70         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
71         while (*scan != '\0') scan++;
72         KMP_ASSERT(scan <= end);
73         return buf;
74     }
75 
76     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
77     while (*scan != '\0') scan++;
78     i++;
79     for (; i < KMP_CPU_SETSIZE; i++) {
80         if (! KMP_CPU_ISSET(i, mask)) {
81             continue;
82         }
83 
84         //
85         // Check for buffer overflow.  A string of the form ",<n>" will have
86         // at most 10 characters, plus we want to leave room to print ",...}"
87         // if the set is too large to print for a total of 15 characters.
88         // We already left room for '\0' in setting end.
89         //
90         if (end - scan < 15) {
91            break;
92         }
93         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
94         while (*scan != '\0') scan++;
95     }
96     if (i < KMP_CPU_SETSIZE) {
97         KMP_SNPRINTF(scan, end-scan+1,  ",...");
98         while (*scan != '\0') scan++;
99     }
100     KMP_SNPRINTF(scan, end-scan+1, "}");
101     while (*scan != '\0') scan++;
102     KMP_ASSERT(scan <= end);
103     return buf;
104 }
105 
106 
107 void
108 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
109 {
110     KMP_CPU_ZERO(mask);
111 
112 # if KMP_GROUP_AFFINITY
113 
114     if (__kmp_num_proc_groups > 1) {
115         int group;
116         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
117         for (group = 0; group < __kmp_num_proc_groups; group++) {
118             int i;
119             int num = __kmp_GetActiveProcessorCount(group);
120             for (i = 0; i < num; i++) {
121                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
122             }
123         }
124     }
125     else
126 
127 # endif /* KMP_GROUP_AFFINITY */
128 
129     {
130         int proc;
131         for (proc = 0; proc < __kmp_xproc; proc++) {
132             KMP_CPU_SET(proc, mask);
133         }
134     }
135 }
136 
137 //
138 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
139 // called to renumber the labels from [0..n] and place them into the child_num
140 // vector of the address object.  This is done in case the labels used for
141 // the children at one node of the hierarchy differ from those used for
142 // another node at the same level.  Example:  suppose the machine has 2 nodes
143 // with 2 packages each.  The first node contains packages 601 and 602, and
144 // second node contains packages 603 and 604.  If we try to sort the table
145 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
146 // because we are paying attention to the labels themselves, not the ordinal
147 // child numbers.  By using the child numbers in the sort, the result is
148 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
149 //
150 static void
151 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
152   int numAddrs)
153 {
154     KMP_DEBUG_ASSERT(numAddrs > 0);
155     int depth = address2os->first.depth;
156     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
157     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
158       * sizeof(unsigned));
159     int labCt;
160     for (labCt = 0; labCt < depth; labCt++) {
161         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
162         lastLabel[labCt] = address2os[0].first.labels[labCt];
163     }
164     int i;
165     for (i = 1; i < numAddrs; i++) {
166         for (labCt = 0; labCt < depth; labCt++) {
167             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
168                 int labCt2;
169                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
170                     counts[labCt2] = 0;
171                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
172                 }
173                 counts[labCt]++;
174                 lastLabel[labCt] = address2os[i].first.labels[labCt];
175                 break;
176             }
177         }
178         for (labCt = 0; labCt < depth; labCt++) {
179             address2os[i].first.childNums[labCt] = counts[labCt];
180         }
181         for (; labCt < (int)Address::maxDepth; labCt++) {
182             address2os[i].first.childNums[labCt] = 0;
183         }
184     }
185 }
186 
187 
188 //
189 // All of the __kmp_affinity_create_*_map() routines should set
190 // __kmp_affinity_masks to a vector of affinity mask objects of length
191 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
192 // return the number of levels in the machine topology tree (zero if
193 // __kmp_affinity_type == affinity_none).
194 //
195 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
196 // to the affinity mask for the initialization thread.  They need to save and
197 // restore the mask, and it could be needed later, so saving it is just an
198 // optimization to avoid calling kmp_get_system_affinity() again.
199 //
200 static kmp_affin_mask_t *fullMask = NULL;
201 
202 kmp_affin_mask_t *
203 __kmp_affinity_get_fullMask() { return fullMask; }
204 
205 
206 static int nCoresPerPkg, nPackages;
207 static int __kmp_nThreadsPerCore;
208 #ifndef KMP_DFLT_NTH_CORES
209 static int __kmp_ncores;
210 #endif
211 
212 //
213 // __kmp_affinity_uniform_topology() doesn't work when called from
214 // places which support arbitrarily many levels in the machine topology
215 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
216 // __kmp_affinity_create_x2apicid_map().
217 //
218 inline static bool
219 __kmp_affinity_uniform_topology()
220 {
221     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
222 }
223 
224 
225 //
226 // Print out the detailed machine topology map, i.e. the physical locations
227 // of each OS proc.
228 //
229 static void
230 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
231   int pkgLevel, int coreLevel, int threadLevel)
232 {
233     int proc;
234 
235     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
236     for (proc = 0; proc < len; proc++) {
237         int level;
238         kmp_str_buf_t buf;
239         __kmp_str_buf_init(&buf);
240         for (level = 0; level < depth; level++) {
241             if (level == threadLevel) {
242                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
243             }
244             else if (level == coreLevel) {
245                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
246             }
247             else if (level == pkgLevel) {
248                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
249             }
250             else if (level > pkgLevel) {
251                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
252                   level - pkgLevel - 1);
253             }
254             else {
255                 __kmp_str_buf_print(&buf, "L%d ", level);
256             }
257             __kmp_str_buf_print(&buf, "%d ",
258               address2os[proc].first.labels[level]);
259         }
260         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
261           buf.str);
262         __kmp_str_buf_free(&buf);
263     }
264 }
265 
266 
267 //
268 // If we don't know how to retrieve the machine's processor topology, or
269 // encounter an error in doing so, this routine is called to form a "flat"
270 // mapping of os thread id's <-> processor id's.
271 //
272 static int
273 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
274   kmp_i18n_id_t *const msg_id)
275 {
276     *address2os = NULL;
277     *msg_id = kmp_i18n_null;
278 
279     //
280     // Even if __kmp_affinity_type == affinity_none, this routine might still
281     // called to set __kmp_ncores, as well as
282     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
283     //
284     if (! KMP_AFFINITY_CAPABLE()) {
285         KMP_ASSERT(__kmp_affinity_type == affinity_none);
286         __kmp_ncores = nPackages = __kmp_xproc;
287         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
288         if (__kmp_affinity_verbose) {
289             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
290             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
291             KMP_INFORM(Uniform, "KMP_AFFINITY");
292             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
293               __kmp_nThreadsPerCore, __kmp_ncores);
294         }
295         return 0;
296     }
297 
298     //
299     // When affinity is off, this routine will still be called to set
300     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
301     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
302     //  correctly, and return now if affinity is not enabled.
303     //
304     __kmp_ncores = nPackages = __kmp_avail_proc;
305     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
306     if (__kmp_affinity_verbose) {
307         char buf[KMP_AFFIN_MASK_PRINT_LEN];
308         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
309 
310         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
311         if (__kmp_affinity_respect_mask) {
312             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
313         } else {
314             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
315         }
316         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
317         KMP_INFORM(Uniform, "KMP_AFFINITY");
318         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
319           __kmp_nThreadsPerCore, __kmp_ncores);
320     }
321     if (__kmp_affinity_type == affinity_none) {
322         return 0;
323     }
324 
325     //
326     // Contruct the data structure to be returned.
327     //
328     *address2os = (AddrUnsPair*)
329       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
330     int avail_ct = 0;
331     unsigned int i;
332     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
333         //
334         // Skip this proc if it is not included in the machine model.
335         //
336         if (! KMP_CPU_ISSET(i, fullMask)) {
337             continue;
338         }
339 
340         Address addr(1);
341         addr.labels[0] = i;
342         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
343     }
344     if (__kmp_affinity_verbose) {
345         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
346     }
347 
348     if (__kmp_affinity_gran_levels < 0) {
349         //
350         // Only the package level is modeled in the machine topology map,
351         // so the #levels of granularity is either 0 or 1.
352         //
353         if (__kmp_affinity_gran > affinity_gran_package) {
354             __kmp_affinity_gran_levels = 1;
355         }
356         else {
357             __kmp_affinity_gran_levels = 0;
358         }
359     }
360     return 1;
361 }
362 
363 
364 # if KMP_GROUP_AFFINITY
365 
366 //
367 // If multiple Windows* OS processor groups exist, we can create a 2-level
368 // topology map with the groups at level 0 and the individual procs at
369 // level 1.
370 //
371 // This facilitates letting the threads float among all procs in a group,
372 // if granularity=group (the default when there are multiple groups).
373 //
374 static int
375 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
376   kmp_i18n_id_t *const msg_id)
377 {
378     *address2os = NULL;
379     *msg_id = kmp_i18n_null;
380 
381     //
382     // If we don't have multiple processor groups, return now.
383     // The flat mapping will be used.
384     //
385     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
386         // FIXME set *msg_id
387         return -1;
388     }
389 
390     //
391     // Contruct the data structure to be returned.
392     //
393     *address2os = (AddrUnsPair*)
394       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
395     int avail_ct = 0;
396     int i;
397     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
398         //
399         // Skip this proc if it is not included in the machine model.
400         //
401         if (! KMP_CPU_ISSET(i, fullMask)) {
402             continue;
403         }
404 
405         Address addr(2);
406         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
407         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
408         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
409 
410         if (__kmp_affinity_verbose) {
411             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
412               addr.labels[1]);
413         }
414     }
415 
416     if (__kmp_affinity_gran_levels < 0) {
417         if (__kmp_affinity_gran == affinity_gran_group) {
418             __kmp_affinity_gran_levels = 1;
419         }
420         else if ((__kmp_affinity_gran == affinity_gran_fine)
421           || (__kmp_affinity_gran == affinity_gran_thread)) {
422             __kmp_affinity_gran_levels = 0;
423         }
424         else {
425             const char *gran_str = NULL;
426             if (__kmp_affinity_gran == affinity_gran_core) {
427                 gran_str = "core";
428             }
429             else if (__kmp_affinity_gran == affinity_gran_package) {
430                 gran_str = "package";
431             }
432             else if (__kmp_affinity_gran == affinity_gran_node) {
433                 gran_str = "node";
434             }
435             else {
436                 KMP_ASSERT(0);
437             }
438 
439             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
440             __kmp_affinity_gran_levels = 0;
441         }
442     }
443     return 2;
444 }
445 
446 # endif /* KMP_GROUP_AFFINITY */
447 
448 
449 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
450 
451 static int
452 __kmp_cpuid_mask_width(int count) {
453     int r = 0;
454 
455     while((1<<r) < count)
456         ++r;
457     return r;
458 }
459 
460 
461 class apicThreadInfo {
462 public:
463     unsigned osId;              // param to __kmp_affinity_bind_thread
464     unsigned apicId;            // from cpuid after binding
465     unsigned maxCoresPerPkg;    //      ""
466     unsigned maxThreadsPerPkg;  //      ""
467     unsigned pkgId;             // inferred from above values
468     unsigned coreId;            //      ""
469     unsigned threadId;          //      ""
470 };
471 
472 
473 static int
474 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
475 {
476     const apicThreadInfo *aa = (const apicThreadInfo *)a;
477     const apicThreadInfo *bb = (const apicThreadInfo *)b;
478     if (aa->osId < bb->osId) return -1;
479     if (aa->osId > bb->osId) return 1;
480     return 0;
481 }
482 
483 
484 static int
485 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
486 {
487     const apicThreadInfo *aa = (const apicThreadInfo *)a;
488     const apicThreadInfo *bb = (const apicThreadInfo *)b;
489     if (aa->pkgId < bb->pkgId) return -1;
490     if (aa->pkgId > bb->pkgId) return 1;
491     if (aa->coreId < bb->coreId) return -1;
492     if (aa->coreId > bb->coreId) return 1;
493     if (aa->threadId < bb->threadId) return -1;
494     if (aa->threadId > bb->threadId) return 1;
495     return 0;
496 }
497 
498 
499 //
500 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
501 // an algorithm which cycles through the available os threads, setting
502 // the current thread's affinity mask to that thread, and then retrieves
503 // the Apic Id for each thread context using the cpuid instruction.
504 //
505 static int
506 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
507   kmp_i18n_id_t *const msg_id)
508 {
509     kmp_cpuid buf;
510     int rc;
511     *address2os = NULL;
512     *msg_id = kmp_i18n_null;
513 
514     //
515     // Check if cpuid leaf 4 is supported.
516     //
517         __kmp_x86_cpuid(0, 0, &buf);
518         if (buf.eax < 4) {
519             *msg_id = kmp_i18n_str_NoLeaf4Support;
520             return -1;
521         }
522 
523     //
524     // The algorithm used starts by setting the affinity to each available
525     // thread and retrieving info from the cpuid instruction, so if we are
526     // not capable of calling __kmp_get_system_affinity() and
527     // _kmp_get_system_affinity(), then we need to do something else - use
528     // the defaults that we calculated from issuing cpuid without binding
529     // to each proc.
530     //
531     if (! KMP_AFFINITY_CAPABLE()) {
532         //
533         // Hack to try and infer the machine topology using only the data
534         // available from cpuid on the current thread, and __kmp_xproc.
535         //
536         KMP_ASSERT(__kmp_affinity_type == affinity_none);
537 
538         //
539         // Get an upper bound on the number of threads per package using
540         // cpuid(1).
541         //
542         // On some OS/chps combinations where HT is supported by the chip
543         // but is disabled, this value will be 2 on a single core chip.
544         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
545         //
546         __kmp_x86_cpuid(1, 0, &buf);
547         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
548         if (maxThreadsPerPkg == 0) {
549             maxThreadsPerPkg = 1;
550         }
551 
552         //
553         // The num cores per pkg comes from cpuid(4).
554         // 1 must be added to the encoded value.
555         //
556         // The author of cpu_count.cpp treated this only an upper bound
557         // on the number of cores, but I haven't seen any cases where it
558         // was greater than the actual number of cores, so we will treat
559         // it as exact in this block of code.
560         //
561         // First, we need to check if cpuid(4) is supported on this chip.
562         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
563         // has the value n or greater.
564         //
565         __kmp_x86_cpuid(0, 0, &buf);
566         if (buf.eax >= 4) {
567             __kmp_x86_cpuid(4, 0, &buf);
568             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
569         }
570         else {
571             nCoresPerPkg = 1;
572         }
573 
574         //
575         // There is no way to reliably tell if HT is enabled without issuing
576         // the cpuid instruction from every thread, can correlating the cpuid
577         // info, so if the machine is not affinity capable, we assume that HT
578         // is off.  We have seen quite a few machines where maxThreadsPerPkg
579         // is 2, yet the machine does not support HT.
580         //
581         // - Older OSes are usually found on machines with older chips, which
582         //   do not support HT.
583         //
584         // - The performance penalty for mistakenly identifying a machine as
585         //   HT when it isn't (which results in blocktime being incorrecly set
586         //   to 0) is greater than the penalty when for mistakenly identifying
587         //   a machine as being 1 thread/core when it is really HT enabled
588         //   (which results in blocktime being incorrectly set to a positive
589         //   value).
590         //
591         __kmp_ncores = __kmp_xproc;
592         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
593         __kmp_nThreadsPerCore = 1;
594         if (__kmp_affinity_verbose) {
595             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
596             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
597             if (__kmp_affinity_uniform_topology()) {
598                 KMP_INFORM(Uniform, "KMP_AFFINITY");
599             } else {
600                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
601             }
602             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
603               __kmp_nThreadsPerCore, __kmp_ncores);
604         }
605         return 0;
606     }
607 
608     //
609     //
610     // From here on, we can assume that it is safe to call
611     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
612     // even if __kmp_affinity_type = affinity_none.
613     //
614 
615     //
616     // Save the affinity mask for the current thread.
617     //
618     kmp_affin_mask_t *oldMask;
619     KMP_CPU_ALLOC(oldMask);
620     KMP_ASSERT(oldMask != NULL);
621     __kmp_get_system_affinity(oldMask, TRUE);
622 
623     //
624     // Run through each of the available contexts, binding the current thread
625     // to it, and obtaining the pertinent information using the cpuid instr.
626     //
627     // The relevant information is:
628     //
629     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
630     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
631     //
632     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
633     //    value of this field determines the width of the core# + thread#
634     //    fields in the Apic Id.  It is also an upper bound on the number
635     //    of threads per package, but it has been verified that situations
636     //    happen were it is not exact.  In particular, on certain OS/chip
637     //    combinations where Intel(R) Hyper-Threading Technology is supported
638     //    by the chip but has
639     //    been disabled, the value of this field will be 2 (for a single core
640     //    chip).  On other OS/chip combinations supporting
641     //    Intel(R) Hyper-Threading Technology, the value of
642     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
643     //    disabled and 2 when it is enabled.
644     //
645     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
646     //    value of this field (+1) determines the width of the core# field in
647     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
648     //    an upper bound, but the IA-32 architecture manual says that it is
649     //    exactly the number of cores per package, and I haven't seen any
650     //    case where it wasn't.
651     //
652     // From this information, deduce the package Id, core Id, and thread Id,
653     // and set the corresponding fields in the apicThreadInfo struct.
654     //
655     unsigned i;
656     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
657       __kmp_avail_proc * sizeof(apicThreadInfo));
658     unsigned nApics = 0;
659     for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
660         //
661         // Skip this proc if it is not included in the machine model.
662         //
663         if (! KMP_CPU_ISSET(i, fullMask)) {
664             continue;
665         }
666         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
667 
668         __kmp_affinity_bind_thread(i);
669         threadInfo[nApics].osId = i;
670 
671         //
672         // The apic id and max threads per pkg come from cpuid(1).
673         //
674         __kmp_x86_cpuid(1, 0, &buf);
675         if (! (buf.edx >> 9) & 1) {
676             __kmp_set_system_affinity(oldMask, TRUE);
677             __kmp_free(threadInfo);
678             KMP_CPU_FREE(oldMask);
679             *msg_id = kmp_i18n_str_ApicNotPresent;
680             return -1;
681         }
682         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
683         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
684         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
685             threadInfo[nApics].maxThreadsPerPkg = 1;
686         }
687 
688         //
689         // Max cores per pkg comes from cpuid(4).
690         // 1 must be added to the encoded value.
691         //
692         // First, we need to check if cpuid(4) is supported on this chip.
693         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
694         // has the value n or greater.
695         //
696         __kmp_x86_cpuid(0, 0, &buf);
697         if (buf.eax >= 4) {
698             __kmp_x86_cpuid(4, 0, &buf);
699             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
700         }
701         else {
702             threadInfo[nApics].maxCoresPerPkg = 1;
703         }
704 
705         //
706         // Infer the pkgId / coreId / threadId using only the info
707         // obtained locally.
708         //
709         int widthCT = __kmp_cpuid_mask_width(
710           threadInfo[nApics].maxThreadsPerPkg);
711         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
712 
713         int widthC = __kmp_cpuid_mask_width(
714           threadInfo[nApics].maxCoresPerPkg);
715         int widthT = widthCT - widthC;
716         if (widthT < 0) {
717             //
718             // I've never seen this one happen, but I suppose it could, if
719             // the cpuid instruction on a chip was really screwed up.
720             // Make sure to restore the affinity mask before the tail call.
721             //
722             __kmp_set_system_affinity(oldMask, TRUE);
723             __kmp_free(threadInfo);
724             KMP_CPU_FREE(oldMask);
725             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
726             return -1;
727         }
728 
729         int maskC = (1 << widthC) - 1;
730         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
731           &maskC;
732 
733         int maskT = (1 << widthT) - 1;
734         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
735 
736         nApics++;
737     }
738 
739     //
740     // We've collected all the info we need.
741     // Restore the old affinity mask for this thread.
742     //
743     __kmp_set_system_affinity(oldMask, TRUE);
744 
745     //
746     // If there's only one thread context to bind to, form an Address object
747     // with depth 1 and return immediately (or, if affinity is off, set
748     // address2os to NULL and return).
749     //
750     // If it is configured to omit the package level when there is only a
751     // single package, the logic at the end of this routine won't work if
752     // there is only a single thread - it would try to form an Address
753     // object with depth 0.
754     //
755     KMP_ASSERT(nApics > 0);
756     if (nApics == 1) {
757         __kmp_ncores = nPackages = 1;
758         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
759         if (__kmp_affinity_verbose) {
760             char buf[KMP_AFFIN_MASK_PRINT_LEN];
761             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
762 
763             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
764             if (__kmp_affinity_respect_mask) {
765                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
766             } else {
767                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
768             }
769             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
770             KMP_INFORM(Uniform, "KMP_AFFINITY");
771             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
772               __kmp_nThreadsPerCore, __kmp_ncores);
773         }
774 
775         if (__kmp_affinity_type == affinity_none) {
776             __kmp_free(threadInfo);
777             KMP_CPU_FREE(oldMask);
778             return 0;
779         }
780 
781         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
782         Address addr(1);
783         addr.labels[0] = threadInfo[0].pkgId;
784         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
785 
786         if (__kmp_affinity_gran_levels < 0) {
787             __kmp_affinity_gran_levels = 0;
788         }
789 
790         if (__kmp_affinity_verbose) {
791             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
792         }
793 
794         __kmp_free(threadInfo);
795         KMP_CPU_FREE(oldMask);
796         return 1;
797     }
798 
799     //
800     // Sort the threadInfo table by physical Id.
801     //
802     qsort(threadInfo, nApics, sizeof(*threadInfo),
803       __kmp_affinity_cmp_apicThreadInfo_phys_id);
804 
805     //
806     // The table is now sorted by pkgId / coreId / threadId, but we really
807     // don't know the radix of any of the fields.  pkgId's may be sparsely
808     // assigned among the chips on a system.  Although coreId's are usually
809     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
810     // [0..threadsPerCore-1], we don't want to make any such assumptions.
811     //
812     // For that matter, we don't know what coresPerPkg and threadsPerCore
813     // (or the total # packages) are at this point - we want to determine
814     // that now.  We only have an upper bound on the first two figures.
815     //
816     // We also perform a consistency check at this point: the values returned
817     // by the cpuid instruction for any thread bound to a given package had
818     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
819     //
820     nPackages = 1;
821     nCoresPerPkg = 1;
822     __kmp_nThreadsPerCore = 1;
823     unsigned nCores = 1;
824 
825     unsigned pkgCt = 1;                         // to determine radii
826     unsigned lastPkgId = threadInfo[0].pkgId;
827     unsigned coreCt = 1;
828     unsigned lastCoreId = threadInfo[0].coreId;
829     unsigned threadCt = 1;
830     unsigned lastThreadId = threadInfo[0].threadId;
831 
832                                                 // intra-pkg consist checks
833     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
834     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
835 
836     for (i = 1; i < nApics; i++) {
837         if (threadInfo[i].pkgId != lastPkgId) {
838             nCores++;
839             pkgCt++;
840             lastPkgId = threadInfo[i].pkgId;
841             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
842             coreCt = 1;
843             lastCoreId = threadInfo[i].coreId;
844             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
845             threadCt = 1;
846             lastThreadId = threadInfo[i].threadId;
847 
848             //
849             // This is a different package, so go on to the next iteration
850             // without doing any consistency checks.  Reset the consistency
851             // check vars, though.
852             //
853             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
854             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
855             continue;
856         }
857 
858         if (threadInfo[i].coreId != lastCoreId) {
859             nCores++;
860             coreCt++;
861             lastCoreId = threadInfo[i].coreId;
862             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
863             threadCt = 1;
864             lastThreadId = threadInfo[i].threadId;
865         }
866         else if (threadInfo[i].threadId != lastThreadId) {
867             threadCt++;
868             lastThreadId = threadInfo[i].threadId;
869         }
870         else {
871             __kmp_free(threadInfo);
872             KMP_CPU_FREE(oldMask);
873             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
874             return -1;
875         }
876 
877         //
878         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
879         // fields agree between all the threads bounds to a given package.
880         //
881         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
882           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
883             __kmp_free(threadInfo);
884             KMP_CPU_FREE(oldMask);
885             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
886             return -1;
887         }
888     }
889     nPackages = pkgCt;
890     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
891     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
892 
893     //
894     // When affinity is off, this routine will still be called to set
895     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
896     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
897     // correctly, and return now if affinity is not enabled.
898     //
899     __kmp_ncores = nCores;
900     if (__kmp_affinity_verbose) {
901         char buf[KMP_AFFIN_MASK_PRINT_LEN];
902         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
903 
904         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
905         if (__kmp_affinity_respect_mask) {
906             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
907         } else {
908             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
909         }
910         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
911         if (__kmp_affinity_uniform_topology()) {
912             KMP_INFORM(Uniform, "KMP_AFFINITY");
913         } else {
914             KMP_INFORM(NonUniform, "KMP_AFFINITY");
915         }
916         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
917           __kmp_nThreadsPerCore, __kmp_ncores);
918 
919     }
920 
921     if (__kmp_affinity_type == affinity_none) {
922         __kmp_free(threadInfo);
923         KMP_CPU_FREE(oldMask);
924         return 0;
925     }
926 
927     //
928     // Now that we've determined the number of packages, the number of cores
929     // per package, and the number of threads per core, we can construct the
930     // data structure that is to be returned.
931     //
932     int pkgLevel = 0;
933     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
934     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
935     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
936 
937     KMP_ASSERT(depth > 0);
938     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
939 
940     for (i = 0; i < nApics; ++i) {
941         Address addr(depth);
942         unsigned os = threadInfo[i].osId;
943         int d = 0;
944 
945         if (pkgLevel >= 0) {
946             addr.labels[d++] = threadInfo[i].pkgId;
947         }
948         if (coreLevel >= 0) {
949             addr.labels[d++] = threadInfo[i].coreId;
950         }
951         if (threadLevel >= 0) {
952             addr.labels[d++] = threadInfo[i].threadId;
953         }
954         (*address2os)[i] = AddrUnsPair(addr, os);
955     }
956 
957     if (__kmp_affinity_gran_levels < 0) {
958         //
959         // Set the granularity level based on what levels are modeled
960         // in the machine topology map.
961         //
962         __kmp_affinity_gran_levels = 0;
963         if ((threadLevel >= 0)
964           && (__kmp_affinity_gran > affinity_gran_thread)) {
965             __kmp_affinity_gran_levels++;
966         }
967         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
968             __kmp_affinity_gran_levels++;
969         }
970         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
971             __kmp_affinity_gran_levels++;
972         }
973     }
974 
975     if (__kmp_affinity_verbose) {
976         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
977           coreLevel, threadLevel);
978     }
979 
980     __kmp_free(threadInfo);
981     KMP_CPU_FREE(oldMask);
982     return depth;
983 }
984 
985 
986 //
987 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
988 // architectures support a newer interface for specifying the x2APIC Ids,
989 // based on cpuid leaf 11.
990 //
991 static int
992 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
993   kmp_i18n_id_t *const msg_id)
994 {
995     kmp_cpuid buf;
996 
997     *address2os = NULL;
998     *msg_id = kmp_i18n_null;
999 
1000     //
1001     // Check to see if cpuid leaf 11 is supported.
1002     //
1003     __kmp_x86_cpuid(0, 0, &buf);
1004     if (buf.eax < 11) {
1005         *msg_id = kmp_i18n_str_NoLeaf11Support;
1006         return -1;
1007     }
1008     __kmp_x86_cpuid(11, 0, &buf);
1009     if (buf.ebx == 0) {
1010         *msg_id = kmp_i18n_str_NoLeaf11Support;
1011         return -1;
1012     }
1013 
1014     //
1015     // Find the number of levels in the machine topology.  While we're at it,
1016     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1017     // try to get more accurate values later by explicitly counting them,
1018     // but get reasonable defaults now, in case we return early.
1019     //
1020     int level;
1021     int threadLevel = -1;
1022     int coreLevel = -1;
1023     int pkgLevel = -1;
1024     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1025 
1026     for (level = 0;; level++) {
1027         if (level > 31) {
1028             //
1029             // FIXME: Hack for DPD200163180
1030             //
1031             // If level is big then something went wrong -> exiting
1032             //
1033             // There could actually be 32 valid levels in the machine topology,
1034             // but so far, the only machine we have seen which does not exit
1035             // this loop before iteration 32 has fubar x2APIC settings.
1036             //
1037             // For now, just reject this case based upon loop trip count.
1038             //
1039             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1040             return -1;
1041         }
1042         __kmp_x86_cpuid(11, level, &buf);
1043         if (buf.ebx == 0) {
1044             if (pkgLevel < 0) {
1045                 //
1046                 // Will infer nPackages from __kmp_xproc
1047                 //
1048                 pkgLevel = level;
1049                 level++;
1050             }
1051             break;
1052         }
1053         int kind = (buf.ecx >> 8) & 0xff;
1054         if (kind == 1) {
1055             //
1056             // SMT level
1057             //
1058             threadLevel = level;
1059             coreLevel = -1;
1060             pkgLevel = -1;
1061             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1062             if (__kmp_nThreadsPerCore == 0) {
1063                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1064                 return -1;
1065             }
1066         }
1067         else if (kind == 2) {
1068             //
1069             // core level
1070             //
1071             coreLevel = level;
1072             pkgLevel = -1;
1073             nCoresPerPkg = buf.ebx & 0xff;
1074             if (nCoresPerPkg == 0) {
1075                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1076                 return -1;
1077             }
1078         }
1079         else {
1080             if (level <= 0) {
1081                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1082                 return -1;
1083             }
1084             if (pkgLevel >= 0) {
1085                 continue;
1086             }
1087             pkgLevel = level;
1088             nPackages = buf.ebx & 0xff;
1089             if (nPackages == 0) {
1090                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1091                 return -1;
1092             }
1093         }
1094     }
1095     int depth = level;
1096 
1097     //
1098     // In the above loop, "level" was counted from the finest level (usually
1099     // thread) to the coarsest.  The caller expects that we will place the
1100     // labels in (*address2os)[].first.labels[] in the inverse order, so
1101     // we need to invert the vars saying which level means what.
1102     //
1103     if (threadLevel >= 0) {
1104         threadLevel = depth - threadLevel - 1;
1105     }
1106     if (coreLevel >= 0) {
1107         coreLevel = depth - coreLevel - 1;
1108     }
1109     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1110     pkgLevel = depth - pkgLevel - 1;
1111 
1112     //
1113     // The algorithm used starts by setting the affinity to each available
1114     // thread and retrieving info from the cpuid instruction, so if we are
1115     // not capable of calling __kmp_get_system_affinity() and
1116     // _kmp_get_system_affinity(), then we need to do something else - use
1117     // the defaults that we calculated from issuing cpuid without binding
1118     // to each proc.
1119     //
1120     if (! KMP_AFFINITY_CAPABLE())
1121     {
1122         //
1123         // Hack to try and infer the machine topology using only the data
1124         // available from cpuid on the current thread, and __kmp_xproc.
1125         //
1126         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1127 
1128         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1129         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1130         if (__kmp_affinity_verbose) {
1131             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1132             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1133             if (__kmp_affinity_uniform_topology()) {
1134                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1135             } else {
1136                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1137             }
1138             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1139               __kmp_nThreadsPerCore, __kmp_ncores);
1140         }
1141         return 0;
1142     }
1143 
1144     //
1145     //
1146     // From here on, we can assume that it is safe to call
1147     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1148     // even if __kmp_affinity_type = affinity_none.
1149     //
1150 
1151     //
1152     // Save the affinity mask for the current thread.
1153     //
1154     kmp_affin_mask_t *oldMask;
1155     KMP_CPU_ALLOC(oldMask);
1156     __kmp_get_system_affinity(oldMask, TRUE);
1157 
1158     //
1159     // Allocate the data structure to be returned.
1160     //
1161     AddrUnsPair *retval = (AddrUnsPair *)
1162       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1163 
1164     //
1165     // Run through each of the available contexts, binding the current thread
1166     // to it, and obtaining the pertinent information using the cpuid instr.
1167     //
1168     unsigned int proc;
1169     int nApics = 0;
1170     for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1171         //
1172         // Skip this proc if it is not included in the machine model.
1173         //
1174         if (! KMP_CPU_ISSET(proc, fullMask)) {
1175             continue;
1176         }
1177         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1178 
1179         __kmp_affinity_bind_thread(proc);
1180 
1181         //
1182         // Extrach the labels for each level in the machine topology map
1183         // from the Apic ID.
1184         //
1185         Address addr(depth);
1186         int prev_shift = 0;
1187 
1188         for (level = 0; level < depth; level++) {
1189             __kmp_x86_cpuid(11, level, &buf);
1190             unsigned apicId = buf.edx;
1191             if (buf.ebx == 0) {
1192                 if (level != depth - 1) {
1193                     KMP_CPU_FREE(oldMask);
1194                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1195                     return -1;
1196                 }
1197                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1198                 level++;
1199                 break;
1200             }
1201             int shift = buf.eax & 0x1f;
1202             int mask = (1 << shift) - 1;
1203             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1204             prev_shift = shift;
1205         }
1206         if (level != depth) {
1207             KMP_CPU_FREE(oldMask);
1208             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1209             return -1;
1210         }
1211 
1212         retval[nApics] = AddrUnsPair(addr, proc);
1213         nApics++;
1214     }
1215 
1216     //
1217     // We've collected all the info we need.
1218     // Restore the old affinity mask for this thread.
1219     //
1220     __kmp_set_system_affinity(oldMask, TRUE);
1221 
1222     //
1223     // If there's only one thread context to bind to, return now.
1224     //
1225     KMP_ASSERT(nApics > 0);
1226     if (nApics == 1) {
1227         __kmp_ncores = nPackages = 1;
1228         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1229         if (__kmp_affinity_verbose) {
1230             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1231             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1232 
1233             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1234             if (__kmp_affinity_respect_mask) {
1235                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1236             } else {
1237                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1238             }
1239             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1240             KMP_INFORM(Uniform, "KMP_AFFINITY");
1241             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1242               __kmp_nThreadsPerCore, __kmp_ncores);
1243         }
1244 
1245         if (__kmp_affinity_type == affinity_none) {
1246             __kmp_free(retval);
1247             KMP_CPU_FREE(oldMask);
1248             return 0;
1249         }
1250 
1251         //
1252         // Form an Address object which only includes the package level.
1253         //
1254         Address addr(1);
1255         addr.labels[0] = retval[0].first.labels[pkgLevel];
1256         retval[0].first = addr;
1257 
1258         if (__kmp_affinity_gran_levels < 0) {
1259             __kmp_affinity_gran_levels = 0;
1260         }
1261 
1262         if (__kmp_affinity_verbose) {
1263             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1264         }
1265 
1266         *address2os = retval;
1267         KMP_CPU_FREE(oldMask);
1268         return 1;
1269     }
1270 
1271     //
1272     // Sort the table by physical Id.
1273     //
1274     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1275 
1276     //
1277     // Find the radix at each of the levels.
1278     //
1279     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1280     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1281     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1282     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1283     for (level = 0; level < depth; level++) {
1284         totals[level] = 1;
1285         maxCt[level] = 1;
1286         counts[level] = 1;
1287         last[level] = retval[0].first.labels[level];
1288     }
1289 
1290     //
1291     // From here on, the iteration variable "level" runs from the finest
1292     // level to the coarsest, i.e. we iterate forward through
1293     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1294     // backwards.
1295     //
1296     for (proc = 1; (int)proc < nApics; proc++) {
1297         int level;
1298         for (level = 0; level < depth; level++) {
1299             if (retval[proc].first.labels[level] != last[level]) {
1300                 int j;
1301                 for (j = level + 1; j < depth; j++) {
1302                     totals[j]++;
1303                     counts[j] = 1;
1304                     // The line below causes printing incorrect topology information
1305                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1306                     // some less value while going through the array.
1307                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1308                     // whereas it must be 4.
1309                     // TODO!!! Check if it can be commented safely
1310                     //maxCt[j] = 1;
1311                     last[j] = retval[proc].first.labels[j];
1312                 }
1313                 totals[level]++;
1314                 counts[level]++;
1315                 if (counts[level] > maxCt[level]) {
1316                     maxCt[level] = counts[level];
1317                 }
1318                 last[level] = retval[proc].first.labels[level];
1319                 break;
1320             }
1321             else if (level == depth - 1) {
1322                 __kmp_free(last);
1323                 __kmp_free(maxCt);
1324                 __kmp_free(counts);
1325                 __kmp_free(totals);
1326                 __kmp_free(retval);
1327                 KMP_CPU_FREE(oldMask);
1328                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1329                 return -1;
1330             }
1331         }
1332     }
1333 
1334     //
1335     // When affinity is off, this routine will still be called to set
1336     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1337     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1338     // correctly, and return if affinity is not enabled.
1339     //
1340     if (threadLevel >= 0) {
1341         __kmp_nThreadsPerCore = maxCt[threadLevel];
1342     }
1343     else {
1344         __kmp_nThreadsPerCore = 1;
1345     }
1346     nPackages = totals[pkgLevel];
1347 
1348     if (coreLevel >= 0) {
1349         __kmp_ncores = totals[coreLevel];
1350         nCoresPerPkg = maxCt[coreLevel];
1351     }
1352     else {
1353         __kmp_ncores = nPackages;
1354         nCoresPerPkg = 1;
1355     }
1356 
1357     //
1358     // Check to see if the machine topology is uniform
1359     //
1360     unsigned prod = maxCt[0];
1361     for (level = 1; level < depth; level++) {
1362        prod *= maxCt[level];
1363     }
1364     bool uniform = (prod == totals[level - 1]);
1365 
1366     //
1367     // Print the machine topology summary.
1368     //
1369     if (__kmp_affinity_verbose) {
1370         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1371         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1372 
1373         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1374         if (__kmp_affinity_respect_mask) {
1375             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1376         } else {
1377             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1378         }
1379         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1380         if (uniform) {
1381             KMP_INFORM(Uniform, "KMP_AFFINITY");
1382         } else {
1383             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1384         }
1385 
1386         kmp_str_buf_t buf;
1387         __kmp_str_buf_init(&buf);
1388 
1389         __kmp_str_buf_print(&buf, "%d", totals[0]);
1390         for (level = 1; level <= pkgLevel; level++) {
1391             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1392         }
1393         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1394           __kmp_nThreadsPerCore, __kmp_ncores);
1395 
1396         __kmp_str_buf_free(&buf);
1397     }
1398 
1399     if (__kmp_affinity_type == affinity_none) {
1400         __kmp_free(last);
1401         __kmp_free(maxCt);
1402         __kmp_free(counts);
1403         __kmp_free(totals);
1404         __kmp_free(retval);
1405         KMP_CPU_FREE(oldMask);
1406         return 0;
1407     }
1408 
1409     //
1410     // Find any levels with radiix 1, and remove them from the map
1411     // (except for the package level).
1412     //
1413     int new_depth = 0;
1414     for (level = 0; level < depth; level++) {
1415         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1416            continue;
1417         }
1418         new_depth++;
1419     }
1420 
1421     //
1422     // If we are removing any levels, allocate a new vector to return,
1423     // and copy the relevant information to it.
1424     //
1425     if (new_depth != depth) {
1426         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1427           sizeof(AddrUnsPair) * nApics);
1428         for (proc = 0; (int)proc < nApics; proc++) {
1429             Address addr(new_depth);
1430             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1431         }
1432         int new_level = 0;
1433         int newPkgLevel = -1;
1434         int newCoreLevel = -1;
1435         int newThreadLevel = -1;
1436         int i;
1437         for (level = 0; level < depth; level++) {
1438             if ((maxCt[level] == 1)
1439               && (level != pkgLevel)) {
1440                 //
1441                 // Remove this level. Never remove the package level
1442                 //
1443                 continue;
1444             }
1445             if (level == pkgLevel) {
1446                 newPkgLevel = level;
1447             }
1448             if (level == coreLevel) {
1449                 newCoreLevel = level;
1450             }
1451             if (level == threadLevel) {
1452                 newThreadLevel = level;
1453             }
1454             for (proc = 0; (int)proc < nApics; proc++) {
1455                 new_retval[proc].first.labels[new_level]
1456                   = retval[proc].first.labels[level];
1457             }
1458             new_level++;
1459         }
1460 
1461         __kmp_free(retval);
1462         retval = new_retval;
1463         depth = new_depth;
1464         pkgLevel = newPkgLevel;
1465         coreLevel = newCoreLevel;
1466         threadLevel = newThreadLevel;
1467     }
1468 
1469     if (__kmp_affinity_gran_levels < 0) {
1470         //
1471         // Set the granularity level based on what levels are modeled
1472         // in the machine topology map.
1473         //
1474         __kmp_affinity_gran_levels = 0;
1475         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1476             __kmp_affinity_gran_levels++;
1477         }
1478         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1479             __kmp_affinity_gran_levels++;
1480         }
1481         if (__kmp_affinity_gran > affinity_gran_package) {
1482             __kmp_affinity_gran_levels++;
1483         }
1484     }
1485 
1486     if (__kmp_affinity_verbose) {
1487         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1488           coreLevel, threadLevel);
1489     }
1490 
1491     __kmp_free(last);
1492     __kmp_free(maxCt);
1493     __kmp_free(counts);
1494     __kmp_free(totals);
1495     KMP_CPU_FREE(oldMask);
1496     *address2os = retval;
1497     return depth;
1498 }
1499 
1500 
1501 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1502 
1503 
1504 #define osIdIndex       0
1505 #define threadIdIndex   1
1506 #define coreIdIndex     2
1507 #define pkgIdIndex      3
1508 #define nodeIdIndex     4
1509 
1510 typedef unsigned *ProcCpuInfo;
1511 static unsigned maxIndex = pkgIdIndex;
1512 
1513 
1514 static int
1515 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1516 {
1517     const unsigned *aa = (const unsigned *)a;
1518     const unsigned *bb = (const unsigned *)b;
1519     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1520     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1521     return 0;
1522 };
1523 
1524 
1525 static int
1526 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1527 {
1528     unsigned i;
1529     const unsigned *aa = *((const unsigned **)a);
1530     const unsigned *bb = *((const unsigned **)b);
1531     for (i = maxIndex; ; i--) {
1532         if (aa[i] < bb[i]) return -1;
1533         if (aa[i] > bb[i]) return 1;
1534         if (i == osIdIndex) break;
1535     }
1536     return 0;
1537 }
1538 
1539 
1540 //
1541 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1542 // affinity map.
1543 //
1544 static int
1545 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1546   kmp_i18n_id_t *const msg_id, FILE *f)
1547 {
1548     *address2os = NULL;
1549     *msg_id = kmp_i18n_null;
1550 
1551     //
1552     // Scan of the file, and count the number of "processor" (osId) fields,
1553     // and find the highest value of <n> for a node_<n> field.
1554     //
1555     char buf[256];
1556     unsigned num_records = 0;
1557     while (! feof(f)) {
1558         buf[sizeof(buf) - 1] = 1;
1559         if (! fgets(buf, sizeof(buf), f)) {
1560             //
1561             // Read errors presumably because of EOF
1562             //
1563             break;
1564         }
1565 
1566         char s1[] = "processor";
1567         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1568             num_records++;
1569             continue;
1570         }
1571 
1572         //
1573         // FIXME - this will match "node_<n> <garbage>"
1574         //
1575         unsigned level;
1576         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1577             if (nodeIdIndex + level >= maxIndex) {
1578                 maxIndex = nodeIdIndex + level;
1579             }
1580             continue;
1581         }
1582     }
1583 
1584     //
1585     // Check for empty file / no valid processor records, or too many.
1586     // The number of records can't exceed the number of valid bits in the
1587     // affinity mask.
1588     //
1589     if (num_records == 0) {
1590         *line = 0;
1591         *msg_id = kmp_i18n_str_NoProcRecords;
1592         return -1;
1593     }
1594     if (num_records > (unsigned)__kmp_xproc) {
1595         *line = 0;
1596         *msg_id = kmp_i18n_str_TooManyProcRecords;
1597         return -1;
1598     }
1599 
1600     //
1601     // Set the file pointer back to the begginning, so that we can scan the
1602     // file again, this time performing a full parse of the data.
1603     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1604     // Adding an extra element at the end allows us to remove a lot of extra
1605     // checks for termination conditions.
1606     //
1607     if (fseek(f, 0, SEEK_SET) != 0) {
1608         *line = 0;
1609         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1610         return -1;
1611     }
1612 
1613     //
1614     // Allocate the array of records to store the proc info in.  The dummy
1615     // element at the end makes the logic in filling them out easier to code.
1616     //
1617     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1618       * sizeof(unsigned *));
1619     unsigned i;
1620     for (i = 0; i <= num_records; i++) {
1621         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1622           * sizeof(unsigned));
1623     }
1624 
1625 #define CLEANUP_THREAD_INFO \
1626     for (i = 0; i <= num_records; i++) {                                \
1627         __kmp_free(threadInfo[i]);                                      \
1628     }                                                                   \
1629     __kmp_free(threadInfo);
1630 
1631     //
1632     // A value of UINT_MAX means that we didn't find the field
1633     //
1634     unsigned __index;
1635 
1636 #define INIT_PROC_INFO(p) \
1637     for (__index = 0; __index <= maxIndex; __index++) {                 \
1638         (p)[__index] = UINT_MAX;                                        \
1639     }
1640 
1641     for (i = 0; i <= num_records; i++) {
1642         INIT_PROC_INFO(threadInfo[i]);
1643     }
1644 
1645     unsigned num_avail = 0;
1646     *line = 0;
1647     while (! feof(f)) {
1648         //
1649         // Create an inner scoping level, so that all the goto targets at the
1650         // end of the loop appear in an outer scoping level.  This avoids
1651         // warnings about jumping past an initialization to a target in the
1652         // same block.
1653         //
1654         {
1655             buf[sizeof(buf) - 1] = 1;
1656             bool long_line = false;
1657             if (! fgets(buf, sizeof(buf), f)) {
1658                 //
1659                 // Read errors presumably because of EOF
1660                 //
1661                 // If there is valid data in threadInfo[num_avail], then fake
1662                 // a blank line in ensure that the last address gets parsed.
1663                 //
1664                 bool valid = false;
1665                 for (i = 0; i <= maxIndex; i++) {
1666                     if (threadInfo[num_avail][i] != UINT_MAX) {
1667                         valid = true;
1668                     }
1669                 }
1670                 if (! valid) {
1671                     break;
1672                 }
1673                 buf[0] = 0;
1674             } else if (!buf[sizeof(buf) - 1]) {
1675                 //
1676                 // The line is longer than the buffer.  Set a flag and don't
1677                 // emit an error if we were going to ignore the line, anyway.
1678                 //
1679                 long_line = true;
1680 
1681 #define CHECK_LINE \
1682     if (long_line) {                                                    \
1683         CLEANUP_THREAD_INFO;                                            \
1684         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
1685         return -1;                                                      \
1686     }
1687             }
1688             (*line)++;
1689 
1690             char s1[] = "processor";
1691             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1692                 CHECK_LINE;
1693                 char *p = strchr(buf + sizeof(s1) - 1, ':');
1694                 unsigned val;
1695                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1696                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1697                 threadInfo[num_avail][osIdIndex] = val;
1698 #if KMP_OS_LINUX && USE_SYSFS_INFO
1699                 char path[256];
1700                 KMP_SNPRINTF(path, sizeof(path),
1701                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1702                     threadInfo[num_avail][osIdIndex]);
1703                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1704 
1705                 KMP_SNPRINTF(path, sizeof(path),
1706                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
1707                     threadInfo[num_avail][osIdIndex]);
1708                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1709                 continue;
1710 #else
1711             }
1712             char s2[] = "physical id";
1713             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1714                 CHECK_LINE;
1715                 char *p = strchr(buf + sizeof(s2) - 1, ':');
1716                 unsigned val;
1717                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1718                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1719                 threadInfo[num_avail][pkgIdIndex] = val;
1720                 continue;
1721             }
1722             char s3[] = "core id";
1723             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1724                 CHECK_LINE;
1725                 char *p = strchr(buf + sizeof(s3) - 1, ':');
1726                 unsigned val;
1727                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1728                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1729                 threadInfo[num_avail][coreIdIndex] = val;
1730                 continue;
1731 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
1732             }
1733             char s4[] = "thread id";
1734             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1735                 CHECK_LINE;
1736                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1737                 unsigned val;
1738                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1739                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1740                 threadInfo[num_avail][threadIdIndex] = val;
1741                 continue;
1742             }
1743             unsigned level;
1744             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1745                 CHECK_LINE;
1746                 char *p = strchr(buf + sizeof(s4) - 1, ':');
1747                 unsigned val;
1748                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
1749                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1750                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1751                 threadInfo[num_avail][nodeIdIndex + level] = val;
1752                 continue;
1753             }
1754 
1755             //
1756             // We didn't recognize the leading token on the line.
1757             // There are lots of leading tokens that we don't recognize -
1758             // if the line isn't empty, go on to the next line.
1759             //
1760             if ((*buf != 0) && (*buf != '\n')) {
1761                 //
1762                 // If the line is longer than the buffer, read characters
1763                 // until we find a newline.
1764                 //
1765                 if (long_line) {
1766                     int ch;
1767                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1768                 }
1769                 continue;
1770             }
1771 
1772             //
1773             // A newline has signalled the end of the processor record.
1774             // Check that there aren't too many procs specified.
1775             //
1776             if ((int)num_avail == __kmp_xproc) {
1777                 CLEANUP_THREAD_INFO;
1778                 *msg_id = kmp_i18n_str_TooManyEntries;
1779                 return -1;
1780             }
1781 
1782             //
1783             // Check for missing fields.  The osId field must be there, and we
1784             // currently require that the physical id field is specified, also.
1785             //
1786             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1787                 CLEANUP_THREAD_INFO;
1788                 *msg_id = kmp_i18n_str_MissingProcField;
1789                 return -1;
1790             }
1791             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1792                 CLEANUP_THREAD_INFO;
1793                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1794                 return -1;
1795             }
1796 
1797             //
1798             // Skip this proc if it is not included in the machine model.
1799             //
1800             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1801                 INIT_PROC_INFO(threadInfo[num_avail]);
1802                 continue;
1803             }
1804 
1805             //
1806             // We have a successful parse of this proc's info.
1807             // Increment the counter, and prepare for the next proc.
1808             //
1809             num_avail++;
1810             KMP_ASSERT(num_avail <= num_records);
1811             INIT_PROC_INFO(threadInfo[num_avail]);
1812         }
1813         continue;
1814 
1815         no_val:
1816         CLEANUP_THREAD_INFO;
1817         *msg_id = kmp_i18n_str_MissingValCpuinfo;
1818         return -1;
1819 
1820         dup_field:
1821         CLEANUP_THREAD_INFO;
1822         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
1823         return -1;
1824     }
1825     *line = 0;
1826 
1827 # if KMP_MIC && REDUCE_TEAM_SIZE
1828     unsigned teamSize = 0;
1829 # endif // KMP_MIC && REDUCE_TEAM_SIZE
1830 
1831     // check for num_records == __kmp_xproc ???
1832 
1833     //
1834     // If there's only one thread context to bind to, form an Address object
1835     // with depth 1 and return immediately (or, if affinity is off, set
1836     // address2os to NULL and return).
1837     //
1838     // If it is configured to omit the package level when there is only a
1839     // single package, the logic at the end of this routine won't work if
1840     // there is only a single thread - it would try to form an Address
1841     // object with depth 0.
1842     //
1843     KMP_ASSERT(num_avail > 0);
1844     KMP_ASSERT(num_avail <= num_records);
1845     if (num_avail == 1) {
1846         __kmp_ncores = 1;
1847         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1848         if (__kmp_affinity_verbose) {
1849             if (! KMP_AFFINITY_CAPABLE()) {
1850                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
1851                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1852                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1853             }
1854             else {
1855                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1856                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
1857                   fullMask);
1858                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
1859                 if (__kmp_affinity_respect_mask) {
1860                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1861                 } else {
1862                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1863                 }
1864                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1865                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1866             }
1867             int index;
1868             kmp_str_buf_t buf;
1869             __kmp_str_buf_init(&buf);
1870             __kmp_str_buf_print(&buf, "1");
1871             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
1872                 __kmp_str_buf_print(&buf, " x 1");
1873             }
1874             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
1875             __kmp_str_buf_free(&buf);
1876         }
1877 
1878         if (__kmp_affinity_type == affinity_none) {
1879             CLEANUP_THREAD_INFO;
1880             return 0;
1881         }
1882 
1883         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1884         Address addr(1);
1885         addr.labels[0] = threadInfo[0][pkgIdIndex];
1886         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
1887 
1888         if (__kmp_affinity_gran_levels < 0) {
1889             __kmp_affinity_gran_levels = 0;
1890         }
1891 
1892         if (__kmp_affinity_verbose) {
1893             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1894         }
1895 
1896         CLEANUP_THREAD_INFO;
1897         return 1;
1898     }
1899 
1900     //
1901     // Sort the threadInfo table by physical Id.
1902     //
1903     qsort(threadInfo, num_avail, sizeof(*threadInfo),
1904       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
1905 
1906     //
1907     // The table is now sorted by pkgId / coreId / threadId, but we really
1908     // don't know the radix of any of the fields.  pkgId's may be sparsely
1909     // assigned among the chips on a system.  Although coreId's are usually
1910     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1911     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1912     //
1913     // For that matter, we don't know what coresPerPkg and threadsPerCore
1914     // (or the total # packages) are at this point - we want to determine
1915     // that now.  We only have an upper bound on the first two figures.
1916     //
1917     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
1918       * sizeof(unsigned));
1919     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
1920       * sizeof(unsigned));
1921     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
1922       * sizeof(unsigned));
1923     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
1924       * sizeof(unsigned));
1925 
1926     bool assign_thread_ids = false;
1927     unsigned threadIdCt;
1928     unsigned index;
1929 
1930     restart_radix_check:
1931     threadIdCt = 0;
1932 
1933     //
1934     // Initialize the counter arrays with data from threadInfo[0].
1935     //
1936     if (assign_thread_ids) {
1937         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
1938             threadInfo[0][threadIdIndex] = threadIdCt++;
1939         }
1940         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
1941             threadIdCt = threadInfo[0][threadIdIndex] + 1;
1942         }
1943     }
1944     for (index = 0; index <= maxIndex; index++) {
1945         counts[index] = 1;
1946         maxCt[index] = 1;
1947         totals[index] = 1;
1948         lastId[index] = threadInfo[0][index];;
1949     }
1950 
1951     //
1952     // Run through the rest of the OS procs.
1953     //
1954     for (i = 1; i < num_avail; i++) {
1955         //
1956         // Find the most significant index whose id differs
1957         // from the id for the previous OS proc.
1958         //
1959         for (index = maxIndex; index >= threadIdIndex; index--) {
1960             if (assign_thread_ids && (index == threadIdIndex)) {
1961                 //
1962                 // Auto-assign the thread id field if it wasn't specified.
1963                 //
1964                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
1965                     threadInfo[i][threadIdIndex] = threadIdCt++;
1966                 }
1967 
1968                 //
1969                 // Aparrently the thread id field was specified for some
1970                 // entries and not others.  Start the thread id counter
1971                 // off at the next higher thread id.
1972                 //
1973                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
1974                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
1975                 }
1976             }
1977             if (threadInfo[i][index] != lastId[index]) {
1978                 //
1979                 // Run through all indices which are less significant,
1980                 // and reset the counts to 1.
1981                 //
1982                 // At all levels up to and including index, we need to
1983                 // increment the totals and record the last id.
1984                 //
1985                 unsigned index2;
1986                 for (index2 = threadIdIndex; index2 < index; index2++) {
1987                     totals[index2]++;
1988                     if (counts[index2] > maxCt[index2]) {
1989                         maxCt[index2] = counts[index2];
1990                     }
1991                     counts[index2] = 1;
1992                     lastId[index2] = threadInfo[i][index2];
1993                 }
1994                 counts[index]++;
1995                 totals[index]++;
1996                 lastId[index] = threadInfo[i][index];
1997 
1998                 if (assign_thread_ids && (index > threadIdIndex)) {
1999 
2000 # if KMP_MIC && REDUCE_TEAM_SIZE
2001                     //
2002                     // The default team size is the total #threads in the machine
2003                     // minus 1 thread for every core that has 3 or more threads.
2004                     //
2005                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2006 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2007 
2008                     //
2009                     // Restart the thread counter, as we are on a new core.
2010                     //
2011                     threadIdCt = 0;
2012 
2013                     //
2014                     // Auto-assign the thread id field if it wasn't specified.
2015                     //
2016                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2017                         threadInfo[i][threadIdIndex] = threadIdCt++;
2018                     }
2019 
2020                     //
2021                     // Aparrently the thread id field was specified for some
2022                     // entries and not others.  Start the thread id counter
2023                     // off at the next higher thread id.
2024                     //
2025                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2026                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2027                     }
2028                 }
2029                 break;
2030             }
2031         }
2032         if (index < threadIdIndex) {
2033             //
2034             // If thread ids were specified, it is an error if they are not
2035             // unique.  Also, check that we waven't already restarted the
2036             // loop (to be safe - shouldn't need to).
2037             //
2038             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2039               || assign_thread_ids) {
2040                 __kmp_free(lastId);
2041                 __kmp_free(totals);
2042                 __kmp_free(maxCt);
2043                 __kmp_free(counts);
2044                 CLEANUP_THREAD_INFO;
2045                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2046                 return -1;
2047             }
2048 
2049             //
2050             // If the thread ids were not specified and we see entries
2051             // entries that are duplicates, start the loop over and
2052             // assign the thread ids manually.
2053             //
2054             assign_thread_ids = true;
2055             goto restart_radix_check;
2056         }
2057     }
2058 
2059 # if KMP_MIC && REDUCE_TEAM_SIZE
2060     //
2061     // The default team size is the total #threads in the machine
2062     // minus 1 thread for every core that has 3 or more threads.
2063     //
2064     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2065 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2066 
2067     for (index = threadIdIndex; index <= maxIndex; index++) {
2068         if (counts[index] > maxCt[index]) {
2069             maxCt[index] = counts[index];
2070         }
2071     }
2072 
2073     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2074     nCoresPerPkg = maxCt[coreIdIndex];
2075     nPackages = totals[pkgIdIndex];
2076 
2077     //
2078     // Check to see if the machine topology is uniform
2079     //
2080     unsigned prod = totals[maxIndex];
2081     for (index = threadIdIndex; index < maxIndex; index++) {
2082        prod *= maxCt[index];
2083     }
2084     bool uniform = (prod == totals[threadIdIndex]);
2085 
2086     //
2087     // When affinity is off, this routine will still be called to set
2088     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2089     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2090     // correctly, and return now if affinity is not enabled.
2091     //
2092     __kmp_ncores = totals[coreIdIndex];
2093 
2094     if (__kmp_affinity_verbose) {
2095         if (! KMP_AFFINITY_CAPABLE()) {
2096                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2097                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2098                 if (uniform) {
2099                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2100                 } else {
2101                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2102                 }
2103         }
2104         else {
2105             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2106             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2107                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2108                 if (__kmp_affinity_respect_mask) {
2109                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2110                 } else {
2111                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2112                 }
2113                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2114                 if (uniform) {
2115                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2116                 } else {
2117                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2118                 }
2119         }
2120         kmp_str_buf_t buf;
2121         __kmp_str_buf_init(&buf);
2122 
2123         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2124         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2125             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2126         }
2127         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2128           maxCt[threadIdIndex], __kmp_ncores);
2129 
2130         __kmp_str_buf_free(&buf);
2131     }
2132 
2133 # if KMP_MIC && REDUCE_TEAM_SIZE
2134     //
2135     // Set the default team size.
2136     //
2137     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2138         __kmp_dflt_team_nth = teamSize;
2139         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2140           __kmp_dflt_team_nth));
2141     }
2142 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2143 
2144     if (__kmp_affinity_type == affinity_none) {
2145         __kmp_free(lastId);
2146         __kmp_free(totals);
2147         __kmp_free(maxCt);
2148         __kmp_free(counts);
2149         CLEANUP_THREAD_INFO;
2150         return 0;
2151     }
2152 
2153     //
2154     // Count the number of levels which have more nodes at that level than
2155     // at the parent's level (with there being an implicit root node of
2156     // the top level).  This is equivalent to saying that there is at least
2157     // one node at this level which has a sibling.  These levels are in the
2158     // map, and the package level is always in the map.
2159     //
2160     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2161     int level = 0;
2162     for (index = threadIdIndex; index < maxIndex; index++) {
2163         KMP_ASSERT(totals[index] >= totals[index + 1]);
2164         inMap[index] = (totals[index] > totals[index + 1]);
2165     }
2166     inMap[maxIndex] = (totals[maxIndex] > 1);
2167     inMap[pkgIdIndex] = true;
2168 
2169     int depth = 0;
2170     for (index = threadIdIndex; index <= maxIndex; index++) {
2171         if (inMap[index]) {
2172             depth++;
2173         }
2174     }
2175     KMP_ASSERT(depth > 0);
2176 
2177     //
2178     // Construct the data structure that is to be returned.
2179     //
2180     *address2os = (AddrUnsPair*)
2181       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2182     int pkgLevel = -1;
2183     int coreLevel = -1;
2184     int threadLevel = -1;
2185 
2186     for (i = 0; i < num_avail; ++i) {
2187         Address addr(depth);
2188         unsigned os = threadInfo[i][osIdIndex];
2189         int src_index;
2190         int dst_index = 0;
2191 
2192         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2193             if (! inMap[src_index]) {
2194                 continue;
2195             }
2196             addr.labels[dst_index] = threadInfo[i][src_index];
2197             if (src_index == pkgIdIndex) {
2198                 pkgLevel = dst_index;
2199             }
2200             else if (src_index == coreIdIndex) {
2201                 coreLevel = dst_index;
2202             }
2203             else if (src_index == threadIdIndex) {
2204                 threadLevel = dst_index;
2205             }
2206             dst_index++;
2207         }
2208         (*address2os)[i] = AddrUnsPair(addr, os);
2209     }
2210 
2211     if (__kmp_affinity_gran_levels < 0) {
2212         //
2213         // Set the granularity level based on what levels are modeled
2214         // in the machine topology map.
2215         //
2216         unsigned src_index;
2217         __kmp_affinity_gran_levels = 0;
2218         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2219             if (! inMap[src_index]) {
2220                 continue;
2221             }
2222             switch (src_index) {
2223                 case threadIdIndex:
2224                 if (__kmp_affinity_gran > affinity_gran_thread) {
2225                     __kmp_affinity_gran_levels++;
2226                 }
2227 
2228                 break;
2229                 case coreIdIndex:
2230                 if (__kmp_affinity_gran > affinity_gran_core) {
2231                     __kmp_affinity_gran_levels++;
2232                 }
2233                 break;
2234 
2235                 case pkgIdIndex:
2236                 if (__kmp_affinity_gran > affinity_gran_package) {
2237                     __kmp_affinity_gran_levels++;
2238                 }
2239                 break;
2240             }
2241         }
2242     }
2243 
2244     if (__kmp_affinity_verbose) {
2245         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2246           coreLevel, threadLevel);
2247     }
2248 
2249     __kmp_free(inMap);
2250     __kmp_free(lastId);
2251     __kmp_free(totals);
2252     __kmp_free(maxCt);
2253     __kmp_free(counts);
2254     CLEANUP_THREAD_INFO;
2255     return depth;
2256 }
2257 
2258 
2259 //
2260 // Create and return a table of affinity masks, indexed by OS thread ID.
2261 // This routine handles OR'ing together all the affinity masks of threads
2262 // that are sufficiently close, if granularity > fine.
2263 //
2264 static kmp_affin_mask_t *
2265 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2266   AddrUnsPair *address2os, unsigned numAddrs)
2267 {
2268     //
2269     // First form a table of affinity masks in order of OS thread id.
2270     //
2271     unsigned depth;
2272     unsigned maxOsId;
2273     unsigned i;
2274 
2275     KMP_ASSERT(numAddrs > 0);
2276     depth = address2os[0].first.depth;
2277 
2278     maxOsId = 0;
2279     for (i = 0; i < numAddrs; i++) {
2280         unsigned osId = address2os[i].second;
2281         if (osId > maxOsId) {
2282             maxOsId = osId;
2283         }
2284     }
2285     kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2286       (maxOsId + 1) * __kmp_affin_mask_size);
2287 
2288     //
2289     // Sort the address2os table according to physical order.  Doing so
2290     // will put all threads on the same core/package/node in consecutive
2291     // locations.
2292     //
2293     qsort(address2os, numAddrs, sizeof(*address2os),
2294       __kmp_affinity_cmp_Address_labels);
2295 
2296     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2297     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2298         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2299     }
2300     if (__kmp_affinity_gran_levels >= (int)depth) {
2301         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2302           && (__kmp_affinity_type != affinity_none))) {
2303             KMP_WARNING(AffThreadsMayMigrate);
2304         }
2305     }
2306 
2307     //
2308     // Run through the table, forming the masks for all threads on each
2309     // core.  Threads on the same core will have identical "Address"
2310     // objects, not considering the last level, which must be the thread
2311     // id.  All threads on a core will appear consecutively.
2312     //
2313     unsigned unique = 0;
2314     unsigned j = 0;                             // index of 1st thread on core
2315     unsigned leader = 0;
2316     Address *leaderAddr = &(address2os[0].first);
2317     kmp_affin_mask_t *sum
2318       = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2319     KMP_CPU_ZERO(sum);
2320     KMP_CPU_SET(address2os[0].second, sum);
2321     for (i = 1; i < numAddrs; i++) {
2322         //
2323         // If this thread is sufficiently close to the leader (within the
2324         // granularity setting), then set the bit for this os thread in the
2325         // affinity mask for this group, and go on to the next thread.
2326         //
2327         if (leaderAddr->isClose(address2os[i].first,
2328           __kmp_affinity_gran_levels)) {
2329             KMP_CPU_SET(address2os[i].second, sum);
2330             continue;
2331         }
2332 
2333         //
2334         // For every thread in this group, copy the mask to the thread's
2335         // entry in the osId2Mask table.  Mark the first address as a
2336         // leader.
2337         //
2338         for (; j < i; j++) {
2339             unsigned osId = address2os[j].second;
2340             KMP_DEBUG_ASSERT(osId <= maxOsId);
2341             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2342             KMP_CPU_COPY(mask, sum);
2343             address2os[j].first.leader = (j == leader);
2344         }
2345         unique++;
2346 
2347         //
2348         // Start a new mask.
2349         //
2350         leader = i;
2351         leaderAddr = &(address2os[i].first);
2352         KMP_CPU_ZERO(sum);
2353         KMP_CPU_SET(address2os[i].second, sum);
2354     }
2355 
2356     //
2357     // For every thread in last group, copy the mask to the thread's
2358     // entry in the osId2Mask table.
2359     //
2360     for (; j < i; j++) {
2361         unsigned osId = address2os[j].second;
2362         KMP_DEBUG_ASSERT(osId <= maxOsId);
2363         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2364         KMP_CPU_COPY(mask, sum);
2365         address2os[j].first.leader = (j == leader);
2366     }
2367     unique++;
2368 
2369     *maxIndex = maxOsId;
2370     *numUnique = unique;
2371     return osId2Mask;
2372 }
2373 
2374 
2375 //
2376 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2377 // as file-static than to try and pass them through the calling sequence of
2378 // the recursive-descent OMP_PLACES parser.
2379 //
2380 static kmp_affin_mask_t *newMasks;
2381 static int numNewMasks;
2382 static int nextNewMask;
2383 
2384 #define ADD_MASK(_mask) \
2385     {                                                                   \
2386         if (nextNewMask >= numNewMasks) {                               \
2387             numNewMasks *= 2;                                           \
2388             newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2389               numNewMasks * __kmp_affin_mask_size);                     \
2390         }                                                               \
2391         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2392         nextNewMask++;                                                  \
2393     }
2394 
2395 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2396     {                                                                   \
2397         if (((_osId) > _maxOsId) ||                                     \
2398           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2399             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2400               && (__kmp_affinity_type != affinity_none))) {             \
2401                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2402             }                                                           \
2403         }                                                               \
2404         else {                                                          \
2405             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2406         }                                                               \
2407     }
2408 
2409 
2410 //
2411 // Re-parse the proclist (for the explicit affinity type), and form the list
2412 // of affinity newMasks indexed by gtid.
2413 //
2414 static void
2415 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2416   unsigned int *out_numMasks, const char *proclist,
2417   kmp_affin_mask_t *osId2Mask, int maxOsId)
2418 {
2419     const char *scan = proclist;
2420     const char *next = proclist;
2421 
2422     //
2423     // We use malloc() for the temporary mask vector,
2424     // so that we can use realloc() to extend it.
2425     //
2426     numNewMasks = 2;
2427     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2428       * __kmp_affin_mask_size);
2429     nextNewMask = 0;
2430     kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2431       __kmp_affin_mask_size);
2432     int setSize = 0;
2433 
2434     for (;;) {
2435         int start, end, stride;
2436 
2437         SKIP_WS(scan);
2438         next = scan;
2439         if (*next == '\0') {
2440             break;
2441         }
2442 
2443         if (*next == '{') {
2444             int num;
2445             setSize = 0;
2446             next++;     // skip '{'
2447             SKIP_WS(next);
2448             scan = next;
2449 
2450             //
2451             // Read the first integer in the set.
2452             //
2453             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2454               "bad proclist");
2455             SKIP_DIGITS(next);
2456             num = __kmp_str_to_int(scan, *next);
2457             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2458 
2459             //
2460             // Copy the mask for that osId to the sum (union) mask.
2461             //
2462             if ((num > maxOsId) ||
2463               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2464                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2465                   && (__kmp_affinity_type != affinity_none))) {
2466                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2467                 }
2468                 KMP_CPU_ZERO(sumMask);
2469             }
2470             else {
2471                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2472                 setSize = 1;
2473             }
2474 
2475             for (;;) {
2476                 //
2477                 // Check for end of set.
2478                 //
2479                 SKIP_WS(next);
2480                 if (*next == '}') {
2481                     next++;     // skip '}'
2482                     break;
2483                 }
2484 
2485                 //
2486                 // Skip optional comma.
2487                 //
2488                 if (*next == ',') {
2489                     next++;
2490                 }
2491                 SKIP_WS(next);
2492 
2493                 //
2494                 // Read the next integer in the set.
2495                 //
2496                 scan = next;
2497                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2498                   "bad explicit proc list");
2499 
2500                 SKIP_DIGITS(next);
2501                 num = __kmp_str_to_int(scan, *next);
2502                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2503 
2504                 //
2505                 // Add the mask for that osId to the sum mask.
2506                 //
2507                 if ((num > maxOsId) ||
2508                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2509                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2510                       && (__kmp_affinity_type != affinity_none))) {
2511                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2512                     }
2513                 }
2514                 else {
2515                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2516                     setSize++;
2517                 }
2518             }
2519             if (setSize > 0) {
2520                 ADD_MASK(sumMask);
2521             }
2522 
2523             SKIP_WS(next);
2524             if (*next == ',') {
2525                 next++;
2526             }
2527             scan = next;
2528             continue;
2529         }
2530 
2531         //
2532         // Read the first integer.
2533         //
2534         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2535         SKIP_DIGITS(next);
2536         start = __kmp_str_to_int(scan, *next);
2537         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2538         SKIP_WS(next);
2539 
2540         //
2541         // If this isn't a range, then add a mask to the list and go on.
2542         //
2543         if (*next != '-') {
2544             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2545 
2546             //
2547             // Skip optional comma.
2548             //
2549             if (*next == ',') {
2550                 next++;
2551             }
2552             scan = next;
2553             continue;
2554         }
2555 
2556         //
2557         // This is a range.  Skip over the '-' and read in the 2nd int.
2558         //
2559         next++;         // skip '-'
2560         SKIP_WS(next);
2561         scan = next;
2562         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2563         SKIP_DIGITS(next);
2564         end = __kmp_str_to_int(scan, *next);
2565         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2566 
2567         //
2568         // Check for a stride parameter
2569         //
2570         stride = 1;
2571         SKIP_WS(next);
2572         if (*next == ':') {
2573             //
2574             // A stride is specified.  Skip over the ':" and read the 3rd int.
2575             //
2576             int sign = +1;
2577             next++;         // skip ':'
2578             SKIP_WS(next);
2579             scan = next;
2580             if (*next == '-') {
2581                 sign = -1;
2582                 next++;
2583                 SKIP_WS(next);
2584                 scan = next;
2585             }
2586             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2587               "bad explicit proc list");
2588             SKIP_DIGITS(next);
2589             stride = __kmp_str_to_int(scan, *next);
2590             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2591             stride *= sign;
2592         }
2593 
2594         //
2595         // Do some range checks.
2596         //
2597         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2598         if (stride > 0) {
2599             KMP_ASSERT2(start <= end, "bad explicit proc list");
2600         }
2601         else {
2602             KMP_ASSERT2(start >= end, "bad explicit proc list");
2603         }
2604         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2605 
2606         //
2607         // Add the mask for each OS proc # to the list.
2608         //
2609         if (stride > 0) {
2610             do {
2611                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2612                 start += stride;
2613             } while (start <= end);
2614         }
2615         else {
2616             do {
2617                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2618                 start += stride;
2619             } while (start >= end);
2620         }
2621 
2622         //
2623         // Skip optional comma.
2624         //
2625         SKIP_WS(next);
2626         if (*next == ',') {
2627             next++;
2628         }
2629         scan = next;
2630     }
2631 
2632     *out_numMasks = nextNewMask;
2633     if (nextNewMask == 0) {
2634         *out_masks = NULL;
2635         KMP_INTERNAL_FREE(newMasks);
2636         return;
2637     }
2638     *out_masks
2639       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2640     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2641     __kmp_free(sumMask);
2642     KMP_INTERNAL_FREE(newMasks);
2643 }
2644 
2645 
2646 # if OMP_40_ENABLED
2647 
2648 /*-----------------------------------------------------------------------------
2649 
2650 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2651 places.  Again, Here is the grammar:
2652 
2653 place_list := place
2654 place_list := place , place_list
2655 place := num
2656 place := place : num
2657 place := place : num : signed
2658 place := { subplacelist }
2659 place := ! place                  // (lowest priority)
2660 subplace_list := subplace
2661 subplace_list := subplace , subplace_list
2662 subplace := num
2663 subplace := num : num
2664 subplace := num : num : signed
2665 signed := num
2666 signed := + signed
2667 signed := - signed
2668 
2669 -----------------------------------------------------------------------------*/
2670 
2671 static void
2672 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2673   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2674 {
2675     const char *next;
2676 
2677     for (;;) {
2678         int start, count, stride, i;
2679 
2680         //
2681         // Read in the starting proc id
2682         //
2683         SKIP_WS(*scan);
2684         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2685           "bad explicit places list");
2686         next = *scan;
2687         SKIP_DIGITS(next);
2688         start = __kmp_str_to_int(*scan, *next);
2689         KMP_ASSERT(start >= 0);
2690         *scan = next;
2691 
2692         //
2693         // valid follow sets are ',' ':' and '}'
2694         //
2695         SKIP_WS(*scan);
2696         if (**scan == '}' || **scan == ',') {
2697             if ((start > maxOsId) ||
2698               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2699                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2700                   && (__kmp_affinity_type != affinity_none))) {
2701                     KMP_WARNING(AffIgnoreInvalidProcID, start);
2702                 }
2703             }
2704             else {
2705                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2706                 (*setSize)++;
2707             }
2708             if (**scan == '}') {
2709                 break;
2710             }
2711             (*scan)++;  // skip ','
2712             continue;
2713         }
2714         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2715         (*scan)++;      // skip ':'
2716 
2717         //
2718         // Read count parameter
2719         //
2720         SKIP_WS(*scan);
2721         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2722           "bad explicit places list");
2723         next = *scan;
2724         SKIP_DIGITS(next);
2725         count = __kmp_str_to_int(*scan, *next);
2726         KMP_ASSERT(count >= 0);
2727         *scan = next;
2728 
2729         //
2730         // valid follow sets are ',' ':' and '}'
2731         //
2732         SKIP_WS(*scan);
2733         if (**scan == '}' || **scan == ',') {
2734             for (i = 0; i < count; i++) {
2735                 if ((start > maxOsId) ||
2736                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2737                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2738                       && (__kmp_affinity_type != affinity_none))) {
2739                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2740                     }
2741                     break;  // don't proliferate warnings for large count
2742                 }
2743                 else {
2744                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2745                     start++;
2746                     (*setSize)++;
2747                 }
2748             }
2749             if (**scan == '}') {
2750                 break;
2751             }
2752             (*scan)++;  // skip ','
2753             continue;
2754         }
2755         KMP_ASSERT2(**scan == ':', "bad explicit places list");
2756         (*scan)++;      // skip ':'
2757 
2758         //
2759         // Read stride parameter
2760         //
2761         int sign = +1;
2762         for (;;) {
2763             SKIP_WS(*scan);
2764             if (**scan == '+') {
2765                 (*scan)++; // skip '+'
2766                 continue;
2767             }
2768             if (**scan == '-') {
2769                 sign *= -1;
2770                 (*scan)++; // skip '-'
2771                 continue;
2772             }
2773             break;
2774         }
2775         SKIP_WS(*scan);
2776         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2777           "bad explicit places list");
2778         next = *scan;
2779         SKIP_DIGITS(next);
2780         stride = __kmp_str_to_int(*scan, *next);
2781         KMP_ASSERT(stride >= 0);
2782         *scan = next;
2783         stride *= sign;
2784 
2785         //
2786         // valid follow sets are ',' and '}'
2787         //
2788         SKIP_WS(*scan);
2789         if (**scan == '}' || **scan == ',') {
2790             for (i = 0; i < count; i++) {
2791                 if ((start > maxOsId) ||
2792                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2793                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2794                       && (__kmp_affinity_type != affinity_none))) {
2795                         KMP_WARNING(AffIgnoreInvalidProcID, start);
2796                     }
2797                     break;  // don't proliferate warnings for large count
2798                 }
2799                 else {
2800                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2801                     start += stride;
2802                     (*setSize)++;
2803                 }
2804             }
2805             if (**scan == '}') {
2806                 break;
2807             }
2808             (*scan)++;  // skip ','
2809             continue;
2810         }
2811 
2812         KMP_ASSERT2(0, "bad explicit places list");
2813     }
2814 }
2815 
2816 
2817 static void
2818 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
2819   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2820 {
2821     const char *next;
2822 
2823     //
2824     // valid follow sets are '{' '!' and num
2825     //
2826     SKIP_WS(*scan);
2827     if (**scan == '{') {
2828         (*scan)++;      // skip '{'
2829         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
2830           setSize);
2831         KMP_ASSERT2(**scan == '}', "bad explicit places list");
2832         (*scan)++;      // skip '}'
2833     }
2834     else if (**scan == '!') {
2835         (*scan)++;      // skip '!'
2836         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
2837         KMP_CPU_COMPLEMENT(tempMask);
2838     }
2839     else if ((**scan >= '0') && (**scan <= '9')) {
2840         next = *scan;
2841         SKIP_DIGITS(next);
2842         int num = __kmp_str_to_int(*scan, *next);
2843         KMP_ASSERT(num >= 0);
2844         if ((num > maxOsId) ||
2845           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2846             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2847               && (__kmp_affinity_type != affinity_none))) {
2848                 KMP_WARNING(AffIgnoreInvalidProcID, num);
2849             }
2850         }
2851         else {
2852             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
2853             (*setSize)++;
2854         }
2855         *scan = next;  // skip num
2856     }
2857     else {
2858         KMP_ASSERT2(0, "bad explicit places list");
2859     }
2860 }
2861 
2862 
2863 //static void
2864 void
2865 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
2866   unsigned int *out_numMasks, const char *placelist,
2867   kmp_affin_mask_t *osId2Mask, int maxOsId)
2868 {
2869     const char *scan = placelist;
2870     const char *next = placelist;
2871 
2872     numNewMasks = 2;
2873     newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2874       * __kmp_affin_mask_size);
2875     nextNewMask = 0;
2876 
2877     kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
2878       __kmp_affin_mask_size);
2879     KMP_CPU_ZERO(tempMask);
2880     int setSize = 0;
2881 
2882     for (;;) {
2883         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
2884 
2885         //
2886         // valid follow sets are ',' ':' and EOL
2887         //
2888         SKIP_WS(scan);
2889         if (*scan == '\0' || *scan == ',') {
2890             if (setSize > 0) {
2891                 ADD_MASK(tempMask);
2892             }
2893             KMP_CPU_ZERO(tempMask);
2894             setSize = 0;
2895             if (*scan == '\0') {
2896                 break;
2897             }
2898             scan++;     // skip ','
2899             continue;
2900         }
2901 
2902         KMP_ASSERT2(*scan == ':', "bad explicit places list");
2903         scan++;         // skip ':'
2904 
2905         //
2906         // Read count parameter
2907         //
2908         SKIP_WS(scan);
2909         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2910           "bad explicit places list");
2911         next = scan;
2912         SKIP_DIGITS(next);
2913         int count = __kmp_str_to_int(scan, *next);
2914         KMP_ASSERT(count >= 0);
2915         scan = next;
2916 
2917         //
2918         // valid follow sets are ',' ':' and EOL
2919         //
2920         SKIP_WS(scan);
2921         int stride;
2922         if (*scan == '\0' || *scan == ',') {
2923             stride = +1;
2924         }
2925         else {
2926             KMP_ASSERT2(*scan == ':', "bad explicit places list");
2927             scan++;         // skip ':'
2928 
2929             //
2930             // Read stride parameter
2931             //
2932             int sign = +1;
2933             for (;;) {
2934                 SKIP_WS(scan);
2935                 if (*scan == '+') {
2936                     scan++; // skip '+'
2937                     continue;
2938                 }
2939                 if (*scan == '-') {
2940                     sign *= -1;
2941                     scan++; // skip '-'
2942                     continue;
2943                 }
2944                 break;
2945             }
2946             SKIP_WS(scan);
2947             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
2948               "bad explicit places list");
2949             next = scan;
2950             SKIP_DIGITS(next);
2951             stride = __kmp_str_to_int(scan, *next);
2952             KMP_DEBUG_ASSERT(stride >= 0);
2953             scan = next;
2954             stride *= sign;
2955         }
2956 
2957         if (stride > 0) {
2958             int i;
2959             for (i = 0; i < count; i++) {
2960                 int j;
2961                 if (setSize == 0) {
2962                     break;
2963                 }
2964                 ADD_MASK(tempMask);
2965                 setSize = 0;
2966                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
2967                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
2968                         KMP_CPU_CLR(j, tempMask);
2969                     }
2970                     else if ((j > maxOsId) ||
2971                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
2972                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
2973                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
2974                             KMP_WARNING(AffIgnoreInvalidProcID, j);
2975                         }
2976                         KMP_CPU_CLR(j, tempMask);
2977                     }
2978                     else {
2979                         KMP_CPU_SET(j, tempMask);
2980                         setSize++;
2981                     }
2982                 }
2983                 for (; j >= 0; j--) {
2984                     KMP_CPU_CLR(j, tempMask);
2985                 }
2986             }
2987         }
2988         else {
2989             int i;
2990             for (i = 0; i < count; i++) {
2991                 int j;
2992                 if (setSize == 0) {
2993                     break;
2994                 }
2995                 ADD_MASK(tempMask);
2996                 setSize = 0;
2997                 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
2998                   j++) {
2999                     if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3000                         KMP_CPU_CLR(j, tempMask);
3001                     }
3002                     else if ((j > maxOsId) ||
3003                       (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3004                         if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3005                           && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3006                             KMP_WARNING(AffIgnoreInvalidProcID, j);
3007                         }
3008                         KMP_CPU_CLR(j, tempMask);
3009                     }
3010                     else {
3011                         KMP_CPU_SET(j, tempMask);
3012                         setSize++;
3013                     }
3014                 }
3015                 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3016                     KMP_CPU_CLR(j, tempMask);
3017                 }
3018             }
3019         }
3020         KMP_CPU_ZERO(tempMask);
3021         setSize = 0;
3022 
3023         //
3024         // valid follow sets are ',' and EOL
3025         //
3026         SKIP_WS(scan);
3027         if (*scan == '\0') {
3028             break;
3029         }
3030         if (*scan == ',') {
3031             scan++;     // skip ','
3032             continue;
3033         }
3034 
3035         KMP_ASSERT2(0, "bad explicit places list");
3036     }
3037 
3038     *out_numMasks = nextNewMask;
3039     if (nextNewMask == 0) {
3040         *out_masks = NULL;
3041         KMP_INTERNAL_FREE(newMasks);
3042         return;
3043     }
3044     *out_masks
3045       = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3046     KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3047     __kmp_free(tempMask);
3048     KMP_INTERNAL_FREE(newMasks);
3049 }
3050 
3051 # endif /* OMP_40_ENABLED */
3052 
3053 #undef ADD_MASK
3054 #undef ADD_MASK_OSID
3055 
3056 static void
3057 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3058 {
3059     if (__kmp_place_num_sockets == 0 &&
3060         __kmp_place_num_cores == 0 &&
3061         __kmp_place_num_threads_per_core == 0 )
3062         return;   // no topology limiting actions requested, exit
3063     if (__kmp_place_num_sockets == 0)
3064         __kmp_place_num_sockets = nPackages;    // use all available sockets
3065     if (__kmp_place_num_cores == 0)
3066         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3067     if (__kmp_place_num_threads_per_core == 0 ||
3068         __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3069         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3070 
3071     if ( !__kmp_affinity_uniform_topology() ) {
3072         KMP_WARNING( AffThrPlaceNonUniform );
3073         return; // don't support non-uniform topology
3074     }
3075     if ( depth != 3 ) {
3076         KMP_WARNING( AffThrPlaceNonThreeLevel );
3077         return; // don't support not-3-level topology
3078     }
3079     if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3080         KMP_WARNING(AffThrPlaceManySockets);
3081         return;
3082     }
3083     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3084         KMP_WARNING( AffThrPlaceManyCores );
3085         return;
3086     }
3087 
3088     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3089         __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3090 
3091     int i, j, k, n_old = 0, n_new = 0;
3092     for (i = 0; i < nPackages; ++i)
3093         if (i < __kmp_place_socket_offset ||
3094             i >= __kmp_place_socket_offset + __kmp_place_num_sockets)
3095             n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3096         else
3097             for (j = 0; j < nCoresPerPkg; ++j) // walk through requested socket
3098                 if (j < __kmp_place_core_offset ||
3099                     j >= __kmp_place_core_offset + __kmp_place_num_cores)
3100                     n_old += __kmp_nThreadsPerCore; // skip not-requested core
3101                 else
3102                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3103                         if (k < __kmp_place_num_threads_per_core) {
3104                             newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3105                             n_new++;
3106                         }
3107                         n_old++;
3108                     }
3109     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3110     KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3111                      __kmp_place_num_threads_per_core);
3112 
3113     nPackages = __kmp_place_num_sockets;                      // correct nPackages
3114     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3115     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3116     __kmp_avail_proc = n_new;                                 // correct avail_proc
3117     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3118 
3119     __kmp_free( *pAddr );
3120     *pAddr = newAddr;      // replace old topology with new one
3121 }
3122 
3123 
3124 static AddrUnsPair *address2os = NULL;
3125 static int           * procarr = NULL;
3126 static int     __kmp_aff_depth = 0;
3127 
3128 static void
3129 __kmp_aux_affinity_initialize(void)
3130 {
3131     if (__kmp_affinity_masks != NULL) {
3132         KMP_ASSERT(fullMask != NULL);
3133         return;
3134     }
3135 
3136     //
3137     // Create the "full" mask - this defines all of the processors that we
3138     // consider to be in the machine model.  If respect is set, then it is
3139     // the initialization thread's affinity mask.  Otherwise, it is all
3140     // processors that we know about on the machine.
3141     //
3142     if (fullMask == NULL) {
3143         fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3144     }
3145     if (KMP_AFFINITY_CAPABLE()) {
3146         if (__kmp_affinity_respect_mask) {
3147             __kmp_get_system_affinity(fullMask, TRUE);
3148 
3149             //
3150             // Count the number of available processors.
3151             //
3152             unsigned i;
3153             __kmp_avail_proc = 0;
3154             for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3155                 if (! KMP_CPU_ISSET(i, fullMask)) {
3156                     continue;
3157                 }
3158                 __kmp_avail_proc++;
3159             }
3160             if (__kmp_avail_proc > __kmp_xproc) {
3161                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3162                   && (__kmp_affinity_type != affinity_none))) {
3163                     KMP_WARNING(ErrorInitializeAffinity);
3164                 }
3165                 __kmp_affinity_type = affinity_none;
3166                 KMP_AFFINITY_DISABLE();
3167                 return;
3168             }
3169         }
3170         else {
3171             __kmp_affinity_entire_machine_mask(fullMask);
3172             __kmp_avail_proc = __kmp_xproc;
3173         }
3174     }
3175 
3176     int depth = -1;
3177     kmp_i18n_id_t msg_id = kmp_i18n_null;
3178 
3179     //
3180     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3181     // KMP_TOPOLOGY_METHOD=cpuinfo
3182     //
3183     if ((__kmp_cpuinfo_file != NULL) &&
3184       (__kmp_affinity_top_method == affinity_top_method_all)) {
3185         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3186     }
3187 
3188     if (__kmp_affinity_top_method == affinity_top_method_all) {
3189         //
3190         // In the default code path, errors are not fatal - we just try using
3191         // another method.  We only emit a warning message if affinity is on,
3192         // or the verbose flag is set, an the nowarnings flag was not set.
3193         //
3194         const char *file_name = NULL;
3195         int line = 0;
3196 
3197 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3198 
3199         if (__kmp_affinity_verbose) {
3200             KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3201         }
3202 
3203         file_name = NULL;
3204         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3205         if (depth == 0) {
3206             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3207             KMP_ASSERT(address2os == NULL);
3208             return;
3209         }
3210 
3211         if (depth < 0) {
3212             if (__kmp_affinity_verbose) {
3213                 if (msg_id != kmp_i18n_null) {
3214                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3215                       KMP_I18N_STR(DecodingLegacyAPIC));
3216                 }
3217                 else {
3218                     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3219                 }
3220             }
3221 
3222             file_name = NULL;
3223             depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3224             if (depth == 0) {
3225                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3226                 KMP_ASSERT(address2os == NULL);
3227                 return;
3228             }
3229         }
3230 
3231 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3232 
3233 # if KMP_OS_LINUX
3234 
3235         if (depth < 0) {
3236             if (__kmp_affinity_verbose) {
3237                 if (msg_id != kmp_i18n_null) {
3238                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3239                 }
3240                 else {
3241                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3242                 }
3243             }
3244 
3245             FILE *f = fopen("/proc/cpuinfo", "r");
3246             if (f == NULL) {
3247                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3248             }
3249             else {
3250                 file_name = "/proc/cpuinfo";
3251                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3252                 fclose(f);
3253                 if (depth == 0) {
3254                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3255                     KMP_ASSERT(address2os == NULL);
3256                     return;
3257                 }
3258             }
3259         }
3260 
3261 # endif /* KMP_OS_LINUX */
3262 
3263 # if KMP_GROUP_AFFINITY
3264 
3265         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3266             if (__kmp_affinity_verbose) {
3267                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3268             }
3269 
3270             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3271             KMP_ASSERT(depth != 0);
3272         }
3273 
3274 # endif /* KMP_GROUP_AFFINITY */
3275 
3276         if (depth < 0) {
3277             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3278                 if (file_name == NULL) {
3279                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3280                 }
3281                 else if (line == 0) {
3282                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3283                 }
3284                 else {
3285                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3286                 }
3287             }
3288             // FIXME - print msg if msg_id = kmp_i18n_null ???
3289 
3290             file_name = "";
3291             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3292             if (depth == 0) {
3293                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3294                 KMP_ASSERT(address2os == NULL);
3295                 return;
3296             }
3297             KMP_ASSERT(depth > 0);
3298             KMP_ASSERT(address2os != NULL);
3299         }
3300     }
3301 
3302     //
3303     // If the user has specified that a paricular topology discovery method
3304     // is to be used, then we abort if that method fails.  The exception is
3305     // group affinity, which might have been implicitly set.
3306     //
3307 
3308 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3309 
3310     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3311         if (__kmp_affinity_verbose) {
3312             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3313               KMP_I18N_STR(Decodingx2APIC));
3314         }
3315 
3316         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3317         if (depth == 0) {
3318             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3319             KMP_ASSERT(address2os == NULL);
3320             return;
3321         }
3322         if (depth < 0) {
3323             KMP_ASSERT(msg_id != kmp_i18n_null);
3324             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3325         }
3326     }
3327     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3328         if (__kmp_affinity_verbose) {
3329             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3330               KMP_I18N_STR(DecodingLegacyAPIC));
3331         }
3332 
3333         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3334         if (depth == 0) {
3335             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3336             KMP_ASSERT(address2os == NULL);
3337             return;
3338         }
3339         if (depth < 0) {
3340             KMP_ASSERT(msg_id != kmp_i18n_null);
3341             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3342         }
3343     }
3344 
3345 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3346 
3347     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3348         const char *filename;
3349         if (__kmp_cpuinfo_file != NULL) {
3350             filename = __kmp_cpuinfo_file;
3351         }
3352         else {
3353             filename = "/proc/cpuinfo";
3354         }
3355 
3356         if (__kmp_affinity_verbose) {
3357             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3358         }
3359 
3360         FILE *f = fopen(filename, "r");
3361         if (f == NULL) {
3362             int code = errno;
3363             if (__kmp_cpuinfo_file != NULL) {
3364                 __kmp_msg(
3365                     kmp_ms_fatal,
3366                     KMP_MSG(CantOpenFileForReading, filename),
3367                     KMP_ERR(code),
3368                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3369                     __kmp_msg_null
3370                 );
3371             }
3372             else {
3373                 __kmp_msg(
3374                     kmp_ms_fatal,
3375                     KMP_MSG(CantOpenFileForReading, filename),
3376                     KMP_ERR(code),
3377                     __kmp_msg_null
3378                 );
3379             }
3380         }
3381         int line = 0;
3382         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3383         fclose(f);
3384         if (depth < 0) {
3385             KMP_ASSERT(msg_id != kmp_i18n_null);
3386             if (line > 0) {
3387                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3388             }
3389             else {
3390                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3391             }
3392         }
3393         if (__kmp_affinity_type == affinity_none) {
3394             KMP_ASSERT(depth == 0);
3395             KMP_ASSERT(address2os == NULL);
3396             return;
3397         }
3398     }
3399 
3400 # if KMP_GROUP_AFFINITY
3401 
3402     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3403         if (__kmp_affinity_verbose) {
3404             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3405         }
3406 
3407         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3408         KMP_ASSERT(depth != 0);
3409         if (depth < 0) {
3410             KMP_ASSERT(msg_id != kmp_i18n_null);
3411             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3412         }
3413     }
3414 
3415 # endif /* KMP_GROUP_AFFINITY */
3416 
3417     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3418         if (__kmp_affinity_verbose) {
3419             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3420         }
3421 
3422         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3423         if (depth == 0) {
3424             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3425             KMP_ASSERT(address2os == NULL);
3426             return;
3427         }
3428         // should not fail
3429         KMP_ASSERT(depth > 0);
3430         KMP_ASSERT(address2os != NULL);
3431     }
3432 
3433     if (address2os == NULL) {
3434         if (KMP_AFFINITY_CAPABLE()
3435           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3436           && (__kmp_affinity_type != affinity_none)))) {
3437             KMP_WARNING(ErrorInitializeAffinity);
3438         }
3439         __kmp_affinity_type = affinity_none;
3440         KMP_AFFINITY_DISABLE();
3441         return;
3442     }
3443 
3444     __kmp_apply_thread_places(&address2os, depth);
3445 
3446     //
3447     // Create the table of masks, indexed by thread Id.
3448     //
3449     unsigned maxIndex;
3450     unsigned numUnique;
3451     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3452       address2os, __kmp_avail_proc);
3453     if (__kmp_affinity_gran_levels == 0) {
3454         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3455     }
3456 
3457     //
3458     // Set the childNums vector in all Address objects.  This must be done
3459     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3460     // which takes into account the setting of __kmp_affinity_compact.
3461     //
3462     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3463 
3464     switch (__kmp_affinity_type) {
3465 
3466         case affinity_explicit:
3467         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3468 # if OMP_40_ENABLED
3469         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3470 # endif
3471         {
3472             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3473               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3474               maxIndex);
3475         }
3476 # if OMP_40_ENABLED
3477         else {
3478             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3479               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3480               maxIndex);
3481         }
3482 # endif
3483         if (__kmp_affinity_num_masks == 0) {
3484             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3485               && (__kmp_affinity_type != affinity_none))) {
3486                 KMP_WARNING(AffNoValidProcID);
3487             }
3488             __kmp_affinity_type = affinity_none;
3489             return;
3490         }
3491         break;
3492 
3493         //
3494         // The other affinity types rely on sorting the Addresses according
3495         // to some permutation of the machine topology tree.  Set
3496         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3497         // then jump to a common code fragment to do the sort and create
3498         // the array of affinity masks.
3499         //
3500 
3501         case affinity_logical:
3502         __kmp_affinity_compact = 0;
3503         if (__kmp_affinity_offset) {
3504             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3505               % __kmp_avail_proc;
3506         }
3507         goto sortAddresses;
3508 
3509         case affinity_physical:
3510         if (__kmp_nThreadsPerCore > 1) {
3511             __kmp_affinity_compact = 1;
3512             if (__kmp_affinity_compact >= depth) {
3513                 __kmp_affinity_compact = 0;
3514             }
3515         } else {
3516             __kmp_affinity_compact = 0;
3517         }
3518         if (__kmp_affinity_offset) {
3519             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3520               % __kmp_avail_proc;
3521         }
3522         goto sortAddresses;
3523 
3524         case affinity_scatter:
3525         if (__kmp_affinity_compact >= depth) {
3526             __kmp_affinity_compact = 0;
3527         }
3528         else {
3529             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3530         }
3531         goto sortAddresses;
3532 
3533         case affinity_compact:
3534         if (__kmp_affinity_compact >= depth) {
3535             __kmp_affinity_compact = depth - 1;
3536         }
3537         goto sortAddresses;
3538 
3539         case affinity_balanced:
3540         // Balanced works only for the case of a single package
3541         if( nPackages > 1 ) {
3542             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3543                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3544             }
3545             __kmp_affinity_type = affinity_none;
3546             return;
3547         } else if( __kmp_affinity_uniform_topology() ) {
3548             break;
3549         } else { // Non-uniform topology
3550 
3551             // Save the depth for further usage
3552             __kmp_aff_depth = depth;
3553 
3554             // Number of hyper threads per core in HT machine
3555             int nth_per_core = __kmp_nThreadsPerCore;
3556 
3557             int core_level;
3558             if( nth_per_core > 1 ) {
3559                 core_level = depth - 2;
3560             } else {
3561                 core_level = depth - 1;
3562             }
3563             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3564             int nproc = nth_per_core * ncores;
3565 
3566             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3567             for( int i = 0; i < nproc; i++ ) {
3568                 procarr[ i ] = -1;
3569             }
3570 
3571             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3572                 int proc = address2os[ i ].second;
3573                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3574                 // If there is only one thread per core then depth == 2: level 0 - package,
3575                 // level 1 - core.
3576                 int level = depth - 1;
3577 
3578                 // __kmp_nth_per_core == 1
3579                 int thread = 0;
3580                 int core = address2os[ i ].first.labels[ level ];
3581                 // If the thread level exists, that is we have more than one thread context per core
3582                 if( nth_per_core > 1 ) {
3583                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3584                     core = address2os[ i ].first.labels[ level - 1 ];
3585                 }
3586                 procarr[ core * nth_per_core + thread ] = proc;
3587             }
3588 
3589             break;
3590         }
3591 
3592         sortAddresses:
3593         //
3594         // Allocate the gtid->affinity mask table.
3595         //
3596         if (__kmp_affinity_dups) {
3597             __kmp_affinity_num_masks = __kmp_avail_proc;
3598         }
3599         else {
3600             __kmp_affinity_num_masks = numUnique;
3601         }
3602 
3603 # if OMP_40_ENABLED
3604         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3605           && ( __kmp_affinity_num_places > 0 )
3606           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3607             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3608         }
3609 # endif
3610 
3611         __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3612           __kmp_affinity_num_masks * __kmp_affin_mask_size);
3613 
3614         //
3615         // Sort the address2os table according to the current setting of
3616         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3617         //
3618         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3619           __kmp_affinity_cmp_Address_child_num);
3620         {
3621             int i;
3622             unsigned j;
3623             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3624                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3625                     continue;
3626                 }
3627                 unsigned osId = address2os[i].second;
3628                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3629                 kmp_affin_mask_t *dest
3630                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3631                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3632                 KMP_CPU_COPY(dest, src);
3633                 if (++j >= __kmp_affinity_num_masks) {
3634                     break;
3635                 }
3636             }
3637             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3638         }
3639         break;
3640 
3641         default:
3642         KMP_ASSERT2(0, "Unexpected affinity setting");
3643     }
3644 
3645     __kmp_free(osId2Mask);
3646     machine_hierarchy.init(address2os, __kmp_avail_proc);
3647 }
3648 
3649 
3650 void
3651 __kmp_affinity_initialize(void)
3652 {
3653     //
3654     // Much of the code above was written assumming that if a machine was not
3655     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3656     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3657     //
3658     // There are too many checks for __kmp_affinity_type == affinity_none
3659     // in this code.  Instead of trying to change them all, check if
3660     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3661     // affinity_none, call the real initialization routine, then restore
3662     // __kmp_affinity_type to affinity_disabled.
3663     //
3664     int disabled = (__kmp_affinity_type == affinity_disabled);
3665     if (! KMP_AFFINITY_CAPABLE()) {
3666         KMP_ASSERT(disabled);
3667     }
3668     if (disabled) {
3669         __kmp_affinity_type = affinity_none;
3670     }
3671     __kmp_aux_affinity_initialize();
3672     if (disabled) {
3673         __kmp_affinity_type = affinity_disabled;
3674     }
3675 }
3676 
3677 
3678 void
3679 __kmp_affinity_uninitialize(void)
3680 {
3681     if (__kmp_affinity_masks != NULL) {
3682         __kmp_free(__kmp_affinity_masks);
3683         __kmp_affinity_masks = NULL;
3684     }
3685     if (fullMask != NULL) {
3686         KMP_CPU_FREE(fullMask);
3687         fullMask = NULL;
3688     }
3689     __kmp_affinity_num_masks = 0;
3690 # if OMP_40_ENABLED
3691     __kmp_affinity_num_places = 0;
3692 # endif
3693     if (__kmp_affinity_proclist != NULL) {
3694         __kmp_free(__kmp_affinity_proclist);
3695         __kmp_affinity_proclist = NULL;
3696     }
3697     if( address2os != NULL ) {
3698         __kmp_free( address2os );
3699         address2os = NULL;
3700     }
3701     if( procarr != NULL ) {
3702         __kmp_free( procarr );
3703         procarr = NULL;
3704     }
3705 }
3706 
3707 
3708 void
3709 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3710 {
3711     if (! KMP_AFFINITY_CAPABLE()) {
3712         return;
3713     }
3714 
3715     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3716     if (th->th.th_affin_mask == NULL) {
3717         KMP_CPU_ALLOC(th->th.th_affin_mask);
3718     }
3719     else {
3720         KMP_CPU_ZERO(th->th.th_affin_mask);
3721     }
3722 
3723     //
3724     // Copy the thread mask to the kmp_info_t strucuture.
3725     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3726     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3727     // is set, then the full mask is the same as the mask of the initialization
3728     // thread.
3729     //
3730     kmp_affin_mask_t *mask;
3731     int i;
3732 
3733 # if OMP_40_ENABLED
3734     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3735 # endif
3736     {
3737         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
3738           ) {
3739 # if KMP_GROUP_AFFINITY
3740             if (__kmp_num_proc_groups > 1) {
3741                 return;
3742             }
3743 # endif
3744             KMP_ASSERT(fullMask != NULL);
3745             i = KMP_PLACE_ALL;
3746             mask = fullMask;
3747         }
3748         else {
3749             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3750             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3751             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3752         }
3753     }
3754 # if OMP_40_ENABLED
3755     else {
3756         if ((! isa_root)
3757           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3758 #  if KMP_GROUP_AFFINITY
3759             if (__kmp_num_proc_groups > 1) {
3760                 return;
3761             }
3762 #  endif
3763             KMP_ASSERT(fullMask != NULL);
3764             i = KMP_PLACE_ALL;
3765             mask = fullMask;
3766         }
3767         else {
3768             //
3769             // int i = some hash function or just a counter that doesn't
3770             // always start at 0.  Use gtid for now.
3771             //
3772             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3773             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3774             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3775         }
3776     }
3777 # endif
3778 
3779 # if OMP_40_ENABLED
3780     th->th.th_current_place = i;
3781     if (isa_root) {
3782         th->th.th_new_place = i;
3783         th->th.th_first_place = 0;
3784         th->th.th_last_place = __kmp_affinity_num_masks - 1;
3785     }
3786 
3787     if (i == KMP_PLACE_ALL) {
3788         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
3789           gtid));
3790     }
3791     else {
3792         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
3793           gtid, i));
3794     }
3795 # else
3796     if (i == -1) {
3797         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
3798           gtid));
3799     }
3800     else {
3801         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
3802           gtid, i));
3803     }
3804 # endif /* OMP_40_ENABLED */
3805 
3806     KMP_CPU_COPY(th->th.th_affin_mask, mask);
3807 
3808     if (__kmp_affinity_verbose) {
3809         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3810         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3811           th->th.th_affin_mask);
3812         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
3813           buf);
3814     }
3815 
3816 # if KMP_OS_WINDOWS
3817     //
3818     // On Windows* OS, the process affinity mask might have changed.
3819     // If the user didn't request affinity and this call fails,
3820     // just continue silently.  See CQ171393.
3821     //
3822     if ( __kmp_affinity_type == affinity_none ) {
3823         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
3824     }
3825     else
3826 # endif
3827     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3828 }
3829 
3830 
3831 # if OMP_40_ENABLED
3832 
3833 void
3834 __kmp_affinity_set_place(int gtid)
3835 {
3836     int retval;
3837 
3838     if (! KMP_AFFINITY_CAPABLE()) {
3839         return;
3840     }
3841 
3842     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3843 
3844     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
3845       gtid, th->th.th_new_place, th->th.th_current_place));
3846 
3847     //
3848     // Check that the new place is within this thread's partition.
3849     //
3850     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3851     KMP_ASSERT(th->th.th_new_place >= 0);
3852     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
3853     if (th->th.th_first_place <= th->th.th_last_place) {
3854         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
3855          && (th->th.th_new_place <= th->th.th_last_place));
3856     }
3857     else {
3858         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
3859          || (th->th.th_new_place >= th->th.th_last_place));
3860     }
3861 
3862     //
3863     // Copy the thread mask to the kmp_info_t strucuture,
3864     // and set this thread's affinity.
3865     //
3866     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
3867       th->th.th_new_place);
3868     KMP_CPU_COPY(th->th.th_affin_mask, mask);
3869     th->th.th_current_place = th->th.th_new_place;
3870 
3871     if (__kmp_affinity_verbose) {
3872         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3873         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3874           th->th.th_affin_mask);
3875         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
3876           gtid, buf);
3877     }
3878     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
3879 }
3880 
3881 # endif /* OMP_40_ENABLED */
3882 
3883 
3884 int
3885 __kmp_aux_set_affinity(void **mask)
3886 {
3887     int gtid;
3888     kmp_info_t *th;
3889     int retval;
3890 
3891     if (! KMP_AFFINITY_CAPABLE()) {
3892         return -1;
3893     }
3894 
3895     gtid = __kmp_entry_gtid();
3896     KA_TRACE(1000, ;{
3897         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3898         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3899           (kmp_affin_mask_t *)(*mask));
3900         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
3901           gtid, buf);
3902     });
3903 
3904     if (__kmp_env_consistency_check) {
3905         if ((mask == NULL) || (*mask == NULL)) {
3906             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3907         }
3908         else {
3909             unsigned proc;
3910             int num_procs = 0;
3911 
3912             for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
3913                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
3914                     continue;
3915                 }
3916                 num_procs++;
3917                 if (! KMP_CPU_ISSET(proc, fullMask)) {
3918                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3919                     break;
3920                 }
3921             }
3922             if (num_procs == 0) {
3923                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3924             }
3925 
3926 # if KMP_GROUP_AFFINITY
3927             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
3928                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
3929             }
3930 # endif /* KMP_GROUP_AFFINITY */
3931 
3932         }
3933     }
3934 
3935     th = __kmp_threads[gtid];
3936     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3937     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
3938     if (retval == 0) {
3939         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
3940     }
3941 
3942 # if OMP_40_ENABLED
3943     th->th.th_current_place = KMP_PLACE_UNDEFINED;
3944     th->th.th_new_place = KMP_PLACE_UNDEFINED;
3945     th->th.th_first_place = 0;
3946     th->th.th_last_place = __kmp_affinity_num_masks - 1;
3947 
3948     //
3949     // Turn off 4.0 affinity for the current tread at this parallel level.
3950     //
3951     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
3952 # endif
3953 
3954     return retval;
3955 }
3956 
3957 
3958 int
3959 __kmp_aux_get_affinity(void **mask)
3960 {
3961     int gtid;
3962     int retval;
3963     kmp_info_t *th;
3964 
3965     if (! KMP_AFFINITY_CAPABLE()) {
3966         return -1;
3967     }
3968 
3969     gtid = __kmp_entry_gtid();
3970     th = __kmp_threads[gtid];
3971     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
3972 
3973     KA_TRACE(1000, ;{
3974         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3975         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3976           th->th.th_affin_mask);
3977         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
3978     });
3979 
3980     if (__kmp_env_consistency_check) {
3981         if ((mask == NULL) || (*mask == NULL)) {
3982             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
3983         }
3984     }
3985 
3986 # if !KMP_OS_WINDOWS
3987 
3988     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
3989     KA_TRACE(1000, ;{
3990         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3991         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3992           (kmp_affin_mask_t *)(*mask));
3993         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
3994     });
3995     return retval;
3996 
3997 # else
3998 
3999     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4000     return 0;
4001 
4002 # endif /* KMP_OS_WINDOWS */
4003 
4004 }
4005 
4006 int
4007 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4008 {
4009     int retval;
4010 
4011     if (! KMP_AFFINITY_CAPABLE()) {
4012         return -1;
4013     }
4014 
4015     KA_TRACE(1000, ;{
4016         int gtid = __kmp_entry_gtid();
4017         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4018         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4019           (kmp_affin_mask_t *)(*mask));
4020         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4021           proc, gtid, buf);
4022     });
4023 
4024     if (__kmp_env_consistency_check) {
4025         if ((mask == NULL) || (*mask == NULL)) {
4026             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4027         }
4028     }
4029 
4030     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4031         return -1;
4032     }
4033     if (! KMP_CPU_ISSET(proc, fullMask)) {
4034         return -2;
4035     }
4036 
4037     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4038     return 0;
4039 }
4040 
4041 
4042 int
4043 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4044 {
4045     int retval;
4046 
4047     if (! KMP_AFFINITY_CAPABLE()) {
4048         return -1;
4049     }
4050 
4051     KA_TRACE(1000, ;{
4052         int gtid = __kmp_entry_gtid();
4053         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4054         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4055           (kmp_affin_mask_t *)(*mask));
4056         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4057           proc, gtid, buf);
4058     });
4059 
4060     if (__kmp_env_consistency_check) {
4061         if ((mask == NULL) || (*mask == NULL)) {
4062             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4063         }
4064     }
4065 
4066     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4067         return -1;
4068     }
4069     if (! KMP_CPU_ISSET(proc, fullMask)) {
4070         return -2;
4071     }
4072 
4073     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4074     return 0;
4075 }
4076 
4077 
4078 int
4079 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4080 {
4081     int retval;
4082 
4083     if (! KMP_AFFINITY_CAPABLE()) {
4084         return -1;
4085     }
4086 
4087     KA_TRACE(1000, ;{
4088         int gtid = __kmp_entry_gtid();
4089         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4090         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4091           (kmp_affin_mask_t *)(*mask));
4092         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4093           proc, gtid, buf);
4094     });
4095 
4096     if (__kmp_env_consistency_check) {
4097         if ((mask == NULL) || (*mask == NULL)) {
4098             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4099         }
4100     }
4101 
4102     if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4103         return 0;
4104     }
4105     if (! KMP_CPU_ISSET(proc, fullMask)) {
4106         return 0;
4107     }
4108 
4109     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4110 }
4111 
4112 
4113 // Dynamic affinity settings - Affinity balanced
4114 void __kmp_balanced_affinity( int tid, int nthreads )
4115 {
4116     if( __kmp_affinity_uniform_topology() ) {
4117         int coreID;
4118         int threadID;
4119         // Number of hyper threads per core in HT machine
4120         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4121         // Number of cores
4122         int ncores = __kmp_ncores;
4123         // How many threads will be bound to each core
4124         int chunk = nthreads / ncores;
4125         // How many cores will have an additional thread bound to it - "big cores"
4126         int big_cores = nthreads % ncores;
4127         // Number of threads on the big cores
4128         int big_nth = ( chunk + 1 ) * big_cores;
4129         if( tid < big_nth ) {
4130             coreID = tid / (chunk + 1 );
4131             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4132         } else { //tid >= big_nth
4133             coreID = ( tid - big_cores ) / chunk;
4134             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4135         }
4136 
4137         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4138           "Illegal set affinity operation when not capable");
4139 
4140         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4141         KMP_CPU_ZERO(mask);
4142 
4143         // Granularity == thread
4144         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4145             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4146             KMP_CPU_SET( osID, mask);
4147         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4148             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4149                 int osID;
4150                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4151                 KMP_CPU_SET( osID, mask);
4152             }
4153         }
4154         if (__kmp_affinity_verbose) {
4155             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4156             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4157             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4158               tid, buf);
4159         }
4160         __kmp_set_system_affinity( mask, TRUE );
4161     } else { // Non-uniform topology
4162 
4163         kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4164         KMP_CPU_ZERO(mask);
4165 
4166         // Number of hyper threads per core in HT machine
4167         int nth_per_core = __kmp_nThreadsPerCore;
4168         int core_level;
4169         if( nth_per_core > 1 ) {
4170             core_level = __kmp_aff_depth - 2;
4171         } else {
4172             core_level = __kmp_aff_depth - 1;
4173         }
4174 
4175         // Number of cores - maximum value; it does not count trail cores with 0 processors
4176         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4177 
4178         // For performance gain consider the special case nthreads == __kmp_avail_proc
4179         if( nthreads == __kmp_avail_proc ) {
4180             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4181                 int osID = address2os[ tid ].second;
4182                 KMP_CPU_SET( osID, mask);
4183             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4184                 int coreID = address2os[ tid ].first.labels[ core_level ];
4185                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4186                 // since the address2os is sortied we can break when cnt==nth_per_core
4187                 int cnt = 0;
4188                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4189                     int osID = address2os[ i ].second;
4190                     int core = address2os[ i ].first.labels[ core_level ];
4191                     if( core == coreID ) {
4192                         KMP_CPU_SET( osID, mask);
4193                         cnt++;
4194                         if( cnt == nth_per_core ) {
4195                             break;
4196                         }
4197                     }
4198                 }
4199             }
4200         } else if( nthreads <= __kmp_ncores ) {
4201 
4202             int core = 0;
4203             for( int i = 0; i < ncores; i++ ) {
4204                 // Check if this core from procarr[] is in the mask
4205                 int in_mask = 0;
4206                 for( int j = 0; j < nth_per_core; j++ ) {
4207                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4208                         in_mask = 1;
4209                         break;
4210                     }
4211                 }
4212                 if( in_mask ) {
4213                     if( tid == core ) {
4214                         for( int j = 0; j < nth_per_core; j++ ) {
4215                             int osID = procarr[ i * nth_per_core + j ];
4216                             if( osID != -1 ) {
4217                                 KMP_CPU_SET( osID, mask );
4218                                 // For granularity=thread it is enough to set the first available osID for this core
4219                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4220                                     break;
4221                                 }
4222                             }
4223                         }
4224                         break;
4225                     } else {
4226                         core++;
4227                     }
4228                 }
4229             }
4230 
4231         } else { // nthreads > __kmp_ncores
4232 
4233             // Array to save the number of processors at each core
4234             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4235             // Array to save the number of cores with "x" available processors;
4236             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4237             // Array to save the number of cores with # procs from x to nth_per_core
4238             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4239 
4240             for( int i = 0; i <= nth_per_core; i++ ) {
4241                 ncores_with_x_procs[ i ] = 0;
4242                 ncores_with_x_to_max_procs[ i ] = 0;
4243             }
4244 
4245             for( int i = 0; i < ncores; i++ ) {
4246                 int cnt = 0;
4247                 for( int j = 0; j < nth_per_core; j++ ) {
4248                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4249                         cnt++;
4250                     }
4251                 }
4252                 nproc_at_core[ i ] = cnt;
4253                 ncores_with_x_procs[ cnt ]++;
4254             }
4255 
4256             for( int i = 0; i <= nth_per_core; i++ ) {
4257                 for( int j = i; j <= nth_per_core; j++ ) {
4258                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4259                 }
4260             }
4261 
4262             // Max number of processors
4263             int nproc = nth_per_core * ncores;
4264             // An array to keep number of threads per each context
4265             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4266             for( int i = 0; i < nproc; i++ ) {
4267                 newarr[ i ] = 0;
4268             }
4269 
4270             int nth = nthreads;
4271             int flag = 0;
4272             while( nth > 0 ) {
4273                 for( int j = 1; j <= nth_per_core; j++ ) {
4274                     int cnt = ncores_with_x_to_max_procs[ j ];
4275                     for( int i = 0; i < ncores; i++ ) {
4276                         // Skip the core with 0 processors
4277                         if( nproc_at_core[ i ] == 0 ) {
4278                             continue;
4279                         }
4280                         for( int k = 0; k < nth_per_core; k++ ) {
4281                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4282                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4283                                     newarr[ i * nth_per_core + k ] = 1;
4284                                     cnt--;
4285                                     nth--;
4286                                     break;
4287                                 } else {
4288                                     if( flag != 0 ) {
4289                                         newarr[ i * nth_per_core + k ] ++;
4290                                         cnt--;
4291                                         nth--;
4292                                         break;
4293                                     }
4294                                 }
4295                             }
4296                         }
4297                         if( cnt == 0 || nth == 0 ) {
4298                             break;
4299                         }
4300                     }
4301                     if( nth == 0 ) {
4302                         break;
4303                     }
4304                 }
4305                 flag = 1;
4306             }
4307             int sum = 0;
4308             for( int i = 0; i < nproc; i++ ) {
4309                 sum += newarr[ i ];
4310                 if( sum > tid ) {
4311                     // Granularity == thread
4312                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4313                         int osID = procarr[ i ];
4314                         KMP_CPU_SET( osID, mask);
4315                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4316                         int coreID = i / nth_per_core;
4317                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4318                             int osID = procarr[ coreID * nth_per_core + ii ];
4319                             if( osID != -1 ) {
4320                                 KMP_CPU_SET( osID, mask);
4321                             }
4322                         }
4323                     }
4324                     break;
4325                 }
4326             }
4327             __kmp_free( newarr );
4328         }
4329 
4330         if (__kmp_affinity_verbose) {
4331             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4332             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4333             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4334               tid, buf);
4335         }
4336         __kmp_set_system_affinity( mask, TRUE );
4337     }
4338 }
4339 
4340 #endif // KMP_AFFINITY_SUPPORTED
4341