1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     // Adjust the hierarchy in case num threads exceeds original
37     if (nproc > machine_hierarchy.base_num_threads)
38         machine_hierarchy.resize(nproc);
39 
40     depth = machine_hierarchy.depth;
41     KMP_DEBUG_ASSERT(depth > 0);
42 
43     thr_bar->depth = depth;
44     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 #if KMP_USE_HWLOC
54 char *
55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
56 {
57     int num_chars_to_write, num_chars_written;
58     char* scan;
59     KMP_ASSERT(buf_len >= 40);
60 
61     // bufsize of 0 just retrieves the needed buffer size.
62     num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
63 
64     // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
65     // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
66     //   take into account the '\0' character.
67     if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
68         KMP_SNPRINTF(buf, buf_len, "{<empty>}");
69     } else if(num_chars_to_write < buf_len - 3) {
70         // no problem fitting the mask into buf_len number of characters
71         buf[0] = '{';
72         // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
73         num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
74         buf[num_chars_written+1] = '}';
75         buf[num_chars_written+2] = '\0';
76     } else {
77         // Need to truncate the affinity mask string and add ellipsis.
78         // To do this, we first write out the '{' + str(mask)
79         buf[0] = '{';
80         hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask);
81         // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
82         // on a digit then write "...}\0".  This way it is a clean ellipsis addition and we don't
83         // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
84         // { 45, 67,...} instead.
85         scan = buf + buf_len - 7;
86         while(*scan >= '0' && *scan <= '9' && scan >= buf)
87             scan--;
88         *(scan+1) = '.';
89         *(scan+2) = '.';
90         *(scan+3) = '.';
91         *(scan+4) = '}';
92         *(scan+5) = '\0';
93     }
94     return buf;
95 }
96 #else
97 char *
98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
99 {
100     KMP_ASSERT(buf_len >= 40);
101     char *scan = buf;
102     char *end = buf + buf_len - 1;
103 
104     //
105     // Find first element / check for empty set.
106     //
107     size_t i;
108     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
109         if (KMP_CPU_ISSET(i, mask)) {
110             break;
111         }
112     }
113     if (i == KMP_CPU_SETSIZE) {
114         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
115         while (*scan != '\0') scan++;
116         KMP_ASSERT(scan <= end);
117         return buf;
118     }
119 
120     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
121     while (*scan != '\0') scan++;
122     i++;
123     for (; i < KMP_CPU_SETSIZE; i++) {
124         if (! KMP_CPU_ISSET(i, mask)) {
125             continue;
126         }
127 
128         //
129         // Check for buffer overflow.  A string of the form ",<n>" will have
130         // at most 10 characters, plus we want to leave room to print ",...}"
131         // if the set is too large to print for a total of 15 characters.
132         // We already left room for '\0' in setting end.
133         //
134         if (end - scan < 15) {
135            break;
136         }
137         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
138         while (*scan != '\0') scan++;
139     }
140     if (i < KMP_CPU_SETSIZE) {
141         KMP_SNPRINTF(scan, end-scan+1,  ",...");
142         while (*scan != '\0') scan++;
143     }
144     KMP_SNPRINTF(scan, end-scan+1, "}");
145     while (*scan != '\0') scan++;
146     KMP_ASSERT(scan <= end);
147     return buf;
148 }
149 #endif // KMP_USE_HWLOC
150 
151 
152 void
153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
154 {
155     KMP_CPU_ZERO(mask);
156 
157 # if KMP_GROUP_AFFINITY
158 
159     if (__kmp_num_proc_groups > 1) {
160         int group;
161         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
162         for (group = 0; group < __kmp_num_proc_groups; group++) {
163             int i;
164             int num = __kmp_GetActiveProcessorCount(group);
165             for (i = 0; i < num; i++) {
166                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
167             }
168         }
169     }
170     else
171 
172 # endif /* KMP_GROUP_AFFINITY */
173 
174     {
175         int proc;
176         for (proc = 0; proc < __kmp_xproc; proc++) {
177             KMP_CPU_SET(proc, mask);
178         }
179     }
180 }
181 
182 //
183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
184 // called to renumber the labels from [0..n] and place them into the child_num
185 // vector of the address object.  This is done in case the labels used for
186 // the children at one node of the hierarchy differ from those used for
187 // another node at the same level.  Example:  suppose the machine has 2 nodes
188 // with 2 packages each.  The first node contains packages 601 and 602, and
189 // second node contains packages 603 and 604.  If we try to sort the table
190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
191 // because we are paying attention to the labels themselves, not the ordinal
192 // child numbers.  By using the child numbers in the sort, the result is
193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
194 //
195 static void
196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
197   int numAddrs)
198 {
199     KMP_DEBUG_ASSERT(numAddrs > 0);
200     int depth = address2os->first.depth;
201     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
202     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
203       * sizeof(unsigned));
204     int labCt;
205     for (labCt = 0; labCt < depth; labCt++) {
206         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
207         lastLabel[labCt] = address2os[0].first.labels[labCt];
208     }
209     int i;
210     for (i = 1; i < numAddrs; i++) {
211         for (labCt = 0; labCt < depth; labCt++) {
212             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
213                 int labCt2;
214                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
215                     counts[labCt2] = 0;
216                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
217                 }
218                 counts[labCt]++;
219                 lastLabel[labCt] = address2os[i].first.labels[labCt];
220                 break;
221             }
222         }
223         for (labCt = 0; labCt < depth; labCt++) {
224             address2os[i].first.childNums[labCt] = counts[labCt];
225         }
226         for (; labCt < (int)Address::maxDepth; labCt++) {
227             address2os[i].first.childNums[labCt] = 0;
228         }
229     }
230 }
231 
232 
233 //
234 // All of the __kmp_affinity_create_*_map() routines should set
235 // __kmp_affinity_masks to a vector of affinity mask objects of length
236 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
237 // return the number of levels in the machine topology tree (zero if
238 // __kmp_affinity_type == affinity_none).
239 //
240 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
241 // to the affinity mask for the initialization thread.  They need to save and
242 // restore the mask, and it could be needed later, so saving it is just an
243 // optimization to avoid calling kmp_get_system_affinity() again.
244 //
245 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
246 
247 static int nCoresPerPkg, nPackages;
248 static int __kmp_nThreadsPerCore;
249 #ifndef KMP_DFLT_NTH_CORES
250 static int __kmp_ncores;
251 #endif
252 
253 //
254 // __kmp_affinity_uniform_topology() doesn't work when called from
255 // places which support arbitrarily many levels in the machine topology
256 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
257 // __kmp_affinity_create_x2apicid_map().
258 //
259 inline static bool
260 __kmp_affinity_uniform_topology()
261 {
262     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
263 }
264 
265 
266 //
267 // Print out the detailed machine topology map, i.e. the physical locations
268 // of each OS proc.
269 //
270 static void
271 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
272   int pkgLevel, int coreLevel, int threadLevel)
273 {
274     int proc;
275 
276     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
277     for (proc = 0; proc < len; proc++) {
278         int level;
279         kmp_str_buf_t buf;
280         __kmp_str_buf_init(&buf);
281         for (level = 0; level < depth; level++) {
282             if (level == threadLevel) {
283                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
284             }
285             else if (level == coreLevel) {
286                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
287             }
288             else if (level == pkgLevel) {
289                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
290             }
291             else if (level > pkgLevel) {
292                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
293                   level - pkgLevel - 1);
294             }
295             else {
296                 __kmp_str_buf_print(&buf, "L%d ", level);
297             }
298             __kmp_str_buf_print(&buf, "%d ",
299               address2os[proc].first.labels[level]);
300         }
301         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
302           buf.str);
303         __kmp_str_buf_free(&buf);
304     }
305 }
306 
307 #if KMP_USE_HWLOC
308 
309 // This function removes the topology levels that are radix 1 and don't offer
310 // further information about the topology.  The most common example is when you
311 // have one thread context per core, we don't want the extra thread context
312 // level if it offers no unique labels.  So they are removed.
313 // return value: the new depth of address2os
314 static int
315 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
316     int level;
317     int i;
318     int radix1_detected;
319 
320     for (level = depth-1; level >= 0; --level) {
321         // Always keep the package level
322         if (level == *pkgLevel)
323             continue;
324         // Detect if this level is radix 1
325         radix1_detected = 1;
326         for (i = 1; i < nActiveThreads; ++i) {
327             if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
328                 // There are differing label values for this level so it stays
329                 radix1_detected = 0;
330                 break;
331             }
332         }
333         if (!radix1_detected)
334             continue;
335         // Radix 1 was detected
336         if (level == *threadLevel) {
337             // If only one thread per core, then just decrement
338             // the depth which removes the threadlevel from address2os
339             for (i = 0; i < nActiveThreads; ++i) {
340                 address2os[i].first.depth--;
341             }
342             *threadLevel = -1;
343         } else if (level == *coreLevel) {
344             // For core level, we move the thread labels over if they are still
345             // valid (*threadLevel != -1), and also reduce the depth another level
346             for (i = 0; i < nActiveThreads; ++i) {
347                 if (*threadLevel != -1) {
348                     address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
349                 }
350                 address2os[i].first.depth--;
351             }
352             *coreLevel = -1;
353         }
354     }
355     return address2os[0].first.depth;
356 }
357 
358 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
359 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
360 //  this will return the number of PU's under the SOCKET object.
361 static int
362 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
363     int retval = 0;
364     hwloc_obj_t first;
365     for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
366         first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
367         first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
368     {
369         ++retval;
370     }
371     return retval;
372 }
373 
374 static int
375 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
376   kmp_i18n_id_t *const msg_id)
377 {
378     *address2os = NULL;
379     *msg_id = kmp_i18n_null;
380 
381     //
382     // Save the affinity mask for the current thread.
383     //
384     kmp_affin_mask_t *oldMask;
385     KMP_CPU_ALLOC(oldMask);
386     __kmp_get_system_affinity(oldMask, TRUE);
387 
388     int depth = 3;
389     int pkgLevel = 0;
390     int coreLevel = 1;
391     int threadLevel = 2;
392     nPackages = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_root_obj(__kmp_hwloc_topology), HWLOC_OBJ_SOCKET);
393     nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
394     __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
395 
396     if (! KMP_AFFINITY_CAPABLE())
397     {
398         //
399         // Hack to try and infer the machine topology using only the data
400         // available from cpuid on the current thread, and __kmp_xproc.
401         //
402         KMP_ASSERT(__kmp_affinity_type == affinity_none);
403 
404         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
405         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
406         if (__kmp_affinity_verbose) {
407             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
408             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
409             if (__kmp_affinity_uniform_topology()) {
410                 KMP_INFORM(Uniform, "KMP_AFFINITY");
411             } else {
412                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
413             }
414             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
415               __kmp_nThreadsPerCore, __kmp_ncores);
416         }
417         return 0;
418     }
419 
420     //
421     // Allocate the data structure to be returned.
422     //
423     AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
424 
425     hwloc_obj_t pu;
426     hwloc_obj_t core;
427     hwloc_obj_t socket;
428     int nActiveThreads = 0;
429     int socket_identifier = 0;
430     for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
431         socket != NULL;
432         socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
433         socket_identifier++)
434     {
435         int core_identifier = 0;
436         for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
437             core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
438             core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
439             core_identifier++)
440         {
441             int pu_identifier = 0;
442             for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
443                 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
444                 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
445                 pu_identifier++)
446             {
447                 Address addr(3);
448                 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
449                     continue;
450                 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
451                     socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
452                 addr.labels[0] = socket_identifier; // package
453                 addr.labels[1] = core_identifier; // core
454                 addr.labels[2] = pu_identifier; // pu
455                 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
456                 nActiveThreads++;
457             }
458         }
459     }
460 
461     //
462     // If there's only one thread context to bind to, return now.
463     //
464     KMP_ASSERT(nActiveThreads > 0);
465     if (nActiveThreads == 1) {
466         __kmp_ncores = nPackages = 1;
467         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
468         if (__kmp_affinity_verbose) {
469             char buf[KMP_AFFIN_MASK_PRINT_LEN];
470             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
471 
472             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
473             if (__kmp_affinity_respect_mask) {
474                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
475             } else {
476                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
477             }
478             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
479             KMP_INFORM(Uniform, "KMP_AFFINITY");
480             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
481               __kmp_nThreadsPerCore, __kmp_ncores);
482         }
483 
484         if (__kmp_affinity_type == affinity_none) {
485             __kmp_free(retval);
486             KMP_CPU_FREE(oldMask);
487             return 0;
488         }
489 
490         //
491         // Form an Address object which only includes the package level.
492         //
493         Address addr(1);
494         addr.labels[0] = retval[0].first.labels[pkgLevel];
495         retval[0].first = addr;
496 
497         if (__kmp_affinity_gran_levels < 0) {
498             __kmp_affinity_gran_levels = 0;
499         }
500 
501         if (__kmp_affinity_verbose) {
502             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
503         }
504 
505         *address2os = retval;
506         KMP_CPU_FREE(oldMask);
507         return 1;
508     }
509 
510     //
511     // Sort the table by physical Id.
512     //
513     qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
514 
515     //
516     // When affinity is off, this routine will still be called to set
517     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
518     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
519     // correctly, and return if affinity is not enabled.
520     //
521     __kmp_ncores = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
522 
523     //
524     // Check to see if the machine topology is uniform
525     //
526     unsigned npackages = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
527     unsigned ncores = __kmp_ncores;
528     unsigned nthreads = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU);
529     unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads);
530 
531     //
532     // Print the machine topology summary.
533     //
534     if (__kmp_affinity_verbose) {
535         char mask[KMP_AFFIN_MASK_PRINT_LEN];
536         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
537 
538         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
539         if (__kmp_affinity_respect_mask) {
540             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
541         } else {
542             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
543         }
544         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
545         if (uniform) {
546             KMP_INFORM(Uniform, "KMP_AFFINITY");
547         } else {
548             KMP_INFORM(NonUniform, "KMP_AFFINITY");
549         }
550 
551         kmp_str_buf_t buf;
552         __kmp_str_buf_init(&buf);
553 
554         __kmp_str_buf_print(&buf, "%d", npackages);
555         //for (level = 1; level <= pkgLevel; level++) {
556         //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
557        // }
558         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
559           __kmp_nThreadsPerCore, __kmp_ncores);
560 
561         __kmp_str_buf_free(&buf);
562     }
563 
564     if (__kmp_affinity_type == affinity_none) {
565         KMP_CPU_FREE(oldMask);
566         return 0;
567     }
568 
569     //
570     // Find any levels with radiix 1, and remove them from the map
571     // (except for the package level).
572     //
573     depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
574 
575     if (__kmp_affinity_gran_levels < 0) {
576         //
577         // Set the granularity level based on what levels are modeled
578         // in the machine topology map.
579         //
580         __kmp_affinity_gran_levels = 0;
581         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
582             __kmp_affinity_gran_levels++;
583         }
584         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
585             __kmp_affinity_gran_levels++;
586         }
587         if (__kmp_affinity_gran > affinity_gran_package) {
588             __kmp_affinity_gran_levels++;
589         }
590     }
591 
592     if (__kmp_affinity_verbose) {
593         __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
594           coreLevel, threadLevel);
595     }
596 
597     KMP_CPU_FREE(oldMask);
598     *address2os = retval;
599     return depth;
600 }
601 #endif // KMP_USE_HWLOC
602 
603 //
604 // If we don't know how to retrieve the machine's processor topology, or
605 // encounter an error in doing so, this routine is called to form a "flat"
606 // mapping of os thread id's <-> processor id's.
607 //
608 static int
609 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
610   kmp_i18n_id_t *const msg_id)
611 {
612     *address2os = NULL;
613     *msg_id = kmp_i18n_null;
614 
615     //
616     // Even if __kmp_affinity_type == affinity_none, this routine might still
617     // called to set __kmp_ncores, as well as
618     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
619     //
620     if (! KMP_AFFINITY_CAPABLE()) {
621         KMP_ASSERT(__kmp_affinity_type == affinity_none);
622         __kmp_ncores = nPackages = __kmp_xproc;
623         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
624         if (__kmp_affinity_verbose) {
625             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
626             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
627             KMP_INFORM(Uniform, "KMP_AFFINITY");
628             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
629               __kmp_nThreadsPerCore, __kmp_ncores);
630         }
631         return 0;
632     }
633 
634     //
635     // When affinity is off, this routine will still be called to set
636     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
637     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
638     //  correctly, and return now if affinity is not enabled.
639     //
640     __kmp_ncores = nPackages = __kmp_avail_proc;
641     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
642     if (__kmp_affinity_verbose) {
643         char buf[KMP_AFFIN_MASK_PRINT_LEN];
644         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
645 
646         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
647         if (__kmp_affinity_respect_mask) {
648             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
649         } else {
650             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
651         }
652         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
653         KMP_INFORM(Uniform, "KMP_AFFINITY");
654         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
655           __kmp_nThreadsPerCore, __kmp_ncores);
656     }
657     if (__kmp_affinity_type == affinity_none) {
658         return 0;
659     }
660 
661     //
662     // Contruct the data structure to be returned.
663     //
664     *address2os = (AddrUnsPair*)
665       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
666     int avail_ct = 0;
667     unsigned int i;
668     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
669         //
670         // Skip this proc if it is not included in the machine model.
671         //
672         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
673             continue;
674         }
675 
676         Address addr(1);
677         addr.labels[0] = i;
678         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
679     }
680     if (__kmp_affinity_verbose) {
681         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
682     }
683 
684     if (__kmp_affinity_gran_levels < 0) {
685         //
686         // Only the package level is modeled in the machine topology map,
687         // so the #levels of granularity is either 0 or 1.
688         //
689         if (__kmp_affinity_gran > affinity_gran_package) {
690             __kmp_affinity_gran_levels = 1;
691         }
692         else {
693             __kmp_affinity_gran_levels = 0;
694         }
695     }
696     return 1;
697 }
698 
699 
700 # if KMP_GROUP_AFFINITY
701 
702 //
703 // If multiple Windows* OS processor groups exist, we can create a 2-level
704 // topology map with the groups at level 0 and the individual procs at
705 // level 1.
706 //
707 // This facilitates letting the threads float among all procs in a group,
708 // if granularity=group (the default when there are multiple groups).
709 //
710 static int
711 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
712   kmp_i18n_id_t *const msg_id)
713 {
714     *address2os = NULL;
715     *msg_id = kmp_i18n_null;
716 
717     //
718     // If we don't have multiple processor groups, return now.
719     // The flat mapping will be used.
720     //
721     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
722         // FIXME set *msg_id
723         return -1;
724     }
725 
726     //
727     // Contruct the data structure to be returned.
728     //
729     *address2os = (AddrUnsPair*)
730       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
731     int avail_ct = 0;
732     int i;
733     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
734         //
735         // Skip this proc if it is not included in the machine model.
736         //
737         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
738             continue;
739         }
740 
741         Address addr(2);
742         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
743         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
744         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
745 
746         if (__kmp_affinity_verbose) {
747             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
748               addr.labels[1]);
749         }
750     }
751 
752     if (__kmp_affinity_gran_levels < 0) {
753         if (__kmp_affinity_gran == affinity_gran_group) {
754             __kmp_affinity_gran_levels = 1;
755         }
756         else if ((__kmp_affinity_gran == affinity_gran_fine)
757           || (__kmp_affinity_gran == affinity_gran_thread)) {
758             __kmp_affinity_gran_levels = 0;
759         }
760         else {
761             const char *gran_str = NULL;
762             if (__kmp_affinity_gran == affinity_gran_core) {
763                 gran_str = "core";
764             }
765             else if (__kmp_affinity_gran == affinity_gran_package) {
766                 gran_str = "package";
767             }
768             else if (__kmp_affinity_gran == affinity_gran_node) {
769                 gran_str = "node";
770             }
771             else {
772                 KMP_ASSERT(0);
773             }
774 
775             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
776             __kmp_affinity_gran_levels = 0;
777         }
778     }
779     return 2;
780 }
781 
782 # endif /* KMP_GROUP_AFFINITY */
783 
784 
785 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
786 
787 static int
788 __kmp_cpuid_mask_width(int count) {
789     int r = 0;
790 
791     while((1<<r) < count)
792         ++r;
793     return r;
794 }
795 
796 
797 class apicThreadInfo {
798 public:
799     unsigned osId;              // param to __kmp_affinity_bind_thread
800     unsigned apicId;            // from cpuid after binding
801     unsigned maxCoresPerPkg;    //      ""
802     unsigned maxThreadsPerPkg;  //      ""
803     unsigned pkgId;             // inferred from above values
804     unsigned coreId;            //      ""
805     unsigned threadId;          //      ""
806 };
807 
808 
809 static int
810 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
811 {
812     const apicThreadInfo *aa = (const apicThreadInfo *)a;
813     const apicThreadInfo *bb = (const apicThreadInfo *)b;
814     if (aa->osId < bb->osId) return -1;
815     if (aa->osId > bb->osId) return 1;
816     return 0;
817 }
818 
819 
820 static int
821 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
822 {
823     const apicThreadInfo *aa = (const apicThreadInfo *)a;
824     const apicThreadInfo *bb = (const apicThreadInfo *)b;
825     if (aa->pkgId < bb->pkgId) return -1;
826     if (aa->pkgId > bb->pkgId) return 1;
827     if (aa->coreId < bb->coreId) return -1;
828     if (aa->coreId > bb->coreId) return 1;
829     if (aa->threadId < bb->threadId) return -1;
830     if (aa->threadId > bb->threadId) return 1;
831     return 0;
832 }
833 
834 
835 //
836 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
837 // an algorithm which cycles through the available os threads, setting
838 // the current thread's affinity mask to that thread, and then retrieves
839 // the Apic Id for each thread context using the cpuid instruction.
840 //
841 static int
842 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
843   kmp_i18n_id_t *const msg_id)
844 {
845     kmp_cpuid buf;
846     int rc;
847     *address2os = NULL;
848     *msg_id = kmp_i18n_null;
849 
850     //
851     // Check if cpuid leaf 4 is supported.
852     //
853         __kmp_x86_cpuid(0, 0, &buf);
854         if (buf.eax < 4) {
855             *msg_id = kmp_i18n_str_NoLeaf4Support;
856             return -1;
857         }
858 
859     //
860     // The algorithm used starts by setting the affinity to each available
861     // thread and retrieving info from the cpuid instruction, so if we are
862     // not capable of calling __kmp_get_system_affinity() and
863     // _kmp_get_system_affinity(), then we need to do something else - use
864     // the defaults that we calculated from issuing cpuid without binding
865     // to each proc.
866     //
867     if (! KMP_AFFINITY_CAPABLE()) {
868         //
869         // Hack to try and infer the machine topology using only the data
870         // available from cpuid on the current thread, and __kmp_xproc.
871         //
872         KMP_ASSERT(__kmp_affinity_type == affinity_none);
873 
874         //
875         // Get an upper bound on the number of threads per package using
876         // cpuid(1).
877         //
878         // On some OS/chps combinations where HT is supported by the chip
879         // but is disabled, this value will be 2 on a single core chip.
880         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
881         //
882         __kmp_x86_cpuid(1, 0, &buf);
883         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
884         if (maxThreadsPerPkg == 0) {
885             maxThreadsPerPkg = 1;
886         }
887 
888         //
889         // The num cores per pkg comes from cpuid(4).
890         // 1 must be added to the encoded value.
891         //
892         // The author of cpu_count.cpp treated this only an upper bound
893         // on the number of cores, but I haven't seen any cases where it
894         // was greater than the actual number of cores, so we will treat
895         // it as exact in this block of code.
896         //
897         // First, we need to check if cpuid(4) is supported on this chip.
898         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
899         // has the value n or greater.
900         //
901         __kmp_x86_cpuid(0, 0, &buf);
902         if (buf.eax >= 4) {
903             __kmp_x86_cpuid(4, 0, &buf);
904             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
905         }
906         else {
907             nCoresPerPkg = 1;
908         }
909 
910         //
911         // There is no way to reliably tell if HT is enabled without issuing
912         // the cpuid instruction from every thread, can correlating the cpuid
913         // info, so if the machine is not affinity capable, we assume that HT
914         // is off.  We have seen quite a few machines where maxThreadsPerPkg
915         // is 2, yet the machine does not support HT.
916         //
917         // - Older OSes are usually found on machines with older chips, which
918         //   do not support HT.
919         //
920         // - The performance penalty for mistakenly identifying a machine as
921         //   HT when it isn't (which results in blocktime being incorrecly set
922         //   to 0) is greater than the penalty when for mistakenly identifying
923         //   a machine as being 1 thread/core when it is really HT enabled
924         //   (which results in blocktime being incorrectly set to a positive
925         //   value).
926         //
927         __kmp_ncores = __kmp_xproc;
928         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
929         __kmp_nThreadsPerCore = 1;
930         if (__kmp_affinity_verbose) {
931             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
932             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
933             if (__kmp_affinity_uniform_topology()) {
934                 KMP_INFORM(Uniform, "KMP_AFFINITY");
935             } else {
936                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
937             }
938             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
939               __kmp_nThreadsPerCore, __kmp_ncores);
940         }
941         return 0;
942     }
943 
944     //
945     //
946     // From here on, we can assume that it is safe to call
947     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
948     // even if __kmp_affinity_type = affinity_none.
949     //
950 
951     //
952     // Save the affinity mask for the current thread.
953     //
954     kmp_affin_mask_t *oldMask;
955     KMP_CPU_ALLOC(oldMask);
956     KMP_ASSERT(oldMask != NULL);
957     __kmp_get_system_affinity(oldMask, TRUE);
958 
959     //
960     // Run through each of the available contexts, binding the current thread
961     // to it, and obtaining the pertinent information using the cpuid instr.
962     //
963     // The relevant information is:
964     //
965     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
966     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
967     //
968     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
969     //    value of this field determines the width of the core# + thread#
970     //    fields in the Apic Id.  It is also an upper bound on the number
971     //    of threads per package, but it has been verified that situations
972     //    happen were it is not exact.  In particular, on certain OS/chip
973     //    combinations where Intel(R) Hyper-Threading Technology is supported
974     //    by the chip but has
975     //    been disabled, the value of this field will be 2 (for a single core
976     //    chip).  On other OS/chip combinations supporting
977     //    Intel(R) Hyper-Threading Technology, the value of
978     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
979     //    disabled and 2 when it is enabled.
980     //
981     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
982     //    value of this field (+1) determines the width of the core# field in
983     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
984     //    an upper bound, but the IA-32 architecture manual says that it is
985     //    exactly the number of cores per package, and I haven't seen any
986     //    case where it wasn't.
987     //
988     // From this information, deduce the package Id, core Id, and thread Id,
989     // and set the corresponding fields in the apicThreadInfo struct.
990     //
991     unsigned i;
992     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
993       __kmp_avail_proc * sizeof(apicThreadInfo));
994     unsigned nApics = 0;
995     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
996         //
997         // Skip this proc if it is not included in the machine model.
998         //
999         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1000             continue;
1001         }
1002         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1003 
1004         __kmp_affinity_bind_thread(i);
1005         threadInfo[nApics].osId = i;
1006 
1007         //
1008         // The apic id and max threads per pkg come from cpuid(1).
1009         //
1010         __kmp_x86_cpuid(1, 0, &buf);
1011         if (! (buf.edx >> 9) & 1) {
1012             __kmp_set_system_affinity(oldMask, TRUE);
1013             __kmp_free(threadInfo);
1014             KMP_CPU_FREE(oldMask);
1015             *msg_id = kmp_i18n_str_ApicNotPresent;
1016             return -1;
1017         }
1018         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1019         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1020         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1021             threadInfo[nApics].maxThreadsPerPkg = 1;
1022         }
1023 
1024         //
1025         // Max cores per pkg comes from cpuid(4).
1026         // 1 must be added to the encoded value.
1027         //
1028         // First, we need to check if cpuid(4) is supported on this chip.
1029         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1030         // has the value n or greater.
1031         //
1032         __kmp_x86_cpuid(0, 0, &buf);
1033         if (buf.eax >= 4) {
1034             __kmp_x86_cpuid(4, 0, &buf);
1035             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1036         }
1037         else {
1038             threadInfo[nApics].maxCoresPerPkg = 1;
1039         }
1040 
1041         //
1042         // Infer the pkgId / coreId / threadId using only the info
1043         // obtained locally.
1044         //
1045         int widthCT = __kmp_cpuid_mask_width(
1046           threadInfo[nApics].maxThreadsPerPkg);
1047         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1048 
1049         int widthC = __kmp_cpuid_mask_width(
1050           threadInfo[nApics].maxCoresPerPkg);
1051         int widthT = widthCT - widthC;
1052         if (widthT < 0) {
1053             //
1054             // I've never seen this one happen, but I suppose it could, if
1055             // the cpuid instruction on a chip was really screwed up.
1056             // Make sure to restore the affinity mask before the tail call.
1057             //
1058             __kmp_set_system_affinity(oldMask, TRUE);
1059             __kmp_free(threadInfo);
1060             KMP_CPU_FREE(oldMask);
1061             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1062             return -1;
1063         }
1064 
1065         int maskC = (1 << widthC) - 1;
1066         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1067           &maskC;
1068 
1069         int maskT = (1 << widthT) - 1;
1070         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1071 
1072         nApics++;
1073     }
1074 
1075     //
1076     // We've collected all the info we need.
1077     // Restore the old affinity mask for this thread.
1078     //
1079     __kmp_set_system_affinity(oldMask, TRUE);
1080 
1081     //
1082     // If there's only one thread context to bind to, form an Address object
1083     // with depth 1 and return immediately (or, if affinity is off, set
1084     // address2os to NULL and return).
1085     //
1086     // If it is configured to omit the package level when there is only a
1087     // single package, the logic at the end of this routine won't work if
1088     // there is only a single thread - it would try to form an Address
1089     // object with depth 0.
1090     //
1091     KMP_ASSERT(nApics > 0);
1092     if (nApics == 1) {
1093         __kmp_ncores = nPackages = 1;
1094         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1095         if (__kmp_affinity_verbose) {
1096             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1097             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1098 
1099             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1100             if (__kmp_affinity_respect_mask) {
1101                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1102             } else {
1103                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1104             }
1105             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1106             KMP_INFORM(Uniform, "KMP_AFFINITY");
1107             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1108               __kmp_nThreadsPerCore, __kmp_ncores);
1109         }
1110 
1111         if (__kmp_affinity_type == affinity_none) {
1112             __kmp_free(threadInfo);
1113             KMP_CPU_FREE(oldMask);
1114             return 0;
1115         }
1116 
1117         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1118         Address addr(1);
1119         addr.labels[0] = threadInfo[0].pkgId;
1120         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1121 
1122         if (__kmp_affinity_gran_levels < 0) {
1123             __kmp_affinity_gran_levels = 0;
1124         }
1125 
1126         if (__kmp_affinity_verbose) {
1127             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1128         }
1129 
1130         __kmp_free(threadInfo);
1131         KMP_CPU_FREE(oldMask);
1132         return 1;
1133     }
1134 
1135     //
1136     // Sort the threadInfo table by physical Id.
1137     //
1138     qsort(threadInfo, nApics, sizeof(*threadInfo),
1139       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1140 
1141     //
1142     // The table is now sorted by pkgId / coreId / threadId, but we really
1143     // don't know the radix of any of the fields.  pkgId's may be sparsely
1144     // assigned among the chips on a system.  Although coreId's are usually
1145     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1146     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1147     //
1148     // For that matter, we don't know what coresPerPkg and threadsPerCore
1149     // (or the total # packages) are at this point - we want to determine
1150     // that now.  We only have an upper bound on the first two figures.
1151     //
1152     // We also perform a consistency check at this point: the values returned
1153     // by the cpuid instruction for any thread bound to a given package had
1154     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1155     //
1156     nPackages = 1;
1157     nCoresPerPkg = 1;
1158     __kmp_nThreadsPerCore = 1;
1159     unsigned nCores = 1;
1160 
1161     unsigned pkgCt = 1;                         // to determine radii
1162     unsigned lastPkgId = threadInfo[0].pkgId;
1163     unsigned coreCt = 1;
1164     unsigned lastCoreId = threadInfo[0].coreId;
1165     unsigned threadCt = 1;
1166     unsigned lastThreadId = threadInfo[0].threadId;
1167 
1168                                                 // intra-pkg consist checks
1169     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1170     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1171 
1172     for (i = 1; i < nApics; i++) {
1173         if (threadInfo[i].pkgId != lastPkgId) {
1174             nCores++;
1175             pkgCt++;
1176             lastPkgId = threadInfo[i].pkgId;
1177             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1178             coreCt = 1;
1179             lastCoreId = threadInfo[i].coreId;
1180             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1181             threadCt = 1;
1182             lastThreadId = threadInfo[i].threadId;
1183 
1184             //
1185             // This is a different package, so go on to the next iteration
1186             // without doing any consistency checks.  Reset the consistency
1187             // check vars, though.
1188             //
1189             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1190             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1191             continue;
1192         }
1193 
1194         if (threadInfo[i].coreId != lastCoreId) {
1195             nCores++;
1196             coreCt++;
1197             lastCoreId = threadInfo[i].coreId;
1198             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1199             threadCt = 1;
1200             lastThreadId = threadInfo[i].threadId;
1201         }
1202         else if (threadInfo[i].threadId != lastThreadId) {
1203             threadCt++;
1204             lastThreadId = threadInfo[i].threadId;
1205         }
1206         else {
1207             __kmp_free(threadInfo);
1208             KMP_CPU_FREE(oldMask);
1209             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1210             return -1;
1211         }
1212 
1213         //
1214         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1215         // fields agree between all the threads bounds to a given package.
1216         //
1217         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1218           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1219             __kmp_free(threadInfo);
1220             KMP_CPU_FREE(oldMask);
1221             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1222             return -1;
1223         }
1224     }
1225     nPackages = pkgCt;
1226     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1227     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1228 
1229     //
1230     // When affinity is off, this routine will still be called to set
1231     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1232     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1233     // correctly, and return now if affinity is not enabled.
1234     //
1235     __kmp_ncores = nCores;
1236     if (__kmp_affinity_verbose) {
1237         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1238         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1239 
1240         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1241         if (__kmp_affinity_respect_mask) {
1242             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1243         } else {
1244             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1245         }
1246         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1247         if (__kmp_affinity_uniform_topology()) {
1248             KMP_INFORM(Uniform, "KMP_AFFINITY");
1249         } else {
1250             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1251         }
1252         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1253           __kmp_nThreadsPerCore, __kmp_ncores);
1254 
1255     }
1256 
1257     if (__kmp_affinity_type == affinity_none) {
1258         __kmp_free(threadInfo);
1259         KMP_CPU_FREE(oldMask);
1260         return 0;
1261     }
1262 
1263     //
1264     // Now that we've determined the number of packages, the number of cores
1265     // per package, and the number of threads per core, we can construct the
1266     // data structure that is to be returned.
1267     //
1268     int pkgLevel = 0;
1269     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1270     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1271     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1272 
1273     KMP_ASSERT(depth > 0);
1274     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1275 
1276     for (i = 0; i < nApics; ++i) {
1277         Address addr(depth);
1278         unsigned os = threadInfo[i].osId;
1279         int d = 0;
1280 
1281         if (pkgLevel >= 0) {
1282             addr.labels[d++] = threadInfo[i].pkgId;
1283         }
1284         if (coreLevel >= 0) {
1285             addr.labels[d++] = threadInfo[i].coreId;
1286         }
1287         if (threadLevel >= 0) {
1288             addr.labels[d++] = threadInfo[i].threadId;
1289         }
1290         (*address2os)[i] = AddrUnsPair(addr, os);
1291     }
1292 
1293     if (__kmp_affinity_gran_levels < 0) {
1294         //
1295         // Set the granularity level based on what levels are modeled
1296         // in the machine topology map.
1297         //
1298         __kmp_affinity_gran_levels = 0;
1299         if ((threadLevel >= 0)
1300           && (__kmp_affinity_gran > affinity_gran_thread)) {
1301             __kmp_affinity_gran_levels++;
1302         }
1303         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1304             __kmp_affinity_gran_levels++;
1305         }
1306         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1307             __kmp_affinity_gran_levels++;
1308         }
1309     }
1310 
1311     if (__kmp_affinity_verbose) {
1312         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1313           coreLevel, threadLevel);
1314     }
1315 
1316     __kmp_free(threadInfo);
1317     KMP_CPU_FREE(oldMask);
1318     return depth;
1319 }
1320 
1321 
1322 //
1323 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1324 // architectures support a newer interface for specifying the x2APIC Ids,
1325 // based on cpuid leaf 11.
1326 //
1327 static int
1328 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1329   kmp_i18n_id_t *const msg_id)
1330 {
1331     kmp_cpuid buf;
1332 
1333     *address2os = NULL;
1334     *msg_id = kmp_i18n_null;
1335 
1336     //
1337     // Check to see if cpuid leaf 11 is supported.
1338     //
1339     __kmp_x86_cpuid(0, 0, &buf);
1340     if (buf.eax < 11) {
1341         *msg_id = kmp_i18n_str_NoLeaf11Support;
1342         return -1;
1343     }
1344     __kmp_x86_cpuid(11, 0, &buf);
1345     if (buf.ebx == 0) {
1346         *msg_id = kmp_i18n_str_NoLeaf11Support;
1347         return -1;
1348     }
1349 
1350     //
1351     // Find the number of levels in the machine topology.  While we're at it,
1352     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1353     // try to get more accurate values later by explicitly counting them,
1354     // but get reasonable defaults now, in case we return early.
1355     //
1356     int level;
1357     int threadLevel = -1;
1358     int coreLevel = -1;
1359     int pkgLevel = -1;
1360     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1361 
1362     for (level = 0;; level++) {
1363         if (level > 31) {
1364             //
1365             // FIXME: Hack for DPD200163180
1366             //
1367             // If level is big then something went wrong -> exiting
1368             //
1369             // There could actually be 32 valid levels in the machine topology,
1370             // but so far, the only machine we have seen which does not exit
1371             // this loop before iteration 32 has fubar x2APIC settings.
1372             //
1373             // For now, just reject this case based upon loop trip count.
1374             //
1375             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1376             return -1;
1377         }
1378         __kmp_x86_cpuid(11, level, &buf);
1379         if (buf.ebx == 0) {
1380             if (pkgLevel < 0) {
1381                 //
1382                 // Will infer nPackages from __kmp_xproc
1383                 //
1384                 pkgLevel = level;
1385                 level++;
1386             }
1387             break;
1388         }
1389         int kind = (buf.ecx >> 8) & 0xff;
1390         if (kind == 1) {
1391             //
1392             // SMT level
1393             //
1394             threadLevel = level;
1395             coreLevel = -1;
1396             pkgLevel = -1;
1397             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1398             if (__kmp_nThreadsPerCore == 0) {
1399                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1400                 return -1;
1401             }
1402         }
1403         else if (kind == 2) {
1404             //
1405             // core level
1406             //
1407             coreLevel = level;
1408             pkgLevel = -1;
1409             nCoresPerPkg = buf.ebx & 0xff;
1410             if (nCoresPerPkg == 0) {
1411                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1412                 return -1;
1413             }
1414         }
1415         else {
1416             if (level <= 0) {
1417                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1418                 return -1;
1419             }
1420             if (pkgLevel >= 0) {
1421                 continue;
1422             }
1423             pkgLevel = level;
1424             nPackages = buf.ebx & 0xff;
1425             if (nPackages == 0) {
1426                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1427                 return -1;
1428             }
1429         }
1430     }
1431     int depth = level;
1432 
1433     //
1434     // In the above loop, "level" was counted from the finest level (usually
1435     // thread) to the coarsest.  The caller expects that we will place the
1436     // labels in (*address2os)[].first.labels[] in the inverse order, so
1437     // we need to invert the vars saying which level means what.
1438     //
1439     if (threadLevel >= 0) {
1440         threadLevel = depth - threadLevel - 1;
1441     }
1442     if (coreLevel >= 0) {
1443         coreLevel = depth - coreLevel - 1;
1444     }
1445     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1446     pkgLevel = depth - pkgLevel - 1;
1447 
1448     //
1449     // The algorithm used starts by setting the affinity to each available
1450     // thread and retrieving info from the cpuid instruction, so if we are
1451     // not capable of calling __kmp_get_system_affinity() and
1452     // _kmp_get_system_affinity(), then we need to do something else - use
1453     // the defaults that we calculated from issuing cpuid without binding
1454     // to each proc.
1455     //
1456     if (! KMP_AFFINITY_CAPABLE())
1457     {
1458         //
1459         // Hack to try and infer the machine topology using only the data
1460         // available from cpuid on the current thread, and __kmp_xproc.
1461         //
1462         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1463 
1464         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1465         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1466         if (__kmp_affinity_verbose) {
1467             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1468             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1469             if (__kmp_affinity_uniform_topology()) {
1470                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1471             } else {
1472                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1473             }
1474             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1475               __kmp_nThreadsPerCore, __kmp_ncores);
1476         }
1477         return 0;
1478     }
1479 
1480     //
1481     //
1482     // From here on, we can assume that it is safe to call
1483     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1484     // even if __kmp_affinity_type = affinity_none.
1485     //
1486 
1487     //
1488     // Save the affinity mask for the current thread.
1489     //
1490     kmp_affin_mask_t *oldMask;
1491     KMP_CPU_ALLOC(oldMask);
1492     __kmp_get_system_affinity(oldMask, TRUE);
1493 
1494     //
1495     // Allocate the data structure to be returned.
1496     //
1497     AddrUnsPair *retval = (AddrUnsPair *)
1498       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1499 
1500     //
1501     // Run through each of the available contexts, binding the current thread
1502     // to it, and obtaining the pertinent information using the cpuid instr.
1503     //
1504     unsigned int proc;
1505     int nApics = 0;
1506     KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1507         //
1508         // Skip this proc if it is not included in the machine model.
1509         //
1510         if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1511             continue;
1512         }
1513         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1514 
1515         __kmp_affinity_bind_thread(proc);
1516 
1517         //
1518         // Extrach the labels for each level in the machine topology map
1519         // from the Apic ID.
1520         //
1521         Address addr(depth);
1522         int prev_shift = 0;
1523 
1524         for (level = 0; level < depth; level++) {
1525             __kmp_x86_cpuid(11, level, &buf);
1526             unsigned apicId = buf.edx;
1527             if (buf.ebx == 0) {
1528                 if (level != depth - 1) {
1529                     KMP_CPU_FREE(oldMask);
1530                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1531                     return -1;
1532                 }
1533                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1534                 level++;
1535                 break;
1536             }
1537             int shift = buf.eax & 0x1f;
1538             int mask = (1 << shift) - 1;
1539             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1540             prev_shift = shift;
1541         }
1542         if (level != depth) {
1543             KMP_CPU_FREE(oldMask);
1544             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1545             return -1;
1546         }
1547 
1548         retval[nApics] = AddrUnsPair(addr, proc);
1549         nApics++;
1550     }
1551 
1552     //
1553     // We've collected all the info we need.
1554     // Restore the old affinity mask for this thread.
1555     //
1556     __kmp_set_system_affinity(oldMask, TRUE);
1557 
1558     //
1559     // If there's only one thread context to bind to, return now.
1560     //
1561     KMP_ASSERT(nApics > 0);
1562     if (nApics == 1) {
1563         __kmp_ncores = nPackages = 1;
1564         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1565         if (__kmp_affinity_verbose) {
1566             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1567             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1568 
1569             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1570             if (__kmp_affinity_respect_mask) {
1571                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1572             } else {
1573                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1574             }
1575             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1576             KMP_INFORM(Uniform, "KMP_AFFINITY");
1577             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1578               __kmp_nThreadsPerCore, __kmp_ncores);
1579         }
1580 
1581         if (__kmp_affinity_type == affinity_none) {
1582             __kmp_free(retval);
1583             KMP_CPU_FREE(oldMask);
1584             return 0;
1585         }
1586 
1587         //
1588         // Form an Address object which only includes the package level.
1589         //
1590         Address addr(1);
1591         addr.labels[0] = retval[0].first.labels[pkgLevel];
1592         retval[0].first = addr;
1593 
1594         if (__kmp_affinity_gran_levels < 0) {
1595             __kmp_affinity_gran_levels = 0;
1596         }
1597 
1598         if (__kmp_affinity_verbose) {
1599             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1600         }
1601 
1602         *address2os = retval;
1603         KMP_CPU_FREE(oldMask);
1604         return 1;
1605     }
1606 
1607     //
1608     // Sort the table by physical Id.
1609     //
1610     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1611 
1612     //
1613     // Find the radix at each of the levels.
1614     //
1615     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1616     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1617     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1618     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1619     for (level = 0; level < depth; level++) {
1620         totals[level] = 1;
1621         maxCt[level] = 1;
1622         counts[level] = 1;
1623         last[level] = retval[0].first.labels[level];
1624     }
1625 
1626     //
1627     // From here on, the iteration variable "level" runs from the finest
1628     // level to the coarsest, i.e. we iterate forward through
1629     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1630     // backwards.
1631     //
1632     for (proc = 1; (int)proc < nApics; proc++) {
1633         int level;
1634         for (level = 0; level < depth; level++) {
1635             if (retval[proc].first.labels[level] != last[level]) {
1636                 int j;
1637                 for (j = level + 1; j < depth; j++) {
1638                     totals[j]++;
1639                     counts[j] = 1;
1640                     // The line below causes printing incorrect topology information
1641                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1642                     // some less value while going through the array.
1643                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1644                     // whereas it must be 4.
1645                     // TODO!!! Check if it can be commented safely
1646                     //maxCt[j] = 1;
1647                     last[j] = retval[proc].first.labels[j];
1648                 }
1649                 totals[level]++;
1650                 counts[level]++;
1651                 if (counts[level] > maxCt[level]) {
1652                     maxCt[level] = counts[level];
1653                 }
1654                 last[level] = retval[proc].first.labels[level];
1655                 break;
1656             }
1657             else if (level == depth - 1) {
1658                 __kmp_free(last);
1659                 __kmp_free(maxCt);
1660                 __kmp_free(counts);
1661                 __kmp_free(totals);
1662                 __kmp_free(retval);
1663                 KMP_CPU_FREE(oldMask);
1664                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1665                 return -1;
1666             }
1667         }
1668     }
1669 
1670     //
1671     // When affinity is off, this routine will still be called to set
1672     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1673     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1674     // correctly, and return if affinity is not enabled.
1675     //
1676     if (threadLevel >= 0) {
1677         __kmp_nThreadsPerCore = maxCt[threadLevel];
1678     }
1679     else {
1680         __kmp_nThreadsPerCore = 1;
1681     }
1682     nPackages = totals[pkgLevel];
1683 
1684     if (coreLevel >= 0) {
1685         __kmp_ncores = totals[coreLevel];
1686         nCoresPerPkg = maxCt[coreLevel];
1687     }
1688     else {
1689         __kmp_ncores = nPackages;
1690         nCoresPerPkg = 1;
1691     }
1692 
1693     //
1694     // Check to see if the machine topology is uniform
1695     //
1696     unsigned prod = maxCt[0];
1697     for (level = 1; level < depth; level++) {
1698        prod *= maxCt[level];
1699     }
1700     bool uniform = (prod == totals[level - 1]);
1701 
1702     //
1703     // Print the machine topology summary.
1704     //
1705     if (__kmp_affinity_verbose) {
1706         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1707         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1708 
1709         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1710         if (__kmp_affinity_respect_mask) {
1711             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1712         } else {
1713             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1714         }
1715         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1716         if (uniform) {
1717             KMP_INFORM(Uniform, "KMP_AFFINITY");
1718         } else {
1719             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1720         }
1721 
1722         kmp_str_buf_t buf;
1723         __kmp_str_buf_init(&buf);
1724 
1725         __kmp_str_buf_print(&buf, "%d", totals[0]);
1726         for (level = 1; level <= pkgLevel; level++) {
1727             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1728         }
1729         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1730           __kmp_nThreadsPerCore, __kmp_ncores);
1731 
1732         __kmp_str_buf_free(&buf);
1733     }
1734 
1735     if (__kmp_affinity_type == affinity_none) {
1736         __kmp_free(last);
1737         __kmp_free(maxCt);
1738         __kmp_free(counts);
1739         __kmp_free(totals);
1740         __kmp_free(retval);
1741         KMP_CPU_FREE(oldMask);
1742         return 0;
1743     }
1744 
1745     //
1746     // Find any levels with radiix 1, and remove them from the map
1747     // (except for the package level).
1748     //
1749     int new_depth = 0;
1750     for (level = 0; level < depth; level++) {
1751         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1752            continue;
1753         }
1754         new_depth++;
1755     }
1756 
1757     //
1758     // If we are removing any levels, allocate a new vector to return,
1759     // and copy the relevant information to it.
1760     //
1761     if (new_depth != depth) {
1762         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1763           sizeof(AddrUnsPair) * nApics);
1764         for (proc = 0; (int)proc < nApics; proc++) {
1765             Address addr(new_depth);
1766             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1767         }
1768         int new_level = 0;
1769         int newPkgLevel = -1;
1770         int newCoreLevel = -1;
1771         int newThreadLevel = -1;
1772         int i;
1773         for (level = 0; level < depth; level++) {
1774             if ((maxCt[level] == 1)
1775               && (level != pkgLevel)) {
1776                 //
1777                 // Remove this level. Never remove the package level
1778                 //
1779                 continue;
1780             }
1781             if (level == pkgLevel) {
1782                 newPkgLevel = level;
1783             }
1784             if (level == coreLevel) {
1785                 newCoreLevel = level;
1786             }
1787             if (level == threadLevel) {
1788                 newThreadLevel = level;
1789             }
1790             for (proc = 0; (int)proc < nApics; proc++) {
1791                 new_retval[proc].first.labels[new_level]
1792                   = retval[proc].first.labels[level];
1793             }
1794             new_level++;
1795         }
1796 
1797         __kmp_free(retval);
1798         retval = new_retval;
1799         depth = new_depth;
1800         pkgLevel = newPkgLevel;
1801         coreLevel = newCoreLevel;
1802         threadLevel = newThreadLevel;
1803     }
1804 
1805     if (__kmp_affinity_gran_levels < 0) {
1806         //
1807         // Set the granularity level based on what levels are modeled
1808         // in the machine topology map.
1809         //
1810         __kmp_affinity_gran_levels = 0;
1811         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1812             __kmp_affinity_gran_levels++;
1813         }
1814         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1815             __kmp_affinity_gran_levels++;
1816         }
1817         if (__kmp_affinity_gran > affinity_gran_package) {
1818             __kmp_affinity_gran_levels++;
1819         }
1820     }
1821 
1822     if (__kmp_affinity_verbose) {
1823         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1824           coreLevel, threadLevel);
1825     }
1826 
1827     __kmp_free(last);
1828     __kmp_free(maxCt);
1829     __kmp_free(counts);
1830     __kmp_free(totals);
1831     KMP_CPU_FREE(oldMask);
1832     *address2os = retval;
1833     return depth;
1834 }
1835 
1836 
1837 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1838 
1839 
1840 #define osIdIndex       0
1841 #define threadIdIndex   1
1842 #define coreIdIndex     2
1843 #define pkgIdIndex      3
1844 #define nodeIdIndex     4
1845 
1846 typedef unsigned *ProcCpuInfo;
1847 static unsigned maxIndex = pkgIdIndex;
1848 
1849 
1850 static int
1851 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1852 {
1853     const unsigned *aa = (const unsigned *)a;
1854     const unsigned *bb = (const unsigned *)b;
1855     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1856     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1857     return 0;
1858 };
1859 
1860 
1861 static int
1862 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1863 {
1864     unsigned i;
1865     const unsigned *aa = *((const unsigned **)a);
1866     const unsigned *bb = *((const unsigned **)b);
1867     for (i = maxIndex; ; i--) {
1868         if (aa[i] < bb[i]) return -1;
1869         if (aa[i] > bb[i]) return 1;
1870         if (i == osIdIndex) break;
1871     }
1872     return 0;
1873 }
1874 
1875 
1876 //
1877 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1878 // affinity map.
1879 //
1880 static int
1881 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1882   kmp_i18n_id_t *const msg_id, FILE *f)
1883 {
1884     *address2os = NULL;
1885     *msg_id = kmp_i18n_null;
1886 
1887     //
1888     // Scan of the file, and count the number of "processor" (osId) fields,
1889     // and find the highest value of <n> for a node_<n> field.
1890     //
1891     char buf[256];
1892     unsigned num_records = 0;
1893     while (! feof(f)) {
1894         buf[sizeof(buf) - 1] = 1;
1895         if (! fgets(buf, sizeof(buf), f)) {
1896             //
1897             // Read errors presumably because of EOF
1898             //
1899             break;
1900         }
1901 
1902         char s1[] = "processor";
1903         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1904             num_records++;
1905             continue;
1906         }
1907 
1908         //
1909         // FIXME - this will match "node_<n> <garbage>"
1910         //
1911         unsigned level;
1912         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1913             if (nodeIdIndex + level >= maxIndex) {
1914                 maxIndex = nodeIdIndex + level;
1915             }
1916             continue;
1917         }
1918     }
1919 
1920     //
1921     // Check for empty file / no valid processor records, or too many.
1922     // The number of records can't exceed the number of valid bits in the
1923     // affinity mask.
1924     //
1925     if (num_records == 0) {
1926         *line = 0;
1927         *msg_id = kmp_i18n_str_NoProcRecords;
1928         return -1;
1929     }
1930     if (num_records > (unsigned)__kmp_xproc) {
1931         *line = 0;
1932         *msg_id = kmp_i18n_str_TooManyProcRecords;
1933         return -1;
1934     }
1935 
1936     //
1937     // Set the file pointer back to the begginning, so that we can scan the
1938     // file again, this time performing a full parse of the data.
1939     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1940     // Adding an extra element at the end allows us to remove a lot of extra
1941     // checks for termination conditions.
1942     //
1943     if (fseek(f, 0, SEEK_SET) != 0) {
1944         *line = 0;
1945         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1946         return -1;
1947     }
1948 
1949     //
1950     // Allocate the array of records to store the proc info in.  The dummy
1951     // element at the end makes the logic in filling them out easier to code.
1952     //
1953     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1954       * sizeof(unsigned *));
1955     unsigned i;
1956     for (i = 0; i <= num_records; i++) {
1957         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1958           * sizeof(unsigned));
1959     }
1960 
1961 #define CLEANUP_THREAD_INFO \
1962     for (i = 0; i <= num_records; i++) {                                \
1963         __kmp_free(threadInfo[i]);                                      \
1964     }                                                                   \
1965     __kmp_free(threadInfo);
1966 
1967     //
1968     // A value of UINT_MAX means that we didn't find the field
1969     //
1970     unsigned __index;
1971 
1972 #define INIT_PROC_INFO(p) \
1973     for (__index = 0; __index <= maxIndex; __index++) {                 \
1974         (p)[__index] = UINT_MAX;                                        \
1975     }
1976 
1977     for (i = 0; i <= num_records; i++) {
1978         INIT_PROC_INFO(threadInfo[i]);
1979     }
1980 
1981     unsigned num_avail = 0;
1982     *line = 0;
1983     while (! feof(f)) {
1984         //
1985         // Create an inner scoping level, so that all the goto targets at the
1986         // end of the loop appear in an outer scoping level.  This avoids
1987         // warnings about jumping past an initialization to a target in the
1988         // same block.
1989         //
1990         {
1991             buf[sizeof(buf) - 1] = 1;
1992             bool long_line = false;
1993             if (! fgets(buf, sizeof(buf), f)) {
1994                 //
1995                 // Read errors presumably because of EOF
1996                 //
1997                 // If there is valid data in threadInfo[num_avail], then fake
1998                 // a blank line in ensure that the last address gets parsed.
1999                 //
2000                 bool valid = false;
2001                 for (i = 0; i <= maxIndex; i++) {
2002                     if (threadInfo[num_avail][i] != UINT_MAX) {
2003                         valid = true;
2004                     }
2005                 }
2006                 if (! valid) {
2007                     break;
2008                 }
2009                 buf[0] = 0;
2010             } else if (!buf[sizeof(buf) - 1]) {
2011                 //
2012                 // The line is longer than the buffer.  Set a flag and don't
2013                 // emit an error if we were going to ignore the line, anyway.
2014                 //
2015                 long_line = true;
2016 
2017 #define CHECK_LINE \
2018     if (long_line) {                                                    \
2019         CLEANUP_THREAD_INFO;                                            \
2020         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2021         return -1;                                                      \
2022     }
2023             }
2024             (*line)++;
2025 
2026             char s1[] = "processor";
2027             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2028                 CHECK_LINE;
2029                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2030                 unsigned val;
2031                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2032                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2033                 threadInfo[num_avail][osIdIndex] = val;
2034 #if KMP_OS_LINUX && USE_SYSFS_INFO
2035                 char path[256];
2036                 KMP_SNPRINTF(path, sizeof(path),
2037                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2038                     threadInfo[num_avail][osIdIndex]);
2039                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2040 
2041                 KMP_SNPRINTF(path, sizeof(path),
2042                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2043                     threadInfo[num_avail][osIdIndex]);
2044                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2045                 continue;
2046 #else
2047             }
2048             char s2[] = "physical id";
2049             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2050                 CHECK_LINE;
2051                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2052                 unsigned val;
2053                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2054                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2055                 threadInfo[num_avail][pkgIdIndex] = val;
2056                 continue;
2057             }
2058             char s3[] = "core id";
2059             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2060                 CHECK_LINE;
2061                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2062                 unsigned val;
2063                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2064                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2065                 threadInfo[num_avail][coreIdIndex] = val;
2066                 continue;
2067 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2068             }
2069             char s4[] = "thread id";
2070             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2071                 CHECK_LINE;
2072                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2073                 unsigned val;
2074                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2075                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2076                 threadInfo[num_avail][threadIdIndex] = val;
2077                 continue;
2078             }
2079             unsigned level;
2080             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2081                 CHECK_LINE;
2082                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2083                 unsigned val;
2084                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2085                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2086                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2087                 threadInfo[num_avail][nodeIdIndex + level] = val;
2088                 continue;
2089             }
2090 
2091             //
2092             // We didn't recognize the leading token on the line.
2093             // There are lots of leading tokens that we don't recognize -
2094             // if the line isn't empty, go on to the next line.
2095             //
2096             if ((*buf != 0) && (*buf != '\n')) {
2097                 //
2098                 // If the line is longer than the buffer, read characters
2099                 // until we find a newline.
2100                 //
2101                 if (long_line) {
2102                     int ch;
2103                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2104                 }
2105                 continue;
2106             }
2107 
2108             //
2109             // A newline has signalled the end of the processor record.
2110             // Check that there aren't too many procs specified.
2111             //
2112             if ((int)num_avail == __kmp_xproc) {
2113                 CLEANUP_THREAD_INFO;
2114                 *msg_id = kmp_i18n_str_TooManyEntries;
2115                 return -1;
2116             }
2117 
2118             //
2119             // Check for missing fields.  The osId field must be there, and we
2120             // currently require that the physical id field is specified, also.
2121             //
2122             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2123                 CLEANUP_THREAD_INFO;
2124                 *msg_id = kmp_i18n_str_MissingProcField;
2125                 return -1;
2126             }
2127             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2128                 CLEANUP_THREAD_INFO;
2129                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2130                 return -1;
2131             }
2132 
2133             //
2134             // Skip this proc if it is not included in the machine model.
2135             //
2136             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2137                 INIT_PROC_INFO(threadInfo[num_avail]);
2138                 continue;
2139             }
2140 
2141             //
2142             // We have a successful parse of this proc's info.
2143             // Increment the counter, and prepare for the next proc.
2144             //
2145             num_avail++;
2146             KMP_ASSERT(num_avail <= num_records);
2147             INIT_PROC_INFO(threadInfo[num_avail]);
2148         }
2149         continue;
2150 
2151         no_val:
2152         CLEANUP_THREAD_INFO;
2153         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2154         return -1;
2155 
2156         dup_field:
2157         CLEANUP_THREAD_INFO;
2158         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2159         return -1;
2160     }
2161     *line = 0;
2162 
2163 # if KMP_MIC && REDUCE_TEAM_SIZE
2164     unsigned teamSize = 0;
2165 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2166 
2167     // check for num_records == __kmp_xproc ???
2168 
2169     //
2170     // If there's only one thread context to bind to, form an Address object
2171     // with depth 1 and return immediately (or, if affinity is off, set
2172     // address2os to NULL and return).
2173     //
2174     // If it is configured to omit the package level when there is only a
2175     // single package, the logic at the end of this routine won't work if
2176     // there is only a single thread - it would try to form an Address
2177     // object with depth 0.
2178     //
2179     KMP_ASSERT(num_avail > 0);
2180     KMP_ASSERT(num_avail <= num_records);
2181     if (num_avail == 1) {
2182         __kmp_ncores = 1;
2183         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2184         if (__kmp_affinity_verbose) {
2185             if (! KMP_AFFINITY_CAPABLE()) {
2186                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2187                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2188                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2189             }
2190             else {
2191                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2192                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2193                   __kmp_affin_fullMask);
2194                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2195                 if (__kmp_affinity_respect_mask) {
2196                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2197                 } else {
2198                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2199                 }
2200                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2201                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2202             }
2203             int index;
2204             kmp_str_buf_t buf;
2205             __kmp_str_buf_init(&buf);
2206             __kmp_str_buf_print(&buf, "1");
2207             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2208                 __kmp_str_buf_print(&buf, " x 1");
2209             }
2210             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2211             __kmp_str_buf_free(&buf);
2212         }
2213 
2214         if (__kmp_affinity_type == affinity_none) {
2215             CLEANUP_THREAD_INFO;
2216             return 0;
2217         }
2218 
2219         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2220         Address addr(1);
2221         addr.labels[0] = threadInfo[0][pkgIdIndex];
2222         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2223 
2224         if (__kmp_affinity_gran_levels < 0) {
2225             __kmp_affinity_gran_levels = 0;
2226         }
2227 
2228         if (__kmp_affinity_verbose) {
2229             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2230         }
2231 
2232         CLEANUP_THREAD_INFO;
2233         return 1;
2234     }
2235 
2236     //
2237     // Sort the threadInfo table by physical Id.
2238     //
2239     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2240       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2241 
2242     //
2243     // The table is now sorted by pkgId / coreId / threadId, but we really
2244     // don't know the radix of any of the fields.  pkgId's may be sparsely
2245     // assigned among the chips on a system.  Although coreId's are usually
2246     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2247     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2248     //
2249     // For that matter, we don't know what coresPerPkg and threadsPerCore
2250     // (or the total # packages) are at this point - we want to determine
2251     // that now.  We only have an upper bound on the first two figures.
2252     //
2253     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2254       * sizeof(unsigned));
2255     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2256       * sizeof(unsigned));
2257     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2258       * sizeof(unsigned));
2259     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2260       * sizeof(unsigned));
2261 
2262     bool assign_thread_ids = false;
2263     unsigned threadIdCt;
2264     unsigned index;
2265 
2266     restart_radix_check:
2267     threadIdCt = 0;
2268 
2269     //
2270     // Initialize the counter arrays with data from threadInfo[0].
2271     //
2272     if (assign_thread_ids) {
2273         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2274             threadInfo[0][threadIdIndex] = threadIdCt++;
2275         }
2276         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2277             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2278         }
2279     }
2280     for (index = 0; index <= maxIndex; index++) {
2281         counts[index] = 1;
2282         maxCt[index] = 1;
2283         totals[index] = 1;
2284         lastId[index] = threadInfo[0][index];;
2285     }
2286 
2287     //
2288     // Run through the rest of the OS procs.
2289     //
2290     for (i = 1; i < num_avail; i++) {
2291         //
2292         // Find the most significant index whose id differs
2293         // from the id for the previous OS proc.
2294         //
2295         for (index = maxIndex; index >= threadIdIndex; index--) {
2296             if (assign_thread_ids && (index == threadIdIndex)) {
2297                 //
2298                 // Auto-assign the thread id field if it wasn't specified.
2299                 //
2300                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2301                     threadInfo[i][threadIdIndex] = threadIdCt++;
2302                 }
2303 
2304                 //
2305                 // Aparrently the thread id field was specified for some
2306                 // entries and not others.  Start the thread id counter
2307                 // off at the next higher thread id.
2308                 //
2309                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2310                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2311                 }
2312             }
2313             if (threadInfo[i][index] != lastId[index]) {
2314                 //
2315                 // Run through all indices which are less significant,
2316                 // and reset the counts to 1.
2317                 //
2318                 // At all levels up to and including index, we need to
2319                 // increment the totals and record the last id.
2320                 //
2321                 unsigned index2;
2322                 for (index2 = threadIdIndex; index2 < index; index2++) {
2323                     totals[index2]++;
2324                     if (counts[index2] > maxCt[index2]) {
2325                         maxCt[index2] = counts[index2];
2326                     }
2327                     counts[index2] = 1;
2328                     lastId[index2] = threadInfo[i][index2];
2329                 }
2330                 counts[index]++;
2331                 totals[index]++;
2332                 lastId[index] = threadInfo[i][index];
2333 
2334                 if (assign_thread_ids && (index > threadIdIndex)) {
2335 
2336 # if KMP_MIC && REDUCE_TEAM_SIZE
2337                     //
2338                     // The default team size is the total #threads in the machine
2339                     // minus 1 thread for every core that has 3 or more threads.
2340                     //
2341                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2342 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2343 
2344                     //
2345                     // Restart the thread counter, as we are on a new core.
2346                     //
2347                     threadIdCt = 0;
2348 
2349                     //
2350                     // Auto-assign the thread id field if it wasn't specified.
2351                     //
2352                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2353                         threadInfo[i][threadIdIndex] = threadIdCt++;
2354                     }
2355 
2356                     //
2357                     // Aparrently the thread id field was specified for some
2358                     // entries and not others.  Start the thread id counter
2359                     // off at the next higher thread id.
2360                     //
2361                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2362                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2363                     }
2364                 }
2365                 break;
2366             }
2367         }
2368         if (index < threadIdIndex) {
2369             //
2370             // If thread ids were specified, it is an error if they are not
2371             // unique.  Also, check that we waven't already restarted the
2372             // loop (to be safe - shouldn't need to).
2373             //
2374             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2375               || assign_thread_ids) {
2376                 __kmp_free(lastId);
2377                 __kmp_free(totals);
2378                 __kmp_free(maxCt);
2379                 __kmp_free(counts);
2380                 CLEANUP_THREAD_INFO;
2381                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2382                 return -1;
2383             }
2384 
2385             //
2386             // If the thread ids were not specified and we see entries
2387             // entries that are duplicates, start the loop over and
2388             // assign the thread ids manually.
2389             //
2390             assign_thread_ids = true;
2391             goto restart_radix_check;
2392         }
2393     }
2394 
2395 # if KMP_MIC && REDUCE_TEAM_SIZE
2396     //
2397     // The default team size is the total #threads in the machine
2398     // minus 1 thread for every core that has 3 or more threads.
2399     //
2400     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2401 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2402 
2403     for (index = threadIdIndex; index <= maxIndex; index++) {
2404         if (counts[index] > maxCt[index]) {
2405             maxCt[index] = counts[index];
2406         }
2407     }
2408 
2409     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2410     nCoresPerPkg = maxCt[coreIdIndex];
2411     nPackages = totals[pkgIdIndex];
2412 
2413     //
2414     // Check to see if the machine topology is uniform
2415     //
2416     unsigned prod = totals[maxIndex];
2417     for (index = threadIdIndex; index < maxIndex; index++) {
2418        prod *= maxCt[index];
2419     }
2420     bool uniform = (prod == totals[threadIdIndex]);
2421 
2422     //
2423     // When affinity is off, this routine will still be called to set
2424     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2425     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2426     // correctly, and return now if affinity is not enabled.
2427     //
2428     __kmp_ncores = totals[coreIdIndex];
2429 
2430     if (__kmp_affinity_verbose) {
2431         if (! KMP_AFFINITY_CAPABLE()) {
2432                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2433                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2434                 if (uniform) {
2435                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2436                 } else {
2437                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2438                 }
2439         }
2440         else {
2441             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2442             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2443                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2444                 if (__kmp_affinity_respect_mask) {
2445                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2446                 } else {
2447                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2448                 }
2449                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2450                 if (uniform) {
2451                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2452                 } else {
2453                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2454                 }
2455         }
2456         kmp_str_buf_t buf;
2457         __kmp_str_buf_init(&buf);
2458 
2459         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2460         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2461             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2462         }
2463         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2464           maxCt[threadIdIndex], __kmp_ncores);
2465 
2466         __kmp_str_buf_free(&buf);
2467     }
2468 
2469 # if KMP_MIC && REDUCE_TEAM_SIZE
2470     //
2471     // Set the default team size.
2472     //
2473     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2474         __kmp_dflt_team_nth = teamSize;
2475         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2476           __kmp_dflt_team_nth));
2477     }
2478 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2479 
2480     if (__kmp_affinity_type == affinity_none) {
2481         __kmp_free(lastId);
2482         __kmp_free(totals);
2483         __kmp_free(maxCt);
2484         __kmp_free(counts);
2485         CLEANUP_THREAD_INFO;
2486         return 0;
2487     }
2488 
2489     //
2490     // Count the number of levels which have more nodes at that level than
2491     // at the parent's level (with there being an implicit root node of
2492     // the top level).  This is equivalent to saying that there is at least
2493     // one node at this level which has a sibling.  These levels are in the
2494     // map, and the package level is always in the map.
2495     //
2496     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2497     int level = 0;
2498     for (index = threadIdIndex; index < maxIndex; index++) {
2499         KMP_ASSERT(totals[index] >= totals[index + 1]);
2500         inMap[index] = (totals[index] > totals[index + 1]);
2501     }
2502     inMap[maxIndex] = (totals[maxIndex] > 1);
2503     inMap[pkgIdIndex] = true;
2504 
2505     int depth = 0;
2506     for (index = threadIdIndex; index <= maxIndex; index++) {
2507         if (inMap[index]) {
2508             depth++;
2509         }
2510     }
2511     KMP_ASSERT(depth > 0);
2512 
2513     //
2514     // Construct the data structure that is to be returned.
2515     //
2516     *address2os = (AddrUnsPair*)
2517       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2518     int pkgLevel = -1;
2519     int coreLevel = -1;
2520     int threadLevel = -1;
2521 
2522     for (i = 0; i < num_avail; ++i) {
2523         Address addr(depth);
2524         unsigned os = threadInfo[i][osIdIndex];
2525         int src_index;
2526         int dst_index = 0;
2527 
2528         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2529             if (! inMap[src_index]) {
2530                 continue;
2531             }
2532             addr.labels[dst_index] = threadInfo[i][src_index];
2533             if (src_index == pkgIdIndex) {
2534                 pkgLevel = dst_index;
2535             }
2536             else if (src_index == coreIdIndex) {
2537                 coreLevel = dst_index;
2538             }
2539             else if (src_index == threadIdIndex) {
2540                 threadLevel = dst_index;
2541             }
2542             dst_index++;
2543         }
2544         (*address2os)[i] = AddrUnsPair(addr, os);
2545     }
2546 
2547     if (__kmp_affinity_gran_levels < 0) {
2548         //
2549         // Set the granularity level based on what levels are modeled
2550         // in the machine topology map.
2551         //
2552         unsigned src_index;
2553         __kmp_affinity_gran_levels = 0;
2554         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2555             if (! inMap[src_index]) {
2556                 continue;
2557             }
2558             switch (src_index) {
2559                 case threadIdIndex:
2560                 if (__kmp_affinity_gran > affinity_gran_thread) {
2561                     __kmp_affinity_gran_levels++;
2562                 }
2563 
2564                 break;
2565                 case coreIdIndex:
2566                 if (__kmp_affinity_gran > affinity_gran_core) {
2567                     __kmp_affinity_gran_levels++;
2568                 }
2569                 break;
2570 
2571                 case pkgIdIndex:
2572                 if (__kmp_affinity_gran > affinity_gran_package) {
2573                     __kmp_affinity_gran_levels++;
2574                 }
2575                 break;
2576             }
2577         }
2578     }
2579 
2580     if (__kmp_affinity_verbose) {
2581         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2582           coreLevel, threadLevel);
2583     }
2584 
2585     __kmp_free(inMap);
2586     __kmp_free(lastId);
2587     __kmp_free(totals);
2588     __kmp_free(maxCt);
2589     __kmp_free(counts);
2590     CLEANUP_THREAD_INFO;
2591     return depth;
2592 }
2593 
2594 
2595 //
2596 // Create and return a table of affinity masks, indexed by OS thread ID.
2597 // This routine handles OR'ing together all the affinity masks of threads
2598 // that are sufficiently close, if granularity > fine.
2599 //
2600 static kmp_affin_mask_t *
2601 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2602   AddrUnsPair *address2os, unsigned numAddrs)
2603 {
2604     //
2605     // First form a table of affinity masks in order of OS thread id.
2606     //
2607     unsigned depth;
2608     unsigned maxOsId;
2609     unsigned i;
2610 
2611     KMP_ASSERT(numAddrs > 0);
2612     depth = address2os[0].first.depth;
2613 
2614     maxOsId = 0;
2615     for (i = 0; i < numAddrs; i++) {
2616         unsigned osId = address2os[i].second;
2617         if (osId > maxOsId) {
2618             maxOsId = osId;
2619         }
2620     }
2621     kmp_affin_mask_t *osId2Mask;
2622     KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2623 
2624     //
2625     // Sort the address2os table according to physical order.  Doing so
2626     // will put all threads on the same core/package/node in consecutive
2627     // locations.
2628     //
2629     qsort(address2os, numAddrs, sizeof(*address2os),
2630       __kmp_affinity_cmp_Address_labels);
2631 
2632     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2633     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2634         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2635     }
2636     if (__kmp_affinity_gran_levels >= (int)depth) {
2637         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2638           && (__kmp_affinity_type != affinity_none))) {
2639             KMP_WARNING(AffThreadsMayMigrate);
2640         }
2641     }
2642 
2643     //
2644     // Run through the table, forming the masks for all threads on each
2645     // core.  Threads on the same core will have identical "Address"
2646     // objects, not considering the last level, which must be the thread
2647     // id.  All threads on a core will appear consecutively.
2648     //
2649     unsigned unique = 0;
2650     unsigned j = 0;                             // index of 1st thread on core
2651     unsigned leader = 0;
2652     Address *leaderAddr = &(address2os[0].first);
2653     kmp_affin_mask_t *sum;
2654     KMP_CPU_ALLOC_ON_STACK(sum);
2655     KMP_CPU_ZERO(sum);
2656     KMP_CPU_SET(address2os[0].second, sum);
2657     for (i = 1; i < numAddrs; i++) {
2658         //
2659         // If this thread is sufficiently close to the leader (within the
2660         // granularity setting), then set the bit for this os thread in the
2661         // affinity mask for this group, and go on to the next thread.
2662         //
2663         if (leaderAddr->isClose(address2os[i].first,
2664           __kmp_affinity_gran_levels)) {
2665             KMP_CPU_SET(address2os[i].second, sum);
2666             continue;
2667         }
2668 
2669         //
2670         // For every thread in this group, copy the mask to the thread's
2671         // entry in the osId2Mask table.  Mark the first address as a
2672         // leader.
2673         //
2674         for (; j < i; j++) {
2675             unsigned osId = address2os[j].second;
2676             KMP_DEBUG_ASSERT(osId <= maxOsId);
2677             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2678             KMP_CPU_COPY(mask, sum);
2679             address2os[j].first.leader = (j == leader);
2680         }
2681         unique++;
2682 
2683         //
2684         // Start a new mask.
2685         //
2686         leader = i;
2687         leaderAddr = &(address2os[i].first);
2688         KMP_CPU_ZERO(sum);
2689         KMP_CPU_SET(address2os[i].second, sum);
2690     }
2691 
2692     //
2693     // For every thread in last group, copy the mask to the thread's
2694     // entry in the osId2Mask table.
2695     //
2696     for (; j < i; j++) {
2697         unsigned osId = address2os[j].second;
2698         KMP_DEBUG_ASSERT(osId <= maxOsId);
2699         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2700         KMP_CPU_COPY(mask, sum);
2701         address2os[j].first.leader = (j == leader);
2702     }
2703     unique++;
2704     KMP_CPU_FREE_FROM_STACK(sum);
2705 
2706     *maxIndex = maxOsId;
2707     *numUnique = unique;
2708     return osId2Mask;
2709 }
2710 
2711 
2712 //
2713 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2714 // as file-static than to try and pass them through the calling sequence of
2715 // the recursive-descent OMP_PLACES parser.
2716 //
2717 static kmp_affin_mask_t *newMasks;
2718 static int numNewMasks;
2719 static int nextNewMask;
2720 
2721 #define ADD_MASK(_mask) \
2722     {                                                                   \
2723         if (nextNewMask >= numNewMasks) {                               \
2724             int i;                                                      \
2725             numNewMasks *= 2;                                           \
2726             kmp_affin_mask_t* temp;                                     \
2727             KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
2728             for(i=0;i<numNewMasks/2;i++) {                              \
2729                 kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
2730                 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
2731                 KMP_CPU_COPY(dest, src);                                \
2732             }                                                           \
2733             KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
2734             newMasks = temp;                                            \
2735         }                                                               \
2736         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2737         nextNewMask++;                                                  \
2738     }
2739 
2740 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2741     {                                                                   \
2742         if (((_osId) > _maxOsId) ||                                     \
2743           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2744             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2745               && (__kmp_affinity_type != affinity_none))) {             \
2746                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2747             }                                                           \
2748         }                                                               \
2749         else {                                                          \
2750             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2751         }                                                               \
2752     }
2753 
2754 
2755 //
2756 // Re-parse the proclist (for the explicit affinity type), and form the list
2757 // of affinity newMasks indexed by gtid.
2758 //
2759 static void
2760 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2761   unsigned int *out_numMasks, const char *proclist,
2762   kmp_affin_mask_t *osId2Mask, int maxOsId)
2763 {
2764     int i;
2765     const char *scan = proclist;
2766     const char *next = proclist;
2767 
2768     //
2769     // We use malloc() for the temporary mask vector,
2770     // so that we can use realloc() to extend it.
2771     //
2772     numNewMasks = 2;
2773     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2774     nextNewMask = 0;
2775     kmp_affin_mask_t *sumMask;
2776     KMP_CPU_ALLOC(sumMask);
2777     int setSize = 0;
2778 
2779     for (;;) {
2780         int start, end, stride;
2781 
2782         SKIP_WS(scan);
2783         next = scan;
2784         if (*next == '\0') {
2785             break;
2786         }
2787 
2788         if (*next == '{') {
2789             int num;
2790             setSize = 0;
2791             next++;     // skip '{'
2792             SKIP_WS(next);
2793             scan = next;
2794 
2795             //
2796             // Read the first integer in the set.
2797             //
2798             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2799               "bad proclist");
2800             SKIP_DIGITS(next);
2801             num = __kmp_str_to_int(scan, *next);
2802             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2803 
2804             //
2805             // Copy the mask for that osId to the sum (union) mask.
2806             //
2807             if ((num > maxOsId) ||
2808               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2809                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2810                   && (__kmp_affinity_type != affinity_none))) {
2811                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2812                 }
2813                 KMP_CPU_ZERO(sumMask);
2814             }
2815             else {
2816                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2817                 setSize = 1;
2818             }
2819 
2820             for (;;) {
2821                 //
2822                 // Check for end of set.
2823                 //
2824                 SKIP_WS(next);
2825                 if (*next == '}') {
2826                     next++;     // skip '}'
2827                     break;
2828                 }
2829 
2830                 //
2831                 // Skip optional comma.
2832                 //
2833                 if (*next == ',') {
2834                     next++;
2835                 }
2836                 SKIP_WS(next);
2837 
2838                 //
2839                 // Read the next integer in the set.
2840                 //
2841                 scan = next;
2842                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2843                   "bad explicit proc list");
2844 
2845                 SKIP_DIGITS(next);
2846                 num = __kmp_str_to_int(scan, *next);
2847                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2848 
2849                 //
2850                 // Add the mask for that osId to the sum mask.
2851                 //
2852                 if ((num > maxOsId) ||
2853                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2854                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2855                       && (__kmp_affinity_type != affinity_none))) {
2856                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2857                     }
2858                 }
2859                 else {
2860                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2861                     setSize++;
2862                 }
2863             }
2864             if (setSize > 0) {
2865                 ADD_MASK(sumMask);
2866             }
2867 
2868             SKIP_WS(next);
2869             if (*next == ',') {
2870                 next++;
2871             }
2872             scan = next;
2873             continue;
2874         }
2875 
2876         //
2877         // Read the first integer.
2878         //
2879         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2880         SKIP_DIGITS(next);
2881         start = __kmp_str_to_int(scan, *next);
2882         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2883         SKIP_WS(next);
2884 
2885         //
2886         // If this isn't a range, then add a mask to the list and go on.
2887         //
2888         if (*next != '-') {
2889             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2890 
2891             //
2892             // Skip optional comma.
2893             //
2894             if (*next == ',') {
2895                 next++;
2896             }
2897             scan = next;
2898             continue;
2899         }
2900 
2901         //
2902         // This is a range.  Skip over the '-' and read in the 2nd int.
2903         //
2904         next++;         // skip '-'
2905         SKIP_WS(next);
2906         scan = next;
2907         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2908         SKIP_DIGITS(next);
2909         end = __kmp_str_to_int(scan, *next);
2910         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2911 
2912         //
2913         // Check for a stride parameter
2914         //
2915         stride = 1;
2916         SKIP_WS(next);
2917         if (*next == ':') {
2918             //
2919             // A stride is specified.  Skip over the ':" and read the 3rd int.
2920             //
2921             int sign = +1;
2922             next++;         // skip ':'
2923             SKIP_WS(next);
2924             scan = next;
2925             if (*next == '-') {
2926                 sign = -1;
2927                 next++;
2928                 SKIP_WS(next);
2929                 scan = next;
2930             }
2931             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2932               "bad explicit proc list");
2933             SKIP_DIGITS(next);
2934             stride = __kmp_str_to_int(scan, *next);
2935             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2936             stride *= sign;
2937         }
2938 
2939         //
2940         // Do some range checks.
2941         //
2942         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2943         if (stride > 0) {
2944             KMP_ASSERT2(start <= end, "bad explicit proc list");
2945         }
2946         else {
2947             KMP_ASSERT2(start >= end, "bad explicit proc list");
2948         }
2949         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2950 
2951         //
2952         // Add the mask for each OS proc # to the list.
2953         //
2954         if (stride > 0) {
2955             do {
2956                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2957                 start += stride;
2958             } while (start <= end);
2959         }
2960         else {
2961             do {
2962                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2963                 start += stride;
2964             } while (start >= end);
2965         }
2966 
2967         //
2968         // Skip optional comma.
2969         //
2970         SKIP_WS(next);
2971         if (*next == ',') {
2972             next++;
2973         }
2974         scan = next;
2975     }
2976 
2977     *out_numMasks = nextNewMask;
2978     if (nextNewMask == 0) {
2979         *out_masks = NULL;
2980         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2981         return;
2982     }
2983     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
2984     for(i = 0; i < nextNewMask; i++) {
2985         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
2986         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
2987         KMP_CPU_COPY(dest, src);
2988     }
2989     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2990     KMP_CPU_FREE(sumMask);
2991 }
2992 
2993 
2994 # if OMP_40_ENABLED
2995 
2996 /*-----------------------------------------------------------------------------
2997 
2998 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2999 places.  Again, Here is the grammar:
3000 
3001 place_list := place
3002 place_list := place , place_list
3003 place := num
3004 place := place : num
3005 place := place : num : signed
3006 place := { subplacelist }
3007 place := ! place                  // (lowest priority)
3008 subplace_list := subplace
3009 subplace_list := subplace , subplace_list
3010 subplace := num
3011 subplace := num : num
3012 subplace := num : num : signed
3013 signed := num
3014 signed := + signed
3015 signed := - signed
3016 
3017 -----------------------------------------------------------------------------*/
3018 
3019 static void
3020 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3021   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3022 {
3023     const char *next;
3024 
3025     for (;;) {
3026         int start, count, stride, i;
3027 
3028         //
3029         // Read in the starting proc id
3030         //
3031         SKIP_WS(*scan);
3032         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3033           "bad explicit places list");
3034         next = *scan;
3035         SKIP_DIGITS(next);
3036         start = __kmp_str_to_int(*scan, *next);
3037         KMP_ASSERT(start >= 0);
3038         *scan = next;
3039 
3040         //
3041         // valid follow sets are ',' ':' and '}'
3042         //
3043         SKIP_WS(*scan);
3044         if (**scan == '}' || **scan == ',') {
3045             if ((start > maxOsId) ||
3046               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3047                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3048                   && (__kmp_affinity_type != affinity_none))) {
3049                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3050                 }
3051             }
3052             else {
3053                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3054                 (*setSize)++;
3055             }
3056             if (**scan == '}') {
3057                 break;
3058             }
3059             (*scan)++;  // skip ','
3060             continue;
3061         }
3062         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3063         (*scan)++;      // skip ':'
3064 
3065         //
3066         // Read count parameter
3067         //
3068         SKIP_WS(*scan);
3069         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3070           "bad explicit places list");
3071         next = *scan;
3072         SKIP_DIGITS(next);
3073         count = __kmp_str_to_int(*scan, *next);
3074         KMP_ASSERT(count >= 0);
3075         *scan = next;
3076 
3077         //
3078         // valid follow sets are ',' ':' and '}'
3079         //
3080         SKIP_WS(*scan);
3081         if (**scan == '}' || **scan == ',') {
3082             for (i = 0; i < count; i++) {
3083                 if ((start > maxOsId) ||
3084                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3085                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3086                       && (__kmp_affinity_type != affinity_none))) {
3087                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3088                     }
3089                     break;  // don't proliferate warnings for large count
3090                 }
3091                 else {
3092                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3093                     start++;
3094                     (*setSize)++;
3095                 }
3096             }
3097             if (**scan == '}') {
3098                 break;
3099             }
3100             (*scan)++;  // skip ','
3101             continue;
3102         }
3103         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3104         (*scan)++;      // skip ':'
3105 
3106         //
3107         // Read stride parameter
3108         //
3109         int sign = +1;
3110         for (;;) {
3111             SKIP_WS(*scan);
3112             if (**scan == '+') {
3113                 (*scan)++; // skip '+'
3114                 continue;
3115             }
3116             if (**scan == '-') {
3117                 sign *= -1;
3118                 (*scan)++; // skip '-'
3119                 continue;
3120             }
3121             break;
3122         }
3123         SKIP_WS(*scan);
3124         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3125           "bad explicit places list");
3126         next = *scan;
3127         SKIP_DIGITS(next);
3128         stride = __kmp_str_to_int(*scan, *next);
3129         KMP_ASSERT(stride >= 0);
3130         *scan = next;
3131         stride *= sign;
3132 
3133         //
3134         // valid follow sets are ',' and '}'
3135         //
3136         SKIP_WS(*scan);
3137         if (**scan == '}' || **scan == ',') {
3138             for (i = 0; i < count; i++) {
3139                 if ((start > maxOsId) ||
3140                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3141                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3142                       && (__kmp_affinity_type != affinity_none))) {
3143                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3144                     }
3145                     break;  // don't proliferate warnings for large count
3146                 }
3147                 else {
3148                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3149                     start += stride;
3150                     (*setSize)++;
3151                 }
3152             }
3153             if (**scan == '}') {
3154                 break;
3155             }
3156             (*scan)++;  // skip ','
3157             continue;
3158         }
3159 
3160         KMP_ASSERT2(0, "bad explicit places list");
3161     }
3162 }
3163 
3164 
3165 static void
3166 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3167   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3168 {
3169     const char *next;
3170 
3171     //
3172     // valid follow sets are '{' '!' and num
3173     //
3174     SKIP_WS(*scan);
3175     if (**scan == '{') {
3176         (*scan)++;      // skip '{'
3177         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3178           setSize);
3179         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3180         (*scan)++;      // skip '}'
3181     }
3182     else if (**scan == '!') {
3183         (*scan)++;      // skip '!'
3184         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3185         KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3186     }
3187     else if ((**scan >= '0') && (**scan <= '9')) {
3188         next = *scan;
3189         SKIP_DIGITS(next);
3190         int num = __kmp_str_to_int(*scan, *next);
3191         KMP_ASSERT(num >= 0);
3192         if ((num > maxOsId) ||
3193           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3194             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3195               && (__kmp_affinity_type != affinity_none))) {
3196                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3197             }
3198         }
3199         else {
3200             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3201             (*setSize)++;
3202         }
3203         *scan = next;  // skip num
3204     }
3205     else {
3206         KMP_ASSERT2(0, "bad explicit places list");
3207     }
3208 }
3209 
3210 
3211 //static void
3212 void
3213 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3214   unsigned int *out_numMasks, const char *placelist,
3215   kmp_affin_mask_t *osId2Mask, int maxOsId)
3216 {
3217     int i,j,count,stride,sign;
3218     const char *scan = placelist;
3219     const char *next = placelist;
3220 
3221     numNewMasks = 2;
3222     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3223     nextNewMask = 0;
3224 
3225     // tempMask is modified based on the previous or initial
3226     //   place to form the current place
3227     // previousMask contains the previous place
3228     kmp_affin_mask_t *tempMask;
3229     kmp_affin_mask_t *previousMask;
3230     KMP_CPU_ALLOC(tempMask);
3231     KMP_CPU_ZERO(tempMask);
3232     KMP_CPU_ALLOC(previousMask);
3233     KMP_CPU_ZERO(previousMask);
3234     int setSize = 0;
3235 
3236     for (;;) {
3237         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3238 
3239         //
3240         // valid follow sets are ',' ':' and EOL
3241         //
3242         SKIP_WS(scan);
3243         if (*scan == '\0' || *scan == ',') {
3244             if (setSize > 0) {
3245                 ADD_MASK(tempMask);
3246             }
3247             KMP_CPU_ZERO(tempMask);
3248             setSize = 0;
3249             if (*scan == '\0') {
3250                 break;
3251             }
3252             scan++;     // skip ','
3253             continue;
3254         }
3255 
3256         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3257         scan++;         // skip ':'
3258 
3259         //
3260         // Read count parameter
3261         //
3262         SKIP_WS(scan);
3263         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3264           "bad explicit places list");
3265         next = scan;
3266         SKIP_DIGITS(next);
3267         count = __kmp_str_to_int(scan, *next);
3268         KMP_ASSERT(count >= 0);
3269         scan = next;
3270 
3271         //
3272         // valid follow sets are ',' ':' and EOL
3273         //
3274         SKIP_WS(scan);
3275         if (*scan == '\0' || *scan == ',') {
3276             stride = +1;
3277         }
3278         else {
3279             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3280             scan++;         // skip ':'
3281 
3282             //
3283             // Read stride parameter
3284             //
3285             sign = +1;
3286             for (;;) {
3287                 SKIP_WS(scan);
3288                 if (*scan == '+') {
3289                     scan++; // skip '+'
3290                     continue;
3291                 }
3292                 if (*scan == '-') {
3293                     sign *= -1;
3294                     scan++; // skip '-'
3295                     continue;
3296                 }
3297                 break;
3298             }
3299             SKIP_WS(scan);
3300             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3301               "bad explicit places list");
3302             next = scan;
3303             SKIP_DIGITS(next);
3304             stride = __kmp_str_to_int(scan, *next);
3305             KMP_DEBUG_ASSERT(stride >= 0);
3306             scan = next;
3307             stride *= sign;
3308         }
3309 
3310         // Add places determined by initial_place : count : stride
3311         for (i = 0; i < count; i++) {
3312             if (setSize == 0) {
3313                 break;
3314             }
3315             // Add the current place, then build the next place (tempMask) from that
3316             KMP_CPU_COPY(previousMask, tempMask);
3317             ADD_MASK(previousMask);
3318             KMP_CPU_ZERO(tempMask);
3319             setSize = 0;
3320             KMP_CPU_SET_ITERATE(j, previousMask) {
3321                 if (! KMP_CPU_ISSET(j, previousMask)) {
3322                     continue;
3323                 }
3324                 if ((j+stride > maxOsId) || (j+stride < 0) ||
3325                   (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3326                   (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3327                     if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3328                       && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3329                         KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3330                     }
3331                     continue;
3332                 }
3333                 KMP_CPU_SET(j+stride, tempMask);
3334                 setSize++;
3335             }
3336         }
3337         KMP_CPU_ZERO(tempMask);
3338         setSize = 0;
3339 
3340         //
3341         // valid follow sets are ',' and EOL
3342         //
3343         SKIP_WS(scan);
3344         if (*scan == '\0') {
3345             break;
3346         }
3347         if (*scan == ',') {
3348             scan++;     // skip ','
3349             continue;
3350         }
3351 
3352         KMP_ASSERT2(0, "bad explicit places list");
3353     }
3354 
3355     *out_numMasks = nextNewMask;
3356     if (nextNewMask == 0) {
3357         *out_masks = NULL;
3358         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3359         return;
3360     }
3361     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3362     KMP_CPU_FREE(tempMask);
3363     KMP_CPU_FREE(previousMask);
3364     for(i = 0; i < nextNewMask; i++) {
3365         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3366         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3367         KMP_CPU_COPY(dest, src);
3368     }
3369     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3370 }
3371 
3372 # endif /* OMP_40_ENABLED */
3373 
3374 #undef ADD_MASK
3375 #undef ADD_MASK_OSID
3376 
3377 static void
3378 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3379 {
3380     if (__kmp_place_num_sockets == 0 &&
3381         __kmp_place_num_cores == 0 &&
3382         __kmp_place_num_threads_per_core == 0 )
3383         return;   // no topology limiting actions requested, exit
3384     if (__kmp_place_num_sockets == 0)
3385         __kmp_place_num_sockets = nPackages;    // use all available sockets
3386     if (__kmp_place_num_cores == 0)
3387         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3388     if (__kmp_place_num_threads_per_core == 0 ||
3389         __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3390         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3391 
3392     if ( !__kmp_affinity_uniform_topology() ) {
3393         KMP_WARNING( AffThrPlaceNonUniform );
3394         return; // don't support non-uniform topology
3395     }
3396     if ( depth != 3 ) {
3397         KMP_WARNING( AffThrPlaceNonThreeLevel );
3398         return; // don't support not-3-level topology
3399     }
3400     if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3401         KMP_WARNING(AffThrPlaceManySockets);
3402         return;
3403     }
3404     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3405         KMP_WARNING( AffThrPlaceManyCores );
3406         return;
3407     }
3408 
3409     AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3410         __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3411 
3412     int i, j, k, n_old = 0, n_new = 0;
3413     for (i = 0; i < nPackages; ++i)
3414         if (i < __kmp_place_socket_offset ||
3415             i >= __kmp_place_socket_offset + __kmp_place_num_sockets)
3416             n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3417         else
3418             for (j = 0; j < nCoresPerPkg; ++j) // walk through requested socket
3419                 if (j < __kmp_place_core_offset ||
3420                     j >= __kmp_place_core_offset + __kmp_place_num_cores)
3421                     n_old += __kmp_nThreadsPerCore; // skip not-requested core
3422                 else
3423                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3424                         if (k < __kmp_place_num_threads_per_core) {
3425                             newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3426                             n_new++;
3427                         }
3428                         n_old++;
3429                     }
3430     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3431     KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3432                      __kmp_place_num_threads_per_core);
3433 
3434     nPackages = __kmp_place_num_sockets;                      // correct nPackages
3435     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3436     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3437     __kmp_avail_proc = n_new;                                 // correct avail_proc
3438     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3439 
3440     __kmp_free( *pAddr );
3441     *pAddr = newAddr;      // replace old topology with new one
3442 }
3443 
3444 
3445 static AddrUnsPair *address2os = NULL;
3446 static int           * procarr = NULL;
3447 static int     __kmp_aff_depth = 0;
3448 
3449 static void
3450 __kmp_aux_affinity_initialize(void)
3451 {
3452     if (__kmp_affinity_masks != NULL) {
3453         KMP_ASSERT(__kmp_affin_fullMask != NULL);
3454         return;
3455     }
3456 
3457     //
3458     // Create the "full" mask - this defines all of the processors that we
3459     // consider to be in the machine model.  If respect is set, then it is
3460     // the initialization thread's affinity mask.  Otherwise, it is all
3461     // processors that we know about on the machine.
3462     //
3463     if (__kmp_affin_fullMask == NULL) {
3464         KMP_CPU_ALLOC(__kmp_affin_fullMask);
3465     }
3466     if (KMP_AFFINITY_CAPABLE()) {
3467         if (__kmp_affinity_respect_mask) {
3468             __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3469 
3470             //
3471             // Count the number of available processors.
3472             //
3473             unsigned i;
3474             __kmp_avail_proc = 0;
3475             KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3476                 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3477                     continue;
3478                 }
3479                 __kmp_avail_proc++;
3480             }
3481             if (__kmp_avail_proc > __kmp_xproc) {
3482                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3483                   && (__kmp_affinity_type != affinity_none))) {
3484                     KMP_WARNING(ErrorInitializeAffinity);
3485                 }
3486                 __kmp_affinity_type = affinity_none;
3487                 KMP_AFFINITY_DISABLE();
3488                 return;
3489             }
3490         }
3491         else {
3492             __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3493             __kmp_avail_proc = __kmp_xproc;
3494         }
3495     }
3496 
3497     int depth = -1;
3498     kmp_i18n_id_t msg_id = kmp_i18n_null;
3499 
3500     //
3501     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3502     // KMP_TOPOLOGY_METHOD=cpuinfo
3503     //
3504     if ((__kmp_cpuinfo_file != NULL) &&
3505       (__kmp_affinity_top_method == affinity_top_method_all)) {
3506         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3507     }
3508 
3509     if (__kmp_affinity_top_method == affinity_top_method_all) {
3510         //
3511         // In the default code path, errors are not fatal - we just try using
3512         // another method.  We only emit a warning message if affinity is on,
3513         // or the verbose flag is set, an the nowarnings flag was not set.
3514         //
3515         const char *file_name = NULL;
3516         int line = 0;
3517 # if KMP_USE_HWLOC
3518         if (depth < 0) {
3519             if (__kmp_affinity_verbose) {
3520                 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3521             }
3522             if(!__kmp_hwloc_error) {
3523                 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3524                 if (depth == 0) {
3525                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3526                     KMP_ASSERT(address2os == NULL);
3527                     return;
3528                 } else if(depth < 0 && __kmp_affinity_verbose) {
3529                     KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3530                 }
3531             } else if(__kmp_affinity_verbose) {
3532                 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3533             }
3534         }
3535 # endif
3536 
3537 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3538 
3539         if (depth < 0) {
3540             if (__kmp_affinity_verbose) {
3541                 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3542             }
3543 
3544             file_name = NULL;
3545             depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3546             if (depth == 0) {
3547                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3548                 KMP_ASSERT(address2os == NULL);
3549                 return;
3550             }
3551 
3552             if (depth < 0) {
3553                 if (__kmp_affinity_verbose) {
3554                     if (msg_id != kmp_i18n_null) {
3555                         KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3556                           KMP_I18N_STR(DecodingLegacyAPIC));
3557                     }
3558                     else {
3559                         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3560                     }
3561                 }
3562 
3563                 file_name = NULL;
3564                 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3565                 if (depth == 0) {
3566                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3567                     KMP_ASSERT(address2os == NULL);
3568                     return;
3569                 }
3570             }
3571         }
3572 
3573 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3574 
3575 # if KMP_OS_LINUX
3576 
3577         if (depth < 0) {
3578             if (__kmp_affinity_verbose) {
3579                 if (msg_id != kmp_i18n_null) {
3580                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3581                 }
3582                 else {
3583                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3584                 }
3585             }
3586 
3587             FILE *f = fopen("/proc/cpuinfo", "r");
3588             if (f == NULL) {
3589                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3590             }
3591             else {
3592                 file_name = "/proc/cpuinfo";
3593                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3594                 fclose(f);
3595                 if (depth == 0) {
3596                     KMP_ASSERT(__kmp_affinity_type == affinity_none);
3597                     KMP_ASSERT(address2os == NULL);
3598                     return;
3599                 }
3600             }
3601         }
3602 
3603 # endif /* KMP_OS_LINUX */
3604 
3605 # if KMP_GROUP_AFFINITY
3606 
3607         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3608             if (__kmp_affinity_verbose) {
3609                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3610             }
3611 
3612             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3613             KMP_ASSERT(depth != 0);
3614         }
3615 
3616 # endif /* KMP_GROUP_AFFINITY */
3617 
3618         if (depth < 0) {
3619             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3620                 if (file_name == NULL) {
3621                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3622                 }
3623                 else if (line == 0) {
3624                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3625                 }
3626                 else {
3627                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3628                 }
3629             }
3630             // FIXME - print msg if msg_id = kmp_i18n_null ???
3631 
3632             file_name = "";
3633             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3634             if (depth == 0) {
3635                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3636                 KMP_ASSERT(address2os == NULL);
3637                 return;
3638             }
3639             KMP_ASSERT(depth > 0);
3640             KMP_ASSERT(address2os != NULL);
3641         }
3642     }
3643 
3644     //
3645     // If the user has specified that a paricular topology discovery method
3646     // is to be used, then we abort if that method fails.  The exception is
3647     // group affinity, which might have been implicitly set.
3648     //
3649 
3650 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3651 
3652     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3653         if (__kmp_affinity_verbose) {
3654             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3655               KMP_I18N_STR(Decodingx2APIC));
3656         }
3657 
3658         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3659         if (depth == 0) {
3660             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3661             KMP_ASSERT(address2os == NULL);
3662             return;
3663         }
3664         if (depth < 0) {
3665             KMP_ASSERT(msg_id != kmp_i18n_null);
3666             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3667         }
3668     }
3669     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3670         if (__kmp_affinity_verbose) {
3671             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3672               KMP_I18N_STR(DecodingLegacyAPIC));
3673         }
3674 
3675         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3676         if (depth == 0) {
3677             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3678             KMP_ASSERT(address2os == NULL);
3679             return;
3680         }
3681         if (depth < 0) {
3682             KMP_ASSERT(msg_id != kmp_i18n_null);
3683             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3684         }
3685     }
3686 
3687 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3688 
3689     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3690         const char *filename;
3691         if (__kmp_cpuinfo_file != NULL) {
3692             filename = __kmp_cpuinfo_file;
3693         }
3694         else {
3695             filename = "/proc/cpuinfo";
3696         }
3697 
3698         if (__kmp_affinity_verbose) {
3699             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3700         }
3701 
3702         FILE *f = fopen(filename, "r");
3703         if (f == NULL) {
3704             int code = errno;
3705             if (__kmp_cpuinfo_file != NULL) {
3706                 __kmp_msg(
3707                     kmp_ms_fatal,
3708                     KMP_MSG(CantOpenFileForReading, filename),
3709                     KMP_ERR(code),
3710                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3711                     __kmp_msg_null
3712                 );
3713             }
3714             else {
3715                 __kmp_msg(
3716                     kmp_ms_fatal,
3717                     KMP_MSG(CantOpenFileForReading, filename),
3718                     KMP_ERR(code),
3719                     __kmp_msg_null
3720                 );
3721             }
3722         }
3723         int line = 0;
3724         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3725         fclose(f);
3726         if (depth < 0) {
3727             KMP_ASSERT(msg_id != kmp_i18n_null);
3728             if (line > 0) {
3729                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3730             }
3731             else {
3732                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3733             }
3734         }
3735         if (__kmp_affinity_type == affinity_none) {
3736             KMP_ASSERT(depth == 0);
3737             KMP_ASSERT(address2os == NULL);
3738             return;
3739         }
3740     }
3741 
3742 # if KMP_GROUP_AFFINITY
3743 
3744     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3745         if (__kmp_affinity_verbose) {
3746             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3747         }
3748 
3749         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3750         KMP_ASSERT(depth != 0);
3751         if (depth < 0) {
3752             KMP_ASSERT(msg_id != kmp_i18n_null);
3753             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3754         }
3755     }
3756 
3757 # endif /* KMP_GROUP_AFFINITY */
3758 
3759     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3760         if (__kmp_affinity_verbose) {
3761             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3762         }
3763 
3764         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3765         if (depth == 0) {
3766             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3767             KMP_ASSERT(address2os == NULL);
3768             return;
3769         }
3770         // should not fail
3771         KMP_ASSERT(depth > 0);
3772         KMP_ASSERT(address2os != NULL);
3773     }
3774 
3775 # if KMP_USE_HWLOC
3776     else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3777         if (__kmp_affinity_verbose) {
3778             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3779         }
3780         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3781         if (depth == 0) {
3782             KMP_ASSERT(__kmp_affinity_type == affinity_none);
3783             KMP_ASSERT(address2os == NULL);
3784             return;
3785         }
3786     }
3787 # endif // KMP_USE_HWLOC
3788 
3789     if (address2os == NULL) {
3790         if (KMP_AFFINITY_CAPABLE()
3791           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3792           && (__kmp_affinity_type != affinity_none)))) {
3793             KMP_WARNING(ErrorInitializeAffinity);
3794         }
3795         __kmp_affinity_type = affinity_none;
3796         KMP_AFFINITY_DISABLE();
3797         return;
3798     }
3799 
3800     __kmp_apply_thread_places(&address2os, depth);
3801 
3802     //
3803     // Create the table of masks, indexed by thread Id.
3804     //
3805     unsigned maxIndex;
3806     unsigned numUnique;
3807     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3808       address2os, __kmp_avail_proc);
3809     if (__kmp_affinity_gran_levels == 0) {
3810         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3811     }
3812 
3813     //
3814     // Set the childNums vector in all Address objects.  This must be done
3815     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3816     // which takes into account the setting of __kmp_affinity_compact.
3817     //
3818     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3819 
3820     switch (__kmp_affinity_type) {
3821 
3822         case affinity_explicit:
3823         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3824 # if OMP_40_ENABLED
3825         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3826 # endif
3827         {
3828             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3829               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3830               maxIndex);
3831         }
3832 # if OMP_40_ENABLED
3833         else {
3834             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3835               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3836               maxIndex);
3837         }
3838 # endif
3839         if (__kmp_affinity_num_masks == 0) {
3840             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3841               && (__kmp_affinity_type != affinity_none))) {
3842                 KMP_WARNING(AffNoValidProcID);
3843             }
3844             __kmp_affinity_type = affinity_none;
3845             return;
3846         }
3847         break;
3848 
3849         //
3850         // The other affinity types rely on sorting the Addresses according
3851         // to some permutation of the machine topology tree.  Set
3852         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3853         // then jump to a common code fragment to do the sort and create
3854         // the array of affinity masks.
3855         //
3856 
3857         case affinity_logical:
3858         __kmp_affinity_compact = 0;
3859         if (__kmp_affinity_offset) {
3860             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3861               % __kmp_avail_proc;
3862         }
3863         goto sortAddresses;
3864 
3865         case affinity_physical:
3866         if (__kmp_nThreadsPerCore > 1) {
3867             __kmp_affinity_compact = 1;
3868             if (__kmp_affinity_compact >= depth) {
3869                 __kmp_affinity_compact = 0;
3870             }
3871         } else {
3872             __kmp_affinity_compact = 0;
3873         }
3874         if (__kmp_affinity_offset) {
3875             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3876               % __kmp_avail_proc;
3877         }
3878         goto sortAddresses;
3879 
3880         case affinity_scatter:
3881         if (__kmp_affinity_compact >= depth) {
3882             __kmp_affinity_compact = 0;
3883         }
3884         else {
3885             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3886         }
3887         goto sortAddresses;
3888 
3889         case affinity_compact:
3890         if (__kmp_affinity_compact >= depth) {
3891             __kmp_affinity_compact = depth - 1;
3892         }
3893         goto sortAddresses;
3894 
3895         case affinity_balanced:
3896         // Balanced works only for the case of a single package
3897         if( nPackages > 1 ) {
3898             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3899                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3900             }
3901             __kmp_affinity_type = affinity_none;
3902             return;
3903         } else if( __kmp_affinity_uniform_topology() ) {
3904             break;
3905         } else { // Non-uniform topology
3906 
3907             // Save the depth for further usage
3908             __kmp_aff_depth = depth;
3909 
3910             // Number of hyper threads per core in HT machine
3911             int nth_per_core = __kmp_nThreadsPerCore;
3912 
3913             int core_level;
3914             if( nth_per_core > 1 ) {
3915                 core_level = depth - 2;
3916             } else {
3917                 core_level = depth - 1;
3918             }
3919             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3920             int nproc = nth_per_core * ncores;
3921 
3922             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3923             for( int i = 0; i < nproc; i++ ) {
3924                 procarr[ i ] = -1;
3925             }
3926 
3927             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3928                 int proc = address2os[ i ].second;
3929                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3930                 // If there is only one thread per core then depth == 2: level 0 - package,
3931                 // level 1 - core.
3932                 int level = depth - 1;
3933 
3934                 // __kmp_nth_per_core == 1
3935                 int thread = 0;
3936                 int core = address2os[ i ].first.labels[ level ];
3937                 // If the thread level exists, that is we have more than one thread context per core
3938                 if( nth_per_core > 1 ) {
3939                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3940                     core = address2os[ i ].first.labels[ level - 1 ];
3941                 }
3942                 procarr[ core * nth_per_core + thread ] = proc;
3943             }
3944 
3945             break;
3946         }
3947 
3948         sortAddresses:
3949         //
3950         // Allocate the gtid->affinity mask table.
3951         //
3952         if (__kmp_affinity_dups) {
3953             __kmp_affinity_num_masks = __kmp_avail_proc;
3954         }
3955         else {
3956             __kmp_affinity_num_masks = numUnique;
3957         }
3958 
3959 # if OMP_40_ENABLED
3960         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3961           && ( __kmp_affinity_num_places > 0 )
3962           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3963             __kmp_affinity_num_masks = __kmp_affinity_num_places;
3964         }
3965 # endif
3966 
3967         KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3968 
3969         //
3970         // Sort the address2os table according to the current setting of
3971         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3972         //
3973         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3974           __kmp_affinity_cmp_Address_child_num);
3975         {
3976             int i;
3977             unsigned j;
3978             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3979                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3980                     continue;
3981                 }
3982                 unsigned osId = address2os[i].second;
3983                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3984                 kmp_affin_mask_t *dest
3985                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3986                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3987                 KMP_CPU_COPY(dest, src);
3988                 if (++j >= __kmp_affinity_num_masks) {
3989                     break;
3990                 }
3991             }
3992             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3993         }
3994         break;
3995 
3996         default:
3997         KMP_ASSERT2(0, "Unexpected affinity setting");
3998     }
3999 
4000     __kmp_free(osId2Mask);
4001     machine_hierarchy.init(address2os, __kmp_avail_proc);
4002 }
4003 
4004 
4005 void
4006 __kmp_affinity_initialize(void)
4007 {
4008     //
4009     // Much of the code above was written assumming that if a machine was not
4010     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4011     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4012     //
4013     // There are too many checks for __kmp_affinity_type == affinity_none
4014     // in this code.  Instead of trying to change them all, check if
4015     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4016     // affinity_none, call the real initialization routine, then restore
4017     // __kmp_affinity_type to affinity_disabled.
4018     //
4019     int disabled = (__kmp_affinity_type == affinity_disabled);
4020     if (! KMP_AFFINITY_CAPABLE()) {
4021         KMP_ASSERT(disabled);
4022     }
4023     if (disabled) {
4024         __kmp_affinity_type = affinity_none;
4025     }
4026     __kmp_aux_affinity_initialize();
4027     if (disabled) {
4028         __kmp_affinity_type = affinity_disabled;
4029     }
4030 }
4031 
4032 
4033 void
4034 __kmp_affinity_uninitialize(void)
4035 {
4036     if (__kmp_affinity_masks != NULL) {
4037         KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4038         __kmp_affinity_masks = NULL;
4039     }
4040     if (__kmp_affin_fullMask != NULL) {
4041         KMP_CPU_FREE(__kmp_affin_fullMask);
4042         __kmp_affin_fullMask = NULL;
4043     }
4044     __kmp_affinity_num_masks = 0;
4045 # if OMP_40_ENABLED
4046     __kmp_affinity_num_places = 0;
4047 # endif
4048     if (__kmp_affinity_proclist != NULL) {
4049         __kmp_free(__kmp_affinity_proclist);
4050         __kmp_affinity_proclist = NULL;
4051     }
4052     if( address2os != NULL ) {
4053         __kmp_free( address2os );
4054         address2os = NULL;
4055     }
4056     if( procarr != NULL ) {
4057         __kmp_free( procarr );
4058         procarr = NULL;
4059     }
4060 # if KMP_USE_HWLOC
4061     if (__kmp_hwloc_topology != NULL) {
4062         hwloc_topology_destroy(__kmp_hwloc_topology);
4063         __kmp_hwloc_topology = NULL;
4064     }
4065 # endif
4066 }
4067 
4068 
4069 void
4070 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4071 {
4072     if (! KMP_AFFINITY_CAPABLE()) {
4073         return;
4074     }
4075 
4076     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4077     if (th->th.th_affin_mask == NULL) {
4078         KMP_CPU_ALLOC(th->th.th_affin_mask);
4079     }
4080     else {
4081         KMP_CPU_ZERO(th->th.th_affin_mask);
4082     }
4083 
4084     //
4085     // Copy the thread mask to the kmp_info_t strucuture.
4086     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4087     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4088     // is set, then the full mask is the same as the mask of the initialization
4089     // thread.
4090     //
4091     kmp_affin_mask_t *mask;
4092     int i;
4093 
4094 # if OMP_40_ENABLED
4095     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4096 # endif
4097     {
4098         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4099           ) {
4100 # if KMP_GROUP_AFFINITY
4101             if (__kmp_num_proc_groups > 1) {
4102                 return;
4103             }
4104 # endif
4105             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4106             i = KMP_PLACE_ALL;
4107             mask = __kmp_affin_fullMask;
4108         }
4109         else {
4110             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4111             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4112             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4113         }
4114     }
4115 # if OMP_40_ENABLED
4116     else {
4117         if ((! isa_root)
4118           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4119 #  if KMP_GROUP_AFFINITY
4120             if (__kmp_num_proc_groups > 1) {
4121                 return;
4122             }
4123 #  endif
4124             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4125             i = KMP_PLACE_ALL;
4126             mask = __kmp_affin_fullMask;
4127         }
4128         else {
4129             //
4130             // int i = some hash function or just a counter that doesn't
4131             // always start at 0.  Use gtid for now.
4132             //
4133             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4134             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4135             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4136         }
4137     }
4138 # endif
4139 
4140 # if OMP_40_ENABLED
4141     th->th.th_current_place = i;
4142     if (isa_root) {
4143         th->th.th_new_place = i;
4144         th->th.th_first_place = 0;
4145         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4146     }
4147 
4148     if (i == KMP_PLACE_ALL) {
4149         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4150           gtid));
4151     }
4152     else {
4153         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4154           gtid, i));
4155     }
4156 # else
4157     if (i == -1) {
4158         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4159           gtid));
4160     }
4161     else {
4162         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4163           gtid, i));
4164     }
4165 # endif /* OMP_40_ENABLED */
4166 
4167     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4168 
4169     if (__kmp_affinity_verbose) {
4170         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4171         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4172           th->th.th_affin_mask);
4173         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4174           buf);
4175     }
4176 
4177 # if KMP_OS_WINDOWS
4178     //
4179     // On Windows* OS, the process affinity mask might have changed.
4180     // If the user didn't request affinity and this call fails,
4181     // just continue silently.  See CQ171393.
4182     //
4183     if ( __kmp_affinity_type == affinity_none ) {
4184         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4185     }
4186     else
4187 # endif
4188     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4189 }
4190 
4191 
4192 # if OMP_40_ENABLED
4193 
4194 void
4195 __kmp_affinity_set_place(int gtid)
4196 {
4197     int retval;
4198 
4199     if (! KMP_AFFINITY_CAPABLE()) {
4200         return;
4201     }
4202 
4203     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4204 
4205     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4206       gtid, th->th.th_new_place, th->th.th_current_place));
4207 
4208     //
4209     // Check that the new place is within this thread's partition.
4210     //
4211     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4212     KMP_ASSERT(th->th.th_new_place >= 0);
4213     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4214     if (th->th.th_first_place <= th->th.th_last_place) {
4215         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4216          && (th->th.th_new_place <= th->th.th_last_place));
4217     }
4218     else {
4219         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4220          || (th->th.th_new_place >= th->th.th_last_place));
4221     }
4222 
4223     //
4224     // Copy the thread mask to the kmp_info_t strucuture,
4225     // and set this thread's affinity.
4226     //
4227     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4228       th->th.th_new_place);
4229     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4230     th->th.th_current_place = th->th.th_new_place;
4231 
4232     if (__kmp_affinity_verbose) {
4233         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4234         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4235           th->th.th_affin_mask);
4236         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4237           gtid, buf);
4238     }
4239     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4240 }
4241 
4242 # endif /* OMP_40_ENABLED */
4243 
4244 
4245 int
4246 __kmp_aux_set_affinity(void **mask)
4247 {
4248     int gtid;
4249     kmp_info_t *th;
4250     int retval;
4251 
4252     if (! KMP_AFFINITY_CAPABLE()) {
4253         return -1;
4254     }
4255 
4256     gtid = __kmp_entry_gtid();
4257     KA_TRACE(1000, ;{
4258         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4259         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4260           (kmp_affin_mask_t *)(*mask));
4261         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4262           gtid, buf);
4263     });
4264 
4265     if (__kmp_env_consistency_check) {
4266         if ((mask == NULL) || (*mask == NULL)) {
4267             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4268         }
4269         else {
4270             unsigned proc;
4271             int num_procs = 0;
4272 
4273             KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4274                 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4275                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4276                 }
4277                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4278                     continue;
4279                 }
4280                 num_procs++;
4281             }
4282             if (num_procs == 0) {
4283                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4284             }
4285 
4286 # if KMP_GROUP_AFFINITY
4287             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4288                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4289             }
4290 # endif /* KMP_GROUP_AFFINITY */
4291 
4292         }
4293     }
4294 
4295     th = __kmp_threads[gtid];
4296     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4297     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4298     if (retval == 0) {
4299         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4300     }
4301 
4302 # if OMP_40_ENABLED
4303     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4304     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4305     th->th.th_first_place = 0;
4306     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4307 
4308     //
4309     // Turn off 4.0 affinity for the current tread at this parallel level.
4310     //
4311     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4312 # endif
4313 
4314     return retval;
4315 }
4316 
4317 
4318 int
4319 __kmp_aux_get_affinity(void **mask)
4320 {
4321     int gtid;
4322     int retval;
4323     kmp_info_t *th;
4324 
4325     if (! KMP_AFFINITY_CAPABLE()) {
4326         return -1;
4327     }
4328 
4329     gtid = __kmp_entry_gtid();
4330     th = __kmp_threads[gtid];
4331     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4332 
4333     KA_TRACE(1000, ;{
4334         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4335         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4336           th->th.th_affin_mask);
4337         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4338     });
4339 
4340     if (__kmp_env_consistency_check) {
4341         if ((mask == NULL) || (*mask == NULL)) {
4342             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4343         }
4344     }
4345 
4346 # if !KMP_OS_WINDOWS
4347 
4348     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4349     KA_TRACE(1000, ;{
4350         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4351         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4352           (kmp_affin_mask_t *)(*mask));
4353         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4354     });
4355     return retval;
4356 
4357 # else
4358 
4359     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4360     return 0;
4361 
4362 # endif /* KMP_OS_WINDOWS */
4363 
4364 }
4365 
4366 int
4367 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4368 {
4369     int retval;
4370 
4371     if (! KMP_AFFINITY_CAPABLE()) {
4372         return -1;
4373     }
4374 
4375     KA_TRACE(1000, ;{
4376         int gtid = __kmp_entry_gtid();
4377         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4378         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4379           (kmp_affin_mask_t *)(*mask));
4380         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4381           proc, gtid, buf);
4382     });
4383 
4384     if (__kmp_env_consistency_check) {
4385         if ((mask == NULL) || (*mask == NULL)) {
4386             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4387         }
4388     }
4389 
4390     if ((proc < 0)
4391 # if !KMP_USE_HWLOC
4392          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4393 # endif
4394        ) {
4395         return -1;
4396     }
4397     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4398         return -2;
4399     }
4400 
4401     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4402     return 0;
4403 }
4404 
4405 
4406 int
4407 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4408 {
4409     int retval;
4410 
4411     if (! KMP_AFFINITY_CAPABLE()) {
4412         return -1;
4413     }
4414 
4415     KA_TRACE(1000, ;{
4416         int gtid = __kmp_entry_gtid();
4417         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4418         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4419           (kmp_affin_mask_t *)(*mask));
4420         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4421           proc, gtid, buf);
4422     });
4423 
4424     if (__kmp_env_consistency_check) {
4425         if ((mask == NULL) || (*mask == NULL)) {
4426             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4427         }
4428     }
4429 
4430     if ((proc < 0)
4431 # if !KMP_USE_HWLOC
4432          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4433 # endif
4434        ) {
4435         return -1;
4436     }
4437     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4438         return -2;
4439     }
4440 
4441     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4442     return 0;
4443 }
4444 
4445 
4446 int
4447 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4448 {
4449     int retval;
4450 
4451     if (! KMP_AFFINITY_CAPABLE()) {
4452         return -1;
4453     }
4454 
4455     KA_TRACE(1000, ;{
4456         int gtid = __kmp_entry_gtid();
4457         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4458         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4459           (kmp_affin_mask_t *)(*mask));
4460         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4461           proc, gtid, buf);
4462     });
4463 
4464     if (__kmp_env_consistency_check) {
4465         if ((mask == NULL) || (*mask == NULL)) {
4466             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4467         }
4468     }
4469 
4470     if ((proc < 0)
4471 # if !KMP_USE_HWLOC
4472          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4473 # endif
4474        ) {
4475         return -1;
4476     }
4477     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4478         return 0;
4479     }
4480 
4481     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4482 }
4483 
4484 
4485 // Dynamic affinity settings - Affinity balanced
4486 void __kmp_balanced_affinity( int tid, int nthreads )
4487 {
4488     if( __kmp_affinity_uniform_topology() ) {
4489         int coreID;
4490         int threadID;
4491         // Number of hyper threads per core in HT machine
4492         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4493         // Number of cores
4494         int ncores = __kmp_ncores;
4495         // How many threads will be bound to each core
4496         int chunk = nthreads / ncores;
4497         // How many cores will have an additional thread bound to it - "big cores"
4498         int big_cores = nthreads % ncores;
4499         // Number of threads on the big cores
4500         int big_nth = ( chunk + 1 ) * big_cores;
4501         if( tid < big_nth ) {
4502             coreID = tid / (chunk + 1 );
4503             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4504         } else { //tid >= big_nth
4505             coreID = ( tid - big_cores ) / chunk;
4506             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4507         }
4508 
4509         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4510           "Illegal set affinity operation when not capable");
4511 
4512         kmp_affin_mask_t *mask;
4513         KMP_CPU_ALLOC_ON_STACK(mask);
4514         KMP_CPU_ZERO(mask);
4515 
4516         // Granularity == thread
4517         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4518             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4519             KMP_CPU_SET( osID, mask);
4520         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4521             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4522                 int osID;
4523                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4524                 KMP_CPU_SET( osID, mask);
4525             }
4526         }
4527         if (__kmp_affinity_verbose) {
4528             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4529             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4530             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4531               tid, buf);
4532         }
4533         __kmp_set_system_affinity( mask, TRUE );
4534         KMP_CPU_FREE_FROM_STACK(mask);
4535     } else { // Non-uniform topology
4536 
4537         kmp_affin_mask_t *mask;
4538         KMP_CPU_ALLOC_ON_STACK(mask);
4539         KMP_CPU_ZERO(mask);
4540 
4541         // Number of hyper threads per core in HT machine
4542         int nth_per_core = __kmp_nThreadsPerCore;
4543         int core_level;
4544         if( nth_per_core > 1 ) {
4545             core_level = __kmp_aff_depth - 2;
4546         } else {
4547             core_level = __kmp_aff_depth - 1;
4548         }
4549 
4550         // Number of cores - maximum value; it does not count trail cores with 0 processors
4551         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4552 
4553         // For performance gain consider the special case nthreads == __kmp_avail_proc
4554         if( nthreads == __kmp_avail_proc ) {
4555             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4556                 int osID = address2os[ tid ].second;
4557                 KMP_CPU_SET( osID, mask);
4558             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4559                 int coreID = address2os[ tid ].first.labels[ core_level ];
4560                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4561                 // since the address2os is sortied we can break when cnt==nth_per_core
4562                 int cnt = 0;
4563                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4564                     int osID = address2os[ i ].second;
4565                     int core = address2os[ i ].first.labels[ core_level ];
4566                     if( core == coreID ) {
4567                         KMP_CPU_SET( osID, mask);
4568                         cnt++;
4569                         if( cnt == nth_per_core ) {
4570                             break;
4571                         }
4572                     }
4573                 }
4574             }
4575         } else if( nthreads <= __kmp_ncores ) {
4576 
4577             int core = 0;
4578             for( int i = 0; i < ncores; i++ ) {
4579                 // Check if this core from procarr[] is in the mask
4580                 int in_mask = 0;
4581                 for( int j = 0; j < nth_per_core; j++ ) {
4582                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4583                         in_mask = 1;
4584                         break;
4585                     }
4586                 }
4587                 if( in_mask ) {
4588                     if( tid == core ) {
4589                         for( int j = 0; j < nth_per_core; j++ ) {
4590                             int osID = procarr[ i * nth_per_core + j ];
4591                             if( osID != -1 ) {
4592                                 KMP_CPU_SET( osID, mask );
4593                                 // For granularity=thread it is enough to set the first available osID for this core
4594                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4595                                     break;
4596                                 }
4597                             }
4598                         }
4599                         break;
4600                     } else {
4601                         core++;
4602                     }
4603                 }
4604             }
4605 
4606         } else { // nthreads > __kmp_ncores
4607 
4608             // Array to save the number of processors at each core
4609             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4610             // Array to save the number of cores with "x" available processors;
4611             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4612             // Array to save the number of cores with # procs from x to nth_per_core
4613             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4614 
4615             for( int i = 0; i <= nth_per_core; i++ ) {
4616                 ncores_with_x_procs[ i ] = 0;
4617                 ncores_with_x_to_max_procs[ i ] = 0;
4618             }
4619 
4620             for( int i = 0; i < ncores; i++ ) {
4621                 int cnt = 0;
4622                 for( int j = 0; j < nth_per_core; j++ ) {
4623                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4624                         cnt++;
4625                     }
4626                 }
4627                 nproc_at_core[ i ] = cnt;
4628                 ncores_with_x_procs[ cnt ]++;
4629             }
4630 
4631             for( int i = 0; i <= nth_per_core; i++ ) {
4632                 for( int j = i; j <= nth_per_core; j++ ) {
4633                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4634                 }
4635             }
4636 
4637             // Max number of processors
4638             int nproc = nth_per_core * ncores;
4639             // An array to keep number of threads per each context
4640             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4641             for( int i = 0; i < nproc; i++ ) {
4642                 newarr[ i ] = 0;
4643             }
4644 
4645             int nth = nthreads;
4646             int flag = 0;
4647             while( nth > 0 ) {
4648                 for( int j = 1; j <= nth_per_core; j++ ) {
4649                     int cnt = ncores_with_x_to_max_procs[ j ];
4650                     for( int i = 0; i < ncores; i++ ) {
4651                         // Skip the core with 0 processors
4652                         if( nproc_at_core[ i ] == 0 ) {
4653                             continue;
4654                         }
4655                         for( int k = 0; k < nth_per_core; k++ ) {
4656                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4657                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4658                                     newarr[ i * nth_per_core + k ] = 1;
4659                                     cnt--;
4660                                     nth--;
4661                                     break;
4662                                 } else {
4663                                     if( flag != 0 ) {
4664                                         newarr[ i * nth_per_core + k ] ++;
4665                                         cnt--;
4666                                         nth--;
4667                                         break;
4668                                     }
4669                                 }
4670                             }
4671                         }
4672                         if( cnt == 0 || nth == 0 ) {
4673                             break;
4674                         }
4675                     }
4676                     if( nth == 0 ) {
4677                         break;
4678                     }
4679                 }
4680                 flag = 1;
4681             }
4682             int sum = 0;
4683             for( int i = 0; i < nproc; i++ ) {
4684                 sum += newarr[ i ];
4685                 if( sum > tid ) {
4686                     // Granularity == thread
4687                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4688                         int osID = procarr[ i ];
4689                         KMP_CPU_SET( osID, mask);
4690                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4691                         int coreID = i / nth_per_core;
4692                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4693                             int osID = procarr[ coreID * nth_per_core + ii ];
4694                             if( osID != -1 ) {
4695                                 KMP_CPU_SET( osID, mask);
4696                             }
4697                         }
4698                     }
4699                     break;
4700                 }
4701             }
4702             __kmp_free( newarr );
4703         }
4704 
4705         if (__kmp_affinity_verbose) {
4706             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4707             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4708             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4709               tid, buf);
4710         }
4711         __kmp_set_system_affinity( mask, TRUE );
4712         KMP_CPU_FREE_FROM_STACK(mask);
4713     }
4714 }
4715 
4716 #if KMP_OS_LINUX
4717 // We don't need this entry for Windows because
4718 // there is GetProcessAffinityMask() api
4719 //
4720 // The intended usage is indicated by these steps:
4721 // 1) The user gets the current affinity mask
4722 // 2) Then sets the affinity by calling this function
4723 // 3) Error check the return value
4724 // 4) Use non-OpenMP parallelization
4725 // 5) Reset the affinity to what was stored in step 1)
4726 #ifdef __cplusplus
4727 extern "C"
4728 #endif
4729 int
4730 kmp_set_thread_affinity_mask_initial()
4731 // the function returns 0 on success,
4732 //   -1 if we cannot bind thread
4733 //   >0 (errno) if an error happened during binding
4734 {
4735     int gtid = __kmp_get_gtid();
4736     if (gtid < 0) {
4737         // Do not touch non-omp threads
4738         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4739             "non-omp thread, returning\n"));
4740         return -1;
4741     }
4742     if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4743         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4744             "affinity not initialized, returning\n"));
4745         return -1;
4746     }
4747     KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4748         "set full mask for thread %d\n", gtid));
4749     KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4750     return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4751 }
4752 #endif
4753 
4754 #endif // KMP_AFFINITY_SUPPORTED
4755