1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     // Adjust the hierarchy in case num threads exceeds original
37     if (nproc > machine_hierarchy.base_num_threads)
38         machine_hierarchy.resize(nproc);
39 
40     depth = machine_hierarchy.depth;
41     KMP_DEBUG_ASSERT(depth > 0);
42 
43     thr_bar->depth = depth;
44     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 #if KMP_USE_HWLOC
54 char *
55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
56 {
57     int num_chars_to_write, num_chars_written;
58     char* scan;
59     KMP_ASSERT(buf_len >= 40);
60 
61     // bufsize of 0 just retrieves the needed buffer size.
62     num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
63 
64     // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
65     // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
66     //   take into account the '\0' character.
67     if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
68         KMP_SNPRINTF(buf, buf_len, "{<empty>}");
69     } else if(num_chars_to_write < buf_len - 3) {
70         // no problem fitting the mask into buf_len number of characters
71         buf[0] = '{';
72         // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
73         num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
74         buf[num_chars_written+1] = '}';
75         buf[num_chars_written+2] = '\0';
76     } else {
77         // Need to truncate the affinity mask string and add ellipsis.
78         // To do this, we first write out the '{' + str(mask)
79         buf[0] = '{';
80         hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask);
81         // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
82         // on a digit then write "...}\0".  This way it is a clean ellipsis addition and we don't
83         // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
84         // { 45, 67,...} instead.
85         scan = buf + buf_len - 7;
86         while(*scan >= '0' && *scan <= '9' && scan >= buf)
87             scan--;
88         *(scan+1) = '.';
89         *(scan+2) = '.';
90         *(scan+3) = '.';
91         *(scan+4) = '}';
92         *(scan+5) = '\0';
93     }
94     return buf;
95 }
96 #else
97 char *
98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
99 {
100     KMP_ASSERT(buf_len >= 40);
101     char *scan = buf;
102     char *end = buf + buf_len - 1;
103 
104     //
105     // Find first element / check for empty set.
106     //
107     size_t i;
108     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
109         if (KMP_CPU_ISSET(i, mask)) {
110             break;
111         }
112     }
113     if (i == KMP_CPU_SETSIZE) {
114         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
115         while (*scan != '\0') scan++;
116         KMP_ASSERT(scan <= end);
117         return buf;
118     }
119 
120     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
121     while (*scan != '\0') scan++;
122     i++;
123     for (; i < KMP_CPU_SETSIZE; i++) {
124         if (! KMP_CPU_ISSET(i, mask)) {
125             continue;
126         }
127 
128         //
129         // Check for buffer overflow.  A string of the form ",<n>" will have
130         // at most 10 characters, plus we want to leave room to print ",...}"
131         // if the set is too large to print for a total of 15 characters.
132         // We already left room for '\0' in setting end.
133         //
134         if (end - scan < 15) {
135            break;
136         }
137         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
138         while (*scan != '\0') scan++;
139     }
140     if (i < KMP_CPU_SETSIZE) {
141         KMP_SNPRINTF(scan, end-scan+1,  ",...");
142         while (*scan != '\0') scan++;
143     }
144     KMP_SNPRINTF(scan, end-scan+1, "}");
145     while (*scan != '\0') scan++;
146     KMP_ASSERT(scan <= end);
147     return buf;
148 }
149 #endif // KMP_USE_HWLOC
150 
151 
152 void
153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
154 {
155     KMP_CPU_ZERO(mask);
156 
157 # if KMP_GROUP_AFFINITY
158 
159     if (__kmp_num_proc_groups > 1) {
160         int group;
161         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
162         for (group = 0; group < __kmp_num_proc_groups; group++) {
163             int i;
164             int num = __kmp_GetActiveProcessorCount(group);
165             for (i = 0; i < num; i++) {
166                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
167             }
168         }
169     }
170     else
171 
172 # endif /* KMP_GROUP_AFFINITY */
173 
174     {
175         int proc;
176         for (proc = 0; proc < __kmp_xproc; proc++) {
177             KMP_CPU_SET(proc, mask);
178         }
179     }
180 }
181 
182 //
183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
184 // called to renumber the labels from [0..n] and place them into the child_num
185 // vector of the address object.  This is done in case the labels used for
186 // the children at one node of the hierarchy differ from those used for
187 // another node at the same level.  Example:  suppose the machine has 2 nodes
188 // with 2 packages each.  The first node contains packages 601 and 602, and
189 // second node contains packages 603 and 604.  If we try to sort the table
190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
191 // because we are paying attention to the labels themselves, not the ordinal
192 // child numbers.  By using the child numbers in the sort, the result is
193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
194 //
195 static void
196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
197   int numAddrs)
198 {
199     KMP_DEBUG_ASSERT(numAddrs > 0);
200     int depth = address2os->first.depth;
201     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
202     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
203       * sizeof(unsigned));
204     int labCt;
205     for (labCt = 0; labCt < depth; labCt++) {
206         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
207         lastLabel[labCt] = address2os[0].first.labels[labCt];
208     }
209     int i;
210     for (i = 1; i < numAddrs; i++) {
211         for (labCt = 0; labCt < depth; labCt++) {
212             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
213                 int labCt2;
214                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
215                     counts[labCt2] = 0;
216                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
217                 }
218                 counts[labCt]++;
219                 lastLabel[labCt] = address2os[i].first.labels[labCt];
220                 break;
221             }
222         }
223         for (labCt = 0; labCt < depth; labCt++) {
224             address2os[i].first.childNums[labCt] = counts[labCt];
225         }
226         for (; labCt < (int)Address::maxDepth; labCt++) {
227             address2os[i].first.childNums[labCt] = 0;
228         }
229     }
230     __kmp_free(lastLabel);
231     __kmp_free(counts);
232 }
233 
234 
235 //
236 // All of the __kmp_affinity_create_*_map() routines should set
237 // __kmp_affinity_masks to a vector of affinity mask objects of length
238 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
239 // return the number of levels in the machine topology tree (zero if
240 // __kmp_affinity_type == affinity_none).
241 //
242 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
243 // to the affinity mask for the initialization thread.  They need to save and
244 // restore the mask, and it could be needed later, so saving it is just an
245 // optimization to avoid calling kmp_get_system_affinity() again.
246 //
247 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
248 
249 static int nCoresPerPkg, nPackages;
250 static int __kmp_nThreadsPerCore;
251 #ifndef KMP_DFLT_NTH_CORES
252 static int __kmp_ncores;
253 #endif
254 static int *__kmp_pu_os_idx = NULL;
255 
256 //
257 // __kmp_affinity_uniform_topology() doesn't work when called from
258 // places which support arbitrarily many levels in the machine topology
259 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
260 // __kmp_affinity_create_x2apicid_map().
261 //
262 inline static bool
263 __kmp_affinity_uniform_topology()
264 {
265     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
266 }
267 
268 
269 //
270 // Print out the detailed machine topology map, i.e. the physical locations
271 // of each OS proc.
272 //
273 static void
274 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
275   int pkgLevel, int coreLevel, int threadLevel)
276 {
277     int proc;
278 
279     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
280     for (proc = 0; proc < len; proc++) {
281         int level;
282         kmp_str_buf_t buf;
283         __kmp_str_buf_init(&buf);
284         for (level = 0; level < depth; level++) {
285             if (level == threadLevel) {
286                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
287             }
288             else if (level == coreLevel) {
289                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
290             }
291             else if (level == pkgLevel) {
292                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
293             }
294             else if (level > pkgLevel) {
295                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
296                   level - pkgLevel - 1);
297             }
298             else {
299                 __kmp_str_buf_print(&buf, "L%d ", level);
300             }
301             __kmp_str_buf_print(&buf, "%d ",
302               address2os[proc].first.labels[level]);
303         }
304         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
305           buf.str);
306         __kmp_str_buf_free(&buf);
307     }
308 }
309 
310 #if KMP_USE_HWLOC
311 
312 // This function removes the topology levels that are radix 1 and don't offer
313 // further information about the topology.  The most common example is when you
314 // have one thread context per core, we don't want the extra thread context
315 // level if it offers no unique labels.  So they are removed.
316 // return value: the new depth of address2os
317 static int
318 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
319     int level;
320     int i;
321     int radix1_detected;
322 
323     for (level = depth-1; level >= 0; --level) {
324         // Always keep the package level
325         if (level == *pkgLevel)
326             continue;
327         // Detect if this level is radix 1
328         radix1_detected = 1;
329         for (i = 1; i < nActiveThreads; ++i) {
330             if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
331                 // There are differing label values for this level so it stays
332                 radix1_detected = 0;
333                 break;
334             }
335         }
336         if (!radix1_detected)
337             continue;
338         // Radix 1 was detected
339         if (level == *threadLevel) {
340             // If only one thread per core, then just decrement
341             // the depth which removes the threadlevel from address2os
342             for (i = 0; i < nActiveThreads; ++i) {
343                 address2os[i].first.depth--;
344             }
345             *threadLevel = -1;
346         } else if (level == *coreLevel) {
347             // For core level, we move the thread labels over if they are still
348             // valid (*threadLevel != -1), and also reduce the depth another level
349             for (i = 0; i < nActiveThreads; ++i) {
350                 if (*threadLevel != -1) {
351                     address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
352                 }
353                 address2os[i].first.depth--;
354             }
355             *coreLevel = -1;
356         }
357     }
358     return address2os[0].first.depth;
359 }
360 
361 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
362 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
363 //  this will return the number of PU's under the SOCKET object.
364 static int
365 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
366     int retval = 0;
367     hwloc_obj_t first;
368     for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
369         first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
370         first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
371     {
372         ++retval;
373     }
374     return retval;
375 }
376 
377 static int
378 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
379   kmp_i18n_id_t *const msg_id)
380 {
381     *address2os = NULL;
382     *msg_id = kmp_i18n_null;
383 
384     //
385     // Save the affinity mask for the current thread.
386     //
387     kmp_affin_mask_t *oldMask;
388     KMP_CPU_ALLOC(oldMask);
389     __kmp_get_system_affinity(oldMask, TRUE);
390 
391     int depth = 3;
392     int pkgLevel = 0;
393     int coreLevel = 1;
394     int threadLevel = 2;
395 
396     if (! KMP_AFFINITY_CAPABLE())
397     {
398         //
399         // Hack to try and infer the machine topology using only the data
400         // available from cpuid on the current thread, and __kmp_xproc.
401         //
402         KMP_ASSERT(__kmp_affinity_type == affinity_none);
403 
404         nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
405         __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
406         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
407         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
408         if (__kmp_affinity_verbose) {
409             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
410             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
411             if (__kmp_affinity_uniform_topology()) {
412                 KMP_INFORM(Uniform, "KMP_AFFINITY");
413             } else {
414                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
415             }
416             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
417               __kmp_nThreadsPerCore, __kmp_ncores);
418         }
419         KMP_CPU_FREE(oldMask);
420         return 0;
421     }
422 
423     //
424     // Allocate the data structure to be returned.
425     //
426     AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
427     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
428 
429     //
430     // When affinity is off, this routine will still be called to set
431     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
432     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
433     // correctly, and return if affinity is not enabled.
434     //
435 
436     hwloc_obj_t pu;
437     hwloc_obj_t core;
438     hwloc_obj_t socket;
439     int nActiveThreads = 0;
440     int socket_identifier = 0;
441     // re-calculate globals to count only accessible resources
442     __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
443     for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
444         socket != NULL;
445         socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
446         socket_identifier++)
447     {
448         int core_identifier = 0;
449         int num_active_cores = 0;
450         for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
451             core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
452             core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
453             core_identifier++)
454         {
455             int pu_identifier = 0;
456             int num_active_threads = 0;
457             for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
458                 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
459                 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
460                 pu_identifier++)
461             {
462                 Address addr(3);
463                 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
464                     continue;         // skip inactive (inaccessible) unit
465                 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
466                     socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
467                 addr.labels[0] = socket_identifier; // package
468                 addr.labels[1] = core_identifier; // core
469                 addr.labels[2] = pu_identifier; // pu
470                 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
471                 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
472                 nActiveThreads++;
473                 ++num_active_threads; // count active threads per core
474             }
475             if (num_active_threads) { // were there any active threads on the core?
476                 ++__kmp_ncores;       // count total active cores
477                 ++num_active_cores;   // count active cores per socket
478                 if (num_active_threads > __kmp_nThreadsPerCore)
479                     __kmp_nThreadsPerCore = num_active_threads; // calc maximum
480             }
481         }
482         if (num_active_cores) {       // were there any active cores on the socket?
483             ++nPackages;              // count total active packages
484             if (num_active_cores > nCoresPerPkg)
485                 nCoresPerPkg = num_active_cores; // calc maximum
486         }
487     }
488 
489     //
490     // If there's only one thread context to bind to, return now.
491     //
492     KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
493     KMP_ASSERT(nActiveThreads > 0);
494     if (nActiveThreads == 1) {
495         __kmp_ncores = nPackages = 1;
496         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
497         if (__kmp_affinity_verbose) {
498             char buf[KMP_AFFIN_MASK_PRINT_LEN];
499             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
500 
501             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
502             if (__kmp_affinity_respect_mask) {
503                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
504             } else {
505                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
506             }
507             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
508             KMP_INFORM(Uniform, "KMP_AFFINITY");
509             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
510               __kmp_nThreadsPerCore, __kmp_ncores);
511         }
512 
513         if (__kmp_affinity_type == affinity_none) {
514             __kmp_free(retval);
515             KMP_CPU_FREE(oldMask);
516             return 0;
517         }
518 
519         //
520         // Form an Address object which only includes the package level.
521         //
522         Address addr(1);
523         addr.labels[0] = retval[0].first.labels[pkgLevel];
524         retval[0].first = addr;
525 
526         if (__kmp_affinity_gran_levels < 0) {
527             __kmp_affinity_gran_levels = 0;
528         }
529 
530         if (__kmp_affinity_verbose) {
531             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
532         }
533 
534         *address2os = retval;
535         KMP_CPU_FREE(oldMask);
536         return 1;
537     }
538 
539     //
540     // Sort the table by physical Id.
541     //
542     qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
543 
544     //
545     // Check to see if the machine topology is uniform
546     //
547     unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
548 
549     //
550     // Print the machine topology summary.
551     //
552     if (__kmp_affinity_verbose) {
553         char mask[KMP_AFFIN_MASK_PRINT_LEN];
554         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
555 
556         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
557         if (__kmp_affinity_respect_mask) {
558             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
559         } else {
560             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
561         }
562         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
563         if (uniform) {
564             KMP_INFORM(Uniform, "KMP_AFFINITY");
565         } else {
566             KMP_INFORM(NonUniform, "KMP_AFFINITY");
567         }
568 
569         kmp_str_buf_t buf;
570         __kmp_str_buf_init(&buf);
571 
572         __kmp_str_buf_print(&buf, "%d", nPackages);
573         //for (level = 1; level <= pkgLevel; level++) {
574         //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
575        // }
576         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
577           __kmp_nThreadsPerCore, __kmp_ncores);
578 
579         __kmp_str_buf_free(&buf);
580     }
581 
582     if (__kmp_affinity_type == affinity_none) {
583         __kmp_free(retval);
584         KMP_CPU_FREE(oldMask);
585         return 0;
586     }
587 
588     //
589     // Find any levels with radiix 1, and remove them from the map
590     // (except for the package level).
591     //
592     depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
593 
594     if (__kmp_affinity_gran_levels < 0) {
595         //
596         // Set the granularity level based on what levels are modeled
597         // in the machine topology map.
598         //
599         __kmp_affinity_gran_levels = 0;
600         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
601             __kmp_affinity_gran_levels++;
602         }
603         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
604             __kmp_affinity_gran_levels++;
605         }
606         if (__kmp_affinity_gran > affinity_gran_package) {
607             __kmp_affinity_gran_levels++;
608         }
609     }
610 
611     if (__kmp_affinity_verbose) {
612         __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
613           coreLevel, threadLevel);
614     }
615 
616     KMP_CPU_FREE(oldMask);
617     *address2os = retval;
618     return depth;
619 }
620 #endif // KMP_USE_HWLOC
621 
622 //
623 // If we don't know how to retrieve the machine's processor topology, or
624 // encounter an error in doing so, this routine is called to form a "flat"
625 // mapping of os thread id's <-> processor id's.
626 //
627 static int
628 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
629   kmp_i18n_id_t *const msg_id)
630 {
631     *address2os = NULL;
632     *msg_id = kmp_i18n_null;
633 
634     //
635     // Even if __kmp_affinity_type == affinity_none, this routine might still
636     // called to set __kmp_ncores, as well as
637     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
638     //
639     if (! KMP_AFFINITY_CAPABLE()) {
640         KMP_ASSERT(__kmp_affinity_type == affinity_none);
641         __kmp_ncores = nPackages = __kmp_xproc;
642         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
643         if (__kmp_affinity_verbose) {
644             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
645             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
646             KMP_INFORM(Uniform, "KMP_AFFINITY");
647             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
648               __kmp_nThreadsPerCore, __kmp_ncores);
649         }
650         return 0;
651     }
652 
653     //
654     // When affinity is off, this routine will still be called to set
655     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
656     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
657     //  correctly, and return now if affinity is not enabled.
658     //
659     __kmp_ncores = nPackages = __kmp_avail_proc;
660     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
661     if (__kmp_affinity_verbose) {
662         char buf[KMP_AFFIN_MASK_PRINT_LEN];
663         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
664 
665         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
666         if (__kmp_affinity_respect_mask) {
667             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
668         } else {
669             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
670         }
671         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
672         KMP_INFORM(Uniform, "KMP_AFFINITY");
673         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
674           __kmp_nThreadsPerCore, __kmp_ncores);
675     }
676     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
677     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
678     if (__kmp_affinity_type == affinity_none) {
679         int avail_ct = 0;
680         unsigned int i;
681         KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
682             if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
683                 continue;
684             __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
685         }
686         return 0;
687     }
688 
689     //
690     // Contruct the data structure to be returned.
691     //
692     *address2os = (AddrUnsPair*)
693       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
694     int avail_ct = 0;
695     unsigned int i;
696     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
697         //
698         // Skip this proc if it is not included in the machine model.
699         //
700         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
701             continue;
702         }
703         __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
704         Address addr(1);
705         addr.labels[0] = i;
706         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
707     }
708     if (__kmp_affinity_verbose) {
709         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
710     }
711 
712     if (__kmp_affinity_gran_levels < 0) {
713         //
714         // Only the package level is modeled in the machine topology map,
715         // so the #levels of granularity is either 0 or 1.
716         //
717         if (__kmp_affinity_gran > affinity_gran_package) {
718             __kmp_affinity_gran_levels = 1;
719         }
720         else {
721             __kmp_affinity_gran_levels = 0;
722         }
723     }
724     return 1;
725 }
726 
727 
728 # if KMP_GROUP_AFFINITY
729 
730 //
731 // If multiple Windows* OS processor groups exist, we can create a 2-level
732 // topology map with the groups at level 0 and the individual procs at
733 // level 1.
734 //
735 // This facilitates letting the threads float among all procs in a group,
736 // if granularity=group (the default when there are multiple groups).
737 //
738 static int
739 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
740   kmp_i18n_id_t *const msg_id)
741 {
742     *address2os = NULL;
743     *msg_id = kmp_i18n_null;
744 
745     //
746     // If we don't have multiple processor groups, return now.
747     // The flat mapping will be used.
748     //
749     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
750         // FIXME set *msg_id
751         return -1;
752     }
753 
754     //
755     // Contruct the data structure to be returned.
756     //
757     *address2os = (AddrUnsPair*)
758       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
759     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
760     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
761     int avail_ct = 0;
762     int i;
763     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
764         //
765         // Skip this proc if it is not included in the machine model.
766         //
767         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
768             continue;
769         }
770         __kmp_pu_os_idx[avail_ct] = i;  // suppose indices are flat
771         Address addr(2);
772         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
773         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
774         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
775 
776         if (__kmp_affinity_verbose) {
777             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
778               addr.labels[1]);
779         }
780     }
781 
782     if (__kmp_affinity_gran_levels < 0) {
783         if (__kmp_affinity_gran == affinity_gran_group) {
784             __kmp_affinity_gran_levels = 1;
785         }
786         else if ((__kmp_affinity_gran == affinity_gran_fine)
787           || (__kmp_affinity_gran == affinity_gran_thread)) {
788             __kmp_affinity_gran_levels = 0;
789         }
790         else {
791             const char *gran_str = NULL;
792             if (__kmp_affinity_gran == affinity_gran_core) {
793                 gran_str = "core";
794             }
795             else if (__kmp_affinity_gran == affinity_gran_package) {
796                 gran_str = "package";
797             }
798             else if (__kmp_affinity_gran == affinity_gran_node) {
799                 gran_str = "node";
800             }
801             else {
802                 KMP_ASSERT(0);
803             }
804 
805             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
806             __kmp_affinity_gran_levels = 0;
807         }
808     }
809     return 2;
810 }
811 
812 # endif /* KMP_GROUP_AFFINITY */
813 
814 
815 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
816 
817 static int
818 __kmp_cpuid_mask_width(int count) {
819     int r = 0;
820 
821     while((1<<r) < count)
822         ++r;
823     return r;
824 }
825 
826 
827 class apicThreadInfo {
828 public:
829     unsigned osId;              // param to __kmp_affinity_bind_thread
830     unsigned apicId;            // from cpuid after binding
831     unsigned maxCoresPerPkg;    //      ""
832     unsigned maxThreadsPerPkg;  //      ""
833     unsigned pkgId;             // inferred from above values
834     unsigned coreId;            //      ""
835     unsigned threadId;          //      ""
836 };
837 
838 
839 static int
840 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
841 {
842     const apicThreadInfo *aa = (const apicThreadInfo *)a;
843     const apicThreadInfo *bb = (const apicThreadInfo *)b;
844     if (aa->osId < bb->osId) return -1;
845     if (aa->osId > bb->osId) return 1;
846     return 0;
847 }
848 
849 
850 static int
851 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
852 {
853     const apicThreadInfo *aa = (const apicThreadInfo *)a;
854     const apicThreadInfo *bb = (const apicThreadInfo *)b;
855     if (aa->pkgId < bb->pkgId) return -1;
856     if (aa->pkgId > bb->pkgId) return 1;
857     if (aa->coreId < bb->coreId) return -1;
858     if (aa->coreId > bb->coreId) return 1;
859     if (aa->threadId < bb->threadId) return -1;
860     if (aa->threadId > bb->threadId) return 1;
861     return 0;
862 }
863 
864 
865 //
866 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
867 // an algorithm which cycles through the available os threads, setting
868 // the current thread's affinity mask to that thread, and then retrieves
869 // the Apic Id for each thread context using the cpuid instruction.
870 //
871 static int
872 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
873   kmp_i18n_id_t *const msg_id)
874 {
875     kmp_cpuid buf;
876     int rc;
877     *address2os = NULL;
878     *msg_id = kmp_i18n_null;
879 
880     //
881     // Check if cpuid leaf 4 is supported.
882     //
883         __kmp_x86_cpuid(0, 0, &buf);
884         if (buf.eax < 4) {
885             *msg_id = kmp_i18n_str_NoLeaf4Support;
886             return -1;
887         }
888 
889     //
890     // The algorithm used starts by setting the affinity to each available
891     // thread and retrieving info from the cpuid instruction, so if we are
892     // not capable of calling __kmp_get_system_affinity() and
893     // _kmp_get_system_affinity(), then we need to do something else - use
894     // the defaults that we calculated from issuing cpuid without binding
895     // to each proc.
896     //
897     if (! KMP_AFFINITY_CAPABLE()) {
898         //
899         // Hack to try and infer the machine topology using only the data
900         // available from cpuid on the current thread, and __kmp_xproc.
901         //
902         KMP_ASSERT(__kmp_affinity_type == affinity_none);
903 
904         //
905         // Get an upper bound on the number of threads per package using
906         // cpuid(1).
907         //
908         // On some OS/chps combinations where HT is supported by the chip
909         // but is disabled, this value will be 2 on a single core chip.
910         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
911         //
912         __kmp_x86_cpuid(1, 0, &buf);
913         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
914         if (maxThreadsPerPkg == 0) {
915             maxThreadsPerPkg = 1;
916         }
917 
918         //
919         // The num cores per pkg comes from cpuid(4).
920         // 1 must be added to the encoded value.
921         //
922         // The author of cpu_count.cpp treated this only an upper bound
923         // on the number of cores, but I haven't seen any cases where it
924         // was greater than the actual number of cores, so we will treat
925         // it as exact in this block of code.
926         //
927         // First, we need to check if cpuid(4) is supported on this chip.
928         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
929         // has the value n or greater.
930         //
931         __kmp_x86_cpuid(0, 0, &buf);
932         if (buf.eax >= 4) {
933             __kmp_x86_cpuid(4, 0, &buf);
934             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
935         }
936         else {
937             nCoresPerPkg = 1;
938         }
939 
940         //
941         // There is no way to reliably tell if HT is enabled without issuing
942         // the cpuid instruction from every thread, can correlating the cpuid
943         // info, so if the machine is not affinity capable, we assume that HT
944         // is off.  We have seen quite a few machines where maxThreadsPerPkg
945         // is 2, yet the machine does not support HT.
946         //
947         // - Older OSes are usually found on machines with older chips, which
948         //   do not support HT.
949         //
950         // - The performance penalty for mistakenly identifying a machine as
951         //   HT when it isn't (which results in blocktime being incorrecly set
952         //   to 0) is greater than the penalty when for mistakenly identifying
953         //   a machine as being 1 thread/core when it is really HT enabled
954         //   (which results in blocktime being incorrectly set to a positive
955         //   value).
956         //
957         __kmp_ncores = __kmp_xproc;
958         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
959         __kmp_nThreadsPerCore = 1;
960         if (__kmp_affinity_verbose) {
961             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
962             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
963             if (__kmp_affinity_uniform_topology()) {
964                 KMP_INFORM(Uniform, "KMP_AFFINITY");
965             } else {
966                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
967             }
968             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
969               __kmp_nThreadsPerCore, __kmp_ncores);
970         }
971         return 0;
972     }
973 
974     //
975     //
976     // From here on, we can assume that it is safe to call
977     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
978     // even if __kmp_affinity_type = affinity_none.
979     //
980 
981     //
982     // Save the affinity mask for the current thread.
983     //
984     kmp_affin_mask_t *oldMask;
985     KMP_CPU_ALLOC(oldMask);
986     KMP_ASSERT(oldMask != NULL);
987     __kmp_get_system_affinity(oldMask, TRUE);
988 
989     //
990     // Run through each of the available contexts, binding the current thread
991     // to it, and obtaining the pertinent information using the cpuid instr.
992     //
993     // The relevant information is:
994     //
995     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
996     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
997     //
998     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
999     //    value of this field determines the width of the core# + thread#
1000     //    fields in the Apic Id.  It is also an upper bound on the number
1001     //    of threads per package, but it has been verified that situations
1002     //    happen were it is not exact.  In particular, on certain OS/chip
1003     //    combinations where Intel(R) Hyper-Threading Technology is supported
1004     //    by the chip but has
1005     //    been disabled, the value of this field will be 2 (for a single core
1006     //    chip).  On other OS/chip combinations supporting
1007     //    Intel(R) Hyper-Threading Technology, the value of
1008     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
1009     //    disabled and 2 when it is enabled.
1010     //
1011     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
1012     //    value of this field (+1) determines the width of the core# field in
1013     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
1014     //    an upper bound, but the IA-32 architecture manual says that it is
1015     //    exactly the number of cores per package, and I haven't seen any
1016     //    case where it wasn't.
1017     //
1018     // From this information, deduce the package Id, core Id, and thread Id,
1019     // and set the corresponding fields in the apicThreadInfo struct.
1020     //
1021     unsigned i;
1022     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1023       __kmp_avail_proc * sizeof(apicThreadInfo));
1024     unsigned nApics = 0;
1025     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1026         //
1027         // Skip this proc if it is not included in the machine model.
1028         //
1029         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1030             continue;
1031         }
1032         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1033 
1034         __kmp_affinity_bind_thread(i);
1035         threadInfo[nApics].osId = i;
1036 
1037         //
1038         // The apic id and max threads per pkg come from cpuid(1).
1039         //
1040         __kmp_x86_cpuid(1, 0, &buf);
1041         if (! (buf.edx >> 9) & 1) {
1042             __kmp_set_system_affinity(oldMask, TRUE);
1043             __kmp_free(threadInfo);
1044             KMP_CPU_FREE(oldMask);
1045             *msg_id = kmp_i18n_str_ApicNotPresent;
1046             return -1;
1047         }
1048         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1049         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1050         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1051             threadInfo[nApics].maxThreadsPerPkg = 1;
1052         }
1053 
1054         //
1055         // Max cores per pkg comes from cpuid(4).
1056         // 1 must be added to the encoded value.
1057         //
1058         // First, we need to check if cpuid(4) is supported on this chip.
1059         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1060         // has the value n or greater.
1061         //
1062         __kmp_x86_cpuid(0, 0, &buf);
1063         if (buf.eax >= 4) {
1064             __kmp_x86_cpuid(4, 0, &buf);
1065             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1066         }
1067         else {
1068             threadInfo[nApics].maxCoresPerPkg = 1;
1069         }
1070 
1071         //
1072         // Infer the pkgId / coreId / threadId using only the info
1073         // obtained locally.
1074         //
1075         int widthCT = __kmp_cpuid_mask_width(
1076           threadInfo[nApics].maxThreadsPerPkg);
1077         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1078 
1079         int widthC = __kmp_cpuid_mask_width(
1080           threadInfo[nApics].maxCoresPerPkg);
1081         int widthT = widthCT - widthC;
1082         if (widthT < 0) {
1083             //
1084             // I've never seen this one happen, but I suppose it could, if
1085             // the cpuid instruction on a chip was really screwed up.
1086             // Make sure to restore the affinity mask before the tail call.
1087             //
1088             __kmp_set_system_affinity(oldMask, TRUE);
1089             __kmp_free(threadInfo);
1090             KMP_CPU_FREE(oldMask);
1091             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1092             return -1;
1093         }
1094 
1095         int maskC = (1 << widthC) - 1;
1096         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1097           &maskC;
1098 
1099         int maskT = (1 << widthT) - 1;
1100         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1101 
1102         nApics++;
1103     }
1104 
1105     //
1106     // We've collected all the info we need.
1107     // Restore the old affinity mask for this thread.
1108     //
1109     __kmp_set_system_affinity(oldMask, TRUE);
1110 
1111     //
1112     // If there's only one thread context to bind to, form an Address object
1113     // with depth 1 and return immediately (or, if affinity is off, set
1114     // address2os to NULL and return).
1115     //
1116     // If it is configured to omit the package level when there is only a
1117     // single package, the logic at the end of this routine won't work if
1118     // there is only a single thread - it would try to form an Address
1119     // object with depth 0.
1120     //
1121     KMP_ASSERT(nApics > 0);
1122     if (nApics == 1) {
1123         __kmp_ncores = nPackages = 1;
1124         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1125         if (__kmp_affinity_verbose) {
1126             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1127             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1128 
1129             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1130             if (__kmp_affinity_respect_mask) {
1131                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1132             } else {
1133                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1134             }
1135             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1136             KMP_INFORM(Uniform, "KMP_AFFINITY");
1137             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1138               __kmp_nThreadsPerCore, __kmp_ncores);
1139         }
1140 
1141         if (__kmp_affinity_type == affinity_none) {
1142             __kmp_free(threadInfo);
1143             KMP_CPU_FREE(oldMask);
1144             return 0;
1145         }
1146 
1147         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1148         Address addr(1);
1149         addr.labels[0] = threadInfo[0].pkgId;
1150         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1151 
1152         if (__kmp_affinity_gran_levels < 0) {
1153             __kmp_affinity_gran_levels = 0;
1154         }
1155 
1156         if (__kmp_affinity_verbose) {
1157             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1158         }
1159 
1160         __kmp_free(threadInfo);
1161         KMP_CPU_FREE(oldMask);
1162         return 1;
1163     }
1164 
1165     //
1166     // Sort the threadInfo table by physical Id.
1167     //
1168     qsort(threadInfo, nApics, sizeof(*threadInfo),
1169       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1170 
1171     //
1172     // The table is now sorted by pkgId / coreId / threadId, but we really
1173     // don't know the radix of any of the fields.  pkgId's may be sparsely
1174     // assigned among the chips on a system.  Although coreId's are usually
1175     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1176     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1177     //
1178     // For that matter, we don't know what coresPerPkg and threadsPerCore
1179     // (or the total # packages) are at this point - we want to determine
1180     // that now.  We only have an upper bound on the first two figures.
1181     //
1182     // We also perform a consistency check at this point: the values returned
1183     // by the cpuid instruction for any thread bound to a given package had
1184     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1185     //
1186     nPackages = 1;
1187     nCoresPerPkg = 1;
1188     __kmp_nThreadsPerCore = 1;
1189     unsigned nCores = 1;
1190 
1191     unsigned pkgCt = 1;                         // to determine radii
1192     unsigned lastPkgId = threadInfo[0].pkgId;
1193     unsigned coreCt = 1;
1194     unsigned lastCoreId = threadInfo[0].coreId;
1195     unsigned threadCt = 1;
1196     unsigned lastThreadId = threadInfo[0].threadId;
1197 
1198                                                 // intra-pkg consist checks
1199     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1200     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1201 
1202     for (i = 1; i < nApics; i++) {
1203         if (threadInfo[i].pkgId != lastPkgId) {
1204             nCores++;
1205             pkgCt++;
1206             lastPkgId = threadInfo[i].pkgId;
1207             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1208             coreCt = 1;
1209             lastCoreId = threadInfo[i].coreId;
1210             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1211             threadCt = 1;
1212             lastThreadId = threadInfo[i].threadId;
1213 
1214             //
1215             // This is a different package, so go on to the next iteration
1216             // without doing any consistency checks.  Reset the consistency
1217             // check vars, though.
1218             //
1219             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1220             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1221             continue;
1222         }
1223 
1224         if (threadInfo[i].coreId != lastCoreId) {
1225             nCores++;
1226             coreCt++;
1227             lastCoreId = threadInfo[i].coreId;
1228             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1229             threadCt = 1;
1230             lastThreadId = threadInfo[i].threadId;
1231         }
1232         else if (threadInfo[i].threadId != lastThreadId) {
1233             threadCt++;
1234             lastThreadId = threadInfo[i].threadId;
1235         }
1236         else {
1237             __kmp_free(threadInfo);
1238             KMP_CPU_FREE(oldMask);
1239             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1240             return -1;
1241         }
1242 
1243         //
1244         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1245         // fields agree between all the threads bounds to a given package.
1246         //
1247         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1248           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1249             __kmp_free(threadInfo);
1250             KMP_CPU_FREE(oldMask);
1251             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1252             return -1;
1253         }
1254     }
1255     nPackages = pkgCt;
1256     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1257     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1258 
1259     //
1260     // When affinity is off, this routine will still be called to set
1261     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1262     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1263     // correctly, and return now if affinity is not enabled.
1264     //
1265     __kmp_ncores = nCores;
1266     if (__kmp_affinity_verbose) {
1267         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1268         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1269 
1270         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1271         if (__kmp_affinity_respect_mask) {
1272             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1273         } else {
1274             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1275         }
1276         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1277         if (__kmp_affinity_uniform_topology()) {
1278             KMP_INFORM(Uniform, "KMP_AFFINITY");
1279         } else {
1280             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1281         }
1282         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1283           __kmp_nThreadsPerCore, __kmp_ncores);
1284 
1285     }
1286     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1287     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1288     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1289     for (i = 0; i < nApics; ++i) {
1290         __kmp_pu_os_idx[i] = threadInfo[i].osId;
1291     }
1292     if (__kmp_affinity_type == affinity_none) {
1293         __kmp_free(threadInfo);
1294         KMP_CPU_FREE(oldMask);
1295         return 0;
1296     }
1297 
1298     //
1299     // Now that we've determined the number of packages, the number of cores
1300     // per package, and the number of threads per core, we can construct the
1301     // data structure that is to be returned.
1302     //
1303     int pkgLevel = 0;
1304     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1305     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1306     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1307 
1308     KMP_ASSERT(depth > 0);
1309     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1310 
1311     for (i = 0; i < nApics; ++i) {
1312         Address addr(depth);
1313         unsigned os = threadInfo[i].osId;
1314         int d = 0;
1315 
1316         if (pkgLevel >= 0) {
1317             addr.labels[d++] = threadInfo[i].pkgId;
1318         }
1319         if (coreLevel >= 0) {
1320             addr.labels[d++] = threadInfo[i].coreId;
1321         }
1322         if (threadLevel >= 0) {
1323             addr.labels[d++] = threadInfo[i].threadId;
1324         }
1325         (*address2os)[i] = AddrUnsPair(addr, os);
1326     }
1327 
1328     if (__kmp_affinity_gran_levels < 0) {
1329         //
1330         // Set the granularity level based on what levels are modeled
1331         // in the machine topology map.
1332         //
1333         __kmp_affinity_gran_levels = 0;
1334         if ((threadLevel >= 0)
1335           && (__kmp_affinity_gran > affinity_gran_thread)) {
1336             __kmp_affinity_gran_levels++;
1337         }
1338         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1339             __kmp_affinity_gran_levels++;
1340         }
1341         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1342             __kmp_affinity_gran_levels++;
1343         }
1344     }
1345 
1346     if (__kmp_affinity_verbose) {
1347         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1348           coreLevel, threadLevel);
1349     }
1350 
1351     __kmp_free(threadInfo);
1352     KMP_CPU_FREE(oldMask);
1353     return depth;
1354 }
1355 
1356 
1357 //
1358 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1359 // architectures support a newer interface for specifying the x2APIC Ids,
1360 // based on cpuid leaf 11.
1361 //
1362 static int
1363 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1364   kmp_i18n_id_t *const msg_id)
1365 {
1366     kmp_cpuid buf;
1367 
1368     *address2os = NULL;
1369     *msg_id = kmp_i18n_null;
1370 
1371     //
1372     // Check to see if cpuid leaf 11 is supported.
1373     //
1374     __kmp_x86_cpuid(0, 0, &buf);
1375     if (buf.eax < 11) {
1376         *msg_id = kmp_i18n_str_NoLeaf11Support;
1377         return -1;
1378     }
1379     __kmp_x86_cpuid(11, 0, &buf);
1380     if (buf.ebx == 0) {
1381         *msg_id = kmp_i18n_str_NoLeaf11Support;
1382         return -1;
1383     }
1384 
1385     //
1386     // Find the number of levels in the machine topology.  While we're at it,
1387     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1388     // try to get more accurate values later by explicitly counting them,
1389     // but get reasonable defaults now, in case we return early.
1390     //
1391     int level;
1392     int threadLevel = -1;
1393     int coreLevel = -1;
1394     int pkgLevel = -1;
1395     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1396 
1397     for (level = 0;; level++) {
1398         if (level > 31) {
1399             //
1400             // FIXME: Hack for DPD200163180
1401             //
1402             // If level is big then something went wrong -> exiting
1403             //
1404             // There could actually be 32 valid levels in the machine topology,
1405             // but so far, the only machine we have seen which does not exit
1406             // this loop before iteration 32 has fubar x2APIC settings.
1407             //
1408             // For now, just reject this case based upon loop trip count.
1409             //
1410             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1411             return -1;
1412         }
1413         __kmp_x86_cpuid(11, level, &buf);
1414         if (buf.ebx == 0) {
1415             if (pkgLevel < 0) {
1416                 //
1417                 // Will infer nPackages from __kmp_xproc
1418                 //
1419                 pkgLevel = level;
1420                 level++;
1421             }
1422             break;
1423         }
1424         int kind = (buf.ecx >> 8) & 0xff;
1425         if (kind == 1) {
1426             //
1427             // SMT level
1428             //
1429             threadLevel = level;
1430             coreLevel = -1;
1431             pkgLevel = -1;
1432             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1433             if (__kmp_nThreadsPerCore == 0) {
1434                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1435                 return -1;
1436             }
1437         }
1438         else if (kind == 2) {
1439             //
1440             // core level
1441             //
1442             coreLevel = level;
1443             pkgLevel = -1;
1444             nCoresPerPkg = buf.ebx & 0xff;
1445             if (nCoresPerPkg == 0) {
1446                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1447                 return -1;
1448             }
1449         }
1450         else {
1451             if (level <= 0) {
1452                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1453                 return -1;
1454             }
1455             if (pkgLevel >= 0) {
1456                 continue;
1457             }
1458             pkgLevel = level;
1459             nPackages = buf.ebx & 0xff;
1460             if (nPackages == 0) {
1461                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1462                 return -1;
1463             }
1464         }
1465     }
1466     int depth = level;
1467 
1468     //
1469     // In the above loop, "level" was counted from the finest level (usually
1470     // thread) to the coarsest.  The caller expects that we will place the
1471     // labels in (*address2os)[].first.labels[] in the inverse order, so
1472     // we need to invert the vars saying which level means what.
1473     //
1474     if (threadLevel >= 0) {
1475         threadLevel = depth - threadLevel - 1;
1476     }
1477     if (coreLevel >= 0) {
1478         coreLevel = depth - coreLevel - 1;
1479     }
1480     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1481     pkgLevel = depth - pkgLevel - 1;
1482 
1483     //
1484     // The algorithm used starts by setting the affinity to each available
1485     // thread and retrieving info from the cpuid instruction, so if we are
1486     // not capable of calling __kmp_get_system_affinity() and
1487     // _kmp_get_system_affinity(), then we need to do something else - use
1488     // the defaults that we calculated from issuing cpuid without binding
1489     // to each proc.
1490     //
1491     if (! KMP_AFFINITY_CAPABLE())
1492     {
1493         //
1494         // Hack to try and infer the machine topology using only the data
1495         // available from cpuid on the current thread, and __kmp_xproc.
1496         //
1497         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1498 
1499         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1500         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1501         if (__kmp_affinity_verbose) {
1502             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1503             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1504             if (__kmp_affinity_uniform_topology()) {
1505                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1506             } else {
1507                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1508             }
1509             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1510               __kmp_nThreadsPerCore, __kmp_ncores);
1511         }
1512         return 0;
1513     }
1514 
1515     //
1516     //
1517     // From here on, we can assume that it is safe to call
1518     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1519     // even if __kmp_affinity_type = affinity_none.
1520     //
1521 
1522     //
1523     // Save the affinity mask for the current thread.
1524     //
1525     kmp_affin_mask_t *oldMask;
1526     KMP_CPU_ALLOC(oldMask);
1527     __kmp_get_system_affinity(oldMask, TRUE);
1528 
1529     //
1530     // Allocate the data structure to be returned.
1531     //
1532     AddrUnsPair *retval = (AddrUnsPair *)
1533       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1534 
1535     //
1536     // Run through each of the available contexts, binding the current thread
1537     // to it, and obtaining the pertinent information using the cpuid instr.
1538     //
1539     unsigned int proc;
1540     int nApics = 0;
1541     KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1542         //
1543         // Skip this proc if it is not included in the machine model.
1544         //
1545         if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1546             continue;
1547         }
1548         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1549 
1550         __kmp_affinity_bind_thread(proc);
1551 
1552         //
1553         // Extrach the labels for each level in the machine topology map
1554         // from the Apic ID.
1555         //
1556         Address addr(depth);
1557         int prev_shift = 0;
1558 
1559         for (level = 0; level < depth; level++) {
1560             __kmp_x86_cpuid(11, level, &buf);
1561             unsigned apicId = buf.edx;
1562             if (buf.ebx == 0) {
1563                 if (level != depth - 1) {
1564                     KMP_CPU_FREE(oldMask);
1565                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1566                     return -1;
1567                 }
1568                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1569                 level++;
1570                 break;
1571             }
1572             int shift = buf.eax & 0x1f;
1573             int mask = (1 << shift) - 1;
1574             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1575             prev_shift = shift;
1576         }
1577         if (level != depth) {
1578             KMP_CPU_FREE(oldMask);
1579             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1580             return -1;
1581         }
1582 
1583         retval[nApics] = AddrUnsPair(addr, proc);
1584         nApics++;
1585     }
1586 
1587     //
1588     // We've collected all the info we need.
1589     // Restore the old affinity mask for this thread.
1590     //
1591     __kmp_set_system_affinity(oldMask, TRUE);
1592 
1593     //
1594     // If there's only one thread context to bind to, return now.
1595     //
1596     KMP_ASSERT(nApics > 0);
1597     if (nApics == 1) {
1598         __kmp_ncores = nPackages = 1;
1599         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1600         if (__kmp_affinity_verbose) {
1601             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1602             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1603 
1604             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1605             if (__kmp_affinity_respect_mask) {
1606                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1607             } else {
1608                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1609             }
1610             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1611             KMP_INFORM(Uniform, "KMP_AFFINITY");
1612             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1613               __kmp_nThreadsPerCore, __kmp_ncores);
1614         }
1615 
1616         if (__kmp_affinity_type == affinity_none) {
1617             __kmp_free(retval);
1618             KMP_CPU_FREE(oldMask);
1619             return 0;
1620         }
1621 
1622         //
1623         // Form an Address object which only includes the package level.
1624         //
1625         Address addr(1);
1626         addr.labels[0] = retval[0].first.labels[pkgLevel];
1627         retval[0].first = addr;
1628 
1629         if (__kmp_affinity_gran_levels < 0) {
1630             __kmp_affinity_gran_levels = 0;
1631         }
1632 
1633         if (__kmp_affinity_verbose) {
1634             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1635         }
1636 
1637         *address2os = retval;
1638         KMP_CPU_FREE(oldMask);
1639         return 1;
1640     }
1641 
1642     //
1643     // Sort the table by physical Id.
1644     //
1645     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1646 
1647     //
1648     // Find the radix at each of the levels.
1649     //
1650     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1651     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1652     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1653     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1654     for (level = 0; level < depth; level++) {
1655         totals[level] = 1;
1656         maxCt[level] = 1;
1657         counts[level] = 1;
1658         last[level] = retval[0].first.labels[level];
1659     }
1660 
1661     //
1662     // From here on, the iteration variable "level" runs from the finest
1663     // level to the coarsest, i.e. we iterate forward through
1664     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1665     // backwards.
1666     //
1667     for (proc = 1; (int)proc < nApics; proc++) {
1668         int level;
1669         for (level = 0; level < depth; level++) {
1670             if (retval[proc].first.labels[level] != last[level]) {
1671                 int j;
1672                 for (j = level + 1; j < depth; j++) {
1673                     totals[j]++;
1674                     counts[j] = 1;
1675                     // The line below causes printing incorrect topology information
1676                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1677                     // some less value while going through the array.
1678                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1679                     // whereas it must be 4.
1680                     // TODO!!! Check if it can be commented safely
1681                     //maxCt[j] = 1;
1682                     last[j] = retval[proc].first.labels[j];
1683                 }
1684                 totals[level]++;
1685                 counts[level]++;
1686                 if (counts[level] > maxCt[level]) {
1687                     maxCt[level] = counts[level];
1688                 }
1689                 last[level] = retval[proc].first.labels[level];
1690                 break;
1691             }
1692             else if (level == depth - 1) {
1693                 __kmp_free(last);
1694                 __kmp_free(maxCt);
1695                 __kmp_free(counts);
1696                 __kmp_free(totals);
1697                 __kmp_free(retval);
1698                 KMP_CPU_FREE(oldMask);
1699                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1700                 return -1;
1701             }
1702         }
1703     }
1704 
1705     //
1706     // When affinity is off, this routine will still be called to set
1707     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1708     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1709     // correctly, and return if affinity is not enabled.
1710     //
1711     if (threadLevel >= 0) {
1712         __kmp_nThreadsPerCore = maxCt[threadLevel];
1713     }
1714     else {
1715         __kmp_nThreadsPerCore = 1;
1716     }
1717     nPackages = totals[pkgLevel];
1718 
1719     if (coreLevel >= 0) {
1720         __kmp_ncores = totals[coreLevel];
1721         nCoresPerPkg = maxCt[coreLevel];
1722     }
1723     else {
1724         __kmp_ncores = nPackages;
1725         nCoresPerPkg = 1;
1726     }
1727 
1728     //
1729     // Check to see if the machine topology is uniform
1730     //
1731     unsigned prod = maxCt[0];
1732     for (level = 1; level < depth; level++) {
1733        prod *= maxCt[level];
1734     }
1735     bool uniform = (prod == totals[level - 1]);
1736 
1737     //
1738     // Print the machine topology summary.
1739     //
1740     if (__kmp_affinity_verbose) {
1741         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1742         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1743 
1744         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1745         if (__kmp_affinity_respect_mask) {
1746             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1747         } else {
1748             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1749         }
1750         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1751         if (uniform) {
1752             KMP_INFORM(Uniform, "KMP_AFFINITY");
1753         } else {
1754             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1755         }
1756 
1757         kmp_str_buf_t buf;
1758         __kmp_str_buf_init(&buf);
1759 
1760         __kmp_str_buf_print(&buf, "%d", totals[0]);
1761         for (level = 1; level <= pkgLevel; level++) {
1762             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1763         }
1764         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1765           __kmp_nThreadsPerCore, __kmp_ncores);
1766 
1767         __kmp_str_buf_free(&buf);
1768     }
1769     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1770     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1771     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1772     for (proc = 0; (int)proc < nApics; ++proc) {
1773         __kmp_pu_os_idx[proc] = retval[proc].second;
1774     }
1775     if (__kmp_affinity_type == affinity_none) {
1776         __kmp_free(last);
1777         __kmp_free(maxCt);
1778         __kmp_free(counts);
1779         __kmp_free(totals);
1780         __kmp_free(retval);
1781         KMP_CPU_FREE(oldMask);
1782         return 0;
1783     }
1784 
1785     //
1786     // Find any levels with radiix 1, and remove them from the map
1787     // (except for the package level).
1788     //
1789     int new_depth = 0;
1790     for (level = 0; level < depth; level++) {
1791         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1792            continue;
1793         }
1794         new_depth++;
1795     }
1796 
1797     //
1798     // If we are removing any levels, allocate a new vector to return,
1799     // and copy the relevant information to it.
1800     //
1801     if (new_depth != depth) {
1802         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1803           sizeof(AddrUnsPair) * nApics);
1804         for (proc = 0; (int)proc < nApics; proc++) {
1805             Address addr(new_depth);
1806             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1807         }
1808         int new_level = 0;
1809         int newPkgLevel = -1;
1810         int newCoreLevel = -1;
1811         int newThreadLevel = -1;
1812         int i;
1813         for (level = 0; level < depth; level++) {
1814             if ((maxCt[level] == 1)
1815               && (level != pkgLevel)) {
1816                 //
1817                 // Remove this level. Never remove the package level
1818                 //
1819                 continue;
1820             }
1821             if (level == pkgLevel) {
1822                 newPkgLevel = level;
1823             }
1824             if (level == coreLevel) {
1825                 newCoreLevel = level;
1826             }
1827             if (level == threadLevel) {
1828                 newThreadLevel = level;
1829             }
1830             for (proc = 0; (int)proc < nApics; proc++) {
1831                 new_retval[proc].first.labels[new_level]
1832                   = retval[proc].first.labels[level];
1833             }
1834             new_level++;
1835         }
1836 
1837         __kmp_free(retval);
1838         retval = new_retval;
1839         depth = new_depth;
1840         pkgLevel = newPkgLevel;
1841         coreLevel = newCoreLevel;
1842         threadLevel = newThreadLevel;
1843     }
1844 
1845     if (__kmp_affinity_gran_levels < 0) {
1846         //
1847         // Set the granularity level based on what levels are modeled
1848         // in the machine topology map.
1849         //
1850         __kmp_affinity_gran_levels = 0;
1851         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1852             __kmp_affinity_gran_levels++;
1853         }
1854         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1855             __kmp_affinity_gran_levels++;
1856         }
1857         if (__kmp_affinity_gran > affinity_gran_package) {
1858             __kmp_affinity_gran_levels++;
1859         }
1860     }
1861 
1862     if (__kmp_affinity_verbose) {
1863         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1864           coreLevel, threadLevel);
1865     }
1866 
1867     __kmp_free(last);
1868     __kmp_free(maxCt);
1869     __kmp_free(counts);
1870     __kmp_free(totals);
1871     KMP_CPU_FREE(oldMask);
1872     *address2os = retval;
1873     return depth;
1874 }
1875 
1876 
1877 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1878 
1879 
1880 #define osIdIndex       0
1881 #define threadIdIndex   1
1882 #define coreIdIndex     2
1883 #define pkgIdIndex      3
1884 #define nodeIdIndex     4
1885 
1886 typedef unsigned *ProcCpuInfo;
1887 static unsigned maxIndex = pkgIdIndex;
1888 
1889 
1890 static int
1891 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1892 {
1893     const unsigned *aa = (const unsigned *)a;
1894     const unsigned *bb = (const unsigned *)b;
1895     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1896     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1897     return 0;
1898 };
1899 
1900 
1901 static int
1902 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1903 {
1904     unsigned i;
1905     const unsigned *aa = *((const unsigned **)a);
1906     const unsigned *bb = *((const unsigned **)b);
1907     for (i = maxIndex; ; i--) {
1908         if (aa[i] < bb[i]) return -1;
1909         if (aa[i] > bb[i]) return 1;
1910         if (i == osIdIndex) break;
1911     }
1912     return 0;
1913 }
1914 
1915 
1916 //
1917 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1918 // affinity map.
1919 //
1920 static int
1921 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1922   kmp_i18n_id_t *const msg_id, FILE *f)
1923 {
1924     *address2os = NULL;
1925     *msg_id = kmp_i18n_null;
1926 
1927     //
1928     // Scan of the file, and count the number of "processor" (osId) fields,
1929     // and find the highest value of <n> for a node_<n> field.
1930     //
1931     char buf[256];
1932     unsigned num_records = 0;
1933     while (! feof(f)) {
1934         buf[sizeof(buf) - 1] = 1;
1935         if (! fgets(buf, sizeof(buf), f)) {
1936             //
1937             // Read errors presumably because of EOF
1938             //
1939             break;
1940         }
1941 
1942         char s1[] = "processor";
1943         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1944             num_records++;
1945             continue;
1946         }
1947 
1948         //
1949         // FIXME - this will match "node_<n> <garbage>"
1950         //
1951         unsigned level;
1952         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1953             if (nodeIdIndex + level >= maxIndex) {
1954                 maxIndex = nodeIdIndex + level;
1955             }
1956             continue;
1957         }
1958     }
1959 
1960     //
1961     // Check for empty file / no valid processor records, or too many.
1962     // The number of records can't exceed the number of valid bits in the
1963     // affinity mask.
1964     //
1965     if (num_records == 0) {
1966         *line = 0;
1967         *msg_id = kmp_i18n_str_NoProcRecords;
1968         return -1;
1969     }
1970     if (num_records > (unsigned)__kmp_xproc) {
1971         *line = 0;
1972         *msg_id = kmp_i18n_str_TooManyProcRecords;
1973         return -1;
1974     }
1975 
1976     //
1977     // Set the file pointer back to the begginning, so that we can scan the
1978     // file again, this time performing a full parse of the data.
1979     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1980     // Adding an extra element at the end allows us to remove a lot of extra
1981     // checks for termination conditions.
1982     //
1983     if (fseek(f, 0, SEEK_SET) != 0) {
1984         *line = 0;
1985         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1986         return -1;
1987     }
1988 
1989     //
1990     // Allocate the array of records to store the proc info in.  The dummy
1991     // element at the end makes the logic in filling them out easier to code.
1992     //
1993     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1994       * sizeof(unsigned *));
1995     unsigned i;
1996     for (i = 0; i <= num_records; i++) {
1997         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1998           * sizeof(unsigned));
1999     }
2000 
2001 #define CLEANUP_THREAD_INFO \
2002     for (i = 0; i <= num_records; i++) {                                \
2003         __kmp_free(threadInfo[i]);                                      \
2004     }                                                                   \
2005     __kmp_free(threadInfo);
2006 
2007     //
2008     // A value of UINT_MAX means that we didn't find the field
2009     //
2010     unsigned __index;
2011 
2012 #define INIT_PROC_INFO(p) \
2013     for (__index = 0; __index <= maxIndex; __index++) {                 \
2014         (p)[__index] = UINT_MAX;                                        \
2015     }
2016 
2017     for (i = 0; i <= num_records; i++) {
2018         INIT_PROC_INFO(threadInfo[i]);
2019     }
2020 
2021     unsigned num_avail = 0;
2022     *line = 0;
2023     while (! feof(f)) {
2024         //
2025         // Create an inner scoping level, so that all the goto targets at the
2026         // end of the loop appear in an outer scoping level.  This avoids
2027         // warnings about jumping past an initialization to a target in the
2028         // same block.
2029         //
2030         {
2031             buf[sizeof(buf) - 1] = 1;
2032             bool long_line = false;
2033             if (! fgets(buf, sizeof(buf), f)) {
2034                 //
2035                 // Read errors presumably because of EOF
2036                 //
2037                 // If there is valid data in threadInfo[num_avail], then fake
2038                 // a blank line in ensure that the last address gets parsed.
2039                 //
2040                 bool valid = false;
2041                 for (i = 0; i <= maxIndex; i++) {
2042                     if (threadInfo[num_avail][i] != UINT_MAX) {
2043                         valid = true;
2044                     }
2045                 }
2046                 if (! valid) {
2047                     break;
2048                 }
2049                 buf[0] = 0;
2050             } else if (!buf[sizeof(buf) - 1]) {
2051                 //
2052                 // The line is longer than the buffer.  Set a flag and don't
2053                 // emit an error if we were going to ignore the line, anyway.
2054                 //
2055                 long_line = true;
2056 
2057 #define CHECK_LINE \
2058     if (long_line) {                                                    \
2059         CLEANUP_THREAD_INFO;                                            \
2060         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2061         return -1;                                                      \
2062     }
2063             }
2064             (*line)++;
2065 
2066             char s1[] = "processor";
2067             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2068                 CHECK_LINE;
2069                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2070                 unsigned val;
2071                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2072                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2073                 threadInfo[num_avail][osIdIndex] = val;
2074 #if KMP_OS_LINUX && USE_SYSFS_INFO
2075                 char path[256];
2076                 KMP_SNPRINTF(path, sizeof(path),
2077                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2078                     threadInfo[num_avail][osIdIndex]);
2079                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2080 
2081                 KMP_SNPRINTF(path, sizeof(path),
2082                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2083                     threadInfo[num_avail][osIdIndex]);
2084                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2085                 continue;
2086 #else
2087             }
2088             char s2[] = "physical id";
2089             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2090                 CHECK_LINE;
2091                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2092                 unsigned val;
2093                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2094                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2095                 threadInfo[num_avail][pkgIdIndex] = val;
2096                 continue;
2097             }
2098             char s3[] = "core id";
2099             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2100                 CHECK_LINE;
2101                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2102                 unsigned val;
2103                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2104                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2105                 threadInfo[num_avail][coreIdIndex] = val;
2106                 continue;
2107 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2108             }
2109             char s4[] = "thread id";
2110             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2111                 CHECK_LINE;
2112                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2113                 unsigned val;
2114                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2115                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2116                 threadInfo[num_avail][threadIdIndex] = val;
2117                 continue;
2118             }
2119             unsigned level;
2120             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2121                 CHECK_LINE;
2122                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2123                 unsigned val;
2124                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2125                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2126                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2127                 threadInfo[num_avail][nodeIdIndex + level] = val;
2128                 continue;
2129             }
2130 
2131             //
2132             // We didn't recognize the leading token on the line.
2133             // There are lots of leading tokens that we don't recognize -
2134             // if the line isn't empty, go on to the next line.
2135             //
2136             if ((*buf != 0) && (*buf != '\n')) {
2137                 //
2138                 // If the line is longer than the buffer, read characters
2139                 // until we find a newline.
2140                 //
2141                 if (long_line) {
2142                     int ch;
2143                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2144                 }
2145                 continue;
2146             }
2147 
2148             //
2149             // A newline has signalled the end of the processor record.
2150             // Check that there aren't too many procs specified.
2151             //
2152             if ((int)num_avail == __kmp_xproc) {
2153                 CLEANUP_THREAD_INFO;
2154                 *msg_id = kmp_i18n_str_TooManyEntries;
2155                 return -1;
2156             }
2157 
2158             //
2159             // Check for missing fields.  The osId field must be there, and we
2160             // currently require that the physical id field is specified, also.
2161             //
2162             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2163                 CLEANUP_THREAD_INFO;
2164                 *msg_id = kmp_i18n_str_MissingProcField;
2165                 return -1;
2166             }
2167             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2168                 CLEANUP_THREAD_INFO;
2169                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2170                 return -1;
2171             }
2172 
2173             //
2174             // Skip this proc if it is not included in the machine model.
2175             //
2176             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2177                 INIT_PROC_INFO(threadInfo[num_avail]);
2178                 continue;
2179             }
2180 
2181             //
2182             // We have a successful parse of this proc's info.
2183             // Increment the counter, and prepare for the next proc.
2184             //
2185             num_avail++;
2186             KMP_ASSERT(num_avail <= num_records);
2187             INIT_PROC_INFO(threadInfo[num_avail]);
2188         }
2189         continue;
2190 
2191         no_val:
2192         CLEANUP_THREAD_INFO;
2193         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2194         return -1;
2195 
2196         dup_field:
2197         CLEANUP_THREAD_INFO;
2198         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2199         return -1;
2200     }
2201     *line = 0;
2202 
2203 # if KMP_MIC && REDUCE_TEAM_SIZE
2204     unsigned teamSize = 0;
2205 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2206 
2207     // check for num_records == __kmp_xproc ???
2208 
2209     //
2210     // If there's only one thread context to bind to, form an Address object
2211     // with depth 1 and return immediately (or, if affinity is off, set
2212     // address2os to NULL and return).
2213     //
2214     // If it is configured to omit the package level when there is only a
2215     // single package, the logic at the end of this routine won't work if
2216     // there is only a single thread - it would try to form an Address
2217     // object with depth 0.
2218     //
2219     KMP_ASSERT(num_avail > 0);
2220     KMP_ASSERT(num_avail <= num_records);
2221     if (num_avail == 1) {
2222         __kmp_ncores = 1;
2223         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2224         if (__kmp_affinity_verbose) {
2225             if (! KMP_AFFINITY_CAPABLE()) {
2226                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2227                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2228                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2229             }
2230             else {
2231                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2232                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2233                   __kmp_affin_fullMask);
2234                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2235                 if (__kmp_affinity_respect_mask) {
2236                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2237                 } else {
2238                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2239                 }
2240                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2241                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2242             }
2243             int index;
2244             kmp_str_buf_t buf;
2245             __kmp_str_buf_init(&buf);
2246             __kmp_str_buf_print(&buf, "1");
2247             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2248                 __kmp_str_buf_print(&buf, " x 1");
2249             }
2250             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2251             __kmp_str_buf_free(&buf);
2252         }
2253 
2254         if (__kmp_affinity_type == affinity_none) {
2255             CLEANUP_THREAD_INFO;
2256             return 0;
2257         }
2258 
2259         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2260         Address addr(1);
2261         addr.labels[0] = threadInfo[0][pkgIdIndex];
2262         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2263 
2264         if (__kmp_affinity_gran_levels < 0) {
2265             __kmp_affinity_gran_levels = 0;
2266         }
2267 
2268         if (__kmp_affinity_verbose) {
2269             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2270         }
2271 
2272         CLEANUP_THREAD_INFO;
2273         return 1;
2274     }
2275 
2276     //
2277     // Sort the threadInfo table by physical Id.
2278     //
2279     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2280       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2281 
2282     //
2283     // The table is now sorted by pkgId / coreId / threadId, but we really
2284     // don't know the radix of any of the fields.  pkgId's may be sparsely
2285     // assigned among the chips on a system.  Although coreId's are usually
2286     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2287     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2288     //
2289     // For that matter, we don't know what coresPerPkg and threadsPerCore
2290     // (or the total # packages) are at this point - we want to determine
2291     // that now.  We only have an upper bound on the first two figures.
2292     //
2293     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2294       * sizeof(unsigned));
2295     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2296       * sizeof(unsigned));
2297     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2298       * sizeof(unsigned));
2299     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2300       * sizeof(unsigned));
2301 
2302     bool assign_thread_ids = false;
2303     unsigned threadIdCt;
2304     unsigned index;
2305 
2306     restart_radix_check:
2307     threadIdCt = 0;
2308 
2309     //
2310     // Initialize the counter arrays with data from threadInfo[0].
2311     //
2312     if (assign_thread_ids) {
2313         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2314             threadInfo[0][threadIdIndex] = threadIdCt++;
2315         }
2316         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2317             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2318         }
2319     }
2320     for (index = 0; index <= maxIndex; index++) {
2321         counts[index] = 1;
2322         maxCt[index] = 1;
2323         totals[index] = 1;
2324         lastId[index] = threadInfo[0][index];;
2325     }
2326 
2327     //
2328     // Run through the rest of the OS procs.
2329     //
2330     for (i = 1; i < num_avail; i++) {
2331         //
2332         // Find the most significant index whose id differs
2333         // from the id for the previous OS proc.
2334         //
2335         for (index = maxIndex; index >= threadIdIndex; index--) {
2336             if (assign_thread_ids && (index == threadIdIndex)) {
2337                 //
2338                 // Auto-assign the thread id field if it wasn't specified.
2339                 //
2340                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2341                     threadInfo[i][threadIdIndex] = threadIdCt++;
2342                 }
2343 
2344                 //
2345                 // Aparrently the thread id field was specified for some
2346                 // entries and not others.  Start the thread id counter
2347                 // off at the next higher thread id.
2348                 //
2349                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2350                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2351                 }
2352             }
2353             if (threadInfo[i][index] != lastId[index]) {
2354                 //
2355                 // Run through all indices which are less significant,
2356                 // and reset the counts to 1.
2357                 //
2358                 // At all levels up to and including index, we need to
2359                 // increment the totals and record the last id.
2360                 //
2361                 unsigned index2;
2362                 for (index2 = threadIdIndex; index2 < index; index2++) {
2363                     totals[index2]++;
2364                     if (counts[index2] > maxCt[index2]) {
2365                         maxCt[index2] = counts[index2];
2366                     }
2367                     counts[index2] = 1;
2368                     lastId[index2] = threadInfo[i][index2];
2369                 }
2370                 counts[index]++;
2371                 totals[index]++;
2372                 lastId[index] = threadInfo[i][index];
2373 
2374                 if (assign_thread_ids && (index > threadIdIndex)) {
2375 
2376 # if KMP_MIC && REDUCE_TEAM_SIZE
2377                     //
2378                     // The default team size is the total #threads in the machine
2379                     // minus 1 thread for every core that has 3 or more threads.
2380                     //
2381                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2382 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2383 
2384                     //
2385                     // Restart the thread counter, as we are on a new core.
2386                     //
2387                     threadIdCt = 0;
2388 
2389                     //
2390                     // Auto-assign the thread id field if it wasn't specified.
2391                     //
2392                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2393                         threadInfo[i][threadIdIndex] = threadIdCt++;
2394                     }
2395 
2396                     //
2397                     // Aparrently the thread id field was specified for some
2398                     // entries and not others.  Start the thread id counter
2399                     // off at the next higher thread id.
2400                     //
2401                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2402                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2403                     }
2404                 }
2405                 break;
2406             }
2407         }
2408         if (index < threadIdIndex) {
2409             //
2410             // If thread ids were specified, it is an error if they are not
2411             // unique.  Also, check that we waven't already restarted the
2412             // loop (to be safe - shouldn't need to).
2413             //
2414             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2415               || assign_thread_ids) {
2416                 __kmp_free(lastId);
2417                 __kmp_free(totals);
2418                 __kmp_free(maxCt);
2419                 __kmp_free(counts);
2420                 CLEANUP_THREAD_INFO;
2421                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2422                 return -1;
2423             }
2424 
2425             //
2426             // If the thread ids were not specified and we see entries
2427             // entries that are duplicates, start the loop over and
2428             // assign the thread ids manually.
2429             //
2430             assign_thread_ids = true;
2431             goto restart_radix_check;
2432         }
2433     }
2434 
2435 # if KMP_MIC && REDUCE_TEAM_SIZE
2436     //
2437     // The default team size is the total #threads in the machine
2438     // minus 1 thread for every core that has 3 or more threads.
2439     //
2440     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2441 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2442 
2443     for (index = threadIdIndex; index <= maxIndex; index++) {
2444         if (counts[index] > maxCt[index]) {
2445             maxCt[index] = counts[index];
2446         }
2447     }
2448 
2449     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2450     nCoresPerPkg = maxCt[coreIdIndex];
2451     nPackages = totals[pkgIdIndex];
2452 
2453     //
2454     // Check to see if the machine topology is uniform
2455     //
2456     unsigned prod = totals[maxIndex];
2457     for (index = threadIdIndex; index < maxIndex; index++) {
2458        prod *= maxCt[index];
2459     }
2460     bool uniform = (prod == totals[threadIdIndex]);
2461 
2462     //
2463     // When affinity is off, this routine will still be called to set
2464     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2465     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2466     // correctly, and return now if affinity is not enabled.
2467     //
2468     __kmp_ncores = totals[coreIdIndex];
2469 
2470     if (__kmp_affinity_verbose) {
2471         if (! KMP_AFFINITY_CAPABLE()) {
2472                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2473                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2474                 if (uniform) {
2475                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2476                 } else {
2477                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2478                 }
2479         }
2480         else {
2481             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2482             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2483                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2484                 if (__kmp_affinity_respect_mask) {
2485                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2486                 } else {
2487                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2488                 }
2489                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2490                 if (uniform) {
2491                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2492                 } else {
2493                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2494                 }
2495         }
2496         kmp_str_buf_t buf;
2497         __kmp_str_buf_init(&buf);
2498 
2499         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2500         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2501             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2502         }
2503         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2504           maxCt[threadIdIndex], __kmp_ncores);
2505 
2506         __kmp_str_buf_free(&buf);
2507     }
2508 
2509 # if KMP_MIC && REDUCE_TEAM_SIZE
2510     //
2511     // Set the default team size.
2512     //
2513     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2514         __kmp_dflt_team_nth = teamSize;
2515         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2516           __kmp_dflt_team_nth));
2517     }
2518 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2519 
2520     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2521     KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2522     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2523     for (i = 0; i < num_avail; ++i) { // fill the os indices
2524         __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2525     }
2526 
2527     if (__kmp_affinity_type == affinity_none) {
2528         __kmp_free(lastId);
2529         __kmp_free(totals);
2530         __kmp_free(maxCt);
2531         __kmp_free(counts);
2532         CLEANUP_THREAD_INFO;
2533         return 0;
2534     }
2535 
2536     //
2537     // Count the number of levels which have more nodes at that level than
2538     // at the parent's level (with there being an implicit root node of
2539     // the top level).  This is equivalent to saying that there is at least
2540     // one node at this level which has a sibling.  These levels are in the
2541     // map, and the package level is always in the map.
2542     //
2543     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2544     int level = 0;
2545     for (index = threadIdIndex; index < maxIndex; index++) {
2546         KMP_ASSERT(totals[index] >= totals[index + 1]);
2547         inMap[index] = (totals[index] > totals[index + 1]);
2548     }
2549     inMap[maxIndex] = (totals[maxIndex] > 1);
2550     inMap[pkgIdIndex] = true;
2551 
2552     int depth = 0;
2553     for (index = threadIdIndex; index <= maxIndex; index++) {
2554         if (inMap[index]) {
2555             depth++;
2556         }
2557     }
2558     KMP_ASSERT(depth > 0);
2559 
2560     //
2561     // Construct the data structure that is to be returned.
2562     //
2563     *address2os = (AddrUnsPair*)
2564       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2565     int pkgLevel = -1;
2566     int coreLevel = -1;
2567     int threadLevel = -1;
2568 
2569     for (i = 0; i < num_avail; ++i) {
2570         Address addr(depth);
2571         unsigned os = threadInfo[i][osIdIndex];
2572         int src_index;
2573         int dst_index = 0;
2574 
2575         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2576             if (! inMap[src_index]) {
2577                 continue;
2578             }
2579             addr.labels[dst_index] = threadInfo[i][src_index];
2580             if (src_index == pkgIdIndex) {
2581                 pkgLevel = dst_index;
2582             }
2583             else if (src_index == coreIdIndex) {
2584                 coreLevel = dst_index;
2585             }
2586             else if (src_index == threadIdIndex) {
2587                 threadLevel = dst_index;
2588             }
2589             dst_index++;
2590         }
2591         (*address2os)[i] = AddrUnsPair(addr, os);
2592     }
2593 
2594     if (__kmp_affinity_gran_levels < 0) {
2595         //
2596         // Set the granularity level based on what levels are modeled
2597         // in the machine topology map.
2598         //
2599         unsigned src_index;
2600         __kmp_affinity_gran_levels = 0;
2601         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2602             if (! inMap[src_index]) {
2603                 continue;
2604             }
2605             switch (src_index) {
2606                 case threadIdIndex:
2607                 if (__kmp_affinity_gran > affinity_gran_thread) {
2608                     __kmp_affinity_gran_levels++;
2609                 }
2610 
2611                 break;
2612                 case coreIdIndex:
2613                 if (__kmp_affinity_gran > affinity_gran_core) {
2614                     __kmp_affinity_gran_levels++;
2615                 }
2616                 break;
2617 
2618                 case pkgIdIndex:
2619                 if (__kmp_affinity_gran > affinity_gran_package) {
2620                     __kmp_affinity_gran_levels++;
2621                 }
2622                 break;
2623             }
2624         }
2625     }
2626 
2627     if (__kmp_affinity_verbose) {
2628         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2629           coreLevel, threadLevel);
2630     }
2631 
2632     __kmp_free(inMap);
2633     __kmp_free(lastId);
2634     __kmp_free(totals);
2635     __kmp_free(maxCt);
2636     __kmp_free(counts);
2637     CLEANUP_THREAD_INFO;
2638     return depth;
2639 }
2640 
2641 
2642 //
2643 // Create and return a table of affinity masks, indexed by OS thread ID.
2644 // This routine handles OR'ing together all the affinity masks of threads
2645 // that are sufficiently close, if granularity > fine.
2646 //
2647 static kmp_affin_mask_t *
2648 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2649   AddrUnsPair *address2os, unsigned numAddrs)
2650 {
2651     //
2652     // First form a table of affinity masks in order of OS thread id.
2653     //
2654     unsigned depth;
2655     unsigned maxOsId;
2656     unsigned i;
2657 
2658     KMP_ASSERT(numAddrs > 0);
2659     depth = address2os[0].first.depth;
2660 
2661     maxOsId = 0;
2662     for (i = 0; i < numAddrs; i++) {
2663         unsigned osId = address2os[i].second;
2664         if (osId > maxOsId) {
2665             maxOsId = osId;
2666         }
2667     }
2668     kmp_affin_mask_t *osId2Mask;
2669     KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2670 
2671     //
2672     // Sort the address2os table according to physical order.  Doing so
2673     // will put all threads on the same core/package/node in consecutive
2674     // locations.
2675     //
2676     qsort(address2os, numAddrs, sizeof(*address2os),
2677       __kmp_affinity_cmp_Address_labels);
2678 
2679     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2680     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2681         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2682     }
2683     if (__kmp_affinity_gran_levels >= (int)depth) {
2684         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2685           && (__kmp_affinity_type != affinity_none))) {
2686             KMP_WARNING(AffThreadsMayMigrate);
2687         }
2688     }
2689 
2690     //
2691     // Run through the table, forming the masks for all threads on each
2692     // core.  Threads on the same core will have identical "Address"
2693     // objects, not considering the last level, which must be the thread
2694     // id.  All threads on a core will appear consecutively.
2695     //
2696     unsigned unique = 0;
2697     unsigned j = 0;                             // index of 1st thread on core
2698     unsigned leader = 0;
2699     Address *leaderAddr = &(address2os[0].first);
2700     kmp_affin_mask_t *sum;
2701     KMP_CPU_ALLOC_ON_STACK(sum);
2702     KMP_CPU_ZERO(sum);
2703     KMP_CPU_SET(address2os[0].second, sum);
2704     for (i = 1; i < numAddrs; i++) {
2705         //
2706         // If this thread is sufficiently close to the leader (within the
2707         // granularity setting), then set the bit for this os thread in the
2708         // affinity mask for this group, and go on to the next thread.
2709         //
2710         if (leaderAddr->isClose(address2os[i].first,
2711           __kmp_affinity_gran_levels)) {
2712             KMP_CPU_SET(address2os[i].second, sum);
2713             continue;
2714         }
2715 
2716         //
2717         // For every thread in this group, copy the mask to the thread's
2718         // entry in the osId2Mask table.  Mark the first address as a
2719         // leader.
2720         //
2721         for (; j < i; j++) {
2722             unsigned osId = address2os[j].second;
2723             KMP_DEBUG_ASSERT(osId <= maxOsId);
2724             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2725             KMP_CPU_COPY(mask, sum);
2726             address2os[j].first.leader = (j == leader);
2727         }
2728         unique++;
2729 
2730         //
2731         // Start a new mask.
2732         //
2733         leader = i;
2734         leaderAddr = &(address2os[i].first);
2735         KMP_CPU_ZERO(sum);
2736         KMP_CPU_SET(address2os[i].second, sum);
2737     }
2738 
2739     //
2740     // For every thread in last group, copy the mask to the thread's
2741     // entry in the osId2Mask table.
2742     //
2743     for (; j < i; j++) {
2744         unsigned osId = address2os[j].second;
2745         KMP_DEBUG_ASSERT(osId <= maxOsId);
2746         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2747         KMP_CPU_COPY(mask, sum);
2748         address2os[j].first.leader = (j == leader);
2749     }
2750     unique++;
2751     KMP_CPU_FREE_FROM_STACK(sum);
2752 
2753     *maxIndex = maxOsId;
2754     *numUnique = unique;
2755     return osId2Mask;
2756 }
2757 
2758 
2759 //
2760 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2761 // as file-static than to try and pass them through the calling sequence of
2762 // the recursive-descent OMP_PLACES parser.
2763 //
2764 static kmp_affin_mask_t *newMasks;
2765 static int numNewMasks;
2766 static int nextNewMask;
2767 
2768 #define ADD_MASK(_mask) \
2769     {                                                                   \
2770         if (nextNewMask >= numNewMasks) {                               \
2771             int i;                                                      \
2772             numNewMasks *= 2;                                           \
2773             kmp_affin_mask_t* temp;                                     \
2774             KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
2775             for(i=0;i<numNewMasks/2;i++) {                              \
2776                 kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
2777                 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
2778                 KMP_CPU_COPY(dest, src);                                \
2779             }                                                           \
2780             KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
2781             newMasks = temp;                                            \
2782         }                                                               \
2783         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2784         nextNewMask++;                                                  \
2785     }
2786 
2787 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2788     {                                                                   \
2789         if (((_osId) > _maxOsId) ||                                     \
2790           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2791             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2792               && (__kmp_affinity_type != affinity_none))) {             \
2793                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2794             }                                                           \
2795         }                                                               \
2796         else {                                                          \
2797             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2798         }                                                               \
2799     }
2800 
2801 
2802 //
2803 // Re-parse the proclist (for the explicit affinity type), and form the list
2804 // of affinity newMasks indexed by gtid.
2805 //
2806 static void
2807 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2808   unsigned int *out_numMasks, const char *proclist,
2809   kmp_affin_mask_t *osId2Mask, int maxOsId)
2810 {
2811     int i;
2812     const char *scan = proclist;
2813     const char *next = proclist;
2814 
2815     //
2816     // We use malloc() for the temporary mask vector,
2817     // so that we can use realloc() to extend it.
2818     //
2819     numNewMasks = 2;
2820     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2821     nextNewMask = 0;
2822     kmp_affin_mask_t *sumMask;
2823     KMP_CPU_ALLOC(sumMask);
2824     int setSize = 0;
2825 
2826     for (;;) {
2827         int start, end, stride;
2828 
2829         SKIP_WS(scan);
2830         next = scan;
2831         if (*next == '\0') {
2832             break;
2833         }
2834 
2835         if (*next == '{') {
2836             int num;
2837             setSize = 0;
2838             next++;     // skip '{'
2839             SKIP_WS(next);
2840             scan = next;
2841 
2842             //
2843             // Read the first integer in the set.
2844             //
2845             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2846               "bad proclist");
2847             SKIP_DIGITS(next);
2848             num = __kmp_str_to_int(scan, *next);
2849             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2850 
2851             //
2852             // Copy the mask for that osId to the sum (union) mask.
2853             //
2854             if ((num > maxOsId) ||
2855               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2856                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2857                   && (__kmp_affinity_type != affinity_none))) {
2858                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2859                 }
2860                 KMP_CPU_ZERO(sumMask);
2861             }
2862             else {
2863                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2864                 setSize = 1;
2865             }
2866 
2867             for (;;) {
2868                 //
2869                 // Check for end of set.
2870                 //
2871                 SKIP_WS(next);
2872                 if (*next == '}') {
2873                     next++;     // skip '}'
2874                     break;
2875                 }
2876 
2877                 //
2878                 // Skip optional comma.
2879                 //
2880                 if (*next == ',') {
2881                     next++;
2882                 }
2883                 SKIP_WS(next);
2884 
2885                 //
2886                 // Read the next integer in the set.
2887                 //
2888                 scan = next;
2889                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2890                   "bad explicit proc list");
2891 
2892                 SKIP_DIGITS(next);
2893                 num = __kmp_str_to_int(scan, *next);
2894                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2895 
2896                 //
2897                 // Add the mask for that osId to the sum mask.
2898                 //
2899                 if ((num > maxOsId) ||
2900                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2901                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2902                       && (__kmp_affinity_type != affinity_none))) {
2903                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2904                     }
2905                 }
2906                 else {
2907                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2908                     setSize++;
2909                 }
2910             }
2911             if (setSize > 0) {
2912                 ADD_MASK(sumMask);
2913             }
2914 
2915             SKIP_WS(next);
2916             if (*next == ',') {
2917                 next++;
2918             }
2919             scan = next;
2920             continue;
2921         }
2922 
2923         //
2924         // Read the first integer.
2925         //
2926         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2927         SKIP_DIGITS(next);
2928         start = __kmp_str_to_int(scan, *next);
2929         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2930         SKIP_WS(next);
2931 
2932         //
2933         // If this isn't a range, then add a mask to the list and go on.
2934         //
2935         if (*next != '-') {
2936             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2937 
2938             //
2939             // Skip optional comma.
2940             //
2941             if (*next == ',') {
2942                 next++;
2943             }
2944             scan = next;
2945             continue;
2946         }
2947 
2948         //
2949         // This is a range.  Skip over the '-' and read in the 2nd int.
2950         //
2951         next++;         // skip '-'
2952         SKIP_WS(next);
2953         scan = next;
2954         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2955         SKIP_DIGITS(next);
2956         end = __kmp_str_to_int(scan, *next);
2957         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2958 
2959         //
2960         // Check for a stride parameter
2961         //
2962         stride = 1;
2963         SKIP_WS(next);
2964         if (*next == ':') {
2965             //
2966             // A stride is specified.  Skip over the ':" and read the 3rd int.
2967             //
2968             int sign = +1;
2969             next++;         // skip ':'
2970             SKIP_WS(next);
2971             scan = next;
2972             if (*next == '-') {
2973                 sign = -1;
2974                 next++;
2975                 SKIP_WS(next);
2976                 scan = next;
2977             }
2978             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2979               "bad explicit proc list");
2980             SKIP_DIGITS(next);
2981             stride = __kmp_str_to_int(scan, *next);
2982             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2983             stride *= sign;
2984         }
2985 
2986         //
2987         // Do some range checks.
2988         //
2989         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2990         if (stride > 0) {
2991             KMP_ASSERT2(start <= end, "bad explicit proc list");
2992         }
2993         else {
2994             KMP_ASSERT2(start >= end, "bad explicit proc list");
2995         }
2996         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2997 
2998         //
2999         // Add the mask for each OS proc # to the list.
3000         //
3001         if (stride > 0) {
3002             do {
3003                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3004                 start += stride;
3005             } while (start <= end);
3006         }
3007         else {
3008             do {
3009                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3010                 start += stride;
3011             } while (start >= end);
3012         }
3013 
3014         //
3015         // Skip optional comma.
3016         //
3017         SKIP_WS(next);
3018         if (*next == ',') {
3019             next++;
3020         }
3021         scan = next;
3022     }
3023 
3024     *out_numMasks = nextNewMask;
3025     if (nextNewMask == 0) {
3026         *out_masks = NULL;
3027         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3028         return;
3029     }
3030     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3031     for(i = 0; i < nextNewMask; i++) {
3032         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3033         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3034         KMP_CPU_COPY(dest, src);
3035     }
3036     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3037     KMP_CPU_FREE(sumMask);
3038 }
3039 
3040 
3041 # if OMP_40_ENABLED
3042 
3043 /*-----------------------------------------------------------------------------
3044 
3045 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3046 places.  Again, Here is the grammar:
3047 
3048 place_list := place
3049 place_list := place , place_list
3050 place := num
3051 place := place : num
3052 place := place : num : signed
3053 place := { subplacelist }
3054 place := ! place                  // (lowest priority)
3055 subplace_list := subplace
3056 subplace_list := subplace , subplace_list
3057 subplace := num
3058 subplace := num : num
3059 subplace := num : num : signed
3060 signed := num
3061 signed := + signed
3062 signed := - signed
3063 
3064 -----------------------------------------------------------------------------*/
3065 
3066 static void
3067 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3068   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3069 {
3070     const char *next;
3071 
3072     for (;;) {
3073         int start, count, stride, i;
3074 
3075         //
3076         // Read in the starting proc id
3077         //
3078         SKIP_WS(*scan);
3079         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3080           "bad explicit places list");
3081         next = *scan;
3082         SKIP_DIGITS(next);
3083         start = __kmp_str_to_int(*scan, *next);
3084         KMP_ASSERT(start >= 0);
3085         *scan = next;
3086 
3087         //
3088         // valid follow sets are ',' ':' and '}'
3089         //
3090         SKIP_WS(*scan);
3091         if (**scan == '}' || **scan == ',') {
3092             if ((start > maxOsId) ||
3093               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3094                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3095                   && (__kmp_affinity_type != affinity_none))) {
3096                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3097                 }
3098             }
3099             else {
3100                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101                 (*setSize)++;
3102             }
3103             if (**scan == '}') {
3104                 break;
3105             }
3106             (*scan)++;  // skip ','
3107             continue;
3108         }
3109         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3110         (*scan)++;      // skip ':'
3111 
3112         //
3113         // Read count parameter
3114         //
3115         SKIP_WS(*scan);
3116         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3117           "bad explicit places list");
3118         next = *scan;
3119         SKIP_DIGITS(next);
3120         count = __kmp_str_to_int(*scan, *next);
3121         KMP_ASSERT(count >= 0);
3122         *scan = next;
3123 
3124         //
3125         // valid follow sets are ',' ':' and '}'
3126         //
3127         SKIP_WS(*scan);
3128         if (**scan == '}' || **scan == ',') {
3129             for (i = 0; i < count; i++) {
3130                 if ((start > maxOsId) ||
3131                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3132                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3133                       && (__kmp_affinity_type != affinity_none))) {
3134                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3135                     }
3136                     break;  // don't proliferate warnings for large count
3137                 }
3138                 else {
3139                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3140                     start++;
3141                     (*setSize)++;
3142                 }
3143             }
3144             if (**scan == '}') {
3145                 break;
3146             }
3147             (*scan)++;  // skip ','
3148             continue;
3149         }
3150         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3151         (*scan)++;      // skip ':'
3152 
3153         //
3154         // Read stride parameter
3155         //
3156         int sign = +1;
3157         for (;;) {
3158             SKIP_WS(*scan);
3159             if (**scan == '+') {
3160                 (*scan)++; // skip '+'
3161                 continue;
3162             }
3163             if (**scan == '-') {
3164                 sign *= -1;
3165                 (*scan)++; // skip '-'
3166                 continue;
3167             }
3168             break;
3169         }
3170         SKIP_WS(*scan);
3171         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3172           "bad explicit places list");
3173         next = *scan;
3174         SKIP_DIGITS(next);
3175         stride = __kmp_str_to_int(*scan, *next);
3176         KMP_ASSERT(stride >= 0);
3177         *scan = next;
3178         stride *= sign;
3179 
3180         //
3181         // valid follow sets are ',' and '}'
3182         //
3183         SKIP_WS(*scan);
3184         if (**scan == '}' || **scan == ',') {
3185             for (i = 0; i < count; i++) {
3186                 if ((start > maxOsId) ||
3187                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3188                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3189                       && (__kmp_affinity_type != affinity_none))) {
3190                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3191                     }
3192                     break;  // don't proliferate warnings for large count
3193                 }
3194                 else {
3195                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3196                     start += stride;
3197                     (*setSize)++;
3198                 }
3199             }
3200             if (**scan == '}') {
3201                 break;
3202             }
3203             (*scan)++;  // skip ','
3204             continue;
3205         }
3206 
3207         KMP_ASSERT2(0, "bad explicit places list");
3208     }
3209 }
3210 
3211 
3212 static void
3213 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3214   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3215 {
3216     const char *next;
3217 
3218     //
3219     // valid follow sets are '{' '!' and num
3220     //
3221     SKIP_WS(*scan);
3222     if (**scan == '{') {
3223         (*scan)++;      // skip '{'
3224         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3225           setSize);
3226         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3227         (*scan)++;      // skip '}'
3228     }
3229     else if (**scan == '!') {
3230         (*scan)++;      // skip '!'
3231         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3232         KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3233     }
3234     else if ((**scan >= '0') && (**scan <= '9')) {
3235         next = *scan;
3236         SKIP_DIGITS(next);
3237         int num = __kmp_str_to_int(*scan, *next);
3238         KMP_ASSERT(num >= 0);
3239         if ((num > maxOsId) ||
3240           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3241             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3242               && (__kmp_affinity_type != affinity_none))) {
3243                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3244             }
3245         }
3246         else {
3247             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3248             (*setSize)++;
3249         }
3250         *scan = next;  // skip num
3251     }
3252     else {
3253         KMP_ASSERT2(0, "bad explicit places list");
3254     }
3255 }
3256 
3257 
3258 //static void
3259 void
3260 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3261   unsigned int *out_numMasks, const char *placelist,
3262   kmp_affin_mask_t *osId2Mask, int maxOsId)
3263 {
3264     int i,j,count,stride,sign;
3265     const char *scan = placelist;
3266     const char *next = placelist;
3267 
3268     numNewMasks = 2;
3269     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3270     nextNewMask = 0;
3271 
3272     // tempMask is modified based on the previous or initial
3273     //   place to form the current place
3274     // previousMask contains the previous place
3275     kmp_affin_mask_t *tempMask;
3276     kmp_affin_mask_t *previousMask;
3277     KMP_CPU_ALLOC(tempMask);
3278     KMP_CPU_ZERO(tempMask);
3279     KMP_CPU_ALLOC(previousMask);
3280     KMP_CPU_ZERO(previousMask);
3281     int setSize = 0;
3282 
3283     for (;;) {
3284         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3285 
3286         //
3287         // valid follow sets are ',' ':' and EOL
3288         //
3289         SKIP_WS(scan);
3290         if (*scan == '\0' || *scan == ',') {
3291             if (setSize > 0) {
3292                 ADD_MASK(tempMask);
3293             }
3294             KMP_CPU_ZERO(tempMask);
3295             setSize = 0;
3296             if (*scan == '\0') {
3297                 break;
3298             }
3299             scan++;     // skip ','
3300             continue;
3301         }
3302 
3303         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3304         scan++;         // skip ':'
3305 
3306         //
3307         // Read count parameter
3308         //
3309         SKIP_WS(scan);
3310         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3311           "bad explicit places list");
3312         next = scan;
3313         SKIP_DIGITS(next);
3314         count = __kmp_str_to_int(scan, *next);
3315         KMP_ASSERT(count >= 0);
3316         scan = next;
3317 
3318         //
3319         // valid follow sets are ',' ':' and EOL
3320         //
3321         SKIP_WS(scan);
3322         if (*scan == '\0' || *scan == ',') {
3323             stride = +1;
3324         }
3325         else {
3326             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3327             scan++;         // skip ':'
3328 
3329             //
3330             // Read stride parameter
3331             //
3332             sign = +1;
3333             for (;;) {
3334                 SKIP_WS(scan);
3335                 if (*scan == '+') {
3336                     scan++; // skip '+'
3337                     continue;
3338                 }
3339                 if (*scan == '-') {
3340                     sign *= -1;
3341                     scan++; // skip '-'
3342                     continue;
3343                 }
3344                 break;
3345             }
3346             SKIP_WS(scan);
3347             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3348               "bad explicit places list");
3349             next = scan;
3350             SKIP_DIGITS(next);
3351             stride = __kmp_str_to_int(scan, *next);
3352             KMP_DEBUG_ASSERT(stride >= 0);
3353             scan = next;
3354             stride *= sign;
3355         }
3356 
3357         // Add places determined by initial_place : count : stride
3358         for (i = 0; i < count; i++) {
3359             if (setSize == 0) {
3360                 break;
3361             }
3362             // Add the current place, then build the next place (tempMask) from that
3363             KMP_CPU_COPY(previousMask, tempMask);
3364             ADD_MASK(previousMask);
3365             KMP_CPU_ZERO(tempMask);
3366             setSize = 0;
3367             KMP_CPU_SET_ITERATE(j, previousMask) {
3368                 if (! KMP_CPU_ISSET(j, previousMask)) {
3369                     continue;
3370                 }
3371                 if ((j+stride > maxOsId) || (j+stride < 0) ||
3372                   (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3373                   (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3374                     if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3375                       && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3376                         KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3377                     }
3378                     continue;
3379                 }
3380                 KMP_CPU_SET(j+stride, tempMask);
3381                 setSize++;
3382             }
3383         }
3384         KMP_CPU_ZERO(tempMask);
3385         setSize = 0;
3386 
3387         //
3388         // valid follow sets are ',' and EOL
3389         //
3390         SKIP_WS(scan);
3391         if (*scan == '\0') {
3392             break;
3393         }
3394         if (*scan == ',') {
3395             scan++;     // skip ','
3396             continue;
3397         }
3398 
3399         KMP_ASSERT2(0, "bad explicit places list");
3400     }
3401 
3402     *out_numMasks = nextNewMask;
3403     if (nextNewMask == 0) {
3404         *out_masks = NULL;
3405         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3406         return;
3407     }
3408     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3409     KMP_CPU_FREE(tempMask);
3410     KMP_CPU_FREE(previousMask);
3411     for(i = 0; i < nextNewMask; i++) {
3412         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3413         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3414         KMP_CPU_COPY(dest, src);
3415     }
3416     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3417 }
3418 
3419 # endif /* OMP_40_ENABLED */
3420 
3421 #undef ADD_MASK
3422 #undef ADD_MASK_OSID
3423 
3424 static void
3425 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3426 {
3427     int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
3428     if (__kmp_place_num_sockets == 0 &&
3429         __kmp_place_num_cores == 0 &&
3430         __kmp_place_num_threads_per_core == 0 )
3431         goto _exit;   // no topology limiting actions requested, exit
3432     if (__kmp_place_num_sockets == 0)
3433         __kmp_place_num_sockets = nPackages;    // use all available sockets
3434     if (__kmp_place_num_cores == 0)
3435         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3436     if (__kmp_place_num_threads_per_core == 0 ||
3437         __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3438         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3439 
3440     if ( !__kmp_affinity_uniform_topology() ) {
3441         KMP_WARNING( AffHWSubsetNonUniform );
3442         goto _exit; // don't support non-uniform topology
3443     }
3444     if ( depth > 3 ) {
3445         KMP_WARNING( AffHWSubsetNonThreeLevel );
3446         goto _exit; // don't support not-3-level topology
3447     }
3448     if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3449         KMP_WARNING(AffHWSubsetManySockets);
3450         goto _exit;
3451     }
3452     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3453         KMP_WARNING( AffHWSubsetManyCores );
3454         goto _exit;
3455     }
3456 
3457     AddrUnsPair *newAddr;
3458     if (pAddr) // pAddr is NULL in case of affinity_none
3459         newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3460             __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3461 
3462     for (i = 0; i < nPackages; ++i) {
3463         if (i < __kmp_place_socket_offset ||
3464             i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
3465             n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3466             if (__kmp_pu_os_idx != NULL) {
3467                 for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
3468                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
3469                         KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3470                         ++proc_num;
3471                     }
3472                 }
3473             }
3474         } else {
3475             for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
3476                 if (j < __kmp_place_core_offset ||
3477                     j >= __kmp_place_core_offset + __kmp_place_num_cores) {
3478                     n_old += __kmp_nThreadsPerCore; // skip not-requested core
3479                     if (__kmp_pu_os_idx != NULL) {
3480                         for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
3481                             KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3482                             ++proc_num;
3483                         }
3484                     }
3485                 } else {
3486                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3487                         if (k < __kmp_place_num_threads_per_core) {
3488                             if (pAddr)
3489                                 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3490                             n_new++;
3491                         } else {
3492                             if (__kmp_pu_os_idx != NULL)
3493                                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3494                         }
3495                         n_old++;
3496                         ++proc_num;
3497                     }
3498                 }
3499             }
3500         }
3501     }
3502     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3503     KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3504                      __kmp_place_num_threads_per_core);
3505 
3506     nPackages = __kmp_place_num_sockets;                      // correct nPackages
3507     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3508     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3509     __kmp_avail_proc = n_new;                                 // correct avail_proc
3510     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3511 
3512     if (pAddr) {
3513         __kmp_free( *pAddr );
3514         *pAddr = newAddr;      // replace old topology with new one
3515     }
3516 _exit:
3517     if (__kmp_pu_os_idx != NULL) {
3518         __kmp_free(__kmp_pu_os_idx);
3519         __kmp_pu_os_idx = NULL;
3520     }
3521 }
3522 
3523 
3524 static AddrUnsPair *address2os = NULL;
3525 static int           * procarr = NULL;
3526 static int     __kmp_aff_depth = 0;
3527 
3528 #define KMP_EXIT_AFF_NONE                             \
3529     KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3530     KMP_ASSERT(address2os == NULL);                   \
3531     __kmp_apply_thread_places(NULL, 0);               \
3532     return;
3533 
3534 static void
3535 __kmp_aux_affinity_initialize(void)
3536 {
3537     if (__kmp_affinity_masks != NULL) {
3538         KMP_ASSERT(__kmp_affin_fullMask != NULL);
3539         return;
3540     }
3541 
3542     //
3543     // Create the "full" mask - this defines all of the processors that we
3544     // consider to be in the machine model.  If respect is set, then it is
3545     // the initialization thread's affinity mask.  Otherwise, it is all
3546     // processors that we know about on the machine.
3547     //
3548     if (__kmp_affin_fullMask == NULL) {
3549         KMP_CPU_ALLOC(__kmp_affin_fullMask);
3550     }
3551     if (KMP_AFFINITY_CAPABLE()) {
3552         if (__kmp_affinity_respect_mask) {
3553             __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3554 
3555             //
3556             // Count the number of available processors.
3557             //
3558             unsigned i;
3559             __kmp_avail_proc = 0;
3560             KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3561                 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3562                     continue;
3563                 }
3564                 __kmp_avail_proc++;
3565             }
3566             if (__kmp_avail_proc > __kmp_xproc) {
3567                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3568                   && (__kmp_affinity_type != affinity_none))) {
3569                     KMP_WARNING(ErrorInitializeAffinity);
3570                 }
3571                 __kmp_affinity_type = affinity_none;
3572                 KMP_AFFINITY_DISABLE();
3573                 return;
3574             }
3575         }
3576         else {
3577             __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3578             __kmp_avail_proc = __kmp_xproc;
3579         }
3580     }
3581 
3582     int depth = -1;
3583     kmp_i18n_id_t msg_id = kmp_i18n_null;
3584 
3585     //
3586     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3587     // KMP_TOPOLOGY_METHOD=cpuinfo
3588     //
3589     if ((__kmp_cpuinfo_file != NULL) &&
3590       (__kmp_affinity_top_method == affinity_top_method_all)) {
3591         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3592     }
3593 
3594     if (__kmp_affinity_top_method == affinity_top_method_all) {
3595         //
3596         // In the default code path, errors are not fatal - we just try using
3597         // another method.  We only emit a warning message if affinity is on,
3598         // or the verbose flag is set, an the nowarnings flag was not set.
3599         //
3600         const char *file_name = NULL;
3601         int line = 0;
3602 # if KMP_USE_HWLOC
3603         if (depth < 0) {
3604             if (__kmp_affinity_verbose) {
3605                 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3606             }
3607             if(!__kmp_hwloc_error) {
3608                 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3609                 if (depth == 0) {
3610                     KMP_EXIT_AFF_NONE;
3611                 } else if(depth < 0 && __kmp_affinity_verbose) {
3612                     KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3613                 }
3614             } else if(__kmp_affinity_verbose) {
3615                 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3616             }
3617         }
3618 # endif
3619 
3620 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3621 
3622         if (depth < 0) {
3623             if (__kmp_affinity_verbose) {
3624                 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3625             }
3626 
3627             file_name = NULL;
3628             depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3629             if (depth == 0) {
3630                 KMP_EXIT_AFF_NONE;
3631             }
3632 
3633             if (depth < 0) {
3634                 if (__kmp_affinity_verbose) {
3635                     if (msg_id != kmp_i18n_null) {
3636                         KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3637                           KMP_I18N_STR(DecodingLegacyAPIC));
3638                     }
3639                     else {
3640                         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3641                     }
3642                 }
3643 
3644                 file_name = NULL;
3645                 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3646                 if (depth == 0) {
3647                     KMP_EXIT_AFF_NONE;
3648                 }
3649             }
3650         }
3651 
3652 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3653 
3654 # if KMP_OS_LINUX
3655 
3656         if (depth < 0) {
3657             if (__kmp_affinity_verbose) {
3658                 if (msg_id != kmp_i18n_null) {
3659                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3660                 }
3661                 else {
3662                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3663                 }
3664             }
3665 
3666             FILE *f = fopen("/proc/cpuinfo", "r");
3667             if (f == NULL) {
3668                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3669             }
3670             else {
3671                 file_name = "/proc/cpuinfo";
3672                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3673                 fclose(f);
3674                 if (depth == 0) {
3675                     KMP_EXIT_AFF_NONE;
3676                 }
3677             }
3678         }
3679 
3680 # endif /* KMP_OS_LINUX */
3681 
3682 # if KMP_GROUP_AFFINITY
3683 
3684         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3685             if (__kmp_affinity_verbose) {
3686                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3687             }
3688 
3689             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3690             KMP_ASSERT(depth != 0);
3691         }
3692 
3693 # endif /* KMP_GROUP_AFFINITY */
3694 
3695         if (depth < 0) {
3696             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3697                 if (file_name == NULL) {
3698                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3699                 }
3700                 else if (line == 0) {
3701                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3702                 }
3703                 else {
3704                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3705                 }
3706             }
3707             // FIXME - print msg if msg_id = kmp_i18n_null ???
3708 
3709             file_name = "";
3710             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3711             if (depth == 0) {
3712                 KMP_EXIT_AFF_NONE;
3713             }
3714             KMP_ASSERT(depth > 0);
3715             KMP_ASSERT(address2os != NULL);
3716         }
3717     }
3718 
3719     //
3720     // If the user has specified that a paricular topology discovery method
3721     // is to be used, then we abort if that method fails.  The exception is
3722     // group affinity, which might have been implicitly set.
3723     //
3724 
3725 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3726 
3727     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3728         if (__kmp_affinity_verbose) {
3729             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3730               KMP_I18N_STR(Decodingx2APIC));
3731         }
3732 
3733         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3734         if (depth == 0) {
3735             KMP_EXIT_AFF_NONE;
3736         }
3737         if (depth < 0) {
3738             KMP_ASSERT(msg_id != kmp_i18n_null);
3739             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3740         }
3741     }
3742     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3743         if (__kmp_affinity_verbose) {
3744             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3745               KMP_I18N_STR(DecodingLegacyAPIC));
3746         }
3747 
3748         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3749         if (depth == 0) {
3750             KMP_EXIT_AFF_NONE;
3751         }
3752         if (depth < 0) {
3753             KMP_ASSERT(msg_id != kmp_i18n_null);
3754             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3755         }
3756     }
3757 
3758 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3759 
3760     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3761         const char *filename;
3762         if (__kmp_cpuinfo_file != NULL) {
3763             filename = __kmp_cpuinfo_file;
3764         }
3765         else {
3766             filename = "/proc/cpuinfo";
3767         }
3768 
3769         if (__kmp_affinity_verbose) {
3770             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3771         }
3772 
3773         FILE *f = fopen(filename, "r");
3774         if (f == NULL) {
3775             int code = errno;
3776             if (__kmp_cpuinfo_file != NULL) {
3777                 __kmp_msg(
3778                     kmp_ms_fatal,
3779                     KMP_MSG(CantOpenFileForReading, filename),
3780                     KMP_ERR(code),
3781                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3782                     __kmp_msg_null
3783                 );
3784             }
3785             else {
3786                 __kmp_msg(
3787                     kmp_ms_fatal,
3788                     KMP_MSG(CantOpenFileForReading, filename),
3789                     KMP_ERR(code),
3790                     __kmp_msg_null
3791                 );
3792             }
3793         }
3794         int line = 0;
3795         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3796         fclose(f);
3797         if (depth < 0) {
3798             KMP_ASSERT(msg_id != kmp_i18n_null);
3799             if (line > 0) {
3800                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3801             }
3802             else {
3803                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3804             }
3805         }
3806         if (__kmp_affinity_type == affinity_none) {
3807             KMP_ASSERT(depth == 0);
3808             KMP_EXIT_AFF_NONE;
3809         }
3810     }
3811 
3812 # if KMP_GROUP_AFFINITY
3813 
3814     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3815         if (__kmp_affinity_verbose) {
3816             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3817         }
3818 
3819         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3820         KMP_ASSERT(depth != 0);
3821         if (depth < 0) {
3822             KMP_ASSERT(msg_id != kmp_i18n_null);
3823             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3824         }
3825     }
3826 
3827 # endif /* KMP_GROUP_AFFINITY */
3828 
3829     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3830         if (__kmp_affinity_verbose) {
3831             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3832         }
3833 
3834         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3835         if (depth == 0) {
3836             KMP_EXIT_AFF_NONE;
3837         }
3838         // should not fail
3839         KMP_ASSERT(depth > 0);
3840         KMP_ASSERT(address2os != NULL);
3841     }
3842 
3843 # if KMP_USE_HWLOC
3844     else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3845         if (__kmp_affinity_verbose) {
3846             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3847         }
3848         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3849         if (depth == 0) {
3850             KMP_EXIT_AFF_NONE;
3851         }
3852     }
3853 # endif // KMP_USE_HWLOC
3854 
3855     if (address2os == NULL) {
3856         if (KMP_AFFINITY_CAPABLE()
3857           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3858           && (__kmp_affinity_type != affinity_none)))) {
3859             KMP_WARNING(ErrorInitializeAffinity);
3860         }
3861         __kmp_affinity_type = affinity_none;
3862         KMP_AFFINITY_DISABLE();
3863         return;
3864     }
3865 
3866     __kmp_apply_thread_places(&address2os, depth);
3867 
3868     //
3869     // Create the table of masks, indexed by thread Id.
3870     //
3871     unsigned maxIndex;
3872     unsigned numUnique;
3873     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3874       address2os, __kmp_avail_proc);
3875     if (__kmp_affinity_gran_levels == 0) {
3876         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3877     }
3878 
3879     //
3880     // Set the childNums vector in all Address objects.  This must be done
3881     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3882     // which takes into account the setting of __kmp_affinity_compact.
3883     //
3884     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3885 
3886     switch (__kmp_affinity_type) {
3887 
3888         case affinity_explicit:
3889         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3890 # if OMP_40_ENABLED
3891         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3892 # endif
3893         {
3894             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3895               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3896               maxIndex);
3897         }
3898 # if OMP_40_ENABLED
3899         else {
3900             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3901               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3902               maxIndex);
3903         }
3904 # endif
3905         if (__kmp_affinity_num_masks == 0) {
3906             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3907               && (__kmp_affinity_type != affinity_none))) {
3908                 KMP_WARNING(AffNoValidProcID);
3909             }
3910             __kmp_affinity_type = affinity_none;
3911             return;
3912         }
3913         break;
3914 
3915         //
3916         // The other affinity types rely on sorting the Addresses according
3917         // to some permutation of the machine topology tree.  Set
3918         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3919         // then jump to a common code fragment to do the sort and create
3920         // the array of affinity masks.
3921         //
3922 
3923         case affinity_logical:
3924         __kmp_affinity_compact = 0;
3925         if (__kmp_affinity_offset) {
3926             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3927               % __kmp_avail_proc;
3928         }
3929         goto sortAddresses;
3930 
3931         case affinity_physical:
3932         if (__kmp_nThreadsPerCore > 1) {
3933             __kmp_affinity_compact = 1;
3934             if (__kmp_affinity_compact >= depth) {
3935                 __kmp_affinity_compact = 0;
3936             }
3937         } else {
3938             __kmp_affinity_compact = 0;
3939         }
3940         if (__kmp_affinity_offset) {
3941             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3942               % __kmp_avail_proc;
3943         }
3944         goto sortAddresses;
3945 
3946         case affinity_scatter:
3947         if (__kmp_affinity_compact >= depth) {
3948             __kmp_affinity_compact = 0;
3949         }
3950         else {
3951             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3952         }
3953         goto sortAddresses;
3954 
3955         case affinity_compact:
3956         if (__kmp_affinity_compact >= depth) {
3957             __kmp_affinity_compact = depth - 1;
3958         }
3959         goto sortAddresses;
3960 
3961         case affinity_balanced:
3962         // Balanced works only for the case of a single package
3963         if( nPackages > 1 ) {
3964             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3965                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3966             }
3967             __kmp_affinity_type = affinity_none;
3968             return;
3969         } else if( __kmp_affinity_uniform_topology() ) {
3970             break;
3971         } else { // Non-uniform topology
3972 
3973             // Save the depth for further usage
3974             __kmp_aff_depth = depth;
3975 
3976             // Number of hyper threads per core in HT machine
3977             int nth_per_core = __kmp_nThreadsPerCore;
3978 
3979             int core_level;
3980             if( nth_per_core > 1 ) {
3981                 core_level = depth - 2;
3982             } else {
3983                 core_level = depth - 1;
3984             }
3985             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3986             int nproc = nth_per_core * ncores;
3987 
3988             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3989             for( int i = 0; i < nproc; i++ ) {
3990                 procarr[ i ] = -1;
3991             }
3992 
3993             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3994                 int proc = address2os[ i ].second;
3995                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3996                 // If there is only one thread per core then depth == 2: level 0 - package,
3997                 // level 1 - core.
3998                 int level = depth - 1;
3999 
4000                 // __kmp_nth_per_core == 1
4001                 int thread = 0;
4002                 int core = address2os[ i ].first.labels[ level ];
4003                 // If the thread level exists, that is we have more than one thread context per core
4004                 if( nth_per_core > 1 ) {
4005                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
4006                     core = address2os[ i ].first.labels[ level - 1 ];
4007                 }
4008                 procarr[ core * nth_per_core + thread ] = proc;
4009             }
4010 
4011             break;
4012         }
4013 
4014         sortAddresses:
4015         //
4016         // Allocate the gtid->affinity mask table.
4017         //
4018         if (__kmp_affinity_dups) {
4019             __kmp_affinity_num_masks = __kmp_avail_proc;
4020         }
4021         else {
4022             __kmp_affinity_num_masks = numUnique;
4023         }
4024 
4025 # if OMP_40_ENABLED
4026         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
4027           && ( __kmp_affinity_num_places > 0 )
4028           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
4029             __kmp_affinity_num_masks = __kmp_affinity_num_places;
4030         }
4031 # endif
4032 
4033         KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4034 
4035         //
4036         // Sort the address2os table according to the current setting of
4037         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4038         //
4039         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4040           __kmp_affinity_cmp_Address_child_num);
4041         {
4042             int i;
4043             unsigned j;
4044             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4045                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
4046                     continue;
4047                 }
4048                 unsigned osId = address2os[i].second;
4049                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4050                 kmp_affin_mask_t *dest
4051                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4052                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4053                 KMP_CPU_COPY(dest, src);
4054                 if (++j >= __kmp_affinity_num_masks) {
4055                     break;
4056                 }
4057             }
4058             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4059         }
4060         break;
4061 
4062         default:
4063         KMP_ASSERT2(0, "Unexpected affinity setting");
4064     }
4065 
4066     __kmp_free(osId2Mask);
4067     machine_hierarchy.init(address2os, __kmp_avail_proc);
4068 }
4069 #undef KMP_EXIT_AFF_NONE
4070 
4071 
4072 void
4073 __kmp_affinity_initialize(void)
4074 {
4075     //
4076     // Much of the code above was written assumming that if a machine was not
4077     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4078     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4079     //
4080     // There are too many checks for __kmp_affinity_type == affinity_none
4081     // in this code.  Instead of trying to change them all, check if
4082     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4083     // affinity_none, call the real initialization routine, then restore
4084     // __kmp_affinity_type to affinity_disabled.
4085     //
4086     int disabled = (__kmp_affinity_type == affinity_disabled);
4087     if (! KMP_AFFINITY_CAPABLE()) {
4088         KMP_ASSERT(disabled);
4089     }
4090     if (disabled) {
4091         __kmp_affinity_type = affinity_none;
4092     }
4093     __kmp_aux_affinity_initialize();
4094     if (disabled) {
4095         __kmp_affinity_type = affinity_disabled;
4096     }
4097 }
4098 
4099 
4100 void
4101 __kmp_affinity_uninitialize(void)
4102 {
4103     if (__kmp_affinity_masks != NULL) {
4104         KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4105         __kmp_affinity_masks = NULL;
4106     }
4107     if (__kmp_affin_fullMask != NULL) {
4108         KMP_CPU_FREE(__kmp_affin_fullMask);
4109         __kmp_affin_fullMask = NULL;
4110     }
4111     __kmp_affinity_num_masks = 0;
4112 # if OMP_40_ENABLED
4113     __kmp_affinity_num_places = 0;
4114 # endif
4115     if (__kmp_affinity_proclist != NULL) {
4116         __kmp_free(__kmp_affinity_proclist);
4117         __kmp_affinity_proclist = NULL;
4118     }
4119     if( address2os != NULL ) {
4120         __kmp_free( address2os );
4121         address2os = NULL;
4122     }
4123     if( procarr != NULL ) {
4124         __kmp_free( procarr );
4125         procarr = NULL;
4126     }
4127 # if KMP_USE_HWLOC
4128     if (__kmp_hwloc_topology != NULL) {
4129         hwloc_topology_destroy(__kmp_hwloc_topology);
4130         __kmp_hwloc_topology = NULL;
4131     }
4132 # endif
4133 }
4134 
4135 
4136 void
4137 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4138 {
4139     if (! KMP_AFFINITY_CAPABLE()) {
4140         return;
4141     }
4142 
4143     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4144     if (th->th.th_affin_mask == NULL) {
4145         KMP_CPU_ALLOC(th->th.th_affin_mask);
4146     }
4147     else {
4148         KMP_CPU_ZERO(th->th.th_affin_mask);
4149     }
4150 
4151     //
4152     // Copy the thread mask to the kmp_info_t strucuture.
4153     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4154     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4155     // is set, then the full mask is the same as the mask of the initialization
4156     // thread.
4157     //
4158     kmp_affin_mask_t *mask;
4159     int i;
4160 
4161 # if OMP_40_ENABLED
4162     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4163 # endif
4164     {
4165         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4166           ) {
4167 # if KMP_GROUP_AFFINITY
4168             if (__kmp_num_proc_groups > 1) {
4169                 return;
4170             }
4171 # endif
4172             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4173             i = KMP_PLACE_ALL;
4174             mask = __kmp_affin_fullMask;
4175         }
4176         else {
4177             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4178             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4179             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4180         }
4181     }
4182 # if OMP_40_ENABLED
4183     else {
4184         if ((! isa_root)
4185           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4186 #  if KMP_GROUP_AFFINITY
4187             if (__kmp_num_proc_groups > 1) {
4188                 return;
4189             }
4190 #  endif
4191             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4192             i = KMP_PLACE_ALL;
4193             mask = __kmp_affin_fullMask;
4194         }
4195         else {
4196             //
4197             // int i = some hash function or just a counter that doesn't
4198             // always start at 0.  Use gtid for now.
4199             //
4200             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4201             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4202             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4203         }
4204     }
4205 # endif
4206 
4207 # if OMP_40_ENABLED
4208     th->th.th_current_place = i;
4209     if (isa_root) {
4210         th->th.th_new_place = i;
4211         th->th.th_first_place = 0;
4212         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4213     }
4214 
4215     if (i == KMP_PLACE_ALL) {
4216         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4217           gtid));
4218     }
4219     else {
4220         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4221           gtid, i));
4222     }
4223 # else
4224     if (i == -1) {
4225         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4226           gtid));
4227     }
4228     else {
4229         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4230           gtid, i));
4231     }
4232 # endif /* OMP_40_ENABLED */
4233 
4234     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4235 
4236     if (__kmp_affinity_verbose) {
4237         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4238         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4239           th->th.th_affin_mask);
4240         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4241           buf);
4242     }
4243 
4244 # if KMP_OS_WINDOWS
4245     //
4246     // On Windows* OS, the process affinity mask might have changed.
4247     // If the user didn't request affinity and this call fails,
4248     // just continue silently.  See CQ171393.
4249     //
4250     if ( __kmp_affinity_type == affinity_none ) {
4251         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4252     }
4253     else
4254 # endif
4255     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4256 }
4257 
4258 
4259 # if OMP_40_ENABLED
4260 
4261 void
4262 __kmp_affinity_set_place(int gtid)
4263 {
4264     int retval;
4265 
4266     if (! KMP_AFFINITY_CAPABLE()) {
4267         return;
4268     }
4269 
4270     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4271 
4272     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4273       gtid, th->th.th_new_place, th->th.th_current_place));
4274 
4275     //
4276     // Check that the new place is within this thread's partition.
4277     //
4278     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4279     KMP_ASSERT(th->th.th_new_place >= 0);
4280     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4281     if (th->th.th_first_place <= th->th.th_last_place) {
4282         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4283          && (th->th.th_new_place <= th->th.th_last_place));
4284     }
4285     else {
4286         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4287          || (th->th.th_new_place >= th->th.th_last_place));
4288     }
4289 
4290     //
4291     // Copy the thread mask to the kmp_info_t strucuture,
4292     // and set this thread's affinity.
4293     //
4294     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4295       th->th.th_new_place);
4296     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4297     th->th.th_current_place = th->th.th_new_place;
4298 
4299     if (__kmp_affinity_verbose) {
4300         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4301         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4302           th->th.th_affin_mask);
4303         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4304           gtid, buf);
4305     }
4306     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4307 }
4308 
4309 # endif /* OMP_40_ENABLED */
4310 
4311 
4312 int
4313 __kmp_aux_set_affinity(void **mask)
4314 {
4315     int gtid;
4316     kmp_info_t *th;
4317     int retval;
4318 
4319     if (! KMP_AFFINITY_CAPABLE()) {
4320         return -1;
4321     }
4322 
4323     gtid = __kmp_entry_gtid();
4324     KA_TRACE(1000, ;{
4325         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4326         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4327           (kmp_affin_mask_t *)(*mask));
4328         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4329           gtid, buf);
4330     });
4331 
4332     if (__kmp_env_consistency_check) {
4333         if ((mask == NULL) || (*mask == NULL)) {
4334             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4335         }
4336         else {
4337             unsigned proc;
4338             int num_procs = 0;
4339 
4340             KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4341                 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4342                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4343                 }
4344                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4345                     continue;
4346                 }
4347                 num_procs++;
4348             }
4349             if (num_procs == 0) {
4350                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4351             }
4352 
4353 # if KMP_GROUP_AFFINITY
4354             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4355                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4356             }
4357 # endif /* KMP_GROUP_AFFINITY */
4358 
4359         }
4360     }
4361 
4362     th = __kmp_threads[gtid];
4363     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4364     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4365     if (retval == 0) {
4366         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4367     }
4368 
4369 # if OMP_40_ENABLED
4370     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4371     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4372     th->th.th_first_place = 0;
4373     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4374 
4375     //
4376     // Turn off 4.0 affinity for the current tread at this parallel level.
4377     //
4378     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4379 # endif
4380 
4381     return retval;
4382 }
4383 
4384 
4385 int
4386 __kmp_aux_get_affinity(void **mask)
4387 {
4388     int gtid;
4389     int retval;
4390     kmp_info_t *th;
4391 
4392     if (! KMP_AFFINITY_CAPABLE()) {
4393         return -1;
4394     }
4395 
4396     gtid = __kmp_entry_gtid();
4397     th = __kmp_threads[gtid];
4398     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4399 
4400     KA_TRACE(1000, ;{
4401         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4402         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4403           th->th.th_affin_mask);
4404         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4405     });
4406 
4407     if (__kmp_env_consistency_check) {
4408         if ((mask == NULL) || (*mask == NULL)) {
4409             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4410         }
4411     }
4412 
4413 # if !KMP_OS_WINDOWS
4414 
4415     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4416     KA_TRACE(1000, ;{
4417         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4418         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4419           (kmp_affin_mask_t *)(*mask));
4420         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4421     });
4422     return retval;
4423 
4424 # else
4425 
4426     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4427     return 0;
4428 
4429 # endif /* KMP_OS_WINDOWS */
4430 
4431 }
4432 
4433 int
4434 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4435 {
4436     int retval;
4437 
4438     if (! KMP_AFFINITY_CAPABLE()) {
4439         return -1;
4440     }
4441 
4442     KA_TRACE(1000, ;{
4443         int gtid = __kmp_entry_gtid();
4444         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4445         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4446           (kmp_affin_mask_t *)(*mask));
4447         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4448           proc, gtid, buf);
4449     });
4450 
4451     if (__kmp_env_consistency_check) {
4452         if ((mask == NULL) || (*mask == NULL)) {
4453             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4454         }
4455     }
4456 
4457     if ((proc < 0)
4458 # if !KMP_USE_HWLOC
4459          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4460 # endif
4461        ) {
4462         return -1;
4463     }
4464     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4465         return -2;
4466     }
4467 
4468     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4469     return 0;
4470 }
4471 
4472 
4473 int
4474 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4475 {
4476     int retval;
4477 
4478     if (! KMP_AFFINITY_CAPABLE()) {
4479         return -1;
4480     }
4481 
4482     KA_TRACE(1000, ;{
4483         int gtid = __kmp_entry_gtid();
4484         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4485         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4486           (kmp_affin_mask_t *)(*mask));
4487         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4488           proc, gtid, buf);
4489     });
4490 
4491     if (__kmp_env_consistency_check) {
4492         if ((mask == NULL) || (*mask == NULL)) {
4493             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4494         }
4495     }
4496 
4497     if ((proc < 0)
4498 # if !KMP_USE_HWLOC
4499          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4500 # endif
4501        ) {
4502         return -1;
4503     }
4504     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4505         return -2;
4506     }
4507 
4508     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4509     return 0;
4510 }
4511 
4512 
4513 int
4514 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4515 {
4516     int retval;
4517 
4518     if (! KMP_AFFINITY_CAPABLE()) {
4519         return -1;
4520     }
4521 
4522     KA_TRACE(1000, ;{
4523         int gtid = __kmp_entry_gtid();
4524         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4525         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4526           (kmp_affin_mask_t *)(*mask));
4527         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4528           proc, gtid, buf);
4529     });
4530 
4531     if (__kmp_env_consistency_check) {
4532         if ((mask == NULL) || (*mask == NULL)) {
4533             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4534         }
4535     }
4536 
4537     if ((proc < 0)
4538 # if !KMP_USE_HWLOC
4539          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4540 # endif
4541        ) {
4542         return -1;
4543     }
4544     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4545         return 0;
4546     }
4547 
4548     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4549 }
4550 
4551 
4552 // Dynamic affinity settings - Affinity balanced
4553 void __kmp_balanced_affinity( int tid, int nthreads )
4554 {
4555     if( __kmp_affinity_uniform_topology() ) {
4556         int coreID;
4557         int threadID;
4558         // Number of hyper threads per core in HT machine
4559         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4560         // Number of cores
4561         int ncores = __kmp_ncores;
4562         // How many threads will be bound to each core
4563         int chunk = nthreads / ncores;
4564         // How many cores will have an additional thread bound to it - "big cores"
4565         int big_cores = nthreads % ncores;
4566         // Number of threads on the big cores
4567         int big_nth = ( chunk + 1 ) * big_cores;
4568         if( tid < big_nth ) {
4569             coreID = tid / (chunk + 1 );
4570             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4571         } else { //tid >= big_nth
4572             coreID = ( tid - big_cores ) / chunk;
4573             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4574         }
4575 
4576         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4577           "Illegal set affinity operation when not capable");
4578 
4579         kmp_affin_mask_t *mask;
4580         KMP_CPU_ALLOC_ON_STACK(mask);
4581         KMP_CPU_ZERO(mask);
4582 
4583         // Granularity == thread
4584         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4585             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4586             KMP_CPU_SET( osID, mask);
4587         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4588             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4589                 int osID;
4590                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4591                 KMP_CPU_SET( osID, mask);
4592             }
4593         }
4594         if (__kmp_affinity_verbose) {
4595             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4596             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4597             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4598               tid, buf);
4599         }
4600         __kmp_set_system_affinity( mask, TRUE );
4601         KMP_CPU_FREE_FROM_STACK(mask);
4602     } else { // Non-uniform topology
4603 
4604         kmp_affin_mask_t *mask;
4605         KMP_CPU_ALLOC_ON_STACK(mask);
4606         KMP_CPU_ZERO(mask);
4607 
4608         // Number of hyper threads per core in HT machine
4609         int nth_per_core = __kmp_nThreadsPerCore;
4610         int core_level;
4611         if( nth_per_core > 1 ) {
4612             core_level = __kmp_aff_depth - 2;
4613         } else {
4614             core_level = __kmp_aff_depth - 1;
4615         }
4616 
4617         // Number of cores - maximum value; it does not count trail cores with 0 processors
4618         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4619 
4620         // For performance gain consider the special case nthreads == __kmp_avail_proc
4621         if( nthreads == __kmp_avail_proc ) {
4622             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4623                 int osID = address2os[ tid ].second;
4624                 KMP_CPU_SET( osID, mask);
4625             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4626                 int coreID = address2os[ tid ].first.labels[ core_level ];
4627                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4628                 // since the address2os is sortied we can break when cnt==nth_per_core
4629                 int cnt = 0;
4630                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4631                     int osID = address2os[ i ].second;
4632                     int core = address2os[ i ].first.labels[ core_level ];
4633                     if( core == coreID ) {
4634                         KMP_CPU_SET( osID, mask);
4635                         cnt++;
4636                         if( cnt == nth_per_core ) {
4637                             break;
4638                         }
4639                     }
4640                 }
4641             }
4642         } else if( nthreads <= __kmp_ncores ) {
4643 
4644             int core = 0;
4645             for( int i = 0; i < ncores; i++ ) {
4646                 // Check if this core from procarr[] is in the mask
4647                 int in_mask = 0;
4648                 for( int j = 0; j < nth_per_core; j++ ) {
4649                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4650                         in_mask = 1;
4651                         break;
4652                     }
4653                 }
4654                 if( in_mask ) {
4655                     if( tid == core ) {
4656                         for( int j = 0; j < nth_per_core; j++ ) {
4657                             int osID = procarr[ i * nth_per_core + j ];
4658                             if( osID != -1 ) {
4659                                 KMP_CPU_SET( osID, mask );
4660                                 // For granularity=thread it is enough to set the first available osID for this core
4661                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4662                                     break;
4663                                 }
4664                             }
4665                         }
4666                         break;
4667                     } else {
4668                         core++;
4669                     }
4670                 }
4671             }
4672 
4673         } else { // nthreads > __kmp_ncores
4674 
4675             // Array to save the number of processors at each core
4676             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4677             // Array to save the number of cores with "x" available processors;
4678             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4679             // Array to save the number of cores with # procs from x to nth_per_core
4680             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4681 
4682             for( int i = 0; i <= nth_per_core; i++ ) {
4683                 ncores_with_x_procs[ i ] = 0;
4684                 ncores_with_x_to_max_procs[ i ] = 0;
4685             }
4686 
4687             for( int i = 0; i < ncores; i++ ) {
4688                 int cnt = 0;
4689                 for( int j = 0; j < nth_per_core; j++ ) {
4690                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4691                         cnt++;
4692                     }
4693                 }
4694                 nproc_at_core[ i ] = cnt;
4695                 ncores_with_x_procs[ cnt ]++;
4696             }
4697 
4698             for( int i = 0; i <= nth_per_core; i++ ) {
4699                 for( int j = i; j <= nth_per_core; j++ ) {
4700                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4701                 }
4702             }
4703 
4704             // Max number of processors
4705             int nproc = nth_per_core * ncores;
4706             // An array to keep number of threads per each context
4707             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4708             for( int i = 0; i < nproc; i++ ) {
4709                 newarr[ i ] = 0;
4710             }
4711 
4712             int nth = nthreads;
4713             int flag = 0;
4714             while( nth > 0 ) {
4715                 for( int j = 1; j <= nth_per_core; j++ ) {
4716                     int cnt = ncores_with_x_to_max_procs[ j ];
4717                     for( int i = 0; i < ncores; i++ ) {
4718                         // Skip the core with 0 processors
4719                         if( nproc_at_core[ i ] == 0 ) {
4720                             continue;
4721                         }
4722                         for( int k = 0; k < nth_per_core; k++ ) {
4723                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4724                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4725                                     newarr[ i * nth_per_core + k ] = 1;
4726                                     cnt--;
4727                                     nth--;
4728                                     break;
4729                                 } else {
4730                                     if( flag != 0 ) {
4731                                         newarr[ i * nth_per_core + k ] ++;
4732                                         cnt--;
4733                                         nth--;
4734                                         break;
4735                                     }
4736                                 }
4737                             }
4738                         }
4739                         if( cnt == 0 || nth == 0 ) {
4740                             break;
4741                         }
4742                     }
4743                     if( nth == 0 ) {
4744                         break;
4745                     }
4746                 }
4747                 flag = 1;
4748             }
4749             int sum = 0;
4750             for( int i = 0; i < nproc; i++ ) {
4751                 sum += newarr[ i ];
4752                 if( sum > tid ) {
4753                     // Granularity == thread
4754                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4755                         int osID = procarr[ i ];
4756                         KMP_CPU_SET( osID, mask);
4757                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4758                         int coreID = i / nth_per_core;
4759                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4760                             int osID = procarr[ coreID * nth_per_core + ii ];
4761                             if( osID != -1 ) {
4762                                 KMP_CPU_SET( osID, mask);
4763                             }
4764                         }
4765                     }
4766                     break;
4767                 }
4768             }
4769             __kmp_free( newarr );
4770         }
4771 
4772         if (__kmp_affinity_verbose) {
4773             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4774             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4775             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4776               tid, buf);
4777         }
4778         __kmp_set_system_affinity( mask, TRUE );
4779         KMP_CPU_FREE_FROM_STACK(mask);
4780     }
4781 }
4782 
4783 #if KMP_OS_LINUX
4784 // We don't need this entry for Windows because
4785 // there is GetProcessAffinityMask() api
4786 //
4787 // The intended usage is indicated by these steps:
4788 // 1) The user gets the current affinity mask
4789 // 2) Then sets the affinity by calling this function
4790 // 3) Error check the return value
4791 // 4) Use non-OpenMP parallelization
4792 // 5) Reset the affinity to what was stored in step 1)
4793 #ifdef __cplusplus
4794 extern "C"
4795 #endif
4796 int
4797 kmp_set_thread_affinity_mask_initial()
4798 // the function returns 0 on success,
4799 //   -1 if we cannot bind thread
4800 //   >0 (errno) if an error happened during binding
4801 {
4802     int gtid = __kmp_get_gtid();
4803     if (gtid < 0) {
4804         // Do not touch non-omp threads
4805         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4806             "non-omp thread, returning\n"));
4807         return -1;
4808     }
4809     if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4810         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4811             "affinity not initialized, returning\n"));
4812         return -1;
4813     }
4814     KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4815         "set full mask for thread %d\n", gtid));
4816     KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4817     return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4818 }
4819 #endif
4820 
4821 #endif // KMP_AFFINITY_SUPPORTED
4822