1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     // Adjust the hierarchy in case num threads exceeds original
37     if (nproc > machine_hierarchy.base_num_threads)
38         machine_hierarchy.resize(nproc);
39 
40     depth = machine_hierarchy.depth;
41     KMP_DEBUG_ASSERT(depth > 0);
42 
43     thr_bar->depth = depth;
44     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 #if KMP_USE_HWLOC
54 char *
55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
56 {
57     int num_chars_to_write, num_chars_written;
58     char* scan;
59     KMP_ASSERT(buf_len >= 40);
60 
61     // bufsize of 0 just retrieves the needed buffer size.
62     num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
63 
64     // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
65     // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
66     //   take into account the '\0' character.
67     if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
68         KMP_SNPRINTF(buf, buf_len, "{<empty>}");
69     } else if(num_chars_to_write < buf_len - 3) {
70         // no problem fitting the mask into buf_len number of characters
71         buf[0] = '{';
72         // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
73         num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
74         buf[num_chars_written+1] = '}';
75         buf[num_chars_written+2] = '\0';
76     } else {
77         // Need to truncate the affinity mask string and add ellipsis.
78         // To do this, we first write out the '{' + str(mask)
79         buf[0] = '{';
80         hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask);
81         // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
82         // on a digit then write "...}\0".  This way it is a clean ellipsis addition and we don't
83         // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
84         // { 45, 67,...} instead.
85         scan = buf + buf_len - 7;
86         while(*scan >= '0' && *scan <= '9' && scan >= buf)
87             scan--;
88         *(scan+1) = '.';
89         *(scan+2) = '.';
90         *(scan+3) = '.';
91         *(scan+4) = '}';
92         *(scan+5) = '\0';
93     }
94     return buf;
95 }
96 #else
97 char *
98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
99 {
100     KMP_ASSERT(buf_len >= 40);
101     char *scan = buf;
102     char *end = buf + buf_len - 1;
103 
104     //
105     // Find first element / check for empty set.
106     //
107     size_t i;
108     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
109         if (KMP_CPU_ISSET(i, mask)) {
110             break;
111         }
112     }
113     if (i == KMP_CPU_SETSIZE) {
114         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
115         while (*scan != '\0') scan++;
116         KMP_ASSERT(scan <= end);
117         return buf;
118     }
119 
120     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
121     while (*scan != '\0') scan++;
122     i++;
123     for (; i < KMP_CPU_SETSIZE; i++) {
124         if (! KMP_CPU_ISSET(i, mask)) {
125             continue;
126         }
127 
128         //
129         // Check for buffer overflow.  A string of the form ",<n>" will have
130         // at most 10 characters, plus we want to leave room to print ",...}"
131         // if the set is too large to print for a total of 15 characters.
132         // We already left room for '\0' in setting end.
133         //
134         if (end - scan < 15) {
135            break;
136         }
137         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
138         while (*scan != '\0') scan++;
139     }
140     if (i < KMP_CPU_SETSIZE) {
141         KMP_SNPRINTF(scan, end-scan+1,  ",...");
142         while (*scan != '\0') scan++;
143     }
144     KMP_SNPRINTF(scan, end-scan+1, "}");
145     while (*scan != '\0') scan++;
146     KMP_ASSERT(scan <= end);
147     return buf;
148 }
149 #endif // KMP_USE_HWLOC
150 
151 
152 void
153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
154 {
155     KMP_CPU_ZERO(mask);
156 
157 # if KMP_GROUP_AFFINITY
158 
159     if (__kmp_num_proc_groups > 1) {
160         int group;
161         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
162         for (group = 0; group < __kmp_num_proc_groups; group++) {
163             int i;
164             int num = __kmp_GetActiveProcessorCount(group);
165             for (i = 0; i < num; i++) {
166                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
167             }
168         }
169     }
170     else
171 
172 # endif /* KMP_GROUP_AFFINITY */
173 
174     {
175         int proc;
176         for (proc = 0; proc < __kmp_xproc; proc++) {
177             KMP_CPU_SET(proc, mask);
178         }
179     }
180 }
181 
182 //
183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
184 // called to renumber the labels from [0..n] and place them into the child_num
185 // vector of the address object.  This is done in case the labels used for
186 // the children at one node of the hierarchy differ from those used for
187 // another node at the same level.  Example:  suppose the machine has 2 nodes
188 // with 2 packages each.  The first node contains packages 601 and 602, and
189 // second node contains packages 603 and 604.  If we try to sort the table
190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
191 // because we are paying attention to the labels themselves, not the ordinal
192 // child numbers.  By using the child numbers in the sort, the result is
193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
194 //
195 static void
196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
197   int numAddrs)
198 {
199     KMP_DEBUG_ASSERT(numAddrs > 0);
200     int depth = address2os->first.depth;
201     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
202     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
203       * sizeof(unsigned));
204     int labCt;
205     for (labCt = 0; labCt < depth; labCt++) {
206         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
207         lastLabel[labCt] = address2os[0].first.labels[labCt];
208     }
209     int i;
210     for (i = 1; i < numAddrs; i++) {
211         for (labCt = 0; labCt < depth; labCt++) {
212             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
213                 int labCt2;
214                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
215                     counts[labCt2] = 0;
216                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
217                 }
218                 counts[labCt]++;
219                 lastLabel[labCt] = address2os[i].first.labels[labCt];
220                 break;
221             }
222         }
223         for (labCt = 0; labCt < depth; labCt++) {
224             address2os[i].first.childNums[labCt] = counts[labCt];
225         }
226         for (; labCt < (int)Address::maxDepth; labCt++) {
227             address2os[i].first.childNums[labCt] = 0;
228         }
229     }
230     __kmp_free(lastLabel);
231     __kmp_free(counts);
232 }
233 
234 
235 //
236 // All of the __kmp_affinity_create_*_map() routines should set
237 // __kmp_affinity_masks to a vector of affinity mask objects of length
238 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
239 // return the number of levels in the machine topology tree (zero if
240 // __kmp_affinity_type == affinity_none).
241 //
242 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
243 // to the affinity mask for the initialization thread.  They need to save and
244 // restore the mask, and it could be needed later, so saving it is just an
245 // optimization to avoid calling kmp_get_system_affinity() again.
246 //
247 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
248 
249 static int nCoresPerPkg, nPackages;
250 static int __kmp_nThreadsPerCore;
251 #ifndef KMP_DFLT_NTH_CORES
252 static int __kmp_ncores;
253 #endif
254 static int *__kmp_pu_os_idx = NULL;
255 
256 //
257 // __kmp_affinity_uniform_topology() doesn't work when called from
258 // places which support arbitrarily many levels in the machine topology
259 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
260 // __kmp_affinity_create_x2apicid_map().
261 //
262 inline static bool
263 __kmp_affinity_uniform_topology()
264 {
265     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
266 }
267 
268 
269 //
270 // Print out the detailed machine topology map, i.e. the physical locations
271 // of each OS proc.
272 //
273 static void
274 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
275   int pkgLevel, int coreLevel, int threadLevel)
276 {
277     int proc;
278 
279     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
280     for (proc = 0; proc < len; proc++) {
281         int level;
282         kmp_str_buf_t buf;
283         __kmp_str_buf_init(&buf);
284         for (level = 0; level < depth; level++) {
285             if (level == threadLevel) {
286                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
287             }
288             else if (level == coreLevel) {
289                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
290             }
291             else if (level == pkgLevel) {
292                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
293             }
294             else if (level > pkgLevel) {
295                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
296                   level - pkgLevel - 1);
297             }
298             else {
299                 __kmp_str_buf_print(&buf, "L%d ", level);
300             }
301             __kmp_str_buf_print(&buf, "%d ",
302               address2os[proc].first.labels[level]);
303         }
304         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
305           buf.str);
306         __kmp_str_buf_free(&buf);
307     }
308 }
309 
310 #if KMP_USE_HWLOC
311 
312 // This function removes the topology levels that are radix 1 and don't offer
313 // further information about the topology.  The most common example is when you
314 // have one thread context per core, we don't want the extra thread context
315 // level if it offers no unique labels.  So they are removed.
316 // return value: the new depth of address2os
317 static int
318 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
319     int level;
320     int i;
321     int radix1_detected;
322 
323     for (level = depth-1; level >= 0; --level) {
324         // Always keep the package level
325         if (level == *pkgLevel)
326             continue;
327         // Detect if this level is radix 1
328         radix1_detected = 1;
329         for (i = 1; i < nActiveThreads; ++i) {
330             if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
331                 // There are differing label values for this level so it stays
332                 radix1_detected = 0;
333                 break;
334             }
335         }
336         if (!radix1_detected)
337             continue;
338         // Radix 1 was detected
339         if (level == *threadLevel) {
340             // If only one thread per core, then just decrement
341             // the depth which removes the threadlevel from address2os
342             for (i = 0; i < nActiveThreads; ++i) {
343                 address2os[i].first.depth--;
344             }
345             *threadLevel = -1;
346         } else if (level == *coreLevel) {
347             // For core level, we move the thread labels over if they are still
348             // valid (*threadLevel != -1), and also reduce the depth another level
349             for (i = 0; i < nActiveThreads; ++i) {
350                 if (*threadLevel != -1) {
351                     address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
352                 }
353                 address2os[i].first.depth--;
354             }
355             *coreLevel = -1;
356         }
357     }
358     return address2os[0].first.depth;
359 }
360 
361 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
362 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
363 //  this will return the number of PU's under the SOCKET object.
364 static int
365 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
366     int retval = 0;
367     hwloc_obj_t first;
368     for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
369         first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
370         first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
371     {
372         ++retval;
373     }
374     return retval;
375 }
376 
377 static int
378 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
379   kmp_i18n_id_t *const msg_id)
380 {
381     *address2os = NULL;
382     *msg_id = kmp_i18n_null;
383 
384     //
385     // Save the affinity mask for the current thread.
386     //
387     kmp_affin_mask_t *oldMask;
388     KMP_CPU_ALLOC(oldMask);
389     __kmp_get_system_affinity(oldMask, TRUE);
390 
391     int depth = 3;
392     int pkgLevel = 0;
393     int coreLevel = 1;
394     int threadLevel = 2;
395 
396     if (! KMP_AFFINITY_CAPABLE())
397     {
398         //
399         // Hack to try and infer the machine topology using only the data
400         // available from cpuid on the current thread, and __kmp_xproc.
401         //
402         KMP_ASSERT(__kmp_affinity_type == affinity_none);
403 
404         nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
405         __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
406         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
407         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
408         if (__kmp_affinity_verbose) {
409             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
410             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
411             if (__kmp_affinity_uniform_topology()) {
412                 KMP_INFORM(Uniform, "KMP_AFFINITY");
413             } else {
414                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
415             }
416             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
417               __kmp_nThreadsPerCore, __kmp_ncores);
418         }
419         KMP_CPU_FREE(oldMask);
420         return 0;
421     }
422 
423     //
424     // Allocate the data structure to be returned.
425     //
426     AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
427     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
428 
429     //
430     // When affinity is off, this routine will still be called to set
431     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
432     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
433     // correctly, and return if affinity is not enabled.
434     //
435 
436     hwloc_obj_t pu;
437     hwloc_obj_t core;
438     hwloc_obj_t socket;
439     int nActiveThreads = 0;
440     int socket_identifier = 0;
441     // re-calculate globals to count only accessible resources
442     __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
443     for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
444         socket != NULL;
445         socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
446         socket_identifier++)
447     {
448         int core_identifier = 0;
449         int num_active_cores = 0;
450         for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
451             core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
452             core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
453             core_identifier++)
454         {
455             int pu_identifier = 0;
456             int num_active_threads = 0;
457             for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
458                 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
459                 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
460                 pu_identifier++)
461             {
462                 Address addr(3);
463                 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
464                     continue;         // skip inactive (inaccessible) unit
465                 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
466                     socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
467                 addr.labels[0] = socket_identifier; // package
468                 addr.labels[1] = core_identifier; // core
469                 addr.labels[2] = pu_identifier; // pu
470                 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
471                 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
472                 nActiveThreads++;
473                 ++num_active_threads; // count active threads per core
474             }
475             if (num_active_threads) { // were there any active threads on the core?
476                 ++__kmp_ncores;       // count total active cores
477                 ++num_active_cores;   // count active cores per socket
478                 if (num_active_threads > __kmp_nThreadsPerCore)
479                     __kmp_nThreadsPerCore = num_active_threads; // calc maximum
480             }
481         }
482         if (num_active_cores) {       // were there any active cores on the socket?
483             ++nPackages;              // count total active packages
484             if (num_active_cores > nCoresPerPkg)
485                 nCoresPerPkg = num_active_cores; // calc maximum
486         }
487     }
488 
489     //
490     // If there's only one thread context to bind to, return now.
491     //
492     KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
493     KMP_ASSERT(nActiveThreads > 0);
494     if (nActiveThreads == 1) {
495         __kmp_ncores = nPackages = 1;
496         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
497         if (__kmp_affinity_verbose) {
498             char buf[KMP_AFFIN_MASK_PRINT_LEN];
499             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
500 
501             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
502             if (__kmp_affinity_respect_mask) {
503                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
504             } else {
505                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
506             }
507             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
508             KMP_INFORM(Uniform, "KMP_AFFINITY");
509             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
510               __kmp_nThreadsPerCore, __kmp_ncores);
511         }
512 
513         if (__kmp_affinity_type == affinity_none) {
514             __kmp_free(retval);
515             KMP_CPU_FREE(oldMask);
516             return 0;
517         }
518 
519         //
520         // Form an Address object which only includes the package level.
521         //
522         Address addr(1);
523         addr.labels[0] = retval[0].first.labels[pkgLevel];
524         retval[0].first = addr;
525 
526         if (__kmp_affinity_gran_levels < 0) {
527             __kmp_affinity_gran_levels = 0;
528         }
529 
530         if (__kmp_affinity_verbose) {
531             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
532         }
533 
534         *address2os = retval;
535         KMP_CPU_FREE(oldMask);
536         return 1;
537     }
538 
539     //
540     // Sort the table by physical Id.
541     //
542     qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
543 
544     //
545     // Check to see if the machine topology is uniform
546     //
547     unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
548 
549     //
550     // Print the machine topology summary.
551     //
552     if (__kmp_affinity_verbose) {
553         char mask[KMP_AFFIN_MASK_PRINT_LEN];
554         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
555 
556         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
557         if (__kmp_affinity_respect_mask) {
558             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
559         } else {
560             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
561         }
562         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
563         if (uniform) {
564             KMP_INFORM(Uniform, "KMP_AFFINITY");
565         } else {
566             KMP_INFORM(NonUniform, "KMP_AFFINITY");
567         }
568 
569         kmp_str_buf_t buf;
570         __kmp_str_buf_init(&buf);
571 
572         __kmp_str_buf_print(&buf, "%d", nPackages);
573         //for (level = 1; level <= pkgLevel; level++) {
574         //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
575        // }
576         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
577           __kmp_nThreadsPerCore, __kmp_ncores);
578 
579         __kmp_str_buf_free(&buf);
580     }
581 
582     if (__kmp_affinity_type == affinity_none) {
583         __kmp_free(retval);
584         KMP_CPU_FREE(oldMask);
585         return 0;
586     }
587 
588     //
589     // Find any levels with radiix 1, and remove them from the map
590     // (except for the package level).
591     //
592     depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
593 
594     if (__kmp_affinity_gran_levels < 0) {
595         //
596         // Set the granularity level based on what levels are modeled
597         // in the machine topology map.
598         //
599         __kmp_affinity_gran_levels = 0;
600         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
601             __kmp_affinity_gran_levels++;
602         }
603         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
604             __kmp_affinity_gran_levels++;
605         }
606         if (__kmp_affinity_gran > affinity_gran_package) {
607             __kmp_affinity_gran_levels++;
608         }
609     }
610 
611     if (__kmp_affinity_verbose) {
612         __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
613           coreLevel, threadLevel);
614     }
615 
616     KMP_CPU_FREE(oldMask);
617     *address2os = retval;
618     return depth;
619 }
620 #endif // KMP_USE_HWLOC
621 
622 //
623 // If we don't know how to retrieve the machine's processor topology, or
624 // encounter an error in doing so, this routine is called to form a "flat"
625 // mapping of os thread id's <-> processor id's.
626 //
627 static int
628 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
629   kmp_i18n_id_t *const msg_id)
630 {
631     *address2os = NULL;
632     *msg_id = kmp_i18n_null;
633 
634     //
635     // Even if __kmp_affinity_type == affinity_none, this routine might still
636     // called to set __kmp_ncores, as well as
637     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
638     //
639     if (! KMP_AFFINITY_CAPABLE()) {
640         KMP_ASSERT(__kmp_affinity_type == affinity_none);
641         __kmp_ncores = nPackages = __kmp_xproc;
642         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
643         if (__kmp_affinity_verbose) {
644             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
645             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
646             KMP_INFORM(Uniform, "KMP_AFFINITY");
647             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
648               __kmp_nThreadsPerCore, __kmp_ncores);
649         }
650         return 0;
651     }
652 
653     //
654     // When affinity is off, this routine will still be called to set
655     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
656     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
657     //  correctly, and return now if affinity is not enabled.
658     //
659     __kmp_ncores = nPackages = __kmp_avail_proc;
660     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
661     if (__kmp_affinity_verbose) {
662         char buf[KMP_AFFIN_MASK_PRINT_LEN];
663         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
664 
665         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
666         if (__kmp_affinity_respect_mask) {
667             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
668         } else {
669             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
670         }
671         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
672         KMP_INFORM(Uniform, "KMP_AFFINITY");
673         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
674           __kmp_nThreadsPerCore, __kmp_ncores);
675     }
676     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
677     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
678     if (__kmp_affinity_type == affinity_none) {
679         int avail_ct = 0;
680         unsigned int i;
681         KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
682             if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
683                 continue;
684             __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
685         }
686         return 0;
687     }
688 
689     //
690     // Contruct the data structure to be returned.
691     //
692     *address2os = (AddrUnsPair*)
693       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
694     int avail_ct = 0;
695     unsigned int i;
696     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
697         //
698         // Skip this proc if it is not included in the machine model.
699         //
700         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
701             continue;
702         }
703         __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
704         Address addr(1);
705         addr.labels[0] = i;
706         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
707     }
708     if (__kmp_affinity_verbose) {
709         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
710     }
711 
712     if (__kmp_affinity_gran_levels < 0) {
713         //
714         // Only the package level is modeled in the machine topology map,
715         // so the #levels of granularity is either 0 or 1.
716         //
717         if (__kmp_affinity_gran > affinity_gran_package) {
718             __kmp_affinity_gran_levels = 1;
719         }
720         else {
721             __kmp_affinity_gran_levels = 0;
722         }
723     }
724     return 1;
725 }
726 
727 
728 # if KMP_GROUP_AFFINITY
729 
730 //
731 // If multiple Windows* OS processor groups exist, we can create a 2-level
732 // topology map with the groups at level 0 and the individual procs at
733 // level 1.
734 //
735 // This facilitates letting the threads float among all procs in a group,
736 // if granularity=group (the default when there are multiple groups).
737 //
738 static int
739 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
740   kmp_i18n_id_t *const msg_id)
741 {
742     *address2os = NULL;
743     *msg_id = kmp_i18n_null;
744 
745     //
746     // If we don't have multiple processor groups, return now.
747     // The flat mapping will be used.
748     //
749     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
750         // FIXME set *msg_id
751         return -1;
752     }
753 
754     //
755     // Contruct the data structure to be returned.
756     //
757     *address2os = (AddrUnsPair*)
758       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
759     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
760     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
761     int avail_ct = 0;
762     int i;
763     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
764         //
765         // Skip this proc if it is not included in the machine model.
766         //
767         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
768             continue;
769         }
770         __kmp_pu_os_idx[avail_ct] = i;  // suppose indices are flat
771         Address addr(2);
772         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
773         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
774         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
775 
776         if (__kmp_affinity_verbose) {
777             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
778               addr.labels[1]);
779         }
780     }
781 
782     if (__kmp_affinity_gran_levels < 0) {
783         if (__kmp_affinity_gran == affinity_gran_group) {
784             __kmp_affinity_gran_levels = 1;
785         }
786         else if ((__kmp_affinity_gran == affinity_gran_fine)
787           || (__kmp_affinity_gran == affinity_gran_thread)) {
788             __kmp_affinity_gran_levels = 0;
789         }
790         else {
791             const char *gran_str = NULL;
792             if (__kmp_affinity_gran == affinity_gran_core) {
793                 gran_str = "core";
794             }
795             else if (__kmp_affinity_gran == affinity_gran_package) {
796                 gran_str = "package";
797             }
798             else if (__kmp_affinity_gran == affinity_gran_node) {
799                 gran_str = "node";
800             }
801             else {
802                 KMP_ASSERT(0);
803             }
804 
805             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
806             __kmp_affinity_gran_levels = 0;
807         }
808     }
809     return 2;
810 }
811 
812 # endif /* KMP_GROUP_AFFINITY */
813 
814 
815 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
816 
817 static int
818 __kmp_cpuid_mask_width(int count) {
819     int r = 0;
820 
821     while((1<<r) < count)
822         ++r;
823     return r;
824 }
825 
826 
827 class apicThreadInfo {
828 public:
829     unsigned osId;              // param to __kmp_affinity_bind_thread
830     unsigned apicId;            // from cpuid after binding
831     unsigned maxCoresPerPkg;    //      ""
832     unsigned maxThreadsPerPkg;  //      ""
833     unsigned pkgId;             // inferred from above values
834     unsigned coreId;            //      ""
835     unsigned threadId;          //      ""
836 };
837 
838 
839 static int
840 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
841 {
842     const apicThreadInfo *aa = (const apicThreadInfo *)a;
843     const apicThreadInfo *bb = (const apicThreadInfo *)b;
844     if (aa->osId < bb->osId) return -1;
845     if (aa->osId > bb->osId) return 1;
846     return 0;
847 }
848 
849 
850 static int
851 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
852 {
853     const apicThreadInfo *aa = (const apicThreadInfo *)a;
854     const apicThreadInfo *bb = (const apicThreadInfo *)b;
855     if (aa->pkgId < bb->pkgId) return -1;
856     if (aa->pkgId > bb->pkgId) return 1;
857     if (aa->coreId < bb->coreId) return -1;
858     if (aa->coreId > bb->coreId) return 1;
859     if (aa->threadId < bb->threadId) return -1;
860     if (aa->threadId > bb->threadId) return 1;
861     return 0;
862 }
863 
864 
865 //
866 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
867 // an algorithm which cycles through the available os threads, setting
868 // the current thread's affinity mask to that thread, and then retrieves
869 // the Apic Id for each thread context using the cpuid instruction.
870 //
871 static int
872 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
873   kmp_i18n_id_t *const msg_id)
874 {
875     kmp_cpuid buf;
876     int rc;
877     *address2os = NULL;
878     *msg_id = kmp_i18n_null;
879 
880     //
881     // Check if cpuid leaf 4 is supported.
882     //
883         __kmp_x86_cpuid(0, 0, &buf);
884         if (buf.eax < 4) {
885             *msg_id = kmp_i18n_str_NoLeaf4Support;
886             return -1;
887         }
888 
889     //
890     // The algorithm used starts by setting the affinity to each available
891     // thread and retrieving info from the cpuid instruction, so if we are
892     // not capable of calling __kmp_get_system_affinity() and
893     // _kmp_get_system_affinity(), then we need to do something else - use
894     // the defaults that we calculated from issuing cpuid without binding
895     // to each proc.
896     //
897     if (! KMP_AFFINITY_CAPABLE()) {
898         //
899         // Hack to try and infer the machine topology using only the data
900         // available from cpuid on the current thread, and __kmp_xproc.
901         //
902         KMP_ASSERT(__kmp_affinity_type == affinity_none);
903 
904         //
905         // Get an upper bound on the number of threads per package using
906         // cpuid(1).
907         //
908         // On some OS/chps combinations where HT is supported by the chip
909         // but is disabled, this value will be 2 on a single core chip.
910         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
911         //
912         __kmp_x86_cpuid(1, 0, &buf);
913         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
914         if (maxThreadsPerPkg == 0) {
915             maxThreadsPerPkg = 1;
916         }
917 
918         //
919         // The num cores per pkg comes from cpuid(4).
920         // 1 must be added to the encoded value.
921         //
922         // The author of cpu_count.cpp treated this only an upper bound
923         // on the number of cores, but I haven't seen any cases where it
924         // was greater than the actual number of cores, so we will treat
925         // it as exact in this block of code.
926         //
927         // First, we need to check if cpuid(4) is supported on this chip.
928         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
929         // has the value n or greater.
930         //
931         __kmp_x86_cpuid(0, 0, &buf);
932         if (buf.eax >= 4) {
933             __kmp_x86_cpuid(4, 0, &buf);
934             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
935         }
936         else {
937             nCoresPerPkg = 1;
938         }
939 
940         //
941         // There is no way to reliably tell if HT is enabled without issuing
942         // the cpuid instruction from every thread, can correlating the cpuid
943         // info, so if the machine is not affinity capable, we assume that HT
944         // is off.  We have seen quite a few machines where maxThreadsPerPkg
945         // is 2, yet the machine does not support HT.
946         //
947         // - Older OSes are usually found on machines with older chips, which
948         //   do not support HT.
949         //
950         // - The performance penalty for mistakenly identifying a machine as
951         //   HT when it isn't (which results in blocktime being incorrecly set
952         //   to 0) is greater than the penalty when for mistakenly identifying
953         //   a machine as being 1 thread/core when it is really HT enabled
954         //   (which results in blocktime being incorrectly set to a positive
955         //   value).
956         //
957         __kmp_ncores = __kmp_xproc;
958         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
959         __kmp_nThreadsPerCore = 1;
960         if (__kmp_affinity_verbose) {
961             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
962             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
963             if (__kmp_affinity_uniform_topology()) {
964                 KMP_INFORM(Uniform, "KMP_AFFINITY");
965             } else {
966                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
967             }
968             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
969               __kmp_nThreadsPerCore, __kmp_ncores);
970         }
971         return 0;
972     }
973 
974     //
975     //
976     // From here on, we can assume that it is safe to call
977     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
978     // even if __kmp_affinity_type = affinity_none.
979     //
980 
981     //
982     // Save the affinity mask for the current thread.
983     //
984     kmp_affin_mask_t *oldMask;
985     KMP_CPU_ALLOC(oldMask);
986     KMP_ASSERT(oldMask != NULL);
987     __kmp_get_system_affinity(oldMask, TRUE);
988 
989     //
990     // Run through each of the available contexts, binding the current thread
991     // to it, and obtaining the pertinent information using the cpuid instr.
992     //
993     // The relevant information is:
994     //
995     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
996     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
997     //
998     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
999     //    value of this field determines the width of the core# + thread#
1000     //    fields in the Apic Id.  It is also an upper bound on the number
1001     //    of threads per package, but it has been verified that situations
1002     //    happen were it is not exact.  In particular, on certain OS/chip
1003     //    combinations where Intel(R) Hyper-Threading Technology is supported
1004     //    by the chip but has
1005     //    been disabled, the value of this field will be 2 (for a single core
1006     //    chip).  On other OS/chip combinations supporting
1007     //    Intel(R) Hyper-Threading Technology, the value of
1008     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
1009     //    disabled and 2 when it is enabled.
1010     //
1011     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
1012     //    value of this field (+1) determines the width of the core# field in
1013     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
1014     //    an upper bound, but the IA-32 architecture manual says that it is
1015     //    exactly the number of cores per package, and I haven't seen any
1016     //    case where it wasn't.
1017     //
1018     // From this information, deduce the package Id, core Id, and thread Id,
1019     // and set the corresponding fields in the apicThreadInfo struct.
1020     //
1021     unsigned i;
1022     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1023       __kmp_avail_proc * sizeof(apicThreadInfo));
1024     unsigned nApics = 0;
1025     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1026         //
1027         // Skip this proc if it is not included in the machine model.
1028         //
1029         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1030             continue;
1031         }
1032         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1033 
1034         __kmp_affinity_bind_thread(i);
1035         threadInfo[nApics].osId = i;
1036 
1037         //
1038         // The apic id and max threads per pkg come from cpuid(1).
1039         //
1040         __kmp_x86_cpuid(1, 0, &buf);
1041         if (! (buf.edx >> 9) & 1) {
1042             __kmp_set_system_affinity(oldMask, TRUE);
1043             __kmp_free(threadInfo);
1044             KMP_CPU_FREE(oldMask);
1045             *msg_id = kmp_i18n_str_ApicNotPresent;
1046             return -1;
1047         }
1048         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1049         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1050         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1051             threadInfo[nApics].maxThreadsPerPkg = 1;
1052         }
1053 
1054         //
1055         // Max cores per pkg comes from cpuid(4).
1056         // 1 must be added to the encoded value.
1057         //
1058         // First, we need to check if cpuid(4) is supported on this chip.
1059         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1060         // has the value n or greater.
1061         //
1062         __kmp_x86_cpuid(0, 0, &buf);
1063         if (buf.eax >= 4) {
1064             __kmp_x86_cpuid(4, 0, &buf);
1065             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1066         }
1067         else {
1068             threadInfo[nApics].maxCoresPerPkg = 1;
1069         }
1070 
1071         //
1072         // Infer the pkgId / coreId / threadId using only the info
1073         // obtained locally.
1074         //
1075         int widthCT = __kmp_cpuid_mask_width(
1076           threadInfo[nApics].maxThreadsPerPkg);
1077         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1078 
1079         int widthC = __kmp_cpuid_mask_width(
1080           threadInfo[nApics].maxCoresPerPkg);
1081         int widthT = widthCT - widthC;
1082         if (widthT < 0) {
1083             //
1084             // I've never seen this one happen, but I suppose it could, if
1085             // the cpuid instruction on a chip was really screwed up.
1086             // Make sure to restore the affinity mask before the tail call.
1087             //
1088             __kmp_set_system_affinity(oldMask, TRUE);
1089             __kmp_free(threadInfo);
1090             KMP_CPU_FREE(oldMask);
1091             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1092             return -1;
1093         }
1094 
1095         int maskC = (1 << widthC) - 1;
1096         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1097           &maskC;
1098 
1099         int maskT = (1 << widthT) - 1;
1100         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1101 
1102         nApics++;
1103     }
1104 
1105     //
1106     // We've collected all the info we need.
1107     // Restore the old affinity mask for this thread.
1108     //
1109     __kmp_set_system_affinity(oldMask, TRUE);
1110 
1111     //
1112     // If there's only one thread context to bind to, form an Address object
1113     // with depth 1 and return immediately (or, if affinity is off, set
1114     // address2os to NULL and return).
1115     //
1116     // If it is configured to omit the package level when there is only a
1117     // single package, the logic at the end of this routine won't work if
1118     // there is only a single thread - it would try to form an Address
1119     // object with depth 0.
1120     //
1121     KMP_ASSERT(nApics > 0);
1122     if (nApics == 1) {
1123         __kmp_ncores = nPackages = 1;
1124         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1125         if (__kmp_affinity_verbose) {
1126             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1127             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1128 
1129             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1130             if (__kmp_affinity_respect_mask) {
1131                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1132             } else {
1133                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1134             }
1135             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1136             KMP_INFORM(Uniform, "KMP_AFFINITY");
1137             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1138               __kmp_nThreadsPerCore, __kmp_ncores);
1139         }
1140 
1141         if (__kmp_affinity_type == affinity_none) {
1142             __kmp_free(threadInfo);
1143             KMP_CPU_FREE(oldMask);
1144             return 0;
1145         }
1146 
1147         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1148         Address addr(1);
1149         addr.labels[0] = threadInfo[0].pkgId;
1150         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1151 
1152         if (__kmp_affinity_gran_levels < 0) {
1153             __kmp_affinity_gran_levels = 0;
1154         }
1155 
1156         if (__kmp_affinity_verbose) {
1157             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1158         }
1159 
1160         __kmp_free(threadInfo);
1161         KMP_CPU_FREE(oldMask);
1162         return 1;
1163     }
1164 
1165     //
1166     // Sort the threadInfo table by physical Id.
1167     //
1168     qsort(threadInfo, nApics, sizeof(*threadInfo),
1169       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1170 
1171     //
1172     // The table is now sorted by pkgId / coreId / threadId, but we really
1173     // don't know the radix of any of the fields.  pkgId's may be sparsely
1174     // assigned among the chips on a system.  Although coreId's are usually
1175     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1176     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1177     //
1178     // For that matter, we don't know what coresPerPkg and threadsPerCore
1179     // (or the total # packages) are at this point - we want to determine
1180     // that now.  We only have an upper bound on the first two figures.
1181     //
1182     // We also perform a consistency check at this point: the values returned
1183     // by the cpuid instruction for any thread bound to a given package had
1184     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1185     //
1186     nPackages = 1;
1187     nCoresPerPkg = 1;
1188     __kmp_nThreadsPerCore = 1;
1189     unsigned nCores = 1;
1190 
1191     unsigned pkgCt = 1;                         // to determine radii
1192     unsigned lastPkgId = threadInfo[0].pkgId;
1193     unsigned coreCt = 1;
1194     unsigned lastCoreId = threadInfo[0].coreId;
1195     unsigned threadCt = 1;
1196     unsigned lastThreadId = threadInfo[0].threadId;
1197 
1198                                                 // intra-pkg consist checks
1199     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1200     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1201 
1202     for (i = 1; i < nApics; i++) {
1203         if (threadInfo[i].pkgId != lastPkgId) {
1204             nCores++;
1205             pkgCt++;
1206             lastPkgId = threadInfo[i].pkgId;
1207             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1208             coreCt = 1;
1209             lastCoreId = threadInfo[i].coreId;
1210             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1211             threadCt = 1;
1212             lastThreadId = threadInfo[i].threadId;
1213 
1214             //
1215             // This is a different package, so go on to the next iteration
1216             // without doing any consistency checks.  Reset the consistency
1217             // check vars, though.
1218             //
1219             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1220             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1221             continue;
1222         }
1223 
1224         if (threadInfo[i].coreId != lastCoreId) {
1225             nCores++;
1226             coreCt++;
1227             lastCoreId = threadInfo[i].coreId;
1228             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1229             threadCt = 1;
1230             lastThreadId = threadInfo[i].threadId;
1231         }
1232         else if (threadInfo[i].threadId != lastThreadId) {
1233             threadCt++;
1234             lastThreadId = threadInfo[i].threadId;
1235         }
1236         else {
1237             __kmp_free(threadInfo);
1238             KMP_CPU_FREE(oldMask);
1239             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1240             return -1;
1241         }
1242 
1243         //
1244         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1245         // fields agree between all the threads bounds to a given package.
1246         //
1247         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1248           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1249             __kmp_free(threadInfo);
1250             KMP_CPU_FREE(oldMask);
1251             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1252             return -1;
1253         }
1254     }
1255     nPackages = pkgCt;
1256     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1257     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1258 
1259     //
1260     // When affinity is off, this routine will still be called to set
1261     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1262     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1263     // correctly, and return now if affinity is not enabled.
1264     //
1265     __kmp_ncores = nCores;
1266     if (__kmp_affinity_verbose) {
1267         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1268         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1269 
1270         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1271         if (__kmp_affinity_respect_mask) {
1272             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1273         } else {
1274             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1275         }
1276         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1277         if (__kmp_affinity_uniform_topology()) {
1278             KMP_INFORM(Uniform, "KMP_AFFINITY");
1279         } else {
1280             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1281         }
1282         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1283           __kmp_nThreadsPerCore, __kmp_ncores);
1284 
1285     }
1286     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1287     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1288     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1289     for (i = 0; i < nApics; ++i) {
1290         __kmp_pu_os_idx[i] = threadInfo[i].osId;
1291     }
1292     if (__kmp_affinity_type == affinity_none) {
1293         __kmp_free(threadInfo);
1294         KMP_CPU_FREE(oldMask);
1295         return 0;
1296     }
1297 
1298     //
1299     // Now that we've determined the number of packages, the number of cores
1300     // per package, and the number of threads per core, we can construct the
1301     // data structure that is to be returned.
1302     //
1303     int pkgLevel = 0;
1304     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1305     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1306     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1307 
1308     KMP_ASSERT(depth > 0);
1309     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1310 
1311     for (i = 0; i < nApics; ++i) {
1312         Address addr(depth);
1313         unsigned os = threadInfo[i].osId;
1314         int d = 0;
1315 
1316         if (pkgLevel >= 0) {
1317             addr.labels[d++] = threadInfo[i].pkgId;
1318         }
1319         if (coreLevel >= 0) {
1320             addr.labels[d++] = threadInfo[i].coreId;
1321         }
1322         if (threadLevel >= 0) {
1323             addr.labels[d++] = threadInfo[i].threadId;
1324         }
1325         (*address2os)[i] = AddrUnsPair(addr, os);
1326     }
1327 
1328     if (__kmp_affinity_gran_levels < 0) {
1329         //
1330         // Set the granularity level based on what levels are modeled
1331         // in the machine topology map.
1332         //
1333         __kmp_affinity_gran_levels = 0;
1334         if ((threadLevel >= 0)
1335           && (__kmp_affinity_gran > affinity_gran_thread)) {
1336             __kmp_affinity_gran_levels++;
1337         }
1338         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1339             __kmp_affinity_gran_levels++;
1340         }
1341         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1342             __kmp_affinity_gran_levels++;
1343         }
1344     }
1345 
1346     if (__kmp_affinity_verbose) {
1347         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1348           coreLevel, threadLevel);
1349     }
1350 
1351     __kmp_free(threadInfo);
1352     KMP_CPU_FREE(oldMask);
1353     return depth;
1354 }
1355 
1356 
1357 //
1358 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1359 // architectures support a newer interface for specifying the x2APIC Ids,
1360 // based on cpuid leaf 11.
1361 //
1362 static int
1363 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1364   kmp_i18n_id_t *const msg_id)
1365 {
1366     kmp_cpuid buf;
1367 
1368     *address2os = NULL;
1369     *msg_id = kmp_i18n_null;
1370 
1371     //
1372     // Check to see if cpuid leaf 11 is supported.
1373     //
1374     __kmp_x86_cpuid(0, 0, &buf);
1375     if (buf.eax < 11) {
1376         *msg_id = kmp_i18n_str_NoLeaf11Support;
1377         return -1;
1378     }
1379     __kmp_x86_cpuid(11, 0, &buf);
1380     if (buf.ebx == 0) {
1381         *msg_id = kmp_i18n_str_NoLeaf11Support;
1382         return -1;
1383     }
1384 
1385     //
1386     // Find the number of levels in the machine topology.  While we're at it,
1387     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1388     // try to get more accurate values later by explicitly counting them,
1389     // but get reasonable defaults now, in case we return early.
1390     //
1391     int level;
1392     int threadLevel = -1;
1393     int coreLevel = -1;
1394     int pkgLevel = -1;
1395     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1396 
1397     for (level = 0;; level++) {
1398         if (level > 31) {
1399             //
1400             // FIXME: Hack for DPD200163180
1401             //
1402             // If level is big then something went wrong -> exiting
1403             //
1404             // There could actually be 32 valid levels in the machine topology,
1405             // but so far, the only machine we have seen which does not exit
1406             // this loop before iteration 32 has fubar x2APIC settings.
1407             //
1408             // For now, just reject this case based upon loop trip count.
1409             //
1410             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1411             return -1;
1412         }
1413         __kmp_x86_cpuid(11, level, &buf);
1414         if (buf.ebx == 0) {
1415             if (pkgLevel < 0) {
1416                 //
1417                 // Will infer nPackages from __kmp_xproc
1418                 //
1419                 pkgLevel = level;
1420                 level++;
1421             }
1422             break;
1423         }
1424         int kind = (buf.ecx >> 8) & 0xff;
1425         if (kind == 1) {
1426             //
1427             // SMT level
1428             //
1429             threadLevel = level;
1430             coreLevel = -1;
1431             pkgLevel = -1;
1432             __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1433             if (__kmp_nThreadsPerCore == 0) {
1434                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1435                 return -1;
1436             }
1437         }
1438         else if (kind == 2) {
1439             //
1440             // core level
1441             //
1442             coreLevel = level;
1443             pkgLevel = -1;
1444             nCoresPerPkg = buf.ebx & 0xffff;
1445             if (nCoresPerPkg == 0) {
1446                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1447                 return -1;
1448             }
1449         }
1450         else {
1451             if (level <= 0) {
1452                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1453                 return -1;
1454             }
1455             if (pkgLevel >= 0) {
1456                 continue;
1457             }
1458             pkgLevel = level;
1459             nPackages = buf.ebx & 0xffff;
1460             if (nPackages == 0) {
1461                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1462                 return -1;
1463             }
1464         }
1465     }
1466     int depth = level;
1467 
1468     //
1469     // In the above loop, "level" was counted from the finest level (usually
1470     // thread) to the coarsest.  The caller expects that we will place the
1471     // labels in (*address2os)[].first.labels[] in the inverse order, so
1472     // we need to invert the vars saying which level means what.
1473     //
1474     if (threadLevel >= 0) {
1475         threadLevel = depth - threadLevel - 1;
1476     }
1477     if (coreLevel >= 0) {
1478         coreLevel = depth - coreLevel - 1;
1479     }
1480     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1481     pkgLevel = depth - pkgLevel - 1;
1482 
1483     //
1484     // The algorithm used starts by setting the affinity to each available
1485     // thread and retrieving info from the cpuid instruction, so if we are
1486     // not capable of calling __kmp_get_system_affinity() and
1487     // _kmp_get_system_affinity(), then we need to do something else - use
1488     // the defaults that we calculated from issuing cpuid without binding
1489     // to each proc.
1490     //
1491     if (! KMP_AFFINITY_CAPABLE())
1492     {
1493         //
1494         // Hack to try and infer the machine topology using only the data
1495         // available from cpuid on the current thread, and __kmp_xproc.
1496         //
1497         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1498 
1499         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1500         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1501         if (__kmp_affinity_verbose) {
1502             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1503             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1504             if (__kmp_affinity_uniform_topology()) {
1505                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1506             } else {
1507                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1508             }
1509             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1510               __kmp_nThreadsPerCore, __kmp_ncores);
1511         }
1512         return 0;
1513     }
1514 
1515     //
1516     //
1517     // From here on, we can assume that it is safe to call
1518     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1519     // even if __kmp_affinity_type = affinity_none.
1520     //
1521 
1522     //
1523     // Save the affinity mask for the current thread.
1524     //
1525     kmp_affin_mask_t *oldMask;
1526     KMP_CPU_ALLOC(oldMask);
1527     __kmp_get_system_affinity(oldMask, TRUE);
1528 
1529     //
1530     // Allocate the data structure to be returned.
1531     //
1532     AddrUnsPair *retval = (AddrUnsPair *)
1533       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1534 
1535     //
1536     // Run through each of the available contexts, binding the current thread
1537     // to it, and obtaining the pertinent information using the cpuid instr.
1538     //
1539     unsigned int proc;
1540     int nApics = 0;
1541     KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1542         //
1543         // Skip this proc if it is not included in the machine model.
1544         //
1545         if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1546             continue;
1547         }
1548         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1549 
1550         __kmp_affinity_bind_thread(proc);
1551 
1552         //
1553         // Extrach the labels for each level in the machine topology map
1554         // from the Apic ID.
1555         //
1556         Address addr(depth);
1557         int prev_shift = 0;
1558 
1559         for (level = 0; level < depth; level++) {
1560             __kmp_x86_cpuid(11, level, &buf);
1561             unsigned apicId = buf.edx;
1562             if (buf.ebx == 0) {
1563                 if (level != depth - 1) {
1564                     KMP_CPU_FREE(oldMask);
1565                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1566                     return -1;
1567                 }
1568                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1569                 level++;
1570                 break;
1571             }
1572             int shift = buf.eax & 0x1f;
1573             int mask = (1 << shift) - 1;
1574             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1575             prev_shift = shift;
1576         }
1577         if (level != depth) {
1578             KMP_CPU_FREE(oldMask);
1579             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1580             return -1;
1581         }
1582 
1583         retval[nApics] = AddrUnsPair(addr, proc);
1584         nApics++;
1585     }
1586 
1587     //
1588     // We've collected all the info we need.
1589     // Restore the old affinity mask for this thread.
1590     //
1591     __kmp_set_system_affinity(oldMask, TRUE);
1592 
1593     //
1594     // If there's only one thread context to bind to, return now.
1595     //
1596     KMP_ASSERT(nApics > 0);
1597     if (nApics == 1) {
1598         __kmp_ncores = nPackages = 1;
1599         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1600         if (__kmp_affinity_verbose) {
1601             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1602             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1603 
1604             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1605             if (__kmp_affinity_respect_mask) {
1606                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1607             } else {
1608                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1609             }
1610             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1611             KMP_INFORM(Uniform, "KMP_AFFINITY");
1612             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1613               __kmp_nThreadsPerCore, __kmp_ncores);
1614         }
1615 
1616         if (__kmp_affinity_type == affinity_none) {
1617             __kmp_free(retval);
1618             KMP_CPU_FREE(oldMask);
1619             return 0;
1620         }
1621 
1622         //
1623         // Form an Address object which only includes the package level.
1624         //
1625         Address addr(1);
1626         addr.labels[0] = retval[0].first.labels[pkgLevel];
1627         retval[0].first = addr;
1628 
1629         if (__kmp_affinity_gran_levels < 0) {
1630             __kmp_affinity_gran_levels = 0;
1631         }
1632 
1633         if (__kmp_affinity_verbose) {
1634             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1635         }
1636 
1637         *address2os = retval;
1638         KMP_CPU_FREE(oldMask);
1639         return 1;
1640     }
1641 
1642     //
1643     // Sort the table by physical Id.
1644     //
1645     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1646 
1647     //
1648     // Find the radix at each of the levels.
1649     //
1650     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1651     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1652     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1653     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1654     for (level = 0; level < depth; level++) {
1655         totals[level] = 1;
1656         maxCt[level] = 1;
1657         counts[level] = 1;
1658         last[level] = retval[0].first.labels[level];
1659     }
1660 
1661     //
1662     // From here on, the iteration variable "level" runs from the finest
1663     // level to the coarsest, i.e. we iterate forward through
1664     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1665     // backwards.
1666     //
1667     for (proc = 1; (int)proc < nApics; proc++) {
1668         int level;
1669         for (level = 0; level < depth; level++) {
1670             if (retval[proc].first.labels[level] != last[level]) {
1671                 int j;
1672                 for (j = level + 1; j < depth; j++) {
1673                     totals[j]++;
1674                     counts[j] = 1;
1675                     // The line below causes printing incorrect topology information
1676                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1677                     // some less value while going through the array.
1678                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1679                     // whereas it must be 4.
1680                     // TODO!!! Check if it can be commented safely
1681                     //maxCt[j] = 1;
1682                     last[j] = retval[proc].first.labels[j];
1683                 }
1684                 totals[level]++;
1685                 counts[level]++;
1686                 if (counts[level] > maxCt[level]) {
1687                     maxCt[level] = counts[level];
1688                 }
1689                 last[level] = retval[proc].first.labels[level];
1690                 break;
1691             }
1692             else if (level == depth - 1) {
1693                 __kmp_free(last);
1694                 __kmp_free(maxCt);
1695                 __kmp_free(counts);
1696                 __kmp_free(totals);
1697                 __kmp_free(retval);
1698                 KMP_CPU_FREE(oldMask);
1699                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1700                 return -1;
1701             }
1702         }
1703     }
1704 
1705     //
1706     // When affinity is off, this routine will still be called to set
1707     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1708     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1709     // correctly, and return if affinity is not enabled.
1710     //
1711     if (threadLevel >= 0) {
1712         __kmp_nThreadsPerCore = maxCt[threadLevel];
1713     }
1714     else {
1715         __kmp_nThreadsPerCore = 1;
1716     }
1717     nPackages = totals[pkgLevel];
1718 
1719     if (coreLevel >= 0) {
1720         __kmp_ncores = totals[coreLevel];
1721         nCoresPerPkg = maxCt[coreLevel];
1722     }
1723     else {
1724         __kmp_ncores = nPackages;
1725         nCoresPerPkg = 1;
1726     }
1727 
1728     //
1729     // Check to see if the machine topology is uniform
1730     //
1731     unsigned prod = maxCt[0];
1732     for (level = 1; level < depth; level++) {
1733        prod *= maxCt[level];
1734     }
1735     bool uniform = (prod == totals[level - 1]);
1736 
1737     //
1738     // Print the machine topology summary.
1739     //
1740     if (__kmp_affinity_verbose) {
1741         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1742         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1743 
1744         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1745         if (__kmp_affinity_respect_mask) {
1746             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1747         } else {
1748             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1749         }
1750         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1751         if (uniform) {
1752             KMP_INFORM(Uniform, "KMP_AFFINITY");
1753         } else {
1754             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1755         }
1756 
1757         kmp_str_buf_t buf;
1758         __kmp_str_buf_init(&buf);
1759 
1760         __kmp_str_buf_print(&buf, "%d", totals[0]);
1761         for (level = 1; level <= pkgLevel; level++) {
1762             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1763         }
1764         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1765           __kmp_nThreadsPerCore, __kmp_ncores);
1766 
1767         __kmp_str_buf_free(&buf);
1768     }
1769     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1770     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1771     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1772     for (proc = 0; (int)proc < nApics; ++proc) {
1773         __kmp_pu_os_idx[proc] = retval[proc].second;
1774     }
1775     if (__kmp_affinity_type == affinity_none) {
1776         __kmp_free(last);
1777         __kmp_free(maxCt);
1778         __kmp_free(counts);
1779         __kmp_free(totals);
1780         __kmp_free(retval);
1781         KMP_CPU_FREE(oldMask);
1782         return 0;
1783     }
1784 
1785     //
1786     // Find any levels with radiix 1, and remove them from the map
1787     // (except for the package level).
1788     //
1789     int new_depth = 0;
1790     for (level = 0; level < depth; level++) {
1791         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1792            continue;
1793         }
1794         new_depth++;
1795     }
1796 
1797     //
1798     // If we are removing any levels, allocate a new vector to return,
1799     // and copy the relevant information to it.
1800     //
1801     if (new_depth != depth) {
1802         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1803           sizeof(AddrUnsPair) * nApics);
1804         for (proc = 0; (int)proc < nApics; proc++) {
1805             Address addr(new_depth);
1806             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1807         }
1808         int new_level = 0;
1809         int newPkgLevel = -1;
1810         int newCoreLevel = -1;
1811         int newThreadLevel = -1;
1812         int i;
1813         for (level = 0; level < depth; level++) {
1814             if ((maxCt[level] == 1)
1815               && (level != pkgLevel)) {
1816                 //
1817                 // Remove this level. Never remove the package level
1818                 //
1819                 continue;
1820             }
1821             if (level == pkgLevel) {
1822                 newPkgLevel = level;
1823             }
1824             if (level == coreLevel) {
1825                 newCoreLevel = level;
1826             }
1827             if (level == threadLevel) {
1828                 newThreadLevel = level;
1829             }
1830             for (proc = 0; (int)proc < nApics; proc++) {
1831                 new_retval[proc].first.labels[new_level]
1832                   = retval[proc].first.labels[level];
1833             }
1834             new_level++;
1835         }
1836 
1837         __kmp_free(retval);
1838         retval = new_retval;
1839         depth = new_depth;
1840         pkgLevel = newPkgLevel;
1841         coreLevel = newCoreLevel;
1842         threadLevel = newThreadLevel;
1843     }
1844 
1845     if (__kmp_affinity_gran_levels < 0) {
1846         //
1847         // Set the granularity level based on what levels are modeled
1848         // in the machine topology map.
1849         //
1850         __kmp_affinity_gran_levels = 0;
1851         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1852             __kmp_affinity_gran_levels++;
1853         }
1854         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1855             __kmp_affinity_gran_levels++;
1856         }
1857         if (__kmp_affinity_gran > affinity_gran_package) {
1858             __kmp_affinity_gran_levels++;
1859         }
1860     }
1861 
1862     if (__kmp_affinity_verbose) {
1863         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1864           coreLevel, threadLevel);
1865     }
1866 
1867     __kmp_free(last);
1868     __kmp_free(maxCt);
1869     __kmp_free(counts);
1870     __kmp_free(totals);
1871     KMP_CPU_FREE(oldMask);
1872     *address2os = retval;
1873     return depth;
1874 }
1875 
1876 
1877 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1878 
1879 
1880 #define osIdIndex       0
1881 #define threadIdIndex   1
1882 #define coreIdIndex     2
1883 #define pkgIdIndex      3
1884 #define nodeIdIndex     4
1885 
1886 typedef unsigned *ProcCpuInfo;
1887 static unsigned maxIndex = pkgIdIndex;
1888 
1889 
1890 static int
1891 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1892 {
1893     const unsigned *aa = (const unsigned *)a;
1894     const unsigned *bb = (const unsigned *)b;
1895     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1896     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1897     return 0;
1898 };
1899 
1900 
1901 static int
1902 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1903 {
1904     unsigned i;
1905     const unsigned *aa = *((const unsigned **)a);
1906     const unsigned *bb = *((const unsigned **)b);
1907     for (i = maxIndex; ; i--) {
1908         if (aa[i] < bb[i]) return -1;
1909         if (aa[i] > bb[i]) return 1;
1910         if (i == osIdIndex) break;
1911     }
1912     return 0;
1913 }
1914 
1915 
1916 //
1917 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1918 // affinity map.
1919 //
1920 static int
1921 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1922   kmp_i18n_id_t *const msg_id, FILE *f)
1923 {
1924     *address2os = NULL;
1925     *msg_id = kmp_i18n_null;
1926 
1927     //
1928     // Scan of the file, and count the number of "processor" (osId) fields,
1929     // and find the highest value of <n> for a node_<n> field.
1930     //
1931     char buf[256];
1932     unsigned num_records = 0;
1933     while (! feof(f)) {
1934         buf[sizeof(buf) - 1] = 1;
1935         if (! fgets(buf, sizeof(buf), f)) {
1936             //
1937             // Read errors presumably because of EOF
1938             //
1939             break;
1940         }
1941 
1942         char s1[] = "processor";
1943         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1944             num_records++;
1945             continue;
1946         }
1947 
1948         //
1949         // FIXME - this will match "node_<n> <garbage>"
1950         //
1951         unsigned level;
1952         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1953             if (nodeIdIndex + level >= maxIndex) {
1954                 maxIndex = nodeIdIndex + level;
1955             }
1956             continue;
1957         }
1958     }
1959 
1960     //
1961     // Check for empty file / no valid processor records, or too many.
1962     // The number of records can't exceed the number of valid bits in the
1963     // affinity mask.
1964     //
1965     if (num_records == 0) {
1966         *line = 0;
1967         *msg_id = kmp_i18n_str_NoProcRecords;
1968         return -1;
1969     }
1970     if (num_records > (unsigned)__kmp_xproc) {
1971         *line = 0;
1972         *msg_id = kmp_i18n_str_TooManyProcRecords;
1973         return -1;
1974     }
1975 
1976     //
1977     // Set the file pointer back to the begginning, so that we can scan the
1978     // file again, this time performing a full parse of the data.
1979     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1980     // Adding an extra element at the end allows us to remove a lot of extra
1981     // checks for termination conditions.
1982     //
1983     if (fseek(f, 0, SEEK_SET) != 0) {
1984         *line = 0;
1985         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1986         return -1;
1987     }
1988 
1989     //
1990     // Allocate the array of records to store the proc info in.  The dummy
1991     // element at the end makes the logic in filling them out easier to code.
1992     //
1993     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1994       * sizeof(unsigned *));
1995     unsigned i;
1996     for (i = 0; i <= num_records; i++) {
1997         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1998           * sizeof(unsigned));
1999     }
2000 
2001 #define CLEANUP_THREAD_INFO \
2002     for (i = 0; i <= num_records; i++) {                                \
2003         __kmp_free(threadInfo[i]);                                      \
2004     }                                                                   \
2005     __kmp_free(threadInfo);
2006 
2007     //
2008     // A value of UINT_MAX means that we didn't find the field
2009     //
2010     unsigned __index;
2011 
2012 #define INIT_PROC_INFO(p) \
2013     for (__index = 0; __index <= maxIndex; __index++) {                 \
2014         (p)[__index] = UINT_MAX;                                        \
2015     }
2016 
2017     for (i = 0; i <= num_records; i++) {
2018         INIT_PROC_INFO(threadInfo[i]);
2019     }
2020 
2021     unsigned num_avail = 0;
2022     *line = 0;
2023     while (! feof(f)) {
2024         //
2025         // Create an inner scoping level, so that all the goto targets at the
2026         // end of the loop appear in an outer scoping level.  This avoids
2027         // warnings about jumping past an initialization to a target in the
2028         // same block.
2029         //
2030         {
2031             buf[sizeof(buf) - 1] = 1;
2032             bool long_line = false;
2033             if (! fgets(buf, sizeof(buf), f)) {
2034                 //
2035                 // Read errors presumably because of EOF
2036                 //
2037                 // If there is valid data in threadInfo[num_avail], then fake
2038                 // a blank line in ensure that the last address gets parsed.
2039                 //
2040                 bool valid = false;
2041                 for (i = 0; i <= maxIndex; i++) {
2042                     if (threadInfo[num_avail][i] != UINT_MAX) {
2043                         valid = true;
2044                     }
2045                 }
2046                 if (! valid) {
2047                     break;
2048                 }
2049                 buf[0] = 0;
2050             } else if (!buf[sizeof(buf) - 1]) {
2051                 //
2052                 // The line is longer than the buffer.  Set a flag and don't
2053                 // emit an error if we were going to ignore the line, anyway.
2054                 //
2055                 long_line = true;
2056 
2057 #define CHECK_LINE \
2058     if (long_line) {                                                    \
2059         CLEANUP_THREAD_INFO;                                            \
2060         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2061         return -1;                                                      \
2062     }
2063             }
2064             (*line)++;
2065 
2066             char s1[] = "processor";
2067             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2068                 CHECK_LINE;
2069                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2070                 unsigned val;
2071                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2072                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2073                 threadInfo[num_avail][osIdIndex] = val;
2074 #if KMP_OS_LINUX && USE_SYSFS_INFO
2075                 char path[256];
2076                 KMP_SNPRINTF(path, sizeof(path),
2077                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2078                     threadInfo[num_avail][osIdIndex]);
2079                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2080 
2081                 KMP_SNPRINTF(path, sizeof(path),
2082                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2083                     threadInfo[num_avail][osIdIndex]);
2084                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2085                 continue;
2086 #else
2087             }
2088             char s2[] = "physical id";
2089             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2090                 CHECK_LINE;
2091                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2092                 unsigned val;
2093                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2094                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2095                 threadInfo[num_avail][pkgIdIndex] = val;
2096                 continue;
2097             }
2098             char s3[] = "core id";
2099             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2100                 CHECK_LINE;
2101                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2102                 unsigned val;
2103                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2104                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2105                 threadInfo[num_avail][coreIdIndex] = val;
2106                 continue;
2107 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2108             }
2109             char s4[] = "thread id";
2110             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2111                 CHECK_LINE;
2112                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2113                 unsigned val;
2114                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2115                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2116                 threadInfo[num_avail][threadIdIndex] = val;
2117                 continue;
2118             }
2119             unsigned level;
2120             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2121                 CHECK_LINE;
2122                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2123                 unsigned val;
2124                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2125                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2126                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2127                 threadInfo[num_avail][nodeIdIndex + level] = val;
2128                 continue;
2129             }
2130 
2131             //
2132             // We didn't recognize the leading token on the line.
2133             // There are lots of leading tokens that we don't recognize -
2134             // if the line isn't empty, go on to the next line.
2135             //
2136             if ((*buf != 0) && (*buf != '\n')) {
2137                 //
2138                 // If the line is longer than the buffer, read characters
2139                 // until we find a newline.
2140                 //
2141                 if (long_line) {
2142                     int ch;
2143                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2144                 }
2145                 continue;
2146             }
2147 
2148             //
2149             // A newline has signalled the end of the processor record.
2150             // Check that there aren't too many procs specified.
2151             //
2152             if ((int)num_avail == __kmp_xproc) {
2153                 CLEANUP_THREAD_INFO;
2154                 *msg_id = kmp_i18n_str_TooManyEntries;
2155                 return -1;
2156             }
2157 
2158             //
2159             // Check for missing fields.  The osId field must be there, and we
2160             // currently require that the physical id field is specified, also.
2161             //
2162             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2163                 CLEANUP_THREAD_INFO;
2164                 *msg_id = kmp_i18n_str_MissingProcField;
2165                 return -1;
2166             }
2167             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2168                 CLEANUP_THREAD_INFO;
2169                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2170                 return -1;
2171             }
2172 
2173             //
2174             // Skip this proc if it is not included in the machine model.
2175             //
2176             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2177                 INIT_PROC_INFO(threadInfo[num_avail]);
2178                 continue;
2179             }
2180 
2181             //
2182             // We have a successful parse of this proc's info.
2183             // Increment the counter, and prepare for the next proc.
2184             //
2185             num_avail++;
2186             KMP_ASSERT(num_avail <= num_records);
2187             INIT_PROC_INFO(threadInfo[num_avail]);
2188         }
2189         continue;
2190 
2191         no_val:
2192         CLEANUP_THREAD_INFO;
2193         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2194         return -1;
2195 
2196         dup_field:
2197         CLEANUP_THREAD_INFO;
2198         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2199         return -1;
2200     }
2201     *line = 0;
2202 
2203 # if KMP_MIC && REDUCE_TEAM_SIZE
2204     unsigned teamSize = 0;
2205 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2206 
2207     // check for num_records == __kmp_xproc ???
2208 
2209     //
2210     // If there's only one thread context to bind to, form an Address object
2211     // with depth 1 and return immediately (or, if affinity is off, set
2212     // address2os to NULL and return).
2213     //
2214     // If it is configured to omit the package level when there is only a
2215     // single package, the logic at the end of this routine won't work if
2216     // there is only a single thread - it would try to form an Address
2217     // object with depth 0.
2218     //
2219     KMP_ASSERT(num_avail > 0);
2220     KMP_ASSERT(num_avail <= num_records);
2221     if (num_avail == 1) {
2222         __kmp_ncores = 1;
2223         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2224         if (__kmp_affinity_verbose) {
2225             if (! KMP_AFFINITY_CAPABLE()) {
2226                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2227                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2228                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2229             }
2230             else {
2231                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2232                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2233                   __kmp_affin_fullMask);
2234                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2235                 if (__kmp_affinity_respect_mask) {
2236                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2237                 } else {
2238                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2239                 }
2240                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2241                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2242             }
2243             int index;
2244             kmp_str_buf_t buf;
2245             __kmp_str_buf_init(&buf);
2246             __kmp_str_buf_print(&buf, "1");
2247             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2248                 __kmp_str_buf_print(&buf, " x 1");
2249             }
2250             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2251             __kmp_str_buf_free(&buf);
2252         }
2253 
2254         if (__kmp_affinity_type == affinity_none) {
2255             CLEANUP_THREAD_INFO;
2256             return 0;
2257         }
2258 
2259         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2260         Address addr(1);
2261         addr.labels[0] = threadInfo[0][pkgIdIndex];
2262         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2263 
2264         if (__kmp_affinity_gran_levels < 0) {
2265             __kmp_affinity_gran_levels = 0;
2266         }
2267 
2268         if (__kmp_affinity_verbose) {
2269             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2270         }
2271 
2272         CLEANUP_THREAD_INFO;
2273         return 1;
2274     }
2275 
2276     //
2277     // Sort the threadInfo table by physical Id.
2278     //
2279     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2280       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2281 
2282     //
2283     // The table is now sorted by pkgId / coreId / threadId, but we really
2284     // don't know the radix of any of the fields.  pkgId's may be sparsely
2285     // assigned among the chips on a system.  Although coreId's are usually
2286     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2287     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2288     //
2289     // For that matter, we don't know what coresPerPkg and threadsPerCore
2290     // (or the total # packages) are at this point - we want to determine
2291     // that now.  We only have an upper bound on the first two figures.
2292     //
2293     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2294       * sizeof(unsigned));
2295     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2296       * sizeof(unsigned));
2297     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2298       * sizeof(unsigned));
2299     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2300       * sizeof(unsigned));
2301 
2302     bool assign_thread_ids = false;
2303     unsigned threadIdCt;
2304     unsigned index;
2305 
2306     restart_radix_check:
2307     threadIdCt = 0;
2308 
2309     //
2310     // Initialize the counter arrays with data from threadInfo[0].
2311     //
2312     if (assign_thread_ids) {
2313         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2314             threadInfo[0][threadIdIndex] = threadIdCt++;
2315         }
2316         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2317             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2318         }
2319     }
2320     for (index = 0; index <= maxIndex; index++) {
2321         counts[index] = 1;
2322         maxCt[index] = 1;
2323         totals[index] = 1;
2324         lastId[index] = threadInfo[0][index];;
2325     }
2326 
2327     //
2328     // Run through the rest of the OS procs.
2329     //
2330     for (i = 1; i < num_avail; i++) {
2331         //
2332         // Find the most significant index whose id differs
2333         // from the id for the previous OS proc.
2334         //
2335         for (index = maxIndex; index >= threadIdIndex; index--) {
2336             if (assign_thread_ids && (index == threadIdIndex)) {
2337                 //
2338                 // Auto-assign the thread id field if it wasn't specified.
2339                 //
2340                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2341                     threadInfo[i][threadIdIndex] = threadIdCt++;
2342                 }
2343 
2344                 //
2345                 // Aparrently the thread id field was specified for some
2346                 // entries and not others.  Start the thread id counter
2347                 // off at the next higher thread id.
2348                 //
2349                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2350                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2351                 }
2352             }
2353             if (threadInfo[i][index] != lastId[index]) {
2354                 //
2355                 // Run through all indices which are less significant,
2356                 // and reset the counts to 1.
2357                 //
2358                 // At all levels up to and including index, we need to
2359                 // increment the totals and record the last id.
2360                 //
2361                 unsigned index2;
2362                 for (index2 = threadIdIndex; index2 < index; index2++) {
2363                     totals[index2]++;
2364                     if (counts[index2] > maxCt[index2]) {
2365                         maxCt[index2] = counts[index2];
2366                     }
2367                     counts[index2] = 1;
2368                     lastId[index2] = threadInfo[i][index2];
2369                 }
2370                 counts[index]++;
2371                 totals[index]++;
2372                 lastId[index] = threadInfo[i][index];
2373 
2374                 if (assign_thread_ids && (index > threadIdIndex)) {
2375 
2376 # if KMP_MIC && REDUCE_TEAM_SIZE
2377                     //
2378                     // The default team size is the total #threads in the machine
2379                     // minus 1 thread for every core that has 3 or more threads.
2380                     //
2381                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2382 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2383 
2384                     //
2385                     // Restart the thread counter, as we are on a new core.
2386                     //
2387                     threadIdCt = 0;
2388 
2389                     //
2390                     // Auto-assign the thread id field if it wasn't specified.
2391                     //
2392                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2393                         threadInfo[i][threadIdIndex] = threadIdCt++;
2394                     }
2395 
2396                     //
2397                     // Aparrently the thread id field was specified for some
2398                     // entries and not others.  Start the thread id counter
2399                     // off at the next higher thread id.
2400                     //
2401                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2402                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2403                     }
2404                 }
2405                 break;
2406             }
2407         }
2408         if (index < threadIdIndex) {
2409             //
2410             // If thread ids were specified, it is an error if they are not
2411             // unique.  Also, check that we waven't already restarted the
2412             // loop (to be safe - shouldn't need to).
2413             //
2414             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2415               || assign_thread_ids) {
2416                 __kmp_free(lastId);
2417                 __kmp_free(totals);
2418                 __kmp_free(maxCt);
2419                 __kmp_free(counts);
2420                 CLEANUP_THREAD_INFO;
2421                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2422                 return -1;
2423             }
2424 
2425             //
2426             // If the thread ids were not specified and we see entries
2427             // entries that are duplicates, start the loop over and
2428             // assign the thread ids manually.
2429             //
2430             assign_thread_ids = true;
2431             goto restart_radix_check;
2432         }
2433     }
2434 
2435 # if KMP_MIC && REDUCE_TEAM_SIZE
2436     //
2437     // The default team size is the total #threads in the machine
2438     // minus 1 thread for every core that has 3 or more threads.
2439     //
2440     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2441 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2442 
2443     for (index = threadIdIndex; index <= maxIndex; index++) {
2444         if (counts[index] > maxCt[index]) {
2445             maxCt[index] = counts[index];
2446         }
2447     }
2448 
2449     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2450     nCoresPerPkg = maxCt[coreIdIndex];
2451     nPackages = totals[pkgIdIndex];
2452 
2453     //
2454     // Check to see if the machine topology is uniform
2455     //
2456     unsigned prod = totals[maxIndex];
2457     for (index = threadIdIndex; index < maxIndex; index++) {
2458        prod *= maxCt[index];
2459     }
2460     bool uniform = (prod == totals[threadIdIndex]);
2461 
2462     //
2463     // When affinity is off, this routine will still be called to set
2464     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2465     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2466     // correctly, and return now if affinity is not enabled.
2467     //
2468     __kmp_ncores = totals[coreIdIndex];
2469 
2470     if (__kmp_affinity_verbose) {
2471         if (! KMP_AFFINITY_CAPABLE()) {
2472                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2473                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2474                 if (uniform) {
2475                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2476                 } else {
2477                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2478                 }
2479         }
2480         else {
2481             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2482             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2483                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2484                 if (__kmp_affinity_respect_mask) {
2485                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2486                 } else {
2487                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2488                 }
2489                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2490                 if (uniform) {
2491                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2492                 } else {
2493                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2494                 }
2495         }
2496         kmp_str_buf_t buf;
2497         __kmp_str_buf_init(&buf);
2498 
2499         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2500         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2501             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2502         }
2503         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2504           maxCt[threadIdIndex], __kmp_ncores);
2505 
2506         __kmp_str_buf_free(&buf);
2507     }
2508 
2509 # if KMP_MIC && REDUCE_TEAM_SIZE
2510     //
2511     // Set the default team size.
2512     //
2513     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2514         __kmp_dflt_team_nth = teamSize;
2515         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2516           __kmp_dflt_team_nth));
2517     }
2518 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2519 
2520     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2521     KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2522     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2523     for (i = 0; i < num_avail; ++i) { // fill the os indices
2524         __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2525     }
2526 
2527     if (__kmp_affinity_type == affinity_none) {
2528         __kmp_free(lastId);
2529         __kmp_free(totals);
2530         __kmp_free(maxCt);
2531         __kmp_free(counts);
2532         CLEANUP_THREAD_INFO;
2533         return 0;
2534     }
2535 
2536     //
2537     // Count the number of levels which have more nodes at that level than
2538     // at the parent's level (with there being an implicit root node of
2539     // the top level).  This is equivalent to saying that there is at least
2540     // one node at this level which has a sibling.  These levels are in the
2541     // map, and the package level is always in the map.
2542     //
2543     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2544     int level = 0;
2545     for (index = threadIdIndex; index < maxIndex; index++) {
2546         KMP_ASSERT(totals[index] >= totals[index + 1]);
2547         inMap[index] = (totals[index] > totals[index + 1]);
2548     }
2549     inMap[maxIndex] = (totals[maxIndex] > 1);
2550     inMap[pkgIdIndex] = true;
2551 
2552     int depth = 0;
2553     for (index = threadIdIndex; index <= maxIndex; index++) {
2554         if (inMap[index]) {
2555             depth++;
2556         }
2557     }
2558     KMP_ASSERT(depth > 0);
2559 
2560     //
2561     // Construct the data structure that is to be returned.
2562     //
2563     *address2os = (AddrUnsPair*)
2564       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2565     int pkgLevel = -1;
2566     int coreLevel = -1;
2567     int threadLevel = -1;
2568 
2569     for (i = 0; i < num_avail; ++i) {
2570         Address addr(depth);
2571         unsigned os = threadInfo[i][osIdIndex];
2572         int src_index;
2573         int dst_index = 0;
2574 
2575         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2576             if (! inMap[src_index]) {
2577                 continue;
2578             }
2579             addr.labels[dst_index] = threadInfo[i][src_index];
2580             if (src_index == pkgIdIndex) {
2581                 pkgLevel = dst_index;
2582             }
2583             else if (src_index == coreIdIndex) {
2584                 coreLevel = dst_index;
2585             }
2586             else if (src_index == threadIdIndex) {
2587                 threadLevel = dst_index;
2588             }
2589             dst_index++;
2590         }
2591         (*address2os)[i] = AddrUnsPair(addr, os);
2592     }
2593 
2594     if (__kmp_affinity_gran_levels < 0) {
2595         //
2596         // Set the granularity level based on what levels are modeled
2597         // in the machine topology map.
2598         //
2599         unsigned src_index;
2600         __kmp_affinity_gran_levels = 0;
2601         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2602             if (! inMap[src_index]) {
2603                 continue;
2604             }
2605             switch (src_index) {
2606                 case threadIdIndex:
2607                 if (__kmp_affinity_gran > affinity_gran_thread) {
2608                     __kmp_affinity_gran_levels++;
2609                 }
2610 
2611                 break;
2612                 case coreIdIndex:
2613                 if (__kmp_affinity_gran > affinity_gran_core) {
2614                     __kmp_affinity_gran_levels++;
2615                 }
2616                 break;
2617 
2618                 case pkgIdIndex:
2619                 if (__kmp_affinity_gran > affinity_gran_package) {
2620                     __kmp_affinity_gran_levels++;
2621                 }
2622                 break;
2623             }
2624         }
2625     }
2626 
2627     if (__kmp_affinity_verbose) {
2628         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2629           coreLevel, threadLevel);
2630     }
2631 
2632     __kmp_free(inMap);
2633     __kmp_free(lastId);
2634     __kmp_free(totals);
2635     __kmp_free(maxCt);
2636     __kmp_free(counts);
2637     CLEANUP_THREAD_INFO;
2638     return depth;
2639 }
2640 
2641 
2642 //
2643 // Create and return a table of affinity masks, indexed by OS thread ID.
2644 // This routine handles OR'ing together all the affinity masks of threads
2645 // that are sufficiently close, if granularity > fine.
2646 //
2647 static kmp_affin_mask_t *
2648 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2649   AddrUnsPair *address2os, unsigned numAddrs)
2650 {
2651     //
2652     // First form a table of affinity masks in order of OS thread id.
2653     //
2654     unsigned depth;
2655     unsigned maxOsId;
2656     unsigned i;
2657 
2658     KMP_ASSERT(numAddrs > 0);
2659     depth = address2os[0].first.depth;
2660 
2661     maxOsId = 0;
2662     for (i = 0; i < numAddrs; i++) {
2663         unsigned osId = address2os[i].second;
2664         if (osId > maxOsId) {
2665             maxOsId = osId;
2666         }
2667     }
2668     kmp_affin_mask_t *osId2Mask;
2669     KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2670 
2671     //
2672     // Sort the address2os table according to physical order.  Doing so
2673     // will put all threads on the same core/package/node in consecutive
2674     // locations.
2675     //
2676     qsort(address2os, numAddrs, sizeof(*address2os),
2677       __kmp_affinity_cmp_Address_labels);
2678 
2679     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2680     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2681         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2682     }
2683     if (__kmp_affinity_gran_levels >= (int)depth) {
2684         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2685           && (__kmp_affinity_type != affinity_none))) {
2686             KMP_WARNING(AffThreadsMayMigrate);
2687         }
2688     }
2689 
2690     //
2691     // Run through the table, forming the masks for all threads on each
2692     // core.  Threads on the same core will have identical "Address"
2693     // objects, not considering the last level, which must be the thread
2694     // id.  All threads on a core will appear consecutively.
2695     //
2696     unsigned unique = 0;
2697     unsigned j = 0;                             // index of 1st thread on core
2698     unsigned leader = 0;
2699     Address *leaderAddr = &(address2os[0].first);
2700     kmp_affin_mask_t *sum;
2701     KMP_CPU_ALLOC_ON_STACK(sum);
2702     KMP_CPU_ZERO(sum);
2703     KMP_CPU_SET(address2os[0].second, sum);
2704     for (i = 1; i < numAddrs; i++) {
2705         //
2706         // If this thread is sufficiently close to the leader (within the
2707         // granularity setting), then set the bit for this os thread in the
2708         // affinity mask for this group, and go on to the next thread.
2709         //
2710         if (leaderAddr->isClose(address2os[i].first,
2711           __kmp_affinity_gran_levels)) {
2712             KMP_CPU_SET(address2os[i].second, sum);
2713             continue;
2714         }
2715 
2716         //
2717         // For every thread in this group, copy the mask to the thread's
2718         // entry in the osId2Mask table.  Mark the first address as a
2719         // leader.
2720         //
2721         for (; j < i; j++) {
2722             unsigned osId = address2os[j].second;
2723             KMP_DEBUG_ASSERT(osId <= maxOsId);
2724             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2725             KMP_CPU_COPY(mask, sum);
2726             address2os[j].first.leader = (j == leader);
2727         }
2728         unique++;
2729 
2730         //
2731         // Start a new mask.
2732         //
2733         leader = i;
2734         leaderAddr = &(address2os[i].first);
2735         KMP_CPU_ZERO(sum);
2736         KMP_CPU_SET(address2os[i].second, sum);
2737     }
2738 
2739     //
2740     // For every thread in last group, copy the mask to the thread's
2741     // entry in the osId2Mask table.
2742     //
2743     for (; j < i; j++) {
2744         unsigned osId = address2os[j].second;
2745         KMP_DEBUG_ASSERT(osId <= maxOsId);
2746         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2747         KMP_CPU_COPY(mask, sum);
2748         address2os[j].first.leader = (j == leader);
2749     }
2750     unique++;
2751     KMP_CPU_FREE_FROM_STACK(sum);
2752 
2753     *maxIndex = maxOsId;
2754     *numUnique = unique;
2755     return osId2Mask;
2756 }
2757 
2758 
2759 //
2760 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2761 // as file-static than to try and pass them through the calling sequence of
2762 // the recursive-descent OMP_PLACES parser.
2763 //
2764 static kmp_affin_mask_t *newMasks;
2765 static int numNewMasks;
2766 static int nextNewMask;
2767 
2768 #define ADD_MASK(_mask) \
2769     {                                                                   \
2770         if (nextNewMask >= numNewMasks) {                               \
2771             int i;                                                      \
2772             numNewMasks *= 2;                                           \
2773             kmp_affin_mask_t* temp;                                     \
2774             KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
2775             for(i=0;i<numNewMasks/2;i++) {                              \
2776                 kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
2777                 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
2778                 KMP_CPU_COPY(dest, src);                                \
2779             }                                                           \
2780             KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
2781             newMasks = temp;                                            \
2782         }                                                               \
2783         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2784         nextNewMask++;                                                  \
2785     }
2786 
2787 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2788     {                                                                   \
2789         if (((_osId) > _maxOsId) ||                                     \
2790           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2791             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2792               && (__kmp_affinity_type != affinity_none))) {             \
2793                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2794             }                                                           \
2795         }                                                               \
2796         else {                                                          \
2797             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2798         }                                                               \
2799     }
2800 
2801 
2802 //
2803 // Re-parse the proclist (for the explicit affinity type), and form the list
2804 // of affinity newMasks indexed by gtid.
2805 //
2806 static void
2807 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2808   unsigned int *out_numMasks, const char *proclist,
2809   kmp_affin_mask_t *osId2Mask, int maxOsId)
2810 {
2811     int i;
2812     const char *scan = proclist;
2813     const char *next = proclist;
2814 
2815     //
2816     // We use malloc() for the temporary mask vector,
2817     // so that we can use realloc() to extend it.
2818     //
2819     numNewMasks = 2;
2820     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2821     nextNewMask = 0;
2822     kmp_affin_mask_t *sumMask;
2823     KMP_CPU_ALLOC(sumMask);
2824     int setSize = 0;
2825 
2826     for (;;) {
2827         int start, end, stride;
2828 
2829         SKIP_WS(scan);
2830         next = scan;
2831         if (*next == '\0') {
2832             break;
2833         }
2834 
2835         if (*next == '{') {
2836             int num;
2837             setSize = 0;
2838             next++;     // skip '{'
2839             SKIP_WS(next);
2840             scan = next;
2841 
2842             //
2843             // Read the first integer in the set.
2844             //
2845             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2846               "bad proclist");
2847             SKIP_DIGITS(next);
2848             num = __kmp_str_to_int(scan, *next);
2849             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2850 
2851             //
2852             // Copy the mask for that osId to the sum (union) mask.
2853             //
2854             if ((num > maxOsId) ||
2855               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2856                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2857                   && (__kmp_affinity_type != affinity_none))) {
2858                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2859                 }
2860                 KMP_CPU_ZERO(sumMask);
2861             }
2862             else {
2863                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2864                 setSize = 1;
2865             }
2866 
2867             for (;;) {
2868                 //
2869                 // Check for end of set.
2870                 //
2871                 SKIP_WS(next);
2872                 if (*next == '}') {
2873                     next++;     // skip '}'
2874                     break;
2875                 }
2876 
2877                 //
2878                 // Skip optional comma.
2879                 //
2880                 if (*next == ',') {
2881                     next++;
2882                 }
2883                 SKIP_WS(next);
2884 
2885                 //
2886                 // Read the next integer in the set.
2887                 //
2888                 scan = next;
2889                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2890                   "bad explicit proc list");
2891 
2892                 SKIP_DIGITS(next);
2893                 num = __kmp_str_to_int(scan, *next);
2894                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2895 
2896                 //
2897                 // Add the mask for that osId to the sum mask.
2898                 //
2899                 if ((num > maxOsId) ||
2900                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2901                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2902                       && (__kmp_affinity_type != affinity_none))) {
2903                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2904                     }
2905                 }
2906                 else {
2907                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2908                     setSize++;
2909                 }
2910             }
2911             if (setSize > 0) {
2912                 ADD_MASK(sumMask);
2913             }
2914 
2915             SKIP_WS(next);
2916             if (*next == ',') {
2917                 next++;
2918             }
2919             scan = next;
2920             continue;
2921         }
2922 
2923         //
2924         // Read the first integer.
2925         //
2926         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2927         SKIP_DIGITS(next);
2928         start = __kmp_str_to_int(scan, *next);
2929         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2930         SKIP_WS(next);
2931 
2932         //
2933         // If this isn't a range, then add a mask to the list and go on.
2934         //
2935         if (*next != '-') {
2936             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2937 
2938             //
2939             // Skip optional comma.
2940             //
2941             if (*next == ',') {
2942                 next++;
2943             }
2944             scan = next;
2945             continue;
2946         }
2947 
2948         //
2949         // This is a range.  Skip over the '-' and read in the 2nd int.
2950         //
2951         next++;         // skip '-'
2952         SKIP_WS(next);
2953         scan = next;
2954         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2955         SKIP_DIGITS(next);
2956         end = __kmp_str_to_int(scan, *next);
2957         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2958 
2959         //
2960         // Check for a stride parameter
2961         //
2962         stride = 1;
2963         SKIP_WS(next);
2964         if (*next == ':') {
2965             //
2966             // A stride is specified.  Skip over the ':" and read the 3rd int.
2967             //
2968             int sign = +1;
2969             next++;         // skip ':'
2970             SKIP_WS(next);
2971             scan = next;
2972             if (*next == '-') {
2973                 sign = -1;
2974                 next++;
2975                 SKIP_WS(next);
2976                 scan = next;
2977             }
2978             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2979               "bad explicit proc list");
2980             SKIP_DIGITS(next);
2981             stride = __kmp_str_to_int(scan, *next);
2982             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2983             stride *= sign;
2984         }
2985 
2986         //
2987         // Do some range checks.
2988         //
2989         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2990         if (stride > 0) {
2991             KMP_ASSERT2(start <= end, "bad explicit proc list");
2992         }
2993         else {
2994             KMP_ASSERT2(start >= end, "bad explicit proc list");
2995         }
2996         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2997 
2998         //
2999         // Add the mask for each OS proc # to the list.
3000         //
3001         if (stride > 0) {
3002             do {
3003                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3004                 start += stride;
3005             } while (start <= end);
3006         }
3007         else {
3008             do {
3009                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3010                 start += stride;
3011             } while (start >= end);
3012         }
3013 
3014         //
3015         // Skip optional comma.
3016         //
3017         SKIP_WS(next);
3018         if (*next == ',') {
3019             next++;
3020         }
3021         scan = next;
3022     }
3023 
3024     *out_numMasks = nextNewMask;
3025     if (nextNewMask == 0) {
3026         *out_masks = NULL;
3027         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3028         return;
3029     }
3030     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3031     for(i = 0; i < nextNewMask; i++) {
3032         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3033         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3034         KMP_CPU_COPY(dest, src);
3035     }
3036     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3037     KMP_CPU_FREE(sumMask);
3038 }
3039 
3040 
3041 # if OMP_40_ENABLED
3042 
3043 /*-----------------------------------------------------------------------------
3044 
3045 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3046 places.  Again, Here is the grammar:
3047 
3048 place_list := place
3049 place_list := place , place_list
3050 place := num
3051 place := place : num
3052 place := place : num : signed
3053 place := { subplacelist }
3054 place := ! place                  // (lowest priority)
3055 subplace_list := subplace
3056 subplace_list := subplace , subplace_list
3057 subplace := num
3058 subplace := num : num
3059 subplace := num : num : signed
3060 signed := num
3061 signed := + signed
3062 signed := - signed
3063 
3064 -----------------------------------------------------------------------------*/
3065 
3066 static void
3067 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3068   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3069 {
3070     const char *next;
3071 
3072     for (;;) {
3073         int start, count, stride, i;
3074 
3075         //
3076         // Read in the starting proc id
3077         //
3078         SKIP_WS(*scan);
3079         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3080           "bad explicit places list");
3081         next = *scan;
3082         SKIP_DIGITS(next);
3083         start = __kmp_str_to_int(*scan, *next);
3084         KMP_ASSERT(start >= 0);
3085         *scan = next;
3086 
3087         //
3088         // valid follow sets are ',' ':' and '}'
3089         //
3090         SKIP_WS(*scan);
3091         if (**scan == '}' || **scan == ',') {
3092             if ((start > maxOsId) ||
3093               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3094                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3095                   && (__kmp_affinity_type != affinity_none))) {
3096                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3097                 }
3098             }
3099             else {
3100                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101                 (*setSize)++;
3102             }
3103             if (**scan == '}') {
3104                 break;
3105             }
3106             (*scan)++;  // skip ','
3107             continue;
3108         }
3109         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3110         (*scan)++;      // skip ':'
3111 
3112         //
3113         // Read count parameter
3114         //
3115         SKIP_WS(*scan);
3116         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3117           "bad explicit places list");
3118         next = *scan;
3119         SKIP_DIGITS(next);
3120         count = __kmp_str_to_int(*scan, *next);
3121         KMP_ASSERT(count >= 0);
3122         *scan = next;
3123 
3124         //
3125         // valid follow sets are ',' ':' and '}'
3126         //
3127         SKIP_WS(*scan);
3128         if (**scan == '}' || **scan == ',') {
3129             for (i = 0; i < count; i++) {
3130                 if ((start > maxOsId) ||
3131                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3132                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3133                       && (__kmp_affinity_type != affinity_none))) {
3134                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3135                     }
3136                     break;  // don't proliferate warnings for large count
3137                 }
3138                 else {
3139                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3140                     start++;
3141                     (*setSize)++;
3142                 }
3143             }
3144             if (**scan == '}') {
3145                 break;
3146             }
3147             (*scan)++;  // skip ','
3148             continue;
3149         }
3150         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3151         (*scan)++;      // skip ':'
3152 
3153         //
3154         // Read stride parameter
3155         //
3156         int sign = +1;
3157         for (;;) {
3158             SKIP_WS(*scan);
3159             if (**scan == '+') {
3160                 (*scan)++; // skip '+'
3161                 continue;
3162             }
3163             if (**scan == '-') {
3164                 sign *= -1;
3165                 (*scan)++; // skip '-'
3166                 continue;
3167             }
3168             break;
3169         }
3170         SKIP_WS(*scan);
3171         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3172           "bad explicit places list");
3173         next = *scan;
3174         SKIP_DIGITS(next);
3175         stride = __kmp_str_to_int(*scan, *next);
3176         KMP_ASSERT(stride >= 0);
3177         *scan = next;
3178         stride *= sign;
3179 
3180         //
3181         // valid follow sets are ',' and '}'
3182         //
3183         SKIP_WS(*scan);
3184         if (**scan == '}' || **scan == ',') {
3185             for (i = 0; i < count; i++) {
3186                 if ((start > maxOsId) ||
3187                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3188                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3189                       && (__kmp_affinity_type != affinity_none))) {
3190                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3191                     }
3192                     break;  // don't proliferate warnings for large count
3193                 }
3194                 else {
3195                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3196                     start += stride;
3197                     (*setSize)++;
3198                 }
3199             }
3200             if (**scan == '}') {
3201                 break;
3202             }
3203             (*scan)++;  // skip ','
3204             continue;
3205         }
3206 
3207         KMP_ASSERT2(0, "bad explicit places list");
3208     }
3209 }
3210 
3211 
3212 static void
3213 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3214   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3215 {
3216     const char *next;
3217 
3218     //
3219     // valid follow sets are '{' '!' and num
3220     //
3221     SKIP_WS(*scan);
3222     if (**scan == '{') {
3223         (*scan)++;      // skip '{'
3224         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3225           setSize);
3226         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3227         (*scan)++;      // skip '}'
3228     }
3229     else if (**scan == '!') {
3230         (*scan)++;      // skip '!'
3231         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3232         KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3233     }
3234     else if ((**scan >= '0') && (**scan <= '9')) {
3235         next = *scan;
3236         SKIP_DIGITS(next);
3237         int num = __kmp_str_to_int(*scan, *next);
3238         KMP_ASSERT(num >= 0);
3239         if ((num > maxOsId) ||
3240           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3241             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3242               && (__kmp_affinity_type != affinity_none))) {
3243                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3244             }
3245         }
3246         else {
3247             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3248             (*setSize)++;
3249         }
3250         *scan = next;  // skip num
3251     }
3252     else {
3253         KMP_ASSERT2(0, "bad explicit places list");
3254     }
3255 }
3256 
3257 
3258 //static void
3259 void
3260 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3261   unsigned int *out_numMasks, const char *placelist,
3262   kmp_affin_mask_t *osId2Mask, int maxOsId)
3263 {
3264     int i,j,count,stride,sign;
3265     const char *scan = placelist;
3266     const char *next = placelist;
3267 
3268     numNewMasks = 2;
3269     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3270     nextNewMask = 0;
3271 
3272     // tempMask is modified based on the previous or initial
3273     //   place to form the current place
3274     // previousMask contains the previous place
3275     kmp_affin_mask_t *tempMask;
3276     kmp_affin_mask_t *previousMask;
3277     KMP_CPU_ALLOC(tempMask);
3278     KMP_CPU_ZERO(tempMask);
3279     KMP_CPU_ALLOC(previousMask);
3280     KMP_CPU_ZERO(previousMask);
3281     int setSize = 0;
3282 
3283     for (;;) {
3284         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3285 
3286         //
3287         // valid follow sets are ',' ':' and EOL
3288         //
3289         SKIP_WS(scan);
3290         if (*scan == '\0' || *scan == ',') {
3291             if (setSize > 0) {
3292                 ADD_MASK(tempMask);
3293             }
3294             KMP_CPU_ZERO(tempMask);
3295             setSize = 0;
3296             if (*scan == '\0') {
3297                 break;
3298             }
3299             scan++;     // skip ','
3300             continue;
3301         }
3302 
3303         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3304         scan++;         // skip ':'
3305 
3306         //
3307         // Read count parameter
3308         //
3309         SKIP_WS(scan);
3310         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3311           "bad explicit places list");
3312         next = scan;
3313         SKIP_DIGITS(next);
3314         count = __kmp_str_to_int(scan, *next);
3315         KMP_ASSERT(count >= 0);
3316         scan = next;
3317 
3318         //
3319         // valid follow sets are ',' ':' and EOL
3320         //
3321         SKIP_WS(scan);
3322         if (*scan == '\0' || *scan == ',') {
3323             stride = +1;
3324         }
3325         else {
3326             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3327             scan++;         // skip ':'
3328 
3329             //
3330             // Read stride parameter
3331             //
3332             sign = +1;
3333             for (;;) {
3334                 SKIP_WS(scan);
3335                 if (*scan == '+') {
3336                     scan++; // skip '+'
3337                     continue;
3338                 }
3339                 if (*scan == '-') {
3340                     sign *= -1;
3341                     scan++; // skip '-'
3342                     continue;
3343                 }
3344                 break;
3345             }
3346             SKIP_WS(scan);
3347             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3348               "bad explicit places list");
3349             next = scan;
3350             SKIP_DIGITS(next);
3351             stride = __kmp_str_to_int(scan, *next);
3352             KMP_DEBUG_ASSERT(stride >= 0);
3353             scan = next;
3354             stride *= sign;
3355         }
3356 
3357         // Add places determined by initial_place : count : stride
3358         for (i = 0; i < count; i++) {
3359             if (setSize == 0) {
3360                 break;
3361             }
3362             // Add the current place, then build the next place (tempMask) from that
3363             KMP_CPU_COPY(previousMask, tempMask);
3364             ADD_MASK(previousMask);
3365             KMP_CPU_ZERO(tempMask);
3366             setSize = 0;
3367             KMP_CPU_SET_ITERATE(j, previousMask) {
3368                 if (! KMP_CPU_ISSET(j, previousMask)) {
3369                     continue;
3370                 }
3371                 if ((j+stride > maxOsId) || (j+stride < 0) ||
3372                   (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3373                   (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3374                     if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3375                       && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3376                         KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3377                     }
3378                     continue;
3379                 }
3380                 KMP_CPU_SET(j+stride, tempMask);
3381                 setSize++;
3382             }
3383         }
3384         KMP_CPU_ZERO(tempMask);
3385         setSize = 0;
3386 
3387         //
3388         // valid follow sets are ',' and EOL
3389         //
3390         SKIP_WS(scan);
3391         if (*scan == '\0') {
3392             break;
3393         }
3394         if (*scan == ',') {
3395             scan++;     // skip ','
3396             continue;
3397         }
3398 
3399         KMP_ASSERT2(0, "bad explicit places list");
3400     }
3401 
3402     *out_numMasks = nextNewMask;
3403     if (nextNewMask == 0) {
3404         *out_masks = NULL;
3405         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3406         return;
3407     }
3408     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3409     KMP_CPU_FREE(tempMask);
3410     KMP_CPU_FREE(previousMask);
3411     for(i = 0; i < nextNewMask; i++) {
3412         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3413         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3414         KMP_CPU_COPY(dest, src);
3415     }
3416     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3417 }
3418 
3419 # endif /* OMP_40_ENABLED */
3420 
3421 #undef ADD_MASK
3422 #undef ADD_MASK_OSID
3423 
3424 static void
3425 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3426 {
3427     int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
3428     if (__kmp_place_num_sockets == 0 &&
3429         __kmp_place_num_cores == 0 &&
3430         __kmp_place_num_threads_per_core == 0 )
3431         goto _exit;   // no topology limiting actions requested, exit
3432     if (__kmp_place_num_sockets == 0)
3433         __kmp_place_num_sockets = nPackages;    // use all available sockets
3434     if (__kmp_place_num_cores == 0)
3435         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3436     if (__kmp_place_num_threads_per_core == 0 ||
3437         __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3438         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3439 
3440     if ( !__kmp_affinity_uniform_topology() ) {
3441         KMP_WARNING( AffHWSubsetNonUniform );
3442         goto _exit; // don't support non-uniform topology
3443     }
3444     if ( depth > 3 ) {
3445         KMP_WARNING( AffHWSubsetNonThreeLevel );
3446         goto _exit; // don't support not-3-level topology
3447     }
3448     if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3449         KMP_WARNING(AffHWSubsetManySockets);
3450         goto _exit;
3451     }
3452     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3453         KMP_WARNING( AffHWSubsetManyCores );
3454         goto _exit;
3455     }
3456 
3457     AddrUnsPair *newAddr;
3458     if (pAddr) // pAddr is NULL in case of affinity_none
3459         newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3460             __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3461 
3462     for (i = 0; i < nPackages; ++i) {
3463         if (i < __kmp_place_socket_offset ||
3464             i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
3465             n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3466             if (__kmp_pu_os_idx != NULL) {
3467                 for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
3468                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
3469                         KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3470                         ++proc_num;
3471                     }
3472                 }
3473             }
3474         } else {
3475             for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
3476                 if (j < __kmp_place_core_offset ||
3477                     j >= __kmp_place_core_offset + __kmp_place_num_cores) {
3478                     n_old += __kmp_nThreadsPerCore; // skip not-requested core
3479                     if (__kmp_pu_os_idx != NULL) {
3480                         for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
3481                             KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3482                             ++proc_num;
3483                         }
3484                     }
3485                 } else {
3486                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3487                         if (k < __kmp_place_num_threads_per_core) {
3488                             if (pAddr)
3489                                 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3490                             n_new++;
3491                         } else {
3492                             if (__kmp_pu_os_idx != NULL)
3493                                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3494                         }
3495                         n_old++;
3496                         ++proc_num;
3497                     }
3498                 }
3499             }
3500         }
3501     }
3502     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3503     KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3504                      __kmp_place_num_threads_per_core);
3505 
3506     nPackages = __kmp_place_num_sockets;                      // correct nPackages
3507     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3508     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3509     __kmp_avail_proc = n_new;                                 // correct avail_proc
3510     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3511 
3512     if (pAddr) {
3513         __kmp_free( *pAddr );
3514         *pAddr = newAddr;      // replace old topology with new one
3515     }
3516 _exit:
3517     if (__kmp_pu_os_idx != NULL) {
3518         __kmp_free(__kmp_pu_os_idx);
3519         __kmp_pu_os_idx = NULL;
3520     }
3521 }
3522 
3523 //
3524 // This function figures out the deepest level at which there is at least one cluster/core
3525 // with more than one processing unit bound to it.
3526 //
3527 static int
3528 __kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level)
3529 {
3530     int core_level = 0;
3531 
3532     for( int i = 0; i < nprocs; i++ ) {
3533         for( int j = bottom_level; j > 0; j-- ) {
3534             if( address2os[i].first.labels[j] > 0 ) {
3535                 if( core_level < ( j - 1 ) ) {
3536                     core_level = j - 1;
3537                 }
3538             }
3539         }
3540     }
3541     return core_level;
3542 }
3543 
3544 //
3545 // This function counts number of clusters/cores at given level.
3546 //
3547 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
3548 {
3549     int ncores = 0;
3550     int i, j;
3551 
3552     j = bottom_level;
3553     for( i = 0; i < nprocs; i++ ) {
3554         for ( j = bottom_level; j > core_level; j-- ) {
3555             if( ( i + 1 ) < nprocs ) {
3556                 if( address2os[i + 1].first.labels[j] > 0 ) {
3557                     break;
3558                 }
3559             }
3560         }
3561         if( j == core_level ) {
3562             ncores++;
3563         }
3564     }
3565     if( j > core_level ) {
3566         //
3567         // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core.
3568         // May occur when called from __kmp_affinity_find_core().
3569         //
3570         ncores++;
3571     }
3572     return ncores;
3573 }
3574 
3575 //
3576 // This function finds to which cluster/core given processing unit is bound.
3577 //
3578 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level)
3579 {
3580     return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1;
3581 }
3582 
3583 //
3584 // This function finds maximal number of processing units bound to a cluster/core at given level.
3585 //
3586 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
3587 {
3588     int maxprocpercore = 0;
3589 
3590     if( core_level < bottom_level ) {
3591         for( int i = 0; i < nprocs; i++ ) {
3592             int percore = address2os[i].first.labels[core_level + 1] + 1;
3593 
3594             if( percore > maxprocpercore ) {
3595                 maxprocpercore = percore;
3596             }
3597        }
3598     } else {
3599         maxprocpercore = 1;
3600     }
3601     return maxprocpercore;
3602 }
3603 
3604 static AddrUnsPair *address2os = NULL;
3605 static int           * procarr = NULL;
3606 static int     __kmp_aff_depth = 0;
3607 
3608 #define KMP_EXIT_AFF_NONE                             \
3609     KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3610     KMP_ASSERT(address2os == NULL);                   \
3611     __kmp_apply_thread_places(NULL, 0);               \
3612     return;
3613 
3614 static int
3615 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
3616 {
3617     const Address *aa = (const Address *)&(((AddrUnsPair *)a)
3618       ->first);
3619     const Address *bb = (const Address *)&(((AddrUnsPair *)b)
3620       ->first);
3621     unsigned depth = aa->depth;
3622     unsigned i;
3623     KMP_DEBUG_ASSERT(depth == bb->depth);
3624     KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
3625     KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
3626     for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
3627         int j = depth - i - 1;
3628         if (aa->childNums[j] < bb->childNums[j]) return -1;
3629         if (aa->childNums[j] > bb->childNums[j]) return 1;
3630     }
3631     for (; i < depth; i++) {
3632         int j = i - __kmp_affinity_compact;
3633         if (aa->childNums[j] < bb->childNums[j]) return -1;
3634         if (aa->childNums[j] > bb->childNums[j]) return 1;
3635     }
3636     return 0;
3637 }
3638 
3639 static void
3640 __kmp_aux_affinity_initialize(void)
3641 {
3642     if (__kmp_affinity_masks != NULL) {
3643         KMP_ASSERT(__kmp_affin_fullMask != NULL);
3644         return;
3645     }
3646 
3647     //
3648     // Create the "full" mask - this defines all of the processors that we
3649     // consider to be in the machine model.  If respect is set, then it is
3650     // the initialization thread's affinity mask.  Otherwise, it is all
3651     // processors that we know about on the machine.
3652     //
3653     if (__kmp_affin_fullMask == NULL) {
3654         KMP_CPU_ALLOC(__kmp_affin_fullMask);
3655     }
3656     if (KMP_AFFINITY_CAPABLE()) {
3657         if (__kmp_affinity_respect_mask) {
3658             __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3659 
3660             //
3661             // Count the number of available processors.
3662             //
3663             unsigned i;
3664             __kmp_avail_proc = 0;
3665             KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3666                 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3667                     continue;
3668                 }
3669                 __kmp_avail_proc++;
3670             }
3671             if (__kmp_avail_proc > __kmp_xproc) {
3672                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3673                   && (__kmp_affinity_type != affinity_none))) {
3674                     KMP_WARNING(ErrorInitializeAffinity);
3675                 }
3676                 __kmp_affinity_type = affinity_none;
3677                 KMP_AFFINITY_DISABLE();
3678                 return;
3679             }
3680         }
3681         else {
3682             __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3683             __kmp_avail_proc = __kmp_xproc;
3684         }
3685     }
3686 
3687     int depth = -1;
3688     kmp_i18n_id_t msg_id = kmp_i18n_null;
3689 
3690     //
3691     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3692     // KMP_TOPOLOGY_METHOD=cpuinfo
3693     //
3694     if ((__kmp_cpuinfo_file != NULL) &&
3695       (__kmp_affinity_top_method == affinity_top_method_all)) {
3696         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3697     }
3698 
3699     if (__kmp_affinity_top_method == affinity_top_method_all) {
3700         //
3701         // In the default code path, errors are not fatal - we just try using
3702         // another method.  We only emit a warning message if affinity is on,
3703         // or the verbose flag is set, an the nowarnings flag was not set.
3704         //
3705         const char *file_name = NULL;
3706         int line = 0;
3707 # if KMP_USE_HWLOC
3708         if (depth < 0) {
3709             if (__kmp_affinity_verbose) {
3710                 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3711             }
3712             if(!__kmp_hwloc_error) {
3713                 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3714                 if (depth == 0) {
3715                     KMP_EXIT_AFF_NONE;
3716                 } else if(depth < 0 && __kmp_affinity_verbose) {
3717                     KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3718                 }
3719             } else if(__kmp_affinity_verbose) {
3720                 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3721             }
3722         }
3723 # endif
3724 
3725 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3726 
3727         if (depth < 0) {
3728             if (__kmp_affinity_verbose) {
3729                 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3730             }
3731 
3732             file_name = NULL;
3733             depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3734             if (depth == 0) {
3735                 KMP_EXIT_AFF_NONE;
3736             }
3737 
3738             if (depth < 0) {
3739                 if (__kmp_affinity_verbose) {
3740                     if (msg_id != kmp_i18n_null) {
3741                         KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3742                           KMP_I18N_STR(DecodingLegacyAPIC));
3743                     }
3744                     else {
3745                         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3746                     }
3747                 }
3748 
3749                 file_name = NULL;
3750                 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3751                 if (depth == 0) {
3752                     KMP_EXIT_AFF_NONE;
3753                 }
3754             }
3755         }
3756 
3757 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3758 
3759 # if KMP_OS_LINUX
3760 
3761         if (depth < 0) {
3762             if (__kmp_affinity_verbose) {
3763                 if (msg_id != kmp_i18n_null) {
3764                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3765                 }
3766                 else {
3767                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3768                 }
3769             }
3770 
3771             FILE *f = fopen("/proc/cpuinfo", "r");
3772             if (f == NULL) {
3773                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3774             }
3775             else {
3776                 file_name = "/proc/cpuinfo";
3777                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3778                 fclose(f);
3779                 if (depth == 0) {
3780                     KMP_EXIT_AFF_NONE;
3781                 }
3782             }
3783         }
3784 
3785 # endif /* KMP_OS_LINUX */
3786 
3787 # if KMP_GROUP_AFFINITY
3788 
3789         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3790             if (__kmp_affinity_verbose) {
3791                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3792             }
3793 
3794             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3795             KMP_ASSERT(depth != 0);
3796         }
3797 
3798 # endif /* KMP_GROUP_AFFINITY */
3799 
3800         if (depth < 0) {
3801             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3802                 if (file_name == NULL) {
3803                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3804                 }
3805                 else if (line == 0) {
3806                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3807                 }
3808                 else {
3809                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3810                 }
3811             }
3812             // FIXME - print msg if msg_id = kmp_i18n_null ???
3813 
3814             file_name = "";
3815             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3816             if (depth == 0) {
3817                 KMP_EXIT_AFF_NONE;
3818             }
3819             KMP_ASSERT(depth > 0);
3820             KMP_ASSERT(address2os != NULL);
3821         }
3822     }
3823 
3824     //
3825     // If the user has specified that a paricular topology discovery method
3826     // is to be used, then we abort if that method fails.  The exception is
3827     // group affinity, which might have been implicitly set.
3828     //
3829 
3830 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3831 
3832     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3833         if (__kmp_affinity_verbose) {
3834             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3835               KMP_I18N_STR(Decodingx2APIC));
3836         }
3837 
3838         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3839         if (depth == 0) {
3840             KMP_EXIT_AFF_NONE;
3841         }
3842         if (depth < 0) {
3843             KMP_ASSERT(msg_id != kmp_i18n_null);
3844             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3845         }
3846     }
3847     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3848         if (__kmp_affinity_verbose) {
3849             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3850               KMP_I18N_STR(DecodingLegacyAPIC));
3851         }
3852 
3853         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3854         if (depth == 0) {
3855             KMP_EXIT_AFF_NONE;
3856         }
3857         if (depth < 0) {
3858             KMP_ASSERT(msg_id != kmp_i18n_null);
3859             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3860         }
3861     }
3862 
3863 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3864 
3865     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3866         const char *filename;
3867         if (__kmp_cpuinfo_file != NULL) {
3868             filename = __kmp_cpuinfo_file;
3869         }
3870         else {
3871             filename = "/proc/cpuinfo";
3872         }
3873 
3874         if (__kmp_affinity_verbose) {
3875             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3876         }
3877 
3878         FILE *f = fopen(filename, "r");
3879         if (f == NULL) {
3880             int code = errno;
3881             if (__kmp_cpuinfo_file != NULL) {
3882                 __kmp_msg(
3883                     kmp_ms_fatal,
3884                     KMP_MSG(CantOpenFileForReading, filename),
3885                     KMP_ERR(code),
3886                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3887                     __kmp_msg_null
3888                 );
3889             }
3890             else {
3891                 __kmp_msg(
3892                     kmp_ms_fatal,
3893                     KMP_MSG(CantOpenFileForReading, filename),
3894                     KMP_ERR(code),
3895                     __kmp_msg_null
3896                 );
3897             }
3898         }
3899         int line = 0;
3900         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3901         fclose(f);
3902         if (depth < 0) {
3903             KMP_ASSERT(msg_id != kmp_i18n_null);
3904             if (line > 0) {
3905                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3906             }
3907             else {
3908                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3909             }
3910         }
3911         if (__kmp_affinity_type == affinity_none) {
3912             KMP_ASSERT(depth == 0);
3913             KMP_EXIT_AFF_NONE;
3914         }
3915     }
3916 
3917 # if KMP_GROUP_AFFINITY
3918 
3919     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3920         if (__kmp_affinity_verbose) {
3921             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3922         }
3923 
3924         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3925         KMP_ASSERT(depth != 0);
3926         if (depth < 0) {
3927             KMP_ASSERT(msg_id != kmp_i18n_null);
3928             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3929         }
3930     }
3931 
3932 # endif /* KMP_GROUP_AFFINITY */
3933 
3934     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3935         if (__kmp_affinity_verbose) {
3936             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3937         }
3938 
3939         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3940         if (depth == 0) {
3941             KMP_EXIT_AFF_NONE;
3942         }
3943         // should not fail
3944         KMP_ASSERT(depth > 0);
3945         KMP_ASSERT(address2os != NULL);
3946     }
3947 
3948 # if KMP_USE_HWLOC
3949     else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3950         if (__kmp_affinity_verbose) {
3951             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3952         }
3953         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3954         if (depth == 0) {
3955             KMP_EXIT_AFF_NONE;
3956         }
3957     }
3958 # endif // KMP_USE_HWLOC
3959 
3960     if (address2os == NULL) {
3961         if (KMP_AFFINITY_CAPABLE()
3962           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3963           && (__kmp_affinity_type != affinity_none)))) {
3964             KMP_WARNING(ErrorInitializeAffinity);
3965         }
3966         __kmp_affinity_type = affinity_none;
3967         KMP_AFFINITY_DISABLE();
3968         return;
3969     }
3970 
3971     __kmp_apply_thread_places(&address2os, depth);
3972 
3973     //
3974     // Create the table of masks, indexed by thread Id.
3975     //
3976     unsigned maxIndex;
3977     unsigned numUnique;
3978     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3979       address2os, __kmp_avail_proc);
3980     if (__kmp_affinity_gran_levels == 0) {
3981         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3982     }
3983 
3984     //
3985     // Set the childNums vector in all Address objects.  This must be done
3986     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3987     // which takes into account the setting of __kmp_affinity_compact.
3988     //
3989     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3990 
3991     switch (__kmp_affinity_type) {
3992 
3993         case affinity_explicit:
3994         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3995 # if OMP_40_ENABLED
3996         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3997 # endif
3998         {
3999             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
4000               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
4001               maxIndex);
4002         }
4003 # if OMP_40_ENABLED
4004         else {
4005             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
4006               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
4007               maxIndex);
4008         }
4009 # endif
4010         if (__kmp_affinity_num_masks == 0) {
4011             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
4012               && (__kmp_affinity_type != affinity_none))) {
4013                 KMP_WARNING(AffNoValidProcID);
4014             }
4015             __kmp_affinity_type = affinity_none;
4016             return;
4017         }
4018         break;
4019 
4020         //
4021         // The other affinity types rely on sorting the Addresses according
4022         // to some permutation of the machine topology tree.  Set
4023         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
4024         // then jump to a common code fragment to do the sort and create
4025         // the array of affinity masks.
4026         //
4027 
4028         case affinity_logical:
4029         __kmp_affinity_compact = 0;
4030         if (__kmp_affinity_offset) {
4031             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
4032               % __kmp_avail_proc;
4033         }
4034         goto sortAddresses;
4035 
4036         case affinity_physical:
4037         if (__kmp_nThreadsPerCore > 1) {
4038             __kmp_affinity_compact = 1;
4039             if (__kmp_affinity_compact >= depth) {
4040                 __kmp_affinity_compact = 0;
4041             }
4042         } else {
4043             __kmp_affinity_compact = 0;
4044         }
4045         if (__kmp_affinity_offset) {
4046             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
4047               % __kmp_avail_proc;
4048         }
4049         goto sortAddresses;
4050 
4051         case affinity_scatter:
4052         if (__kmp_affinity_compact >= depth) {
4053             __kmp_affinity_compact = 0;
4054         }
4055         else {
4056             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4057         }
4058         goto sortAddresses;
4059 
4060         case affinity_compact:
4061         if (__kmp_affinity_compact >= depth) {
4062             __kmp_affinity_compact = depth - 1;
4063         }
4064         goto sortAddresses;
4065 
4066         case affinity_balanced:
4067         if( depth <= 1 ) {
4068             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
4069                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
4070             }
4071             __kmp_affinity_type = affinity_none;
4072             return;
4073         } else if( __kmp_affinity_uniform_topology() ) {
4074             break;
4075         } else { // Non-uniform topology
4076 
4077             // Save the depth for further usage
4078             __kmp_aff_depth = depth;
4079 
4080             int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1);
4081             int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level);
4082             int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level);
4083 
4084             int nproc = ncores * maxprocpercore;
4085             if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) {
4086                 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
4087                     KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
4088                 }
4089                 __kmp_affinity_type = affinity_none;
4090                 return;
4091             }
4092 
4093             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4094             for( int i = 0; i < nproc; i++ ) {
4095                 procarr[ i ] = -1;
4096             }
4097 
4098             int lastcore = -1;
4099             int inlastcore = 0;
4100             for( int i = 0; i < __kmp_avail_proc; i++ ) {
4101                 int proc = address2os[ i ].second;
4102                 int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4103 
4104                 if ( core == lastcore ) {
4105                     inlastcore++;
4106                 } else {
4107                     inlastcore = 0;
4108                 }
4109                 lastcore = core;
4110 
4111                 procarr[ core * maxprocpercore + inlastcore ] = proc;
4112             }
4113 
4114             break;
4115         }
4116 
4117         sortAddresses:
4118         //
4119         // Allocate the gtid->affinity mask table.
4120         //
4121         if (__kmp_affinity_dups) {
4122             __kmp_affinity_num_masks = __kmp_avail_proc;
4123         }
4124         else {
4125             __kmp_affinity_num_masks = numUnique;
4126         }
4127 
4128 # if OMP_40_ENABLED
4129         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
4130           && ( __kmp_affinity_num_places > 0 )
4131           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
4132             __kmp_affinity_num_masks = __kmp_affinity_num_places;
4133         }
4134 # endif
4135 
4136         KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4137 
4138         //
4139         // Sort the address2os table according to the current setting of
4140         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4141         //
4142         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4143           __kmp_affinity_cmp_Address_child_num);
4144         {
4145             int i;
4146             unsigned j;
4147             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4148                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
4149                     continue;
4150                 }
4151                 unsigned osId = address2os[i].second;
4152                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4153                 kmp_affin_mask_t *dest
4154                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4155                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4156                 KMP_CPU_COPY(dest, src);
4157                 if (++j >= __kmp_affinity_num_masks) {
4158                     break;
4159                 }
4160             }
4161             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4162         }
4163         break;
4164 
4165         default:
4166         KMP_ASSERT2(0, "Unexpected affinity setting");
4167     }
4168 
4169     KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex+1);
4170     machine_hierarchy.init(address2os, __kmp_avail_proc);
4171 }
4172 #undef KMP_EXIT_AFF_NONE
4173 
4174 
4175 void
4176 __kmp_affinity_initialize(void)
4177 {
4178     //
4179     // Much of the code above was written assumming that if a machine was not
4180     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4181     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4182     //
4183     // There are too many checks for __kmp_affinity_type == affinity_none
4184     // in this code.  Instead of trying to change them all, check if
4185     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4186     // affinity_none, call the real initialization routine, then restore
4187     // __kmp_affinity_type to affinity_disabled.
4188     //
4189     int disabled = (__kmp_affinity_type == affinity_disabled);
4190     if (! KMP_AFFINITY_CAPABLE()) {
4191         KMP_ASSERT(disabled);
4192     }
4193     if (disabled) {
4194         __kmp_affinity_type = affinity_none;
4195     }
4196     __kmp_aux_affinity_initialize();
4197     if (disabled) {
4198         __kmp_affinity_type = affinity_disabled;
4199     }
4200 }
4201 
4202 
4203 void
4204 __kmp_affinity_uninitialize(void)
4205 {
4206     if (__kmp_affinity_masks != NULL) {
4207         KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4208         __kmp_affinity_masks = NULL;
4209     }
4210     if (__kmp_affin_fullMask != NULL) {
4211         KMP_CPU_FREE(__kmp_affin_fullMask);
4212         __kmp_affin_fullMask = NULL;
4213     }
4214     __kmp_affinity_num_masks = 0;
4215 # if OMP_40_ENABLED
4216     __kmp_affinity_num_places = 0;
4217 # endif
4218     if (__kmp_affinity_proclist != NULL) {
4219         __kmp_free(__kmp_affinity_proclist);
4220         __kmp_affinity_proclist = NULL;
4221     }
4222     if( address2os != NULL ) {
4223         __kmp_free( address2os );
4224         address2os = NULL;
4225     }
4226     if( procarr != NULL ) {
4227         __kmp_free( procarr );
4228         procarr = NULL;
4229     }
4230 # if KMP_USE_HWLOC
4231     if (__kmp_hwloc_topology != NULL) {
4232         hwloc_topology_destroy(__kmp_hwloc_topology);
4233         __kmp_hwloc_topology = NULL;
4234     }
4235 # endif
4236 }
4237 
4238 
4239 void
4240 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4241 {
4242     if (! KMP_AFFINITY_CAPABLE()) {
4243         return;
4244     }
4245 
4246     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4247     if (th->th.th_affin_mask == NULL) {
4248         KMP_CPU_ALLOC(th->th.th_affin_mask);
4249     }
4250     else {
4251         KMP_CPU_ZERO(th->th.th_affin_mask);
4252     }
4253 
4254     //
4255     // Copy the thread mask to the kmp_info_t strucuture.
4256     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4257     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4258     // is set, then the full mask is the same as the mask of the initialization
4259     // thread.
4260     //
4261     kmp_affin_mask_t *mask;
4262     int i;
4263 
4264 # if OMP_40_ENABLED
4265     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4266 # endif
4267     {
4268         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4269           ) {
4270 # if KMP_GROUP_AFFINITY
4271             if (__kmp_num_proc_groups > 1) {
4272                 return;
4273             }
4274 # endif
4275             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4276             i = KMP_PLACE_ALL;
4277             mask = __kmp_affin_fullMask;
4278         }
4279         else {
4280             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4281             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4282             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4283         }
4284     }
4285 # if OMP_40_ENABLED
4286     else {
4287         if ((! isa_root)
4288           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4289 #  if KMP_GROUP_AFFINITY
4290             if (__kmp_num_proc_groups > 1) {
4291                 return;
4292             }
4293 #  endif
4294             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4295             i = KMP_PLACE_ALL;
4296             mask = __kmp_affin_fullMask;
4297         }
4298         else {
4299             //
4300             // int i = some hash function or just a counter that doesn't
4301             // always start at 0.  Use gtid for now.
4302             //
4303             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4304             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4305             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4306         }
4307     }
4308 # endif
4309 
4310 # if OMP_40_ENABLED
4311     th->th.th_current_place = i;
4312     if (isa_root) {
4313         th->th.th_new_place = i;
4314         th->th.th_first_place = 0;
4315         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4316     }
4317 
4318     if (i == KMP_PLACE_ALL) {
4319         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4320           gtid));
4321     }
4322     else {
4323         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4324           gtid, i));
4325     }
4326 # else
4327     if (i == -1) {
4328         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4329           gtid));
4330     }
4331     else {
4332         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4333           gtid, i));
4334     }
4335 # endif /* OMP_40_ENABLED */
4336 
4337     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4338 
4339     if (__kmp_affinity_verbose) {
4340         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4341         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4342           th->th.th_affin_mask);
4343         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4344           buf);
4345     }
4346 
4347 # if KMP_OS_WINDOWS
4348     //
4349     // On Windows* OS, the process affinity mask might have changed.
4350     // If the user didn't request affinity and this call fails,
4351     // just continue silently.  See CQ171393.
4352     //
4353     if ( __kmp_affinity_type == affinity_none ) {
4354         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4355     }
4356     else
4357 # endif
4358     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4359 }
4360 
4361 
4362 # if OMP_40_ENABLED
4363 
4364 void
4365 __kmp_affinity_set_place(int gtid)
4366 {
4367     int retval;
4368 
4369     if (! KMP_AFFINITY_CAPABLE()) {
4370         return;
4371     }
4372 
4373     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4374 
4375     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4376       gtid, th->th.th_new_place, th->th.th_current_place));
4377 
4378     //
4379     // Check that the new place is within this thread's partition.
4380     //
4381     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4382     KMP_ASSERT(th->th.th_new_place >= 0);
4383     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4384     if (th->th.th_first_place <= th->th.th_last_place) {
4385         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4386          && (th->th.th_new_place <= th->th.th_last_place));
4387     }
4388     else {
4389         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4390          || (th->th.th_new_place >= th->th.th_last_place));
4391     }
4392 
4393     //
4394     // Copy the thread mask to the kmp_info_t strucuture,
4395     // and set this thread's affinity.
4396     //
4397     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4398       th->th.th_new_place);
4399     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4400     th->th.th_current_place = th->th.th_new_place;
4401 
4402     if (__kmp_affinity_verbose) {
4403         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4404         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4405           th->th.th_affin_mask);
4406         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4407           gtid, buf);
4408     }
4409     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4410 }
4411 
4412 # endif /* OMP_40_ENABLED */
4413 
4414 
4415 int
4416 __kmp_aux_set_affinity(void **mask)
4417 {
4418     int gtid;
4419     kmp_info_t *th;
4420     int retval;
4421 
4422     if (! KMP_AFFINITY_CAPABLE()) {
4423         return -1;
4424     }
4425 
4426     gtid = __kmp_entry_gtid();
4427     KA_TRACE(1000, ;{
4428         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4429         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4430           (kmp_affin_mask_t *)(*mask));
4431         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4432           gtid, buf);
4433     });
4434 
4435     if (__kmp_env_consistency_check) {
4436         if ((mask == NULL) || (*mask == NULL)) {
4437             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4438         }
4439         else {
4440             unsigned proc;
4441             int num_procs = 0;
4442 
4443             KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4444                 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4445                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4446                 }
4447                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4448                     continue;
4449                 }
4450                 num_procs++;
4451             }
4452             if (num_procs == 0) {
4453                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4454             }
4455 
4456 # if KMP_GROUP_AFFINITY
4457             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4458                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4459             }
4460 # endif /* KMP_GROUP_AFFINITY */
4461 
4462         }
4463     }
4464 
4465     th = __kmp_threads[gtid];
4466     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4467     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4468     if (retval == 0) {
4469         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4470     }
4471 
4472 # if OMP_40_ENABLED
4473     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4474     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4475     th->th.th_first_place = 0;
4476     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4477 
4478     //
4479     // Turn off 4.0 affinity for the current tread at this parallel level.
4480     //
4481     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4482 # endif
4483 
4484     return retval;
4485 }
4486 
4487 
4488 int
4489 __kmp_aux_get_affinity(void **mask)
4490 {
4491     int gtid;
4492     int retval;
4493     kmp_info_t *th;
4494 
4495     if (! KMP_AFFINITY_CAPABLE()) {
4496         return -1;
4497     }
4498 
4499     gtid = __kmp_entry_gtid();
4500     th = __kmp_threads[gtid];
4501     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4502 
4503     KA_TRACE(1000, ;{
4504         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4505         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4506           th->th.th_affin_mask);
4507         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4508     });
4509 
4510     if (__kmp_env_consistency_check) {
4511         if ((mask == NULL) || (*mask == NULL)) {
4512             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4513         }
4514     }
4515 
4516 # if !KMP_OS_WINDOWS
4517 
4518     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4519     KA_TRACE(1000, ;{
4520         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4521         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4522           (kmp_affin_mask_t *)(*mask));
4523         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4524     });
4525     return retval;
4526 
4527 # else
4528 
4529     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4530     return 0;
4531 
4532 # endif /* KMP_OS_WINDOWS */
4533 
4534 }
4535 
4536 int
4537 __kmp_aux_get_affinity_max_proc() {
4538     if (!  KMP_AFFINITY_CAPABLE()) {
4539         return 0;
4540     }
4541 #if KMP_GROUP_AFFINITY
4542     if ( __kmp_num_proc_groups > 1 ) {
4543         return (int)(__kmp_num_proc_groups*sizeof(DWORD_PTR)*CHAR_BIT);
4544     }
4545 #endif
4546     return __kmp_xproc;
4547 }
4548 
4549 int
4550 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4551 {
4552     int retval;
4553 
4554     if (! KMP_AFFINITY_CAPABLE()) {
4555         return -1;
4556     }
4557 
4558     KA_TRACE(1000, ;{
4559         int gtid = __kmp_entry_gtid();
4560         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4561         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4562           (kmp_affin_mask_t *)(*mask));
4563         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4564           proc, gtid, buf);
4565     });
4566 
4567     if (__kmp_env_consistency_check) {
4568         if ((mask == NULL) || (*mask == NULL)) {
4569             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4570         }
4571     }
4572 
4573     if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4574         return -1;
4575     }
4576     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4577         return -2;
4578     }
4579 
4580     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4581     return 0;
4582 }
4583 
4584 
4585 int
4586 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4587 {
4588     int retval;
4589 
4590     if (! KMP_AFFINITY_CAPABLE()) {
4591         return -1;
4592     }
4593 
4594     KA_TRACE(1000, ;{
4595         int gtid = __kmp_entry_gtid();
4596         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4597         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4598           (kmp_affin_mask_t *)(*mask));
4599         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4600           proc, gtid, buf);
4601     });
4602 
4603     if (__kmp_env_consistency_check) {
4604         if ((mask == NULL) || (*mask == NULL)) {
4605             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4606         }
4607     }
4608 
4609     if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4610         return -1;
4611     }
4612     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4613         return -2;
4614     }
4615 
4616     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4617     return 0;
4618 }
4619 
4620 
4621 int
4622 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4623 {
4624     int retval;
4625 
4626     if (! KMP_AFFINITY_CAPABLE()) {
4627         return -1;
4628     }
4629 
4630     KA_TRACE(1000, ;{
4631         int gtid = __kmp_entry_gtid();
4632         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4633         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4634           (kmp_affin_mask_t *)(*mask));
4635         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4636           proc, gtid, buf);
4637     });
4638 
4639     if (__kmp_env_consistency_check) {
4640         if ((mask == NULL) || (*mask == NULL)) {
4641             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4642         }
4643     }
4644 
4645     if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4646         return -1;
4647     }
4648     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4649         return 0;
4650     }
4651 
4652     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4653 }
4654 
4655 
4656 // Dynamic affinity settings - Affinity balanced
4657 void __kmp_balanced_affinity( int tid, int nthreads )
4658 {
4659     bool fine_gran = true;
4660 
4661     switch (__kmp_affinity_gran) {
4662         case affinity_gran_fine:
4663         case affinity_gran_thread:
4664             break;
4665         case affinity_gran_core:
4666             if( __kmp_nThreadsPerCore > 1) {
4667                 fine_gran = false;
4668             }
4669             break;
4670         case affinity_gran_package:
4671             if( nCoresPerPkg > 1) {
4672                 fine_gran = false;
4673             }
4674             break;
4675         default:
4676             fine_gran = false;
4677     }
4678 
4679     if( __kmp_affinity_uniform_topology() ) {
4680         int coreID;
4681         int threadID;
4682         // Number of hyper threads per core in HT machine
4683         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4684         // Number of cores
4685         int ncores = __kmp_ncores;
4686         if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) {
4687             __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4688             ncores = nPackages;
4689         }
4690         // How many threads will be bound to each core
4691         int chunk = nthreads / ncores;
4692         // How many cores will have an additional thread bound to it - "big cores"
4693         int big_cores = nthreads % ncores;
4694         // Number of threads on the big cores
4695         int big_nth = ( chunk + 1 ) * big_cores;
4696         if( tid < big_nth ) {
4697             coreID = tid / (chunk + 1 );
4698             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4699         } else { //tid >= big_nth
4700             coreID = ( tid - big_cores ) / chunk;
4701             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4702         }
4703 
4704         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4705           "Illegal set affinity operation when not capable");
4706 
4707         kmp_affin_mask_t *mask;
4708         KMP_CPU_ALLOC_ON_STACK(mask);
4709         KMP_CPU_ZERO(mask);
4710 
4711         if( fine_gran ) {
4712             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4713             KMP_CPU_SET( osID, mask);
4714         } else {
4715             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4716                 int osID;
4717                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4718                 KMP_CPU_SET( osID, mask);
4719             }
4720         }
4721         if (__kmp_affinity_verbose) {
4722             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4723             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4724             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4725               tid, buf);
4726         }
4727         __kmp_set_system_affinity( mask, TRUE );
4728         KMP_CPU_FREE_FROM_STACK(mask);
4729     } else { // Non-uniform topology
4730 
4731         kmp_affin_mask_t *mask;
4732         KMP_CPU_ALLOC_ON_STACK(mask);
4733         KMP_CPU_ZERO(mask);
4734 
4735         int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
4736         int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4737         int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4738 
4739         // For performance gain consider the special case nthreads == __kmp_avail_proc
4740         if( nthreads == __kmp_avail_proc ) {
4741             if( fine_gran ) {
4742                 int osID = address2os[ tid ].second;
4743                 KMP_CPU_SET( osID, mask);
4744             } else {
4745                 int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level);
4746                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4747                     int osID = address2os[ i ].second;
4748                     if( __kmp_affinity_find_core(address2os, i,  __kmp_aff_depth - 1, core_level) == core ) {
4749                         KMP_CPU_SET( osID, mask);
4750                     }
4751                 }
4752             }
4753         } else if( nthreads <= ncores ) {
4754 
4755             int core = 0;
4756             for( int i = 0; i < ncores; i++ ) {
4757                 // Check if this core from procarr[] is in the mask
4758                 int in_mask = 0;
4759                 for( int j = 0; j < nth_per_core; j++ ) {
4760                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4761                         in_mask = 1;
4762                         break;
4763                     }
4764                 }
4765                 if( in_mask ) {
4766                     if( tid == core ) {
4767                         for( int j = 0; j < nth_per_core; j++ ) {
4768                             int osID = procarr[ i * nth_per_core + j ];
4769                             if( osID != -1 ) {
4770                                 KMP_CPU_SET( osID, mask );
4771                                 // For fine granularity it is enough to set the first available osID for this core
4772                                 if( fine_gran) {
4773                                     break;
4774                                 }
4775                             }
4776                         }
4777                         break;
4778                     } else {
4779                         core++;
4780                     }
4781                 }
4782             }
4783 
4784         } else { // nthreads > ncores
4785 
4786             // Array to save the number of processors at each core
4787             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4788             // Array to save the number of cores with "x" available processors;
4789             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4790             // Array to save the number of cores with # procs from x to nth_per_core
4791             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4792 
4793             for( int i = 0; i <= nth_per_core; i++ ) {
4794                 ncores_with_x_procs[ i ] = 0;
4795                 ncores_with_x_to_max_procs[ i ] = 0;
4796             }
4797 
4798             for( int i = 0; i < ncores; i++ ) {
4799                 int cnt = 0;
4800                 for( int j = 0; j < nth_per_core; j++ ) {
4801                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4802                         cnt++;
4803                     }
4804                 }
4805                 nproc_at_core[ i ] = cnt;
4806                 ncores_with_x_procs[ cnt ]++;
4807             }
4808 
4809             for( int i = 0; i <= nth_per_core; i++ ) {
4810                 for( int j = i; j <= nth_per_core; j++ ) {
4811                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4812                 }
4813             }
4814 
4815             // Max number of processors
4816             int nproc = nth_per_core * ncores;
4817             // An array to keep number of threads per each context
4818             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4819             for( int i = 0; i < nproc; i++ ) {
4820                 newarr[ i ] = 0;
4821             }
4822 
4823             int nth = nthreads;
4824             int flag = 0;
4825             while( nth > 0 ) {
4826                 for( int j = 1; j <= nth_per_core; j++ ) {
4827                     int cnt = ncores_with_x_to_max_procs[ j ];
4828                     for( int i = 0; i < ncores; i++ ) {
4829                         // Skip the core with 0 processors
4830                         if( nproc_at_core[ i ] == 0 ) {
4831                             continue;
4832                         }
4833                         for( int k = 0; k < nth_per_core; k++ ) {
4834                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4835                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4836                                     newarr[ i * nth_per_core + k ] = 1;
4837                                     cnt--;
4838                                     nth--;
4839                                     break;
4840                                 } else {
4841                                     if( flag != 0 ) {
4842                                         newarr[ i * nth_per_core + k ] ++;
4843                                         cnt--;
4844                                         nth--;
4845                                         break;
4846                                     }
4847                                 }
4848                             }
4849                         }
4850                         if( cnt == 0 || nth == 0 ) {
4851                             break;
4852                         }
4853                     }
4854                     if( nth == 0 ) {
4855                         break;
4856                     }
4857                 }
4858                 flag = 1;
4859             }
4860             int sum = 0;
4861             for( int i = 0; i < nproc; i++ ) {
4862                 sum += newarr[ i ];
4863                 if( sum > tid ) {
4864                     if( fine_gran) {
4865                         int osID = procarr[ i ];
4866                         KMP_CPU_SET( osID, mask);
4867                     } else {
4868                         int coreID = i / nth_per_core;
4869                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4870                             int osID = procarr[ coreID * nth_per_core + ii ];
4871                             if( osID != -1 ) {
4872                                 KMP_CPU_SET( osID, mask);
4873                             }
4874                         }
4875                     }
4876                     break;
4877                 }
4878             }
4879             __kmp_free( newarr );
4880         }
4881 
4882         if (__kmp_affinity_verbose) {
4883             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4884             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4885             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4886               tid, buf);
4887         }
4888         __kmp_set_system_affinity( mask, TRUE );
4889         KMP_CPU_FREE_FROM_STACK(mask);
4890     }
4891 }
4892 
4893 #if KMP_OS_LINUX
4894 // We don't need this entry for Windows because
4895 // there is GetProcessAffinityMask() api
4896 //
4897 // The intended usage is indicated by these steps:
4898 // 1) The user gets the current affinity mask
4899 // 2) Then sets the affinity by calling this function
4900 // 3) Error check the return value
4901 // 4) Use non-OpenMP parallelization
4902 // 5) Reset the affinity to what was stored in step 1)
4903 #ifdef __cplusplus
4904 extern "C"
4905 #endif
4906 int
4907 kmp_set_thread_affinity_mask_initial()
4908 // the function returns 0 on success,
4909 //   -1 if we cannot bind thread
4910 //   >0 (errno) if an error happened during binding
4911 {
4912     int gtid = __kmp_get_gtid();
4913     if (gtid < 0) {
4914         // Do not touch non-omp threads
4915         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4916             "non-omp thread, returning\n"));
4917         return -1;
4918     }
4919     if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4920         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4921             "affinity not initialized, returning\n"));
4922         return -1;
4923     }
4924     KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4925         "set full mask for thread %d\n", gtid));
4926     KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4927     return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4928 }
4929 #endif
4930 
4931 #endif // KMP_AFFINITY_SUPPORTED
4932