1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27     machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31     kmp_uint32 depth;
32     // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33     if (TCR_1(machine_hierarchy.uninitialized))
34         machine_hierarchy.init(NULL, nproc);
35 
36     // Adjust the hierarchy in case num threads exceeds original
37     if (nproc > machine_hierarchy.base_num_threads)
38         machine_hierarchy.resize(nproc);
39 
40     depth = machine_hierarchy.depth;
41     KMP_DEBUG_ASSERT(depth > 0);
42 
43     thr_bar->depth = depth;
44     thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45     thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 //
51 // Print the affinity mask to the character array in a pretty format.
52 //
53 #if KMP_USE_HWLOC
54 char *
55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
56 {
57     int num_chars_to_write, num_chars_written;
58     char* scan;
59     KMP_ASSERT(buf_len >= 40);
60 
61     // bufsize of 0 just retrieves the needed buffer size.
62     num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
63 
64     // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
65     // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
66     //   take into account the '\0' character.
67     if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
68         KMP_SNPRINTF(buf, buf_len, "{<empty>}");
69     } else if(num_chars_to_write < buf_len - 3) {
70         // no problem fitting the mask into buf_len number of characters
71         buf[0] = '{';
72         // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
73         num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
74         buf[num_chars_written+1] = '}';
75         buf[num_chars_written+2] = '\0';
76     } else {
77         // Need to truncate the affinity mask string and add ellipsis.
78         // To do this, we first write out the '{' + str(mask)
79         buf[0] = '{';
80         hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask);
81         // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
82         // on a digit then write "...}\0".  This way it is a clean ellipsis addition and we don't
83         // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
84         // { 45, 67,...} instead.
85         scan = buf + buf_len - 7;
86         while(*scan >= '0' && *scan <= '9' && scan >= buf)
87             scan--;
88         *(scan+1) = '.';
89         *(scan+2) = '.';
90         *(scan+3) = '.';
91         *(scan+4) = '}';
92         *(scan+5) = '\0';
93     }
94     return buf;
95 }
96 #else
97 char *
98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
99 {
100     KMP_ASSERT(buf_len >= 40);
101     char *scan = buf;
102     char *end = buf + buf_len - 1;
103 
104     //
105     // Find first element / check for empty set.
106     //
107     size_t i;
108     for (i = 0; i < KMP_CPU_SETSIZE; i++) {
109         if (KMP_CPU_ISSET(i, mask)) {
110             break;
111         }
112     }
113     if (i == KMP_CPU_SETSIZE) {
114         KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
115         while (*scan != '\0') scan++;
116         KMP_ASSERT(scan <= end);
117         return buf;
118     }
119 
120     KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
121     while (*scan != '\0') scan++;
122     i++;
123     for (; i < KMP_CPU_SETSIZE; i++) {
124         if (! KMP_CPU_ISSET(i, mask)) {
125             continue;
126         }
127 
128         //
129         // Check for buffer overflow.  A string of the form ",<n>" will have
130         // at most 10 characters, plus we want to leave room to print ",...}"
131         // if the set is too large to print for a total of 15 characters.
132         // We already left room for '\0' in setting end.
133         //
134         if (end - scan < 15) {
135            break;
136         }
137         KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
138         while (*scan != '\0') scan++;
139     }
140     if (i < KMP_CPU_SETSIZE) {
141         KMP_SNPRINTF(scan, end-scan+1,  ",...");
142         while (*scan != '\0') scan++;
143     }
144     KMP_SNPRINTF(scan, end-scan+1, "}");
145     while (*scan != '\0') scan++;
146     KMP_ASSERT(scan <= end);
147     return buf;
148 }
149 #endif // KMP_USE_HWLOC
150 
151 
152 void
153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
154 {
155     KMP_CPU_ZERO(mask);
156 
157 # if KMP_GROUP_AFFINITY
158 
159     if (__kmp_num_proc_groups > 1) {
160         int group;
161         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
162         for (group = 0; group < __kmp_num_proc_groups; group++) {
163             int i;
164             int num = __kmp_GetActiveProcessorCount(group);
165             for (i = 0; i < num; i++) {
166                 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
167             }
168         }
169     }
170     else
171 
172 # endif /* KMP_GROUP_AFFINITY */
173 
174     {
175         int proc;
176         for (proc = 0; proc < __kmp_xproc; proc++) {
177             KMP_CPU_SET(proc, mask);
178         }
179     }
180 }
181 
182 //
183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
184 // called to renumber the labels from [0..n] and place them into the child_num
185 // vector of the address object.  This is done in case the labels used for
186 // the children at one node of the hierarchy differ from those used for
187 // another node at the same level.  Example:  suppose the machine has 2 nodes
188 // with 2 packages each.  The first node contains packages 601 and 602, and
189 // second node contains packages 603 and 604.  If we try to sort the table
190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
191 // because we are paying attention to the labels themselves, not the ordinal
192 // child numbers.  By using the child numbers in the sort, the result is
193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
194 //
195 static void
196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
197   int numAddrs)
198 {
199     KMP_DEBUG_ASSERT(numAddrs > 0);
200     int depth = address2os->first.depth;
201     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
202     unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
203       * sizeof(unsigned));
204     int labCt;
205     for (labCt = 0; labCt < depth; labCt++) {
206         address2os[0].first.childNums[labCt] = counts[labCt] = 0;
207         lastLabel[labCt] = address2os[0].first.labels[labCt];
208     }
209     int i;
210     for (i = 1; i < numAddrs; i++) {
211         for (labCt = 0; labCt < depth; labCt++) {
212             if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
213                 int labCt2;
214                 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
215                     counts[labCt2] = 0;
216                     lastLabel[labCt2] = address2os[i].first.labels[labCt2];
217                 }
218                 counts[labCt]++;
219                 lastLabel[labCt] = address2os[i].first.labels[labCt];
220                 break;
221             }
222         }
223         for (labCt = 0; labCt < depth; labCt++) {
224             address2os[i].first.childNums[labCt] = counts[labCt];
225         }
226         for (; labCt < (int)Address::maxDepth; labCt++) {
227             address2os[i].first.childNums[labCt] = 0;
228         }
229     }
230 }
231 
232 
233 //
234 // All of the __kmp_affinity_create_*_map() routines should set
235 // __kmp_affinity_masks to a vector of affinity mask objects of length
236 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
237 // return the number of levels in the machine topology tree (zero if
238 // __kmp_affinity_type == affinity_none).
239 //
240 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
241 // to the affinity mask for the initialization thread.  They need to save and
242 // restore the mask, and it could be needed later, so saving it is just an
243 // optimization to avoid calling kmp_get_system_affinity() again.
244 //
245 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
246 
247 static int nCoresPerPkg, nPackages;
248 static int __kmp_nThreadsPerCore;
249 #ifndef KMP_DFLT_NTH_CORES
250 static int __kmp_ncores;
251 #endif
252 static int *__kmp_pu_os_idx = NULL;
253 
254 //
255 // __kmp_affinity_uniform_topology() doesn't work when called from
256 // places which support arbitrarily many levels in the machine topology
257 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
258 // __kmp_affinity_create_x2apicid_map().
259 //
260 inline static bool
261 __kmp_affinity_uniform_topology()
262 {
263     return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
264 }
265 
266 
267 //
268 // Print out the detailed machine topology map, i.e. the physical locations
269 // of each OS proc.
270 //
271 static void
272 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
273   int pkgLevel, int coreLevel, int threadLevel)
274 {
275     int proc;
276 
277     KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
278     for (proc = 0; proc < len; proc++) {
279         int level;
280         kmp_str_buf_t buf;
281         __kmp_str_buf_init(&buf);
282         for (level = 0; level < depth; level++) {
283             if (level == threadLevel) {
284                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
285             }
286             else if (level == coreLevel) {
287                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
288             }
289             else if (level == pkgLevel) {
290                 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
291             }
292             else if (level > pkgLevel) {
293                 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
294                   level - pkgLevel - 1);
295             }
296             else {
297                 __kmp_str_buf_print(&buf, "L%d ", level);
298             }
299             __kmp_str_buf_print(&buf, "%d ",
300               address2os[proc].first.labels[level]);
301         }
302         KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
303           buf.str);
304         __kmp_str_buf_free(&buf);
305     }
306 }
307 
308 #if KMP_USE_HWLOC
309 
310 // This function removes the topology levels that are radix 1 and don't offer
311 // further information about the topology.  The most common example is when you
312 // have one thread context per core, we don't want the extra thread context
313 // level if it offers no unique labels.  So they are removed.
314 // return value: the new depth of address2os
315 static int
316 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
317     int level;
318     int i;
319     int radix1_detected;
320 
321     for (level = depth-1; level >= 0; --level) {
322         // Always keep the package level
323         if (level == *pkgLevel)
324             continue;
325         // Detect if this level is radix 1
326         radix1_detected = 1;
327         for (i = 1; i < nActiveThreads; ++i) {
328             if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
329                 // There are differing label values for this level so it stays
330                 radix1_detected = 0;
331                 break;
332             }
333         }
334         if (!radix1_detected)
335             continue;
336         // Radix 1 was detected
337         if (level == *threadLevel) {
338             // If only one thread per core, then just decrement
339             // the depth which removes the threadlevel from address2os
340             for (i = 0; i < nActiveThreads; ++i) {
341                 address2os[i].first.depth--;
342             }
343             *threadLevel = -1;
344         } else if (level == *coreLevel) {
345             // For core level, we move the thread labels over if they are still
346             // valid (*threadLevel != -1), and also reduce the depth another level
347             for (i = 0; i < nActiveThreads; ++i) {
348                 if (*threadLevel != -1) {
349                     address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
350                 }
351                 address2os[i].first.depth--;
352             }
353             *coreLevel = -1;
354         }
355     }
356     return address2os[0].first.depth;
357 }
358 
359 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
360 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
361 //  this will return the number of PU's under the SOCKET object.
362 static int
363 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
364     int retval = 0;
365     hwloc_obj_t first;
366     for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
367         first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
368         first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
369     {
370         ++retval;
371     }
372     return retval;
373 }
374 
375 static int
376 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
377   kmp_i18n_id_t *const msg_id)
378 {
379     *address2os = NULL;
380     *msg_id = kmp_i18n_null;
381 
382     //
383     // Save the affinity mask for the current thread.
384     //
385     kmp_affin_mask_t *oldMask;
386     KMP_CPU_ALLOC(oldMask);
387     __kmp_get_system_affinity(oldMask, TRUE);
388 
389     int depth = 3;
390     int pkgLevel = 0;
391     int coreLevel = 1;
392     int threadLevel = 2;
393 
394     if (! KMP_AFFINITY_CAPABLE())
395     {
396         //
397         // Hack to try and infer the machine topology using only the data
398         // available from cpuid on the current thread, and __kmp_xproc.
399         //
400         KMP_ASSERT(__kmp_affinity_type == affinity_none);
401 
402         nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
403         __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
404         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
405         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
406         if (__kmp_affinity_verbose) {
407             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
408             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
409             if (__kmp_affinity_uniform_topology()) {
410                 KMP_INFORM(Uniform, "KMP_AFFINITY");
411             } else {
412                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
413             }
414             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
415               __kmp_nThreadsPerCore, __kmp_ncores);
416         }
417         KMP_CPU_FREE(oldMask);
418         return 0;
419     }
420 
421     //
422     // Allocate the data structure to be returned.
423     //
424     AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
425     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
426 
427     //
428     // When affinity is off, this routine will still be called to set
429     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
430     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
431     // correctly, and return if affinity is not enabled.
432     //
433 
434     hwloc_obj_t pu;
435     hwloc_obj_t core;
436     hwloc_obj_t socket;
437     int nActiveThreads = 0;
438     int socket_identifier = 0;
439     // re-calculate globals to count only accessible resources
440     __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
441     for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
442         socket != NULL;
443         socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
444         socket_identifier++)
445     {
446         int core_identifier = 0;
447         int num_active_cores = 0;
448         for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
449             core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
450             core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
451             core_identifier++)
452         {
453             int pu_identifier = 0;
454             int num_active_threads = 0;
455             for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
456                 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
457                 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
458                 pu_identifier++)
459             {
460                 Address addr(3);
461                 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
462                     continue;         // skip inactive (inaccessible) unit
463                 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
464                     socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
465                 addr.labels[0] = socket_identifier; // package
466                 addr.labels[1] = core_identifier; // core
467                 addr.labels[2] = pu_identifier; // pu
468                 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
469                 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
470                 nActiveThreads++;
471                 ++num_active_threads; // count active threads per core
472             }
473             if (num_active_threads) { // were there any active threads on the core?
474                 ++__kmp_ncores;       // count total active cores
475                 ++num_active_cores;   // count active cores per socket
476                 if (num_active_threads > __kmp_nThreadsPerCore)
477                     __kmp_nThreadsPerCore = num_active_threads; // calc maximum
478             }
479         }
480         if (num_active_cores) {       // were there any active cores on the socket?
481             ++nPackages;              // count total active packages
482             if (num_active_cores > nCoresPerPkg)
483                 nCoresPerPkg = num_active_cores; // calc maximum
484         }
485     }
486 
487     //
488     // If there's only one thread context to bind to, return now.
489     //
490     KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
491     KMP_ASSERT(nActiveThreads > 0);
492     if (nActiveThreads == 1) {
493         __kmp_ncores = nPackages = 1;
494         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
495         if (__kmp_affinity_verbose) {
496             char buf[KMP_AFFIN_MASK_PRINT_LEN];
497             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
498 
499             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
500             if (__kmp_affinity_respect_mask) {
501                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
502             } else {
503                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
504             }
505             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
506             KMP_INFORM(Uniform, "KMP_AFFINITY");
507             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
508               __kmp_nThreadsPerCore, __kmp_ncores);
509         }
510 
511         if (__kmp_affinity_type == affinity_none) {
512             __kmp_free(retval);
513             KMP_CPU_FREE(oldMask);
514             return 0;
515         }
516 
517         //
518         // Form an Address object which only includes the package level.
519         //
520         Address addr(1);
521         addr.labels[0] = retval[0].first.labels[pkgLevel];
522         retval[0].first = addr;
523 
524         if (__kmp_affinity_gran_levels < 0) {
525             __kmp_affinity_gran_levels = 0;
526         }
527 
528         if (__kmp_affinity_verbose) {
529             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
530         }
531 
532         *address2os = retval;
533         KMP_CPU_FREE(oldMask);
534         return 1;
535     }
536 
537     //
538     // Sort the table by physical Id.
539     //
540     qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
541 
542     //
543     // Check to see if the machine topology is uniform
544     //
545     unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
546 
547     //
548     // Print the machine topology summary.
549     //
550     if (__kmp_affinity_verbose) {
551         char mask[KMP_AFFIN_MASK_PRINT_LEN];
552         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
553 
554         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
555         if (__kmp_affinity_respect_mask) {
556             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
557         } else {
558             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
559         }
560         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
561         if (uniform) {
562             KMP_INFORM(Uniform, "KMP_AFFINITY");
563         } else {
564             KMP_INFORM(NonUniform, "KMP_AFFINITY");
565         }
566 
567         kmp_str_buf_t buf;
568         __kmp_str_buf_init(&buf);
569 
570         __kmp_str_buf_print(&buf, "%d", nPackages);
571         //for (level = 1; level <= pkgLevel; level++) {
572         //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
573        // }
574         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
575           __kmp_nThreadsPerCore, __kmp_ncores);
576 
577         __kmp_str_buf_free(&buf);
578     }
579 
580     if (__kmp_affinity_type == affinity_none) {
581         __kmp_free(retval);
582         KMP_CPU_FREE(oldMask);
583         return 0;
584     }
585 
586     //
587     // Find any levels with radiix 1, and remove them from the map
588     // (except for the package level).
589     //
590     depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
591 
592     if (__kmp_affinity_gran_levels < 0) {
593         //
594         // Set the granularity level based on what levels are modeled
595         // in the machine topology map.
596         //
597         __kmp_affinity_gran_levels = 0;
598         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
599             __kmp_affinity_gran_levels++;
600         }
601         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
602             __kmp_affinity_gran_levels++;
603         }
604         if (__kmp_affinity_gran > affinity_gran_package) {
605             __kmp_affinity_gran_levels++;
606         }
607     }
608 
609     if (__kmp_affinity_verbose) {
610         __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
611           coreLevel, threadLevel);
612     }
613 
614     KMP_CPU_FREE(oldMask);
615     *address2os = retval;
616     return depth;
617 }
618 #endif // KMP_USE_HWLOC
619 
620 //
621 // If we don't know how to retrieve the machine's processor topology, or
622 // encounter an error in doing so, this routine is called to form a "flat"
623 // mapping of os thread id's <-> processor id's.
624 //
625 static int
626 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
627   kmp_i18n_id_t *const msg_id)
628 {
629     *address2os = NULL;
630     *msg_id = kmp_i18n_null;
631 
632     //
633     // Even if __kmp_affinity_type == affinity_none, this routine might still
634     // called to set __kmp_ncores, as well as
635     // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
636     //
637     if (! KMP_AFFINITY_CAPABLE()) {
638         KMP_ASSERT(__kmp_affinity_type == affinity_none);
639         __kmp_ncores = nPackages = __kmp_xproc;
640         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
641         if (__kmp_affinity_verbose) {
642             KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
643             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
644             KMP_INFORM(Uniform, "KMP_AFFINITY");
645             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
646               __kmp_nThreadsPerCore, __kmp_ncores);
647         }
648         return 0;
649     }
650 
651     //
652     // When affinity is off, this routine will still be called to set
653     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
654     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
655     //  correctly, and return now if affinity is not enabled.
656     //
657     __kmp_ncores = nPackages = __kmp_avail_proc;
658     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
659     if (__kmp_affinity_verbose) {
660         char buf[KMP_AFFIN_MASK_PRINT_LEN];
661         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
662 
663         KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
664         if (__kmp_affinity_respect_mask) {
665             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
666         } else {
667             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
668         }
669         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
670         KMP_INFORM(Uniform, "KMP_AFFINITY");
671         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
672           __kmp_nThreadsPerCore, __kmp_ncores);
673     }
674     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
675     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
676     if (__kmp_affinity_type == affinity_none) {
677         int avail_ct = 0;
678         unsigned int i;
679         KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
680             if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
681                 continue;
682             __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
683         }
684         return 0;
685     }
686 
687     //
688     // Contruct the data structure to be returned.
689     //
690     *address2os = (AddrUnsPair*)
691       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
692     int avail_ct = 0;
693     unsigned int i;
694     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
695         //
696         // Skip this proc if it is not included in the machine model.
697         //
698         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
699             continue;
700         }
701         __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
702         Address addr(1);
703         addr.labels[0] = i;
704         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
705     }
706     if (__kmp_affinity_verbose) {
707         KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
708     }
709 
710     if (__kmp_affinity_gran_levels < 0) {
711         //
712         // Only the package level is modeled in the machine topology map,
713         // so the #levels of granularity is either 0 or 1.
714         //
715         if (__kmp_affinity_gran > affinity_gran_package) {
716             __kmp_affinity_gran_levels = 1;
717         }
718         else {
719             __kmp_affinity_gran_levels = 0;
720         }
721     }
722     return 1;
723 }
724 
725 
726 # if KMP_GROUP_AFFINITY
727 
728 //
729 // If multiple Windows* OS processor groups exist, we can create a 2-level
730 // topology map with the groups at level 0 and the individual procs at
731 // level 1.
732 //
733 // This facilitates letting the threads float among all procs in a group,
734 // if granularity=group (the default when there are multiple groups).
735 //
736 static int
737 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
738   kmp_i18n_id_t *const msg_id)
739 {
740     *address2os = NULL;
741     *msg_id = kmp_i18n_null;
742 
743     //
744     // If we don't have multiple processor groups, return now.
745     // The flat mapping will be used.
746     //
747     if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
748         // FIXME set *msg_id
749         return -1;
750     }
751 
752     //
753     // Contruct the data structure to be returned.
754     //
755     *address2os = (AddrUnsPair*)
756       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
757     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
758     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
759     int avail_ct = 0;
760     int i;
761     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
762         //
763         // Skip this proc if it is not included in the machine model.
764         //
765         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
766             continue;
767         }
768         __kmp_pu_os_idx[avail_ct] = i;  // suppose indices are flat
769         Address addr(2);
770         addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
771         addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
772         (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
773 
774         if (__kmp_affinity_verbose) {
775             KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
776               addr.labels[1]);
777         }
778     }
779 
780     if (__kmp_affinity_gran_levels < 0) {
781         if (__kmp_affinity_gran == affinity_gran_group) {
782             __kmp_affinity_gran_levels = 1;
783         }
784         else if ((__kmp_affinity_gran == affinity_gran_fine)
785           || (__kmp_affinity_gran == affinity_gran_thread)) {
786             __kmp_affinity_gran_levels = 0;
787         }
788         else {
789             const char *gran_str = NULL;
790             if (__kmp_affinity_gran == affinity_gran_core) {
791                 gran_str = "core";
792             }
793             else if (__kmp_affinity_gran == affinity_gran_package) {
794                 gran_str = "package";
795             }
796             else if (__kmp_affinity_gran == affinity_gran_node) {
797                 gran_str = "node";
798             }
799             else {
800                 KMP_ASSERT(0);
801             }
802 
803             // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
804             __kmp_affinity_gran_levels = 0;
805         }
806     }
807     return 2;
808 }
809 
810 # endif /* KMP_GROUP_AFFINITY */
811 
812 
813 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
814 
815 static int
816 __kmp_cpuid_mask_width(int count) {
817     int r = 0;
818 
819     while((1<<r) < count)
820         ++r;
821     return r;
822 }
823 
824 
825 class apicThreadInfo {
826 public:
827     unsigned osId;              // param to __kmp_affinity_bind_thread
828     unsigned apicId;            // from cpuid after binding
829     unsigned maxCoresPerPkg;    //      ""
830     unsigned maxThreadsPerPkg;  //      ""
831     unsigned pkgId;             // inferred from above values
832     unsigned coreId;            //      ""
833     unsigned threadId;          //      ""
834 };
835 
836 
837 static int
838 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
839 {
840     const apicThreadInfo *aa = (const apicThreadInfo *)a;
841     const apicThreadInfo *bb = (const apicThreadInfo *)b;
842     if (aa->osId < bb->osId) return -1;
843     if (aa->osId > bb->osId) return 1;
844     return 0;
845 }
846 
847 
848 static int
849 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
850 {
851     const apicThreadInfo *aa = (const apicThreadInfo *)a;
852     const apicThreadInfo *bb = (const apicThreadInfo *)b;
853     if (aa->pkgId < bb->pkgId) return -1;
854     if (aa->pkgId > bb->pkgId) return 1;
855     if (aa->coreId < bb->coreId) return -1;
856     if (aa->coreId > bb->coreId) return 1;
857     if (aa->threadId < bb->threadId) return -1;
858     if (aa->threadId > bb->threadId) return 1;
859     return 0;
860 }
861 
862 
863 //
864 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
865 // an algorithm which cycles through the available os threads, setting
866 // the current thread's affinity mask to that thread, and then retrieves
867 // the Apic Id for each thread context using the cpuid instruction.
868 //
869 static int
870 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
871   kmp_i18n_id_t *const msg_id)
872 {
873     kmp_cpuid buf;
874     int rc;
875     *address2os = NULL;
876     *msg_id = kmp_i18n_null;
877 
878     //
879     // Check if cpuid leaf 4 is supported.
880     //
881         __kmp_x86_cpuid(0, 0, &buf);
882         if (buf.eax < 4) {
883             *msg_id = kmp_i18n_str_NoLeaf4Support;
884             return -1;
885         }
886 
887     //
888     // The algorithm used starts by setting the affinity to each available
889     // thread and retrieving info from the cpuid instruction, so if we are
890     // not capable of calling __kmp_get_system_affinity() and
891     // _kmp_get_system_affinity(), then we need to do something else - use
892     // the defaults that we calculated from issuing cpuid without binding
893     // to each proc.
894     //
895     if (! KMP_AFFINITY_CAPABLE()) {
896         //
897         // Hack to try and infer the machine topology using only the data
898         // available from cpuid on the current thread, and __kmp_xproc.
899         //
900         KMP_ASSERT(__kmp_affinity_type == affinity_none);
901 
902         //
903         // Get an upper bound on the number of threads per package using
904         // cpuid(1).
905         //
906         // On some OS/chps combinations where HT is supported by the chip
907         // but is disabled, this value will be 2 on a single core chip.
908         // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
909         //
910         __kmp_x86_cpuid(1, 0, &buf);
911         int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
912         if (maxThreadsPerPkg == 0) {
913             maxThreadsPerPkg = 1;
914         }
915 
916         //
917         // The num cores per pkg comes from cpuid(4).
918         // 1 must be added to the encoded value.
919         //
920         // The author of cpu_count.cpp treated this only an upper bound
921         // on the number of cores, but I haven't seen any cases where it
922         // was greater than the actual number of cores, so we will treat
923         // it as exact in this block of code.
924         //
925         // First, we need to check if cpuid(4) is supported on this chip.
926         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
927         // has the value n or greater.
928         //
929         __kmp_x86_cpuid(0, 0, &buf);
930         if (buf.eax >= 4) {
931             __kmp_x86_cpuid(4, 0, &buf);
932             nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
933         }
934         else {
935             nCoresPerPkg = 1;
936         }
937 
938         //
939         // There is no way to reliably tell if HT is enabled without issuing
940         // the cpuid instruction from every thread, can correlating the cpuid
941         // info, so if the machine is not affinity capable, we assume that HT
942         // is off.  We have seen quite a few machines where maxThreadsPerPkg
943         // is 2, yet the machine does not support HT.
944         //
945         // - Older OSes are usually found on machines with older chips, which
946         //   do not support HT.
947         //
948         // - The performance penalty for mistakenly identifying a machine as
949         //   HT when it isn't (which results in blocktime being incorrecly set
950         //   to 0) is greater than the penalty when for mistakenly identifying
951         //   a machine as being 1 thread/core when it is really HT enabled
952         //   (which results in blocktime being incorrectly set to a positive
953         //   value).
954         //
955         __kmp_ncores = __kmp_xproc;
956         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
957         __kmp_nThreadsPerCore = 1;
958         if (__kmp_affinity_verbose) {
959             KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
960             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
961             if (__kmp_affinity_uniform_topology()) {
962                 KMP_INFORM(Uniform, "KMP_AFFINITY");
963             } else {
964                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
965             }
966             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
967               __kmp_nThreadsPerCore, __kmp_ncores);
968         }
969         return 0;
970     }
971 
972     //
973     //
974     // From here on, we can assume that it is safe to call
975     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
976     // even if __kmp_affinity_type = affinity_none.
977     //
978 
979     //
980     // Save the affinity mask for the current thread.
981     //
982     kmp_affin_mask_t *oldMask;
983     KMP_CPU_ALLOC(oldMask);
984     KMP_ASSERT(oldMask != NULL);
985     __kmp_get_system_affinity(oldMask, TRUE);
986 
987     //
988     // Run through each of the available contexts, binding the current thread
989     // to it, and obtaining the pertinent information using the cpuid instr.
990     //
991     // The relevant information is:
992     //
993     // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
994     //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
995     //
996     // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
997     //    value of this field determines the width of the core# + thread#
998     //    fields in the Apic Id.  It is also an upper bound on the number
999     //    of threads per package, but it has been verified that situations
1000     //    happen were it is not exact.  In particular, on certain OS/chip
1001     //    combinations where Intel(R) Hyper-Threading Technology is supported
1002     //    by the chip but has
1003     //    been disabled, the value of this field will be 2 (for a single core
1004     //    chip).  On other OS/chip combinations supporting
1005     //    Intel(R) Hyper-Threading Technology, the value of
1006     //    this field will be 1 when Intel(R) Hyper-Threading Technology is
1007     //    disabled and 2 when it is enabled.
1008     //
1009     // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
1010     //    value of this field (+1) determines the width of the core# field in
1011     //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
1012     //    an upper bound, but the IA-32 architecture manual says that it is
1013     //    exactly the number of cores per package, and I haven't seen any
1014     //    case where it wasn't.
1015     //
1016     // From this information, deduce the package Id, core Id, and thread Id,
1017     // and set the corresponding fields in the apicThreadInfo struct.
1018     //
1019     unsigned i;
1020     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1021       __kmp_avail_proc * sizeof(apicThreadInfo));
1022     unsigned nApics = 0;
1023     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1024         //
1025         // Skip this proc if it is not included in the machine model.
1026         //
1027         if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1028             continue;
1029         }
1030         KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1031 
1032         __kmp_affinity_bind_thread(i);
1033         threadInfo[nApics].osId = i;
1034 
1035         //
1036         // The apic id and max threads per pkg come from cpuid(1).
1037         //
1038         __kmp_x86_cpuid(1, 0, &buf);
1039         if (! (buf.edx >> 9) & 1) {
1040             __kmp_set_system_affinity(oldMask, TRUE);
1041             __kmp_free(threadInfo);
1042             KMP_CPU_FREE(oldMask);
1043             *msg_id = kmp_i18n_str_ApicNotPresent;
1044             return -1;
1045         }
1046         threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1047         threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1048         if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1049             threadInfo[nApics].maxThreadsPerPkg = 1;
1050         }
1051 
1052         //
1053         // Max cores per pkg comes from cpuid(4).
1054         // 1 must be added to the encoded value.
1055         //
1056         // First, we need to check if cpuid(4) is supported on this chip.
1057         // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1058         // has the value n or greater.
1059         //
1060         __kmp_x86_cpuid(0, 0, &buf);
1061         if (buf.eax >= 4) {
1062             __kmp_x86_cpuid(4, 0, &buf);
1063             threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1064         }
1065         else {
1066             threadInfo[nApics].maxCoresPerPkg = 1;
1067         }
1068 
1069         //
1070         // Infer the pkgId / coreId / threadId using only the info
1071         // obtained locally.
1072         //
1073         int widthCT = __kmp_cpuid_mask_width(
1074           threadInfo[nApics].maxThreadsPerPkg);
1075         threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1076 
1077         int widthC = __kmp_cpuid_mask_width(
1078           threadInfo[nApics].maxCoresPerPkg);
1079         int widthT = widthCT - widthC;
1080         if (widthT < 0) {
1081             //
1082             // I've never seen this one happen, but I suppose it could, if
1083             // the cpuid instruction on a chip was really screwed up.
1084             // Make sure to restore the affinity mask before the tail call.
1085             //
1086             __kmp_set_system_affinity(oldMask, TRUE);
1087             __kmp_free(threadInfo);
1088             KMP_CPU_FREE(oldMask);
1089             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1090             return -1;
1091         }
1092 
1093         int maskC = (1 << widthC) - 1;
1094         threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1095           &maskC;
1096 
1097         int maskT = (1 << widthT) - 1;
1098         threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1099 
1100         nApics++;
1101     }
1102 
1103     //
1104     // We've collected all the info we need.
1105     // Restore the old affinity mask for this thread.
1106     //
1107     __kmp_set_system_affinity(oldMask, TRUE);
1108 
1109     //
1110     // If there's only one thread context to bind to, form an Address object
1111     // with depth 1 and return immediately (or, if affinity is off, set
1112     // address2os to NULL and return).
1113     //
1114     // If it is configured to omit the package level when there is only a
1115     // single package, the logic at the end of this routine won't work if
1116     // there is only a single thread - it would try to form an Address
1117     // object with depth 0.
1118     //
1119     KMP_ASSERT(nApics > 0);
1120     if (nApics == 1) {
1121         __kmp_ncores = nPackages = 1;
1122         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1123         if (__kmp_affinity_verbose) {
1124             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1125             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1126 
1127             KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1128             if (__kmp_affinity_respect_mask) {
1129                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1130             } else {
1131                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1132             }
1133             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1134             KMP_INFORM(Uniform, "KMP_AFFINITY");
1135             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1136               __kmp_nThreadsPerCore, __kmp_ncores);
1137         }
1138 
1139         if (__kmp_affinity_type == affinity_none) {
1140             __kmp_free(threadInfo);
1141             KMP_CPU_FREE(oldMask);
1142             return 0;
1143         }
1144 
1145         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1146         Address addr(1);
1147         addr.labels[0] = threadInfo[0].pkgId;
1148         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1149 
1150         if (__kmp_affinity_gran_levels < 0) {
1151             __kmp_affinity_gran_levels = 0;
1152         }
1153 
1154         if (__kmp_affinity_verbose) {
1155             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1156         }
1157 
1158         __kmp_free(threadInfo);
1159         KMP_CPU_FREE(oldMask);
1160         return 1;
1161     }
1162 
1163     //
1164     // Sort the threadInfo table by physical Id.
1165     //
1166     qsort(threadInfo, nApics, sizeof(*threadInfo),
1167       __kmp_affinity_cmp_apicThreadInfo_phys_id);
1168 
1169     //
1170     // The table is now sorted by pkgId / coreId / threadId, but we really
1171     // don't know the radix of any of the fields.  pkgId's may be sparsely
1172     // assigned among the chips on a system.  Although coreId's are usually
1173     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1174     // [0..threadsPerCore-1], we don't want to make any such assumptions.
1175     //
1176     // For that matter, we don't know what coresPerPkg and threadsPerCore
1177     // (or the total # packages) are at this point - we want to determine
1178     // that now.  We only have an upper bound on the first two figures.
1179     //
1180     // We also perform a consistency check at this point: the values returned
1181     // by the cpuid instruction for any thread bound to a given package had
1182     // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1183     //
1184     nPackages = 1;
1185     nCoresPerPkg = 1;
1186     __kmp_nThreadsPerCore = 1;
1187     unsigned nCores = 1;
1188 
1189     unsigned pkgCt = 1;                         // to determine radii
1190     unsigned lastPkgId = threadInfo[0].pkgId;
1191     unsigned coreCt = 1;
1192     unsigned lastCoreId = threadInfo[0].coreId;
1193     unsigned threadCt = 1;
1194     unsigned lastThreadId = threadInfo[0].threadId;
1195 
1196                                                 // intra-pkg consist checks
1197     unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1198     unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1199 
1200     for (i = 1; i < nApics; i++) {
1201         if (threadInfo[i].pkgId != lastPkgId) {
1202             nCores++;
1203             pkgCt++;
1204             lastPkgId = threadInfo[i].pkgId;
1205             if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1206             coreCt = 1;
1207             lastCoreId = threadInfo[i].coreId;
1208             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1209             threadCt = 1;
1210             lastThreadId = threadInfo[i].threadId;
1211 
1212             //
1213             // This is a different package, so go on to the next iteration
1214             // without doing any consistency checks.  Reset the consistency
1215             // check vars, though.
1216             //
1217             prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1218             prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1219             continue;
1220         }
1221 
1222         if (threadInfo[i].coreId != lastCoreId) {
1223             nCores++;
1224             coreCt++;
1225             lastCoreId = threadInfo[i].coreId;
1226             if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1227             threadCt = 1;
1228             lastThreadId = threadInfo[i].threadId;
1229         }
1230         else if (threadInfo[i].threadId != lastThreadId) {
1231             threadCt++;
1232             lastThreadId = threadInfo[i].threadId;
1233         }
1234         else {
1235             __kmp_free(threadInfo);
1236             KMP_CPU_FREE(oldMask);
1237             *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1238             return -1;
1239         }
1240 
1241         //
1242         // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1243         // fields agree between all the threads bounds to a given package.
1244         //
1245         if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1246           || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1247             __kmp_free(threadInfo);
1248             KMP_CPU_FREE(oldMask);
1249             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1250             return -1;
1251         }
1252     }
1253     nPackages = pkgCt;
1254     if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1255     if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1256 
1257     //
1258     // When affinity is off, this routine will still be called to set
1259     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1260     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1261     // correctly, and return now if affinity is not enabled.
1262     //
1263     __kmp_ncores = nCores;
1264     if (__kmp_affinity_verbose) {
1265         char buf[KMP_AFFIN_MASK_PRINT_LEN];
1266         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1267 
1268         KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1269         if (__kmp_affinity_respect_mask) {
1270             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1271         } else {
1272             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1273         }
1274         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1275         if (__kmp_affinity_uniform_topology()) {
1276             KMP_INFORM(Uniform, "KMP_AFFINITY");
1277         } else {
1278             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1279         }
1280         KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1281           __kmp_nThreadsPerCore, __kmp_ncores);
1282 
1283     }
1284     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1285     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1286     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1287     for (i = 0; i < nApics; ++i) {
1288         __kmp_pu_os_idx[i] = threadInfo[i].osId;
1289     }
1290     if (__kmp_affinity_type == affinity_none) {
1291         __kmp_free(threadInfo);
1292         KMP_CPU_FREE(oldMask);
1293         return 0;
1294     }
1295 
1296     //
1297     // Now that we've determined the number of packages, the number of cores
1298     // per package, and the number of threads per core, we can construct the
1299     // data structure that is to be returned.
1300     //
1301     int pkgLevel = 0;
1302     int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1303     int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1304     unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1305 
1306     KMP_ASSERT(depth > 0);
1307     *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1308 
1309     for (i = 0; i < nApics; ++i) {
1310         Address addr(depth);
1311         unsigned os = threadInfo[i].osId;
1312         int d = 0;
1313 
1314         if (pkgLevel >= 0) {
1315             addr.labels[d++] = threadInfo[i].pkgId;
1316         }
1317         if (coreLevel >= 0) {
1318             addr.labels[d++] = threadInfo[i].coreId;
1319         }
1320         if (threadLevel >= 0) {
1321             addr.labels[d++] = threadInfo[i].threadId;
1322         }
1323         (*address2os)[i] = AddrUnsPair(addr, os);
1324     }
1325 
1326     if (__kmp_affinity_gran_levels < 0) {
1327         //
1328         // Set the granularity level based on what levels are modeled
1329         // in the machine topology map.
1330         //
1331         __kmp_affinity_gran_levels = 0;
1332         if ((threadLevel >= 0)
1333           && (__kmp_affinity_gran > affinity_gran_thread)) {
1334             __kmp_affinity_gran_levels++;
1335         }
1336         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1337             __kmp_affinity_gran_levels++;
1338         }
1339         if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1340             __kmp_affinity_gran_levels++;
1341         }
1342     }
1343 
1344     if (__kmp_affinity_verbose) {
1345         __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1346           coreLevel, threadLevel);
1347     }
1348 
1349     __kmp_free(threadInfo);
1350     KMP_CPU_FREE(oldMask);
1351     return depth;
1352 }
1353 
1354 
1355 //
1356 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1357 // architectures support a newer interface for specifying the x2APIC Ids,
1358 // based on cpuid leaf 11.
1359 //
1360 static int
1361 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1362   kmp_i18n_id_t *const msg_id)
1363 {
1364     kmp_cpuid buf;
1365 
1366     *address2os = NULL;
1367     *msg_id = kmp_i18n_null;
1368 
1369     //
1370     // Check to see if cpuid leaf 11 is supported.
1371     //
1372     __kmp_x86_cpuid(0, 0, &buf);
1373     if (buf.eax < 11) {
1374         *msg_id = kmp_i18n_str_NoLeaf11Support;
1375         return -1;
1376     }
1377     __kmp_x86_cpuid(11, 0, &buf);
1378     if (buf.ebx == 0) {
1379         *msg_id = kmp_i18n_str_NoLeaf11Support;
1380         return -1;
1381     }
1382 
1383     //
1384     // Find the number of levels in the machine topology.  While we're at it,
1385     // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
1386     // try to get more accurate values later by explicitly counting them,
1387     // but get reasonable defaults now, in case we return early.
1388     //
1389     int level;
1390     int threadLevel = -1;
1391     int coreLevel = -1;
1392     int pkgLevel = -1;
1393     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1394 
1395     for (level = 0;; level++) {
1396         if (level > 31) {
1397             //
1398             // FIXME: Hack for DPD200163180
1399             //
1400             // If level is big then something went wrong -> exiting
1401             //
1402             // There could actually be 32 valid levels in the machine topology,
1403             // but so far, the only machine we have seen which does not exit
1404             // this loop before iteration 32 has fubar x2APIC settings.
1405             //
1406             // For now, just reject this case based upon loop trip count.
1407             //
1408             *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1409             return -1;
1410         }
1411         __kmp_x86_cpuid(11, level, &buf);
1412         if (buf.ebx == 0) {
1413             if (pkgLevel < 0) {
1414                 //
1415                 // Will infer nPackages from __kmp_xproc
1416                 //
1417                 pkgLevel = level;
1418                 level++;
1419             }
1420             break;
1421         }
1422         int kind = (buf.ecx >> 8) & 0xff;
1423         if (kind == 1) {
1424             //
1425             // SMT level
1426             //
1427             threadLevel = level;
1428             coreLevel = -1;
1429             pkgLevel = -1;
1430             __kmp_nThreadsPerCore = buf.ebx & 0xff;
1431             if (__kmp_nThreadsPerCore == 0) {
1432                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1433                 return -1;
1434             }
1435         }
1436         else if (kind == 2) {
1437             //
1438             // core level
1439             //
1440             coreLevel = level;
1441             pkgLevel = -1;
1442             nCoresPerPkg = buf.ebx & 0xff;
1443             if (nCoresPerPkg == 0) {
1444                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1445                 return -1;
1446             }
1447         }
1448         else {
1449             if (level <= 0) {
1450                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1451                 return -1;
1452             }
1453             if (pkgLevel >= 0) {
1454                 continue;
1455             }
1456             pkgLevel = level;
1457             nPackages = buf.ebx & 0xff;
1458             if (nPackages == 0) {
1459                 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1460                 return -1;
1461             }
1462         }
1463     }
1464     int depth = level;
1465 
1466     //
1467     // In the above loop, "level" was counted from the finest level (usually
1468     // thread) to the coarsest.  The caller expects that we will place the
1469     // labels in (*address2os)[].first.labels[] in the inverse order, so
1470     // we need to invert the vars saying which level means what.
1471     //
1472     if (threadLevel >= 0) {
1473         threadLevel = depth - threadLevel - 1;
1474     }
1475     if (coreLevel >= 0) {
1476         coreLevel = depth - coreLevel - 1;
1477     }
1478     KMP_DEBUG_ASSERT(pkgLevel >= 0);
1479     pkgLevel = depth - pkgLevel - 1;
1480 
1481     //
1482     // The algorithm used starts by setting the affinity to each available
1483     // thread and retrieving info from the cpuid instruction, so if we are
1484     // not capable of calling __kmp_get_system_affinity() and
1485     // _kmp_get_system_affinity(), then we need to do something else - use
1486     // the defaults that we calculated from issuing cpuid without binding
1487     // to each proc.
1488     //
1489     if (! KMP_AFFINITY_CAPABLE())
1490     {
1491         //
1492         // Hack to try and infer the machine topology using only the data
1493         // available from cpuid on the current thread, and __kmp_xproc.
1494         //
1495         KMP_ASSERT(__kmp_affinity_type == affinity_none);
1496 
1497         __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1498         nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1499         if (__kmp_affinity_verbose) {
1500             KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1501             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1502             if (__kmp_affinity_uniform_topology()) {
1503                 KMP_INFORM(Uniform, "KMP_AFFINITY");
1504             } else {
1505                 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1506             }
1507             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1508               __kmp_nThreadsPerCore, __kmp_ncores);
1509         }
1510         return 0;
1511     }
1512 
1513     //
1514     //
1515     // From here on, we can assume that it is safe to call
1516     // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1517     // even if __kmp_affinity_type = affinity_none.
1518     //
1519 
1520     //
1521     // Save the affinity mask for the current thread.
1522     //
1523     kmp_affin_mask_t *oldMask;
1524     KMP_CPU_ALLOC(oldMask);
1525     __kmp_get_system_affinity(oldMask, TRUE);
1526 
1527     //
1528     // Allocate the data structure to be returned.
1529     //
1530     AddrUnsPair *retval = (AddrUnsPair *)
1531       __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1532 
1533     //
1534     // Run through each of the available contexts, binding the current thread
1535     // to it, and obtaining the pertinent information using the cpuid instr.
1536     //
1537     unsigned int proc;
1538     int nApics = 0;
1539     KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1540         //
1541         // Skip this proc if it is not included in the machine model.
1542         //
1543         if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1544             continue;
1545         }
1546         KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1547 
1548         __kmp_affinity_bind_thread(proc);
1549 
1550         //
1551         // Extrach the labels for each level in the machine topology map
1552         // from the Apic ID.
1553         //
1554         Address addr(depth);
1555         int prev_shift = 0;
1556 
1557         for (level = 0; level < depth; level++) {
1558             __kmp_x86_cpuid(11, level, &buf);
1559             unsigned apicId = buf.edx;
1560             if (buf.ebx == 0) {
1561                 if (level != depth - 1) {
1562                     KMP_CPU_FREE(oldMask);
1563                     *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1564                     return -1;
1565                 }
1566                 addr.labels[depth - level - 1] = apicId >> prev_shift;
1567                 level++;
1568                 break;
1569             }
1570             int shift = buf.eax & 0x1f;
1571             int mask = (1 << shift) - 1;
1572             addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1573             prev_shift = shift;
1574         }
1575         if (level != depth) {
1576             KMP_CPU_FREE(oldMask);
1577             *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1578             return -1;
1579         }
1580 
1581         retval[nApics] = AddrUnsPair(addr, proc);
1582         nApics++;
1583     }
1584 
1585     //
1586     // We've collected all the info we need.
1587     // Restore the old affinity mask for this thread.
1588     //
1589     __kmp_set_system_affinity(oldMask, TRUE);
1590 
1591     //
1592     // If there's only one thread context to bind to, return now.
1593     //
1594     KMP_ASSERT(nApics > 0);
1595     if (nApics == 1) {
1596         __kmp_ncores = nPackages = 1;
1597         __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1598         if (__kmp_affinity_verbose) {
1599             char buf[KMP_AFFIN_MASK_PRINT_LEN];
1600             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1601 
1602             KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1603             if (__kmp_affinity_respect_mask) {
1604                 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1605             } else {
1606                 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1607             }
1608             KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1609             KMP_INFORM(Uniform, "KMP_AFFINITY");
1610             KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1611               __kmp_nThreadsPerCore, __kmp_ncores);
1612         }
1613 
1614         if (__kmp_affinity_type == affinity_none) {
1615             __kmp_free(retval);
1616             KMP_CPU_FREE(oldMask);
1617             return 0;
1618         }
1619 
1620         //
1621         // Form an Address object which only includes the package level.
1622         //
1623         Address addr(1);
1624         addr.labels[0] = retval[0].first.labels[pkgLevel];
1625         retval[0].first = addr;
1626 
1627         if (__kmp_affinity_gran_levels < 0) {
1628             __kmp_affinity_gran_levels = 0;
1629         }
1630 
1631         if (__kmp_affinity_verbose) {
1632             __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1633         }
1634 
1635         *address2os = retval;
1636         KMP_CPU_FREE(oldMask);
1637         return 1;
1638     }
1639 
1640     //
1641     // Sort the table by physical Id.
1642     //
1643     qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1644 
1645     //
1646     // Find the radix at each of the levels.
1647     //
1648     unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1649     unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1650     unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1651     unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1652     for (level = 0; level < depth; level++) {
1653         totals[level] = 1;
1654         maxCt[level] = 1;
1655         counts[level] = 1;
1656         last[level] = retval[0].first.labels[level];
1657     }
1658 
1659     //
1660     // From here on, the iteration variable "level" runs from the finest
1661     // level to the coarsest, i.e. we iterate forward through
1662     // (*address2os)[].first.labels[] - in the previous loops, we iterated
1663     // backwards.
1664     //
1665     for (proc = 1; (int)proc < nApics; proc++) {
1666         int level;
1667         for (level = 0; level < depth; level++) {
1668             if (retval[proc].first.labels[level] != last[level]) {
1669                 int j;
1670                 for (j = level + 1; j < depth; j++) {
1671                     totals[j]++;
1672                     counts[j] = 1;
1673                     // The line below causes printing incorrect topology information
1674                     // in case the max value for some level (maxCt[level]) is encountered earlier than
1675                     // some less value while going through the array.
1676                     // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1677                     // whereas it must be 4.
1678                     // TODO!!! Check if it can be commented safely
1679                     //maxCt[j] = 1;
1680                     last[j] = retval[proc].first.labels[j];
1681                 }
1682                 totals[level]++;
1683                 counts[level]++;
1684                 if (counts[level] > maxCt[level]) {
1685                     maxCt[level] = counts[level];
1686                 }
1687                 last[level] = retval[proc].first.labels[level];
1688                 break;
1689             }
1690             else if (level == depth - 1) {
1691                 __kmp_free(last);
1692                 __kmp_free(maxCt);
1693                 __kmp_free(counts);
1694                 __kmp_free(totals);
1695                 __kmp_free(retval);
1696                 KMP_CPU_FREE(oldMask);
1697                 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1698                 return -1;
1699             }
1700         }
1701     }
1702 
1703     //
1704     // When affinity is off, this routine will still be called to set
1705     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1706     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
1707     // correctly, and return if affinity is not enabled.
1708     //
1709     if (threadLevel >= 0) {
1710         __kmp_nThreadsPerCore = maxCt[threadLevel];
1711     }
1712     else {
1713         __kmp_nThreadsPerCore = 1;
1714     }
1715     nPackages = totals[pkgLevel];
1716 
1717     if (coreLevel >= 0) {
1718         __kmp_ncores = totals[coreLevel];
1719         nCoresPerPkg = maxCt[coreLevel];
1720     }
1721     else {
1722         __kmp_ncores = nPackages;
1723         nCoresPerPkg = 1;
1724     }
1725 
1726     //
1727     // Check to see if the machine topology is uniform
1728     //
1729     unsigned prod = maxCt[0];
1730     for (level = 1; level < depth; level++) {
1731        prod *= maxCt[level];
1732     }
1733     bool uniform = (prod == totals[level - 1]);
1734 
1735     //
1736     // Print the machine topology summary.
1737     //
1738     if (__kmp_affinity_verbose) {
1739         char mask[KMP_AFFIN_MASK_PRINT_LEN];
1740         __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1741 
1742         KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1743         if (__kmp_affinity_respect_mask) {
1744             KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1745         } else {
1746             KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1747         }
1748         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1749         if (uniform) {
1750             KMP_INFORM(Uniform, "KMP_AFFINITY");
1751         } else {
1752             KMP_INFORM(NonUniform, "KMP_AFFINITY");
1753         }
1754 
1755         kmp_str_buf_t buf;
1756         __kmp_str_buf_init(&buf);
1757 
1758         __kmp_str_buf_print(&buf, "%d", totals[0]);
1759         for (level = 1; level <= pkgLevel; level++) {
1760             __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1761         }
1762         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1763           __kmp_nThreadsPerCore, __kmp_ncores);
1764 
1765         __kmp_str_buf_free(&buf);
1766     }
1767     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1768     KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1769     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1770     for (proc = 0; (int)proc < nApics; ++proc) {
1771         __kmp_pu_os_idx[proc] = retval[proc].second;
1772     }
1773     if (__kmp_affinity_type == affinity_none) {
1774         __kmp_free(last);
1775         __kmp_free(maxCt);
1776         __kmp_free(counts);
1777         __kmp_free(totals);
1778         __kmp_free(retval);
1779         KMP_CPU_FREE(oldMask);
1780         return 0;
1781     }
1782 
1783     //
1784     // Find any levels with radiix 1, and remove them from the map
1785     // (except for the package level).
1786     //
1787     int new_depth = 0;
1788     for (level = 0; level < depth; level++) {
1789         if ((maxCt[level] == 1) && (level != pkgLevel)) {
1790            continue;
1791         }
1792         new_depth++;
1793     }
1794 
1795     //
1796     // If we are removing any levels, allocate a new vector to return,
1797     // and copy the relevant information to it.
1798     //
1799     if (new_depth != depth) {
1800         AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1801           sizeof(AddrUnsPair) * nApics);
1802         for (proc = 0; (int)proc < nApics; proc++) {
1803             Address addr(new_depth);
1804             new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1805         }
1806         int new_level = 0;
1807         int newPkgLevel = -1;
1808         int newCoreLevel = -1;
1809         int newThreadLevel = -1;
1810         int i;
1811         for (level = 0; level < depth; level++) {
1812             if ((maxCt[level] == 1)
1813               && (level != pkgLevel)) {
1814                 //
1815                 // Remove this level. Never remove the package level
1816                 //
1817                 continue;
1818             }
1819             if (level == pkgLevel) {
1820                 newPkgLevel = level;
1821             }
1822             if (level == coreLevel) {
1823                 newCoreLevel = level;
1824             }
1825             if (level == threadLevel) {
1826                 newThreadLevel = level;
1827             }
1828             for (proc = 0; (int)proc < nApics; proc++) {
1829                 new_retval[proc].first.labels[new_level]
1830                   = retval[proc].first.labels[level];
1831             }
1832             new_level++;
1833         }
1834 
1835         __kmp_free(retval);
1836         retval = new_retval;
1837         depth = new_depth;
1838         pkgLevel = newPkgLevel;
1839         coreLevel = newCoreLevel;
1840         threadLevel = newThreadLevel;
1841     }
1842 
1843     if (__kmp_affinity_gran_levels < 0) {
1844         //
1845         // Set the granularity level based on what levels are modeled
1846         // in the machine topology map.
1847         //
1848         __kmp_affinity_gran_levels = 0;
1849         if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1850             __kmp_affinity_gran_levels++;
1851         }
1852         if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1853             __kmp_affinity_gran_levels++;
1854         }
1855         if (__kmp_affinity_gran > affinity_gran_package) {
1856             __kmp_affinity_gran_levels++;
1857         }
1858     }
1859 
1860     if (__kmp_affinity_verbose) {
1861         __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1862           coreLevel, threadLevel);
1863     }
1864 
1865     __kmp_free(last);
1866     __kmp_free(maxCt);
1867     __kmp_free(counts);
1868     __kmp_free(totals);
1869     KMP_CPU_FREE(oldMask);
1870     *address2os = retval;
1871     return depth;
1872 }
1873 
1874 
1875 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1876 
1877 
1878 #define osIdIndex       0
1879 #define threadIdIndex   1
1880 #define coreIdIndex     2
1881 #define pkgIdIndex      3
1882 #define nodeIdIndex     4
1883 
1884 typedef unsigned *ProcCpuInfo;
1885 static unsigned maxIndex = pkgIdIndex;
1886 
1887 
1888 static int
1889 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1890 {
1891     const unsigned *aa = (const unsigned *)a;
1892     const unsigned *bb = (const unsigned *)b;
1893     if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1894     if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1895     return 0;
1896 };
1897 
1898 
1899 static int
1900 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1901 {
1902     unsigned i;
1903     const unsigned *aa = *((const unsigned **)a);
1904     const unsigned *bb = *((const unsigned **)b);
1905     for (i = maxIndex; ; i--) {
1906         if (aa[i] < bb[i]) return -1;
1907         if (aa[i] > bb[i]) return 1;
1908         if (i == osIdIndex) break;
1909     }
1910     return 0;
1911 }
1912 
1913 
1914 //
1915 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1916 // affinity map.
1917 //
1918 static int
1919 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1920   kmp_i18n_id_t *const msg_id, FILE *f)
1921 {
1922     *address2os = NULL;
1923     *msg_id = kmp_i18n_null;
1924 
1925     //
1926     // Scan of the file, and count the number of "processor" (osId) fields,
1927     // and find the highest value of <n> for a node_<n> field.
1928     //
1929     char buf[256];
1930     unsigned num_records = 0;
1931     while (! feof(f)) {
1932         buf[sizeof(buf) - 1] = 1;
1933         if (! fgets(buf, sizeof(buf), f)) {
1934             //
1935             // Read errors presumably because of EOF
1936             //
1937             break;
1938         }
1939 
1940         char s1[] = "processor";
1941         if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1942             num_records++;
1943             continue;
1944         }
1945 
1946         //
1947         // FIXME - this will match "node_<n> <garbage>"
1948         //
1949         unsigned level;
1950         if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1951             if (nodeIdIndex + level >= maxIndex) {
1952                 maxIndex = nodeIdIndex + level;
1953             }
1954             continue;
1955         }
1956     }
1957 
1958     //
1959     // Check for empty file / no valid processor records, or too many.
1960     // The number of records can't exceed the number of valid bits in the
1961     // affinity mask.
1962     //
1963     if (num_records == 0) {
1964         *line = 0;
1965         *msg_id = kmp_i18n_str_NoProcRecords;
1966         return -1;
1967     }
1968     if (num_records > (unsigned)__kmp_xproc) {
1969         *line = 0;
1970         *msg_id = kmp_i18n_str_TooManyProcRecords;
1971         return -1;
1972     }
1973 
1974     //
1975     // Set the file pointer back to the begginning, so that we can scan the
1976     // file again, this time performing a full parse of the data.
1977     // Allocate a vector of ProcCpuInfo object, where we will place the data.
1978     // Adding an extra element at the end allows us to remove a lot of extra
1979     // checks for termination conditions.
1980     //
1981     if (fseek(f, 0, SEEK_SET) != 0) {
1982         *line = 0;
1983         *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1984         return -1;
1985     }
1986 
1987     //
1988     // Allocate the array of records to store the proc info in.  The dummy
1989     // element at the end makes the logic in filling them out easier to code.
1990     //
1991     unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1992       * sizeof(unsigned *));
1993     unsigned i;
1994     for (i = 0; i <= num_records; i++) {
1995         threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1996           * sizeof(unsigned));
1997     }
1998 
1999 #define CLEANUP_THREAD_INFO \
2000     for (i = 0; i <= num_records; i++) {                                \
2001         __kmp_free(threadInfo[i]);                                      \
2002     }                                                                   \
2003     __kmp_free(threadInfo);
2004 
2005     //
2006     // A value of UINT_MAX means that we didn't find the field
2007     //
2008     unsigned __index;
2009 
2010 #define INIT_PROC_INFO(p) \
2011     for (__index = 0; __index <= maxIndex; __index++) {                 \
2012         (p)[__index] = UINT_MAX;                                        \
2013     }
2014 
2015     for (i = 0; i <= num_records; i++) {
2016         INIT_PROC_INFO(threadInfo[i]);
2017     }
2018 
2019     unsigned num_avail = 0;
2020     *line = 0;
2021     while (! feof(f)) {
2022         //
2023         // Create an inner scoping level, so that all the goto targets at the
2024         // end of the loop appear in an outer scoping level.  This avoids
2025         // warnings about jumping past an initialization to a target in the
2026         // same block.
2027         //
2028         {
2029             buf[sizeof(buf) - 1] = 1;
2030             bool long_line = false;
2031             if (! fgets(buf, sizeof(buf), f)) {
2032                 //
2033                 // Read errors presumably because of EOF
2034                 //
2035                 // If there is valid data in threadInfo[num_avail], then fake
2036                 // a blank line in ensure that the last address gets parsed.
2037                 //
2038                 bool valid = false;
2039                 for (i = 0; i <= maxIndex; i++) {
2040                     if (threadInfo[num_avail][i] != UINT_MAX) {
2041                         valid = true;
2042                     }
2043                 }
2044                 if (! valid) {
2045                     break;
2046                 }
2047                 buf[0] = 0;
2048             } else if (!buf[sizeof(buf) - 1]) {
2049                 //
2050                 // The line is longer than the buffer.  Set a flag and don't
2051                 // emit an error if we were going to ignore the line, anyway.
2052                 //
2053                 long_line = true;
2054 
2055 #define CHECK_LINE \
2056     if (long_line) {                                                    \
2057         CLEANUP_THREAD_INFO;                                            \
2058         *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
2059         return -1;                                                      \
2060     }
2061             }
2062             (*line)++;
2063 
2064             char s1[] = "processor";
2065             if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2066                 CHECK_LINE;
2067                 char *p = strchr(buf + sizeof(s1) - 1, ':');
2068                 unsigned val;
2069                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2070                 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2071                 threadInfo[num_avail][osIdIndex] = val;
2072 #if KMP_OS_LINUX && USE_SYSFS_INFO
2073                 char path[256];
2074                 KMP_SNPRINTF(path, sizeof(path),
2075                     "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2076                     threadInfo[num_avail][osIdIndex]);
2077                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2078 
2079                 KMP_SNPRINTF(path, sizeof(path),
2080                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
2081                     threadInfo[num_avail][osIdIndex]);
2082                 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2083                 continue;
2084 #else
2085             }
2086             char s2[] = "physical id";
2087             if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2088                 CHECK_LINE;
2089                 char *p = strchr(buf + sizeof(s2) - 1, ':');
2090                 unsigned val;
2091                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2092                 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2093                 threadInfo[num_avail][pkgIdIndex] = val;
2094                 continue;
2095             }
2096             char s3[] = "core id";
2097             if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2098                 CHECK_LINE;
2099                 char *p = strchr(buf + sizeof(s3) - 1, ':');
2100                 unsigned val;
2101                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2102                 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2103                 threadInfo[num_avail][coreIdIndex] = val;
2104                 continue;
2105 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2106             }
2107             char s4[] = "thread id";
2108             if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2109                 CHECK_LINE;
2110                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2111                 unsigned val;
2112                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2113                 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2114                 threadInfo[num_avail][threadIdIndex] = val;
2115                 continue;
2116             }
2117             unsigned level;
2118             if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2119                 CHECK_LINE;
2120                 char *p = strchr(buf + sizeof(s4) - 1, ':');
2121                 unsigned val;
2122                 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2123                 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2124                 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2125                 threadInfo[num_avail][nodeIdIndex + level] = val;
2126                 continue;
2127             }
2128 
2129             //
2130             // We didn't recognize the leading token on the line.
2131             // There are lots of leading tokens that we don't recognize -
2132             // if the line isn't empty, go on to the next line.
2133             //
2134             if ((*buf != 0) && (*buf != '\n')) {
2135                 //
2136                 // If the line is longer than the buffer, read characters
2137                 // until we find a newline.
2138                 //
2139                 if (long_line) {
2140                     int ch;
2141                     while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2142                 }
2143                 continue;
2144             }
2145 
2146             //
2147             // A newline has signalled the end of the processor record.
2148             // Check that there aren't too many procs specified.
2149             //
2150             if ((int)num_avail == __kmp_xproc) {
2151                 CLEANUP_THREAD_INFO;
2152                 *msg_id = kmp_i18n_str_TooManyEntries;
2153                 return -1;
2154             }
2155 
2156             //
2157             // Check for missing fields.  The osId field must be there, and we
2158             // currently require that the physical id field is specified, also.
2159             //
2160             if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2161                 CLEANUP_THREAD_INFO;
2162                 *msg_id = kmp_i18n_str_MissingProcField;
2163                 return -1;
2164             }
2165             if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2166                 CLEANUP_THREAD_INFO;
2167                 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2168                 return -1;
2169             }
2170 
2171             //
2172             // Skip this proc if it is not included in the machine model.
2173             //
2174             if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2175                 INIT_PROC_INFO(threadInfo[num_avail]);
2176                 continue;
2177             }
2178 
2179             //
2180             // We have a successful parse of this proc's info.
2181             // Increment the counter, and prepare for the next proc.
2182             //
2183             num_avail++;
2184             KMP_ASSERT(num_avail <= num_records);
2185             INIT_PROC_INFO(threadInfo[num_avail]);
2186         }
2187         continue;
2188 
2189         no_val:
2190         CLEANUP_THREAD_INFO;
2191         *msg_id = kmp_i18n_str_MissingValCpuinfo;
2192         return -1;
2193 
2194         dup_field:
2195         CLEANUP_THREAD_INFO;
2196         *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2197         return -1;
2198     }
2199     *line = 0;
2200 
2201 # if KMP_MIC && REDUCE_TEAM_SIZE
2202     unsigned teamSize = 0;
2203 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2204 
2205     // check for num_records == __kmp_xproc ???
2206 
2207     //
2208     // If there's only one thread context to bind to, form an Address object
2209     // with depth 1 and return immediately (or, if affinity is off, set
2210     // address2os to NULL and return).
2211     //
2212     // If it is configured to omit the package level when there is only a
2213     // single package, the logic at the end of this routine won't work if
2214     // there is only a single thread - it would try to form an Address
2215     // object with depth 0.
2216     //
2217     KMP_ASSERT(num_avail > 0);
2218     KMP_ASSERT(num_avail <= num_records);
2219     if (num_avail == 1) {
2220         __kmp_ncores = 1;
2221         __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2222         if (__kmp_affinity_verbose) {
2223             if (! KMP_AFFINITY_CAPABLE()) {
2224                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2225                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2226                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2227             }
2228             else {
2229                 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2230                 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2231                   __kmp_affin_fullMask);
2232                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2233                 if (__kmp_affinity_respect_mask) {
2234                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2235                 } else {
2236                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2237                 }
2238                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2239                 KMP_INFORM(Uniform, "KMP_AFFINITY");
2240             }
2241             int index;
2242             kmp_str_buf_t buf;
2243             __kmp_str_buf_init(&buf);
2244             __kmp_str_buf_print(&buf, "1");
2245             for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2246                 __kmp_str_buf_print(&buf, " x 1");
2247             }
2248             KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2249             __kmp_str_buf_free(&buf);
2250         }
2251 
2252         if (__kmp_affinity_type == affinity_none) {
2253             CLEANUP_THREAD_INFO;
2254             return 0;
2255         }
2256 
2257         *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2258         Address addr(1);
2259         addr.labels[0] = threadInfo[0][pkgIdIndex];
2260         (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2261 
2262         if (__kmp_affinity_gran_levels < 0) {
2263             __kmp_affinity_gran_levels = 0;
2264         }
2265 
2266         if (__kmp_affinity_verbose) {
2267             __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2268         }
2269 
2270         CLEANUP_THREAD_INFO;
2271         return 1;
2272     }
2273 
2274     //
2275     // Sort the threadInfo table by physical Id.
2276     //
2277     qsort(threadInfo, num_avail, sizeof(*threadInfo),
2278       __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2279 
2280     //
2281     // The table is now sorted by pkgId / coreId / threadId, but we really
2282     // don't know the radix of any of the fields.  pkgId's may be sparsely
2283     // assigned among the chips on a system.  Although coreId's are usually
2284     // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2285     // [0..threadsPerCore-1], we don't want to make any such assumptions.
2286     //
2287     // For that matter, we don't know what coresPerPkg and threadsPerCore
2288     // (or the total # packages) are at this point - we want to determine
2289     // that now.  We only have an upper bound on the first two figures.
2290     //
2291     unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2292       * sizeof(unsigned));
2293     unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2294       * sizeof(unsigned));
2295     unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2296       * sizeof(unsigned));
2297     unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2298       * sizeof(unsigned));
2299 
2300     bool assign_thread_ids = false;
2301     unsigned threadIdCt;
2302     unsigned index;
2303 
2304     restart_radix_check:
2305     threadIdCt = 0;
2306 
2307     //
2308     // Initialize the counter arrays with data from threadInfo[0].
2309     //
2310     if (assign_thread_ids) {
2311         if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2312             threadInfo[0][threadIdIndex] = threadIdCt++;
2313         }
2314         else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2315             threadIdCt = threadInfo[0][threadIdIndex] + 1;
2316         }
2317     }
2318     for (index = 0; index <= maxIndex; index++) {
2319         counts[index] = 1;
2320         maxCt[index] = 1;
2321         totals[index] = 1;
2322         lastId[index] = threadInfo[0][index];;
2323     }
2324 
2325     //
2326     // Run through the rest of the OS procs.
2327     //
2328     for (i = 1; i < num_avail; i++) {
2329         //
2330         // Find the most significant index whose id differs
2331         // from the id for the previous OS proc.
2332         //
2333         for (index = maxIndex; index >= threadIdIndex; index--) {
2334             if (assign_thread_ids && (index == threadIdIndex)) {
2335                 //
2336                 // Auto-assign the thread id field if it wasn't specified.
2337                 //
2338                 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2339                     threadInfo[i][threadIdIndex] = threadIdCt++;
2340                 }
2341 
2342                 //
2343                 // Aparrently the thread id field was specified for some
2344                 // entries and not others.  Start the thread id counter
2345                 // off at the next higher thread id.
2346                 //
2347                 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2348                     threadIdCt = threadInfo[i][threadIdIndex] + 1;
2349                 }
2350             }
2351             if (threadInfo[i][index] != lastId[index]) {
2352                 //
2353                 // Run through all indices which are less significant,
2354                 // and reset the counts to 1.
2355                 //
2356                 // At all levels up to and including index, we need to
2357                 // increment the totals and record the last id.
2358                 //
2359                 unsigned index2;
2360                 for (index2 = threadIdIndex; index2 < index; index2++) {
2361                     totals[index2]++;
2362                     if (counts[index2] > maxCt[index2]) {
2363                         maxCt[index2] = counts[index2];
2364                     }
2365                     counts[index2] = 1;
2366                     lastId[index2] = threadInfo[i][index2];
2367                 }
2368                 counts[index]++;
2369                 totals[index]++;
2370                 lastId[index] = threadInfo[i][index];
2371 
2372                 if (assign_thread_ids && (index > threadIdIndex)) {
2373 
2374 # if KMP_MIC && REDUCE_TEAM_SIZE
2375                     //
2376                     // The default team size is the total #threads in the machine
2377                     // minus 1 thread for every core that has 3 or more threads.
2378                     //
2379                     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2380 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2381 
2382                     //
2383                     // Restart the thread counter, as we are on a new core.
2384                     //
2385                     threadIdCt = 0;
2386 
2387                     //
2388                     // Auto-assign the thread id field if it wasn't specified.
2389                     //
2390                     if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2391                         threadInfo[i][threadIdIndex] = threadIdCt++;
2392                     }
2393 
2394                     //
2395                     // Aparrently the thread id field was specified for some
2396                     // entries and not others.  Start the thread id counter
2397                     // off at the next higher thread id.
2398                     //
2399                     else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2400                         threadIdCt = threadInfo[i][threadIdIndex] + 1;
2401                     }
2402                 }
2403                 break;
2404             }
2405         }
2406         if (index < threadIdIndex) {
2407             //
2408             // If thread ids were specified, it is an error if they are not
2409             // unique.  Also, check that we waven't already restarted the
2410             // loop (to be safe - shouldn't need to).
2411             //
2412             if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2413               || assign_thread_ids) {
2414                 __kmp_free(lastId);
2415                 __kmp_free(totals);
2416                 __kmp_free(maxCt);
2417                 __kmp_free(counts);
2418                 CLEANUP_THREAD_INFO;
2419                 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2420                 return -1;
2421             }
2422 
2423             //
2424             // If the thread ids were not specified and we see entries
2425             // entries that are duplicates, start the loop over and
2426             // assign the thread ids manually.
2427             //
2428             assign_thread_ids = true;
2429             goto restart_radix_check;
2430         }
2431     }
2432 
2433 # if KMP_MIC && REDUCE_TEAM_SIZE
2434     //
2435     // The default team size is the total #threads in the machine
2436     // minus 1 thread for every core that has 3 or more threads.
2437     //
2438     teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2439 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2440 
2441     for (index = threadIdIndex; index <= maxIndex; index++) {
2442         if (counts[index] > maxCt[index]) {
2443             maxCt[index] = counts[index];
2444         }
2445     }
2446 
2447     __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2448     nCoresPerPkg = maxCt[coreIdIndex];
2449     nPackages = totals[pkgIdIndex];
2450 
2451     //
2452     // Check to see if the machine topology is uniform
2453     //
2454     unsigned prod = totals[maxIndex];
2455     for (index = threadIdIndex; index < maxIndex; index++) {
2456        prod *= maxCt[index];
2457     }
2458     bool uniform = (prod == totals[threadIdIndex]);
2459 
2460     //
2461     // When affinity is off, this routine will still be called to set
2462     // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2463     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
2464     // correctly, and return now if affinity is not enabled.
2465     //
2466     __kmp_ncores = totals[coreIdIndex];
2467 
2468     if (__kmp_affinity_verbose) {
2469         if (! KMP_AFFINITY_CAPABLE()) {
2470                 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2471                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2472                 if (uniform) {
2473                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2474                 } else {
2475                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2476                 }
2477         }
2478         else {
2479             char buf[KMP_AFFIN_MASK_PRINT_LEN];
2480             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2481                 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2482                 if (__kmp_affinity_respect_mask) {
2483                     KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2484                 } else {
2485                     KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2486                 }
2487                 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2488                 if (uniform) {
2489                     KMP_INFORM(Uniform, "KMP_AFFINITY");
2490                 } else {
2491                     KMP_INFORM(NonUniform, "KMP_AFFINITY");
2492                 }
2493         }
2494         kmp_str_buf_t buf;
2495         __kmp_str_buf_init(&buf);
2496 
2497         __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2498         for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2499             __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2500         }
2501         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
2502           maxCt[threadIdIndex], __kmp_ncores);
2503 
2504         __kmp_str_buf_free(&buf);
2505     }
2506 
2507 # if KMP_MIC && REDUCE_TEAM_SIZE
2508     //
2509     // Set the default team size.
2510     //
2511     if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2512         __kmp_dflt_team_nth = teamSize;
2513         KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2514           __kmp_dflt_team_nth));
2515     }
2516 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2517 
2518     KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2519     KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2520     __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2521     for (i = 0; i < num_avail; ++i) { // fill the os indices
2522         __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2523     }
2524 
2525     if (__kmp_affinity_type == affinity_none) {
2526         __kmp_free(lastId);
2527         __kmp_free(totals);
2528         __kmp_free(maxCt);
2529         __kmp_free(counts);
2530         CLEANUP_THREAD_INFO;
2531         return 0;
2532     }
2533 
2534     //
2535     // Count the number of levels which have more nodes at that level than
2536     // at the parent's level (with there being an implicit root node of
2537     // the top level).  This is equivalent to saying that there is at least
2538     // one node at this level which has a sibling.  These levels are in the
2539     // map, and the package level is always in the map.
2540     //
2541     bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2542     int level = 0;
2543     for (index = threadIdIndex; index < maxIndex; index++) {
2544         KMP_ASSERT(totals[index] >= totals[index + 1]);
2545         inMap[index] = (totals[index] > totals[index + 1]);
2546     }
2547     inMap[maxIndex] = (totals[maxIndex] > 1);
2548     inMap[pkgIdIndex] = true;
2549 
2550     int depth = 0;
2551     for (index = threadIdIndex; index <= maxIndex; index++) {
2552         if (inMap[index]) {
2553             depth++;
2554         }
2555     }
2556     KMP_ASSERT(depth > 0);
2557 
2558     //
2559     // Construct the data structure that is to be returned.
2560     //
2561     *address2os = (AddrUnsPair*)
2562       __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2563     int pkgLevel = -1;
2564     int coreLevel = -1;
2565     int threadLevel = -1;
2566 
2567     for (i = 0; i < num_avail; ++i) {
2568         Address addr(depth);
2569         unsigned os = threadInfo[i][osIdIndex];
2570         int src_index;
2571         int dst_index = 0;
2572 
2573         for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2574             if (! inMap[src_index]) {
2575                 continue;
2576             }
2577             addr.labels[dst_index] = threadInfo[i][src_index];
2578             if (src_index == pkgIdIndex) {
2579                 pkgLevel = dst_index;
2580             }
2581             else if (src_index == coreIdIndex) {
2582                 coreLevel = dst_index;
2583             }
2584             else if (src_index == threadIdIndex) {
2585                 threadLevel = dst_index;
2586             }
2587             dst_index++;
2588         }
2589         (*address2os)[i] = AddrUnsPair(addr, os);
2590     }
2591 
2592     if (__kmp_affinity_gran_levels < 0) {
2593         //
2594         // Set the granularity level based on what levels are modeled
2595         // in the machine topology map.
2596         //
2597         unsigned src_index;
2598         __kmp_affinity_gran_levels = 0;
2599         for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2600             if (! inMap[src_index]) {
2601                 continue;
2602             }
2603             switch (src_index) {
2604                 case threadIdIndex:
2605                 if (__kmp_affinity_gran > affinity_gran_thread) {
2606                     __kmp_affinity_gran_levels++;
2607                 }
2608 
2609                 break;
2610                 case coreIdIndex:
2611                 if (__kmp_affinity_gran > affinity_gran_core) {
2612                     __kmp_affinity_gran_levels++;
2613                 }
2614                 break;
2615 
2616                 case pkgIdIndex:
2617                 if (__kmp_affinity_gran > affinity_gran_package) {
2618                     __kmp_affinity_gran_levels++;
2619                 }
2620                 break;
2621             }
2622         }
2623     }
2624 
2625     if (__kmp_affinity_verbose) {
2626         __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2627           coreLevel, threadLevel);
2628     }
2629 
2630     __kmp_free(inMap);
2631     __kmp_free(lastId);
2632     __kmp_free(totals);
2633     __kmp_free(maxCt);
2634     __kmp_free(counts);
2635     CLEANUP_THREAD_INFO;
2636     return depth;
2637 }
2638 
2639 
2640 //
2641 // Create and return a table of affinity masks, indexed by OS thread ID.
2642 // This routine handles OR'ing together all the affinity masks of threads
2643 // that are sufficiently close, if granularity > fine.
2644 //
2645 static kmp_affin_mask_t *
2646 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2647   AddrUnsPair *address2os, unsigned numAddrs)
2648 {
2649     //
2650     // First form a table of affinity masks in order of OS thread id.
2651     //
2652     unsigned depth;
2653     unsigned maxOsId;
2654     unsigned i;
2655 
2656     KMP_ASSERT(numAddrs > 0);
2657     depth = address2os[0].first.depth;
2658 
2659     maxOsId = 0;
2660     for (i = 0; i < numAddrs; i++) {
2661         unsigned osId = address2os[i].second;
2662         if (osId > maxOsId) {
2663             maxOsId = osId;
2664         }
2665     }
2666     kmp_affin_mask_t *osId2Mask;
2667     KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2668 
2669     //
2670     // Sort the address2os table according to physical order.  Doing so
2671     // will put all threads on the same core/package/node in consecutive
2672     // locations.
2673     //
2674     qsort(address2os, numAddrs, sizeof(*address2os),
2675       __kmp_affinity_cmp_Address_labels);
2676 
2677     KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2678     if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2679         KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
2680     }
2681     if (__kmp_affinity_gran_levels >= (int)depth) {
2682         if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2683           && (__kmp_affinity_type != affinity_none))) {
2684             KMP_WARNING(AffThreadsMayMigrate);
2685         }
2686     }
2687 
2688     //
2689     // Run through the table, forming the masks for all threads on each
2690     // core.  Threads on the same core will have identical "Address"
2691     // objects, not considering the last level, which must be the thread
2692     // id.  All threads on a core will appear consecutively.
2693     //
2694     unsigned unique = 0;
2695     unsigned j = 0;                             // index of 1st thread on core
2696     unsigned leader = 0;
2697     Address *leaderAddr = &(address2os[0].first);
2698     kmp_affin_mask_t *sum;
2699     KMP_CPU_ALLOC_ON_STACK(sum);
2700     KMP_CPU_ZERO(sum);
2701     KMP_CPU_SET(address2os[0].second, sum);
2702     for (i = 1; i < numAddrs; i++) {
2703         //
2704         // If this thread is sufficiently close to the leader (within the
2705         // granularity setting), then set the bit for this os thread in the
2706         // affinity mask for this group, and go on to the next thread.
2707         //
2708         if (leaderAddr->isClose(address2os[i].first,
2709           __kmp_affinity_gran_levels)) {
2710             KMP_CPU_SET(address2os[i].second, sum);
2711             continue;
2712         }
2713 
2714         //
2715         // For every thread in this group, copy the mask to the thread's
2716         // entry in the osId2Mask table.  Mark the first address as a
2717         // leader.
2718         //
2719         for (; j < i; j++) {
2720             unsigned osId = address2os[j].second;
2721             KMP_DEBUG_ASSERT(osId <= maxOsId);
2722             kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2723             KMP_CPU_COPY(mask, sum);
2724             address2os[j].first.leader = (j == leader);
2725         }
2726         unique++;
2727 
2728         //
2729         // Start a new mask.
2730         //
2731         leader = i;
2732         leaderAddr = &(address2os[i].first);
2733         KMP_CPU_ZERO(sum);
2734         KMP_CPU_SET(address2os[i].second, sum);
2735     }
2736 
2737     //
2738     // For every thread in last group, copy the mask to the thread's
2739     // entry in the osId2Mask table.
2740     //
2741     for (; j < i; j++) {
2742         unsigned osId = address2os[j].second;
2743         KMP_DEBUG_ASSERT(osId <= maxOsId);
2744         kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2745         KMP_CPU_COPY(mask, sum);
2746         address2os[j].first.leader = (j == leader);
2747     }
2748     unique++;
2749     KMP_CPU_FREE_FROM_STACK(sum);
2750 
2751     *maxIndex = maxOsId;
2752     *numUnique = unique;
2753     return osId2Mask;
2754 }
2755 
2756 
2757 //
2758 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2759 // as file-static than to try and pass them through the calling sequence of
2760 // the recursive-descent OMP_PLACES parser.
2761 //
2762 static kmp_affin_mask_t *newMasks;
2763 static int numNewMasks;
2764 static int nextNewMask;
2765 
2766 #define ADD_MASK(_mask) \
2767     {                                                                   \
2768         if (nextNewMask >= numNewMasks) {                               \
2769             int i;                                                      \
2770             numNewMasks *= 2;                                           \
2771             kmp_affin_mask_t* temp;                                     \
2772             KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
2773             for(i=0;i<numNewMasks/2;i++) {                              \
2774                 kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
2775                 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
2776                 KMP_CPU_COPY(dest, src);                                \
2777             }                                                           \
2778             KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
2779             newMasks = temp;                                            \
2780         }                                                               \
2781         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
2782         nextNewMask++;                                                  \
2783     }
2784 
2785 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2786     {                                                                   \
2787         if (((_osId) > _maxOsId) ||                                     \
2788           (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2789             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
2790               && (__kmp_affinity_type != affinity_none))) {             \
2791                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
2792             }                                                           \
2793         }                                                               \
2794         else {                                                          \
2795             ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
2796         }                                                               \
2797     }
2798 
2799 
2800 //
2801 // Re-parse the proclist (for the explicit affinity type), and form the list
2802 // of affinity newMasks indexed by gtid.
2803 //
2804 static void
2805 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2806   unsigned int *out_numMasks, const char *proclist,
2807   kmp_affin_mask_t *osId2Mask, int maxOsId)
2808 {
2809     int i;
2810     const char *scan = proclist;
2811     const char *next = proclist;
2812 
2813     //
2814     // We use malloc() for the temporary mask vector,
2815     // so that we can use realloc() to extend it.
2816     //
2817     numNewMasks = 2;
2818     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2819     nextNewMask = 0;
2820     kmp_affin_mask_t *sumMask;
2821     KMP_CPU_ALLOC(sumMask);
2822     int setSize = 0;
2823 
2824     for (;;) {
2825         int start, end, stride;
2826 
2827         SKIP_WS(scan);
2828         next = scan;
2829         if (*next == '\0') {
2830             break;
2831         }
2832 
2833         if (*next == '{') {
2834             int num;
2835             setSize = 0;
2836             next++;     // skip '{'
2837             SKIP_WS(next);
2838             scan = next;
2839 
2840             //
2841             // Read the first integer in the set.
2842             //
2843             KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2844               "bad proclist");
2845             SKIP_DIGITS(next);
2846             num = __kmp_str_to_int(scan, *next);
2847             KMP_ASSERT2(num >= 0, "bad explicit proc list");
2848 
2849             //
2850             // Copy the mask for that osId to the sum (union) mask.
2851             //
2852             if ((num > maxOsId) ||
2853               (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2854                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2855                   && (__kmp_affinity_type != affinity_none))) {
2856                     KMP_WARNING(AffIgnoreInvalidProcID, num);
2857                 }
2858                 KMP_CPU_ZERO(sumMask);
2859             }
2860             else {
2861                 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2862                 setSize = 1;
2863             }
2864 
2865             for (;;) {
2866                 //
2867                 // Check for end of set.
2868                 //
2869                 SKIP_WS(next);
2870                 if (*next == '}') {
2871                     next++;     // skip '}'
2872                     break;
2873                 }
2874 
2875                 //
2876                 // Skip optional comma.
2877                 //
2878                 if (*next == ',') {
2879                     next++;
2880                 }
2881                 SKIP_WS(next);
2882 
2883                 //
2884                 // Read the next integer in the set.
2885                 //
2886                 scan = next;
2887                 KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2888                   "bad explicit proc list");
2889 
2890                 SKIP_DIGITS(next);
2891                 num = __kmp_str_to_int(scan, *next);
2892                 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2893 
2894                 //
2895                 // Add the mask for that osId to the sum mask.
2896                 //
2897                 if ((num > maxOsId) ||
2898                   (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2899                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2900                       && (__kmp_affinity_type != affinity_none))) {
2901                         KMP_WARNING(AffIgnoreInvalidProcID, num);
2902                     }
2903                 }
2904                 else {
2905                     KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2906                     setSize++;
2907                 }
2908             }
2909             if (setSize > 0) {
2910                 ADD_MASK(sumMask);
2911             }
2912 
2913             SKIP_WS(next);
2914             if (*next == ',') {
2915                 next++;
2916             }
2917             scan = next;
2918             continue;
2919         }
2920 
2921         //
2922         // Read the first integer.
2923         //
2924         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2925         SKIP_DIGITS(next);
2926         start = __kmp_str_to_int(scan, *next);
2927         KMP_ASSERT2(start >= 0, "bad explicit proc list");
2928         SKIP_WS(next);
2929 
2930         //
2931         // If this isn't a range, then add a mask to the list and go on.
2932         //
2933         if (*next != '-') {
2934             ADD_MASK_OSID(start, osId2Mask, maxOsId);
2935 
2936             //
2937             // Skip optional comma.
2938             //
2939             if (*next == ',') {
2940                 next++;
2941             }
2942             scan = next;
2943             continue;
2944         }
2945 
2946         //
2947         // This is a range.  Skip over the '-' and read in the 2nd int.
2948         //
2949         next++;         // skip '-'
2950         SKIP_WS(next);
2951         scan = next;
2952         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2953         SKIP_DIGITS(next);
2954         end = __kmp_str_to_int(scan, *next);
2955         KMP_ASSERT2(end >= 0, "bad explicit proc list");
2956 
2957         //
2958         // Check for a stride parameter
2959         //
2960         stride = 1;
2961         SKIP_WS(next);
2962         if (*next == ':') {
2963             //
2964             // A stride is specified.  Skip over the ':" and read the 3rd int.
2965             //
2966             int sign = +1;
2967             next++;         // skip ':'
2968             SKIP_WS(next);
2969             scan = next;
2970             if (*next == '-') {
2971                 sign = -1;
2972                 next++;
2973                 SKIP_WS(next);
2974                 scan = next;
2975             }
2976             KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
2977               "bad explicit proc list");
2978             SKIP_DIGITS(next);
2979             stride = __kmp_str_to_int(scan, *next);
2980             KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2981             stride *= sign;
2982         }
2983 
2984         //
2985         // Do some range checks.
2986         //
2987         KMP_ASSERT2(stride != 0, "bad explicit proc list");
2988         if (stride > 0) {
2989             KMP_ASSERT2(start <= end, "bad explicit proc list");
2990         }
2991         else {
2992             KMP_ASSERT2(start >= end, "bad explicit proc list");
2993         }
2994         KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2995 
2996         //
2997         // Add the mask for each OS proc # to the list.
2998         //
2999         if (stride > 0) {
3000             do {
3001                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3002                 start += stride;
3003             } while (start <= end);
3004         }
3005         else {
3006             do {
3007                 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3008                 start += stride;
3009             } while (start >= end);
3010         }
3011 
3012         //
3013         // Skip optional comma.
3014         //
3015         SKIP_WS(next);
3016         if (*next == ',') {
3017             next++;
3018         }
3019         scan = next;
3020     }
3021 
3022     *out_numMasks = nextNewMask;
3023     if (nextNewMask == 0) {
3024         *out_masks = NULL;
3025         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3026         return;
3027     }
3028     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3029     for(i = 0; i < nextNewMask; i++) {
3030         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3031         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3032         KMP_CPU_COPY(dest, src);
3033     }
3034     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3035     KMP_CPU_FREE(sumMask);
3036 }
3037 
3038 
3039 # if OMP_40_ENABLED
3040 
3041 /*-----------------------------------------------------------------------------
3042 
3043 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3044 places.  Again, Here is the grammar:
3045 
3046 place_list := place
3047 place_list := place , place_list
3048 place := num
3049 place := place : num
3050 place := place : num : signed
3051 place := { subplacelist }
3052 place := ! place                  // (lowest priority)
3053 subplace_list := subplace
3054 subplace_list := subplace , subplace_list
3055 subplace := num
3056 subplace := num : num
3057 subplace := num : num : signed
3058 signed := num
3059 signed := + signed
3060 signed := - signed
3061 
3062 -----------------------------------------------------------------------------*/
3063 
3064 static void
3065 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3066   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3067 {
3068     const char *next;
3069 
3070     for (;;) {
3071         int start, count, stride, i;
3072 
3073         //
3074         // Read in the starting proc id
3075         //
3076         SKIP_WS(*scan);
3077         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3078           "bad explicit places list");
3079         next = *scan;
3080         SKIP_DIGITS(next);
3081         start = __kmp_str_to_int(*scan, *next);
3082         KMP_ASSERT(start >= 0);
3083         *scan = next;
3084 
3085         //
3086         // valid follow sets are ',' ':' and '}'
3087         //
3088         SKIP_WS(*scan);
3089         if (**scan == '}' || **scan == ',') {
3090             if ((start > maxOsId) ||
3091               (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3092                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3093                   && (__kmp_affinity_type != affinity_none))) {
3094                     KMP_WARNING(AffIgnoreInvalidProcID, start);
3095                 }
3096             }
3097             else {
3098                 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3099                 (*setSize)++;
3100             }
3101             if (**scan == '}') {
3102                 break;
3103             }
3104             (*scan)++;  // skip ','
3105             continue;
3106         }
3107         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3108         (*scan)++;      // skip ':'
3109 
3110         //
3111         // Read count parameter
3112         //
3113         SKIP_WS(*scan);
3114         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3115           "bad explicit places list");
3116         next = *scan;
3117         SKIP_DIGITS(next);
3118         count = __kmp_str_to_int(*scan, *next);
3119         KMP_ASSERT(count >= 0);
3120         *scan = next;
3121 
3122         //
3123         // valid follow sets are ',' ':' and '}'
3124         //
3125         SKIP_WS(*scan);
3126         if (**scan == '}' || **scan == ',') {
3127             for (i = 0; i < count; i++) {
3128                 if ((start > maxOsId) ||
3129                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3130                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3131                       && (__kmp_affinity_type != affinity_none))) {
3132                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3133                     }
3134                     break;  // don't proliferate warnings for large count
3135                 }
3136                 else {
3137                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3138                     start++;
3139                     (*setSize)++;
3140                 }
3141             }
3142             if (**scan == '}') {
3143                 break;
3144             }
3145             (*scan)++;  // skip ','
3146             continue;
3147         }
3148         KMP_ASSERT2(**scan == ':', "bad explicit places list");
3149         (*scan)++;      // skip ':'
3150 
3151         //
3152         // Read stride parameter
3153         //
3154         int sign = +1;
3155         for (;;) {
3156             SKIP_WS(*scan);
3157             if (**scan == '+') {
3158                 (*scan)++; // skip '+'
3159                 continue;
3160             }
3161             if (**scan == '-') {
3162                 sign *= -1;
3163                 (*scan)++; // skip '-'
3164                 continue;
3165             }
3166             break;
3167         }
3168         SKIP_WS(*scan);
3169         KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3170           "bad explicit places list");
3171         next = *scan;
3172         SKIP_DIGITS(next);
3173         stride = __kmp_str_to_int(*scan, *next);
3174         KMP_ASSERT(stride >= 0);
3175         *scan = next;
3176         stride *= sign;
3177 
3178         //
3179         // valid follow sets are ',' and '}'
3180         //
3181         SKIP_WS(*scan);
3182         if (**scan == '}' || **scan == ',') {
3183             for (i = 0; i < count; i++) {
3184                 if ((start > maxOsId) ||
3185                   (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3186                     if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3187                       && (__kmp_affinity_type != affinity_none))) {
3188                         KMP_WARNING(AffIgnoreInvalidProcID, start);
3189                     }
3190                     break;  // don't proliferate warnings for large count
3191                 }
3192                 else {
3193                     KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3194                     start += stride;
3195                     (*setSize)++;
3196                 }
3197             }
3198             if (**scan == '}') {
3199                 break;
3200             }
3201             (*scan)++;  // skip ','
3202             continue;
3203         }
3204 
3205         KMP_ASSERT2(0, "bad explicit places list");
3206     }
3207 }
3208 
3209 
3210 static void
3211 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3212   int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3213 {
3214     const char *next;
3215 
3216     //
3217     // valid follow sets are '{' '!' and num
3218     //
3219     SKIP_WS(*scan);
3220     if (**scan == '{') {
3221         (*scan)++;      // skip '{'
3222         __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3223           setSize);
3224         KMP_ASSERT2(**scan == '}', "bad explicit places list");
3225         (*scan)++;      // skip '}'
3226     }
3227     else if (**scan == '!') {
3228         (*scan)++;      // skip '!'
3229         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3230         KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3231     }
3232     else if ((**scan >= '0') && (**scan <= '9')) {
3233         next = *scan;
3234         SKIP_DIGITS(next);
3235         int num = __kmp_str_to_int(*scan, *next);
3236         KMP_ASSERT(num >= 0);
3237         if ((num > maxOsId) ||
3238           (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3239             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3240               && (__kmp_affinity_type != affinity_none))) {
3241                 KMP_WARNING(AffIgnoreInvalidProcID, num);
3242             }
3243         }
3244         else {
3245             KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3246             (*setSize)++;
3247         }
3248         *scan = next;  // skip num
3249     }
3250     else {
3251         KMP_ASSERT2(0, "bad explicit places list");
3252     }
3253 }
3254 
3255 
3256 //static void
3257 void
3258 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3259   unsigned int *out_numMasks, const char *placelist,
3260   kmp_affin_mask_t *osId2Mask, int maxOsId)
3261 {
3262     int i,j,count,stride,sign;
3263     const char *scan = placelist;
3264     const char *next = placelist;
3265 
3266     numNewMasks = 2;
3267     KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3268     nextNewMask = 0;
3269 
3270     // tempMask is modified based on the previous or initial
3271     //   place to form the current place
3272     // previousMask contains the previous place
3273     kmp_affin_mask_t *tempMask;
3274     kmp_affin_mask_t *previousMask;
3275     KMP_CPU_ALLOC(tempMask);
3276     KMP_CPU_ZERO(tempMask);
3277     KMP_CPU_ALLOC(previousMask);
3278     KMP_CPU_ZERO(previousMask);
3279     int setSize = 0;
3280 
3281     for (;;) {
3282         __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3283 
3284         //
3285         // valid follow sets are ',' ':' and EOL
3286         //
3287         SKIP_WS(scan);
3288         if (*scan == '\0' || *scan == ',') {
3289             if (setSize > 0) {
3290                 ADD_MASK(tempMask);
3291             }
3292             KMP_CPU_ZERO(tempMask);
3293             setSize = 0;
3294             if (*scan == '\0') {
3295                 break;
3296             }
3297             scan++;     // skip ','
3298             continue;
3299         }
3300 
3301         KMP_ASSERT2(*scan == ':', "bad explicit places list");
3302         scan++;         // skip ':'
3303 
3304         //
3305         // Read count parameter
3306         //
3307         SKIP_WS(scan);
3308         KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3309           "bad explicit places list");
3310         next = scan;
3311         SKIP_DIGITS(next);
3312         count = __kmp_str_to_int(scan, *next);
3313         KMP_ASSERT(count >= 0);
3314         scan = next;
3315 
3316         //
3317         // valid follow sets are ',' ':' and EOL
3318         //
3319         SKIP_WS(scan);
3320         if (*scan == '\0' || *scan == ',') {
3321             stride = +1;
3322         }
3323         else {
3324             KMP_ASSERT2(*scan == ':', "bad explicit places list");
3325             scan++;         // skip ':'
3326 
3327             //
3328             // Read stride parameter
3329             //
3330             sign = +1;
3331             for (;;) {
3332                 SKIP_WS(scan);
3333                 if (*scan == '+') {
3334                     scan++; // skip '+'
3335                     continue;
3336                 }
3337                 if (*scan == '-') {
3338                     sign *= -1;
3339                     scan++; // skip '-'
3340                     continue;
3341                 }
3342                 break;
3343             }
3344             SKIP_WS(scan);
3345             KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3346               "bad explicit places list");
3347             next = scan;
3348             SKIP_DIGITS(next);
3349             stride = __kmp_str_to_int(scan, *next);
3350             KMP_DEBUG_ASSERT(stride >= 0);
3351             scan = next;
3352             stride *= sign;
3353         }
3354 
3355         // Add places determined by initial_place : count : stride
3356         for (i = 0; i < count; i++) {
3357             if (setSize == 0) {
3358                 break;
3359             }
3360             // Add the current place, then build the next place (tempMask) from that
3361             KMP_CPU_COPY(previousMask, tempMask);
3362             ADD_MASK(previousMask);
3363             KMP_CPU_ZERO(tempMask);
3364             setSize = 0;
3365             KMP_CPU_SET_ITERATE(j, previousMask) {
3366                 if (! KMP_CPU_ISSET(j, previousMask)) {
3367                     continue;
3368                 }
3369                 if ((j+stride > maxOsId) || (j+stride < 0) ||
3370                   (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3371                   (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3372                     if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3373                       && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3374                         KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3375                     }
3376                     continue;
3377                 }
3378                 KMP_CPU_SET(j+stride, tempMask);
3379                 setSize++;
3380             }
3381         }
3382         KMP_CPU_ZERO(tempMask);
3383         setSize = 0;
3384 
3385         //
3386         // valid follow sets are ',' and EOL
3387         //
3388         SKIP_WS(scan);
3389         if (*scan == '\0') {
3390             break;
3391         }
3392         if (*scan == ',') {
3393             scan++;     // skip ','
3394             continue;
3395         }
3396 
3397         KMP_ASSERT2(0, "bad explicit places list");
3398     }
3399 
3400     *out_numMasks = nextNewMask;
3401     if (nextNewMask == 0) {
3402         *out_masks = NULL;
3403         KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3404         return;
3405     }
3406     KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3407     KMP_CPU_FREE(tempMask);
3408     KMP_CPU_FREE(previousMask);
3409     for(i = 0; i < nextNewMask; i++) {
3410         kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
3411         kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3412         KMP_CPU_COPY(dest, src);
3413     }
3414     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3415 }
3416 
3417 # endif /* OMP_40_ENABLED */
3418 
3419 #undef ADD_MASK
3420 #undef ADD_MASK_OSID
3421 
3422 static void
3423 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3424 {
3425     int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
3426     if (__kmp_place_num_sockets == 0 &&
3427         __kmp_place_num_cores == 0 &&
3428         __kmp_place_num_threads_per_core == 0 )
3429         goto _exit;   // no topology limiting actions requested, exit
3430     if (__kmp_place_num_sockets == 0)
3431         __kmp_place_num_sockets = nPackages;    // use all available sockets
3432     if (__kmp_place_num_cores == 0)
3433         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
3434     if (__kmp_place_num_threads_per_core == 0 ||
3435         __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3436         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3437 
3438     if ( !__kmp_affinity_uniform_topology() ) {
3439         KMP_WARNING( AffHWSubsetNonUniform );
3440         goto _exit; // don't support non-uniform topology
3441     }
3442     if ( depth > 3 ) {
3443         KMP_WARNING( AffHWSubsetNonThreeLevel );
3444         goto _exit; // don't support not-3-level topology
3445     }
3446     if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3447         KMP_WARNING(AffHWSubsetManySockets);
3448         goto _exit;
3449     }
3450     if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3451         KMP_WARNING( AffHWSubsetManyCores );
3452         goto _exit;
3453     }
3454 
3455     AddrUnsPair *newAddr;
3456     if (pAddr) // pAddr is NULL in case of affinity_none
3457         newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3458             __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3459 
3460     for (i = 0; i < nPackages; ++i) {
3461         if (i < __kmp_place_socket_offset ||
3462             i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
3463             n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3464             if (__kmp_pu_os_idx != NULL) {
3465                 for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
3466                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
3467                         KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3468                         ++proc_num;
3469                     }
3470                 }
3471             }
3472         } else {
3473             for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
3474                 if (j < __kmp_place_core_offset ||
3475                     j >= __kmp_place_core_offset + __kmp_place_num_cores) {
3476                     n_old += __kmp_nThreadsPerCore; // skip not-requested core
3477                     if (__kmp_pu_os_idx != NULL) {
3478                         for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
3479                             KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3480                             ++proc_num;
3481                         }
3482                     }
3483                 } else {
3484                     for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3485                         if (k < __kmp_place_num_threads_per_core) {
3486                             if (pAddr)
3487                                 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3488                             n_new++;
3489                         } else {
3490                             if (__kmp_pu_os_idx != NULL)
3491                                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3492                         }
3493                         n_old++;
3494                         ++proc_num;
3495                     }
3496                 }
3497             }
3498         }
3499     }
3500     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3501     KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3502                      __kmp_place_num_threads_per_core);
3503 
3504     nPackages = __kmp_place_num_sockets;                      // correct nPackages
3505     nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
3506     __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3507     __kmp_avail_proc = n_new;                                 // correct avail_proc
3508     __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
3509 
3510     if (pAddr) {
3511         __kmp_free( *pAddr );
3512         *pAddr = newAddr;      // replace old topology with new one
3513     }
3514 _exit:
3515     if (__kmp_pu_os_idx != NULL) {
3516         __kmp_free(__kmp_pu_os_idx);
3517         __kmp_pu_os_idx = NULL;
3518     }
3519 }
3520 
3521 
3522 static AddrUnsPair *address2os = NULL;
3523 static int           * procarr = NULL;
3524 static int     __kmp_aff_depth = 0;
3525 
3526 #define KMP_EXIT_AFF_NONE                             \
3527     KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3528     KMP_ASSERT(address2os == NULL);                   \
3529     __kmp_apply_thread_places(NULL, 0);               \
3530     return;
3531 
3532 static void
3533 __kmp_aux_affinity_initialize(void)
3534 {
3535     if (__kmp_affinity_masks != NULL) {
3536         KMP_ASSERT(__kmp_affin_fullMask != NULL);
3537         return;
3538     }
3539 
3540     //
3541     // Create the "full" mask - this defines all of the processors that we
3542     // consider to be in the machine model.  If respect is set, then it is
3543     // the initialization thread's affinity mask.  Otherwise, it is all
3544     // processors that we know about on the machine.
3545     //
3546     if (__kmp_affin_fullMask == NULL) {
3547         KMP_CPU_ALLOC(__kmp_affin_fullMask);
3548     }
3549     if (KMP_AFFINITY_CAPABLE()) {
3550         if (__kmp_affinity_respect_mask) {
3551             __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3552 
3553             //
3554             // Count the number of available processors.
3555             //
3556             unsigned i;
3557             __kmp_avail_proc = 0;
3558             KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3559                 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3560                     continue;
3561                 }
3562                 __kmp_avail_proc++;
3563             }
3564             if (__kmp_avail_proc > __kmp_xproc) {
3565                 if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3566                   && (__kmp_affinity_type != affinity_none))) {
3567                     KMP_WARNING(ErrorInitializeAffinity);
3568                 }
3569                 __kmp_affinity_type = affinity_none;
3570                 KMP_AFFINITY_DISABLE();
3571                 return;
3572             }
3573         }
3574         else {
3575             __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3576             __kmp_avail_proc = __kmp_xproc;
3577         }
3578     }
3579 
3580     int depth = -1;
3581     kmp_i18n_id_t msg_id = kmp_i18n_null;
3582 
3583     //
3584     // For backward compatibility, setting KMP_CPUINFO_FILE =>
3585     // KMP_TOPOLOGY_METHOD=cpuinfo
3586     //
3587     if ((__kmp_cpuinfo_file != NULL) &&
3588       (__kmp_affinity_top_method == affinity_top_method_all)) {
3589         __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3590     }
3591 
3592     if (__kmp_affinity_top_method == affinity_top_method_all) {
3593         //
3594         // In the default code path, errors are not fatal - we just try using
3595         // another method.  We only emit a warning message if affinity is on,
3596         // or the verbose flag is set, an the nowarnings flag was not set.
3597         //
3598         const char *file_name = NULL;
3599         int line = 0;
3600 # if KMP_USE_HWLOC
3601         if (depth < 0) {
3602             if (__kmp_affinity_verbose) {
3603                 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3604             }
3605             if(!__kmp_hwloc_error) {
3606                 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3607                 if (depth == 0) {
3608                     KMP_EXIT_AFF_NONE;
3609                 } else if(depth < 0 && __kmp_affinity_verbose) {
3610                     KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3611                 }
3612             } else if(__kmp_affinity_verbose) {
3613                 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3614             }
3615         }
3616 # endif
3617 
3618 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3619 
3620         if (depth < 0) {
3621             if (__kmp_affinity_verbose) {
3622                 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3623             }
3624 
3625             file_name = NULL;
3626             depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3627             if (depth == 0) {
3628                 KMP_EXIT_AFF_NONE;
3629             }
3630 
3631             if (depth < 0) {
3632                 if (__kmp_affinity_verbose) {
3633                     if (msg_id != kmp_i18n_null) {
3634                         KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3635                           KMP_I18N_STR(DecodingLegacyAPIC));
3636                     }
3637                     else {
3638                         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3639                     }
3640                 }
3641 
3642                 file_name = NULL;
3643                 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3644                 if (depth == 0) {
3645                     KMP_EXIT_AFF_NONE;
3646                 }
3647             }
3648         }
3649 
3650 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3651 
3652 # if KMP_OS_LINUX
3653 
3654         if (depth < 0) {
3655             if (__kmp_affinity_verbose) {
3656                 if (msg_id != kmp_i18n_null) {
3657                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3658                 }
3659                 else {
3660                     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3661                 }
3662             }
3663 
3664             FILE *f = fopen("/proc/cpuinfo", "r");
3665             if (f == NULL) {
3666                 msg_id = kmp_i18n_str_CantOpenCpuinfo;
3667             }
3668             else {
3669                 file_name = "/proc/cpuinfo";
3670                 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3671                 fclose(f);
3672                 if (depth == 0) {
3673                     KMP_EXIT_AFF_NONE;
3674                 }
3675             }
3676         }
3677 
3678 # endif /* KMP_OS_LINUX */
3679 
3680 # if KMP_GROUP_AFFINITY
3681 
3682         if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3683             if (__kmp_affinity_verbose) {
3684                 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3685             }
3686 
3687             depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3688             KMP_ASSERT(depth != 0);
3689         }
3690 
3691 # endif /* KMP_GROUP_AFFINITY */
3692 
3693         if (depth < 0) {
3694             if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3695                 if (file_name == NULL) {
3696                     KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3697                 }
3698                 else if (line == 0) {
3699                     KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3700                 }
3701                 else {
3702                     KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3703                 }
3704             }
3705             // FIXME - print msg if msg_id = kmp_i18n_null ???
3706 
3707             file_name = "";
3708             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3709             if (depth == 0) {
3710                 KMP_EXIT_AFF_NONE;
3711             }
3712             KMP_ASSERT(depth > 0);
3713             KMP_ASSERT(address2os != NULL);
3714         }
3715     }
3716 
3717     //
3718     // If the user has specified that a paricular topology discovery method
3719     // is to be used, then we abort if that method fails.  The exception is
3720     // group affinity, which might have been implicitly set.
3721     //
3722 
3723 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3724 
3725     else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3726         if (__kmp_affinity_verbose) {
3727             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3728               KMP_I18N_STR(Decodingx2APIC));
3729         }
3730 
3731         depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3732         if (depth == 0) {
3733             KMP_EXIT_AFF_NONE;
3734         }
3735         if (depth < 0) {
3736             KMP_ASSERT(msg_id != kmp_i18n_null);
3737             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3738         }
3739     }
3740     else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3741         if (__kmp_affinity_verbose) {
3742             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3743               KMP_I18N_STR(DecodingLegacyAPIC));
3744         }
3745 
3746         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3747         if (depth == 0) {
3748             KMP_EXIT_AFF_NONE;
3749         }
3750         if (depth < 0) {
3751             KMP_ASSERT(msg_id != kmp_i18n_null);
3752             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3753         }
3754     }
3755 
3756 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3757 
3758     else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3759         const char *filename;
3760         if (__kmp_cpuinfo_file != NULL) {
3761             filename = __kmp_cpuinfo_file;
3762         }
3763         else {
3764             filename = "/proc/cpuinfo";
3765         }
3766 
3767         if (__kmp_affinity_verbose) {
3768             KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3769         }
3770 
3771         FILE *f = fopen(filename, "r");
3772         if (f == NULL) {
3773             int code = errno;
3774             if (__kmp_cpuinfo_file != NULL) {
3775                 __kmp_msg(
3776                     kmp_ms_fatal,
3777                     KMP_MSG(CantOpenFileForReading, filename),
3778                     KMP_ERR(code),
3779                     KMP_HNT(NameComesFrom_CPUINFO_FILE),
3780                     __kmp_msg_null
3781                 );
3782             }
3783             else {
3784                 __kmp_msg(
3785                     kmp_ms_fatal,
3786                     KMP_MSG(CantOpenFileForReading, filename),
3787                     KMP_ERR(code),
3788                     __kmp_msg_null
3789                 );
3790             }
3791         }
3792         int line = 0;
3793         depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3794         fclose(f);
3795         if (depth < 0) {
3796             KMP_ASSERT(msg_id != kmp_i18n_null);
3797             if (line > 0) {
3798                 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3799             }
3800             else {
3801                 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3802             }
3803         }
3804         if (__kmp_affinity_type == affinity_none) {
3805             KMP_ASSERT(depth == 0);
3806             KMP_EXIT_AFF_NONE;
3807         }
3808     }
3809 
3810 # if KMP_GROUP_AFFINITY
3811 
3812     else if (__kmp_affinity_top_method == affinity_top_method_group) {
3813         if (__kmp_affinity_verbose) {
3814             KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3815         }
3816 
3817         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3818         KMP_ASSERT(depth != 0);
3819         if (depth < 0) {
3820             KMP_ASSERT(msg_id != kmp_i18n_null);
3821             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3822         }
3823     }
3824 
3825 # endif /* KMP_GROUP_AFFINITY */
3826 
3827     else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3828         if (__kmp_affinity_verbose) {
3829             KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3830         }
3831 
3832         depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3833         if (depth == 0) {
3834             KMP_EXIT_AFF_NONE;
3835         }
3836         // should not fail
3837         KMP_ASSERT(depth > 0);
3838         KMP_ASSERT(address2os != NULL);
3839     }
3840 
3841 # if KMP_USE_HWLOC
3842     else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3843         if (__kmp_affinity_verbose) {
3844             KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3845         }
3846         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3847         if (depth == 0) {
3848             KMP_EXIT_AFF_NONE;
3849         }
3850     }
3851 # endif // KMP_USE_HWLOC
3852 
3853     if (address2os == NULL) {
3854         if (KMP_AFFINITY_CAPABLE()
3855           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3856           && (__kmp_affinity_type != affinity_none)))) {
3857             KMP_WARNING(ErrorInitializeAffinity);
3858         }
3859         __kmp_affinity_type = affinity_none;
3860         KMP_AFFINITY_DISABLE();
3861         return;
3862     }
3863 
3864     __kmp_apply_thread_places(&address2os, depth);
3865 
3866     //
3867     // Create the table of masks, indexed by thread Id.
3868     //
3869     unsigned maxIndex;
3870     unsigned numUnique;
3871     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3872       address2os, __kmp_avail_proc);
3873     if (__kmp_affinity_gran_levels == 0) {
3874         KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3875     }
3876 
3877     //
3878     // Set the childNums vector in all Address objects.  This must be done
3879     // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3880     // which takes into account the setting of __kmp_affinity_compact.
3881     //
3882     __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3883 
3884     switch (__kmp_affinity_type) {
3885 
3886         case affinity_explicit:
3887         KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3888 # if OMP_40_ENABLED
3889         if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3890 # endif
3891         {
3892             __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3893               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3894               maxIndex);
3895         }
3896 # if OMP_40_ENABLED
3897         else {
3898             __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3899               &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3900               maxIndex);
3901         }
3902 # endif
3903         if (__kmp_affinity_num_masks == 0) {
3904             if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3905               && (__kmp_affinity_type != affinity_none))) {
3906                 KMP_WARNING(AffNoValidProcID);
3907             }
3908             __kmp_affinity_type = affinity_none;
3909             return;
3910         }
3911         break;
3912 
3913         //
3914         // The other affinity types rely on sorting the Addresses according
3915         // to some permutation of the machine topology tree.  Set
3916         // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3917         // then jump to a common code fragment to do the sort and create
3918         // the array of affinity masks.
3919         //
3920 
3921         case affinity_logical:
3922         __kmp_affinity_compact = 0;
3923         if (__kmp_affinity_offset) {
3924             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3925               % __kmp_avail_proc;
3926         }
3927         goto sortAddresses;
3928 
3929         case affinity_physical:
3930         if (__kmp_nThreadsPerCore > 1) {
3931             __kmp_affinity_compact = 1;
3932             if (__kmp_affinity_compact >= depth) {
3933                 __kmp_affinity_compact = 0;
3934             }
3935         } else {
3936             __kmp_affinity_compact = 0;
3937         }
3938         if (__kmp_affinity_offset) {
3939             __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3940               % __kmp_avail_proc;
3941         }
3942         goto sortAddresses;
3943 
3944         case affinity_scatter:
3945         if (__kmp_affinity_compact >= depth) {
3946             __kmp_affinity_compact = 0;
3947         }
3948         else {
3949             __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3950         }
3951         goto sortAddresses;
3952 
3953         case affinity_compact:
3954         if (__kmp_affinity_compact >= depth) {
3955             __kmp_affinity_compact = depth - 1;
3956         }
3957         goto sortAddresses;
3958 
3959         case affinity_balanced:
3960         // Balanced works only for the case of a single package
3961         if( nPackages > 1 ) {
3962             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3963                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3964             }
3965             __kmp_affinity_type = affinity_none;
3966             return;
3967         } else if( __kmp_affinity_uniform_topology() ) {
3968             break;
3969         } else { // Non-uniform topology
3970 
3971             // Save the depth for further usage
3972             __kmp_aff_depth = depth;
3973 
3974             // Number of hyper threads per core in HT machine
3975             int nth_per_core = __kmp_nThreadsPerCore;
3976 
3977             int core_level;
3978             if( nth_per_core > 1 ) {
3979                 core_level = depth - 2;
3980             } else {
3981                 core_level = depth - 1;
3982             }
3983             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3984             int nproc = nth_per_core * ncores;
3985 
3986             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3987             for( int i = 0; i < nproc; i++ ) {
3988                 procarr[ i ] = -1;
3989             }
3990 
3991             for( int i = 0; i < __kmp_avail_proc; i++ ) {
3992                 int proc = address2os[ i ].second;
3993                 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3994                 // If there is only one thread per core then depth == 2: level 0 - package,
3995                 // level 1 - core.
3996                 int level = depth - 1;
3997 
3998                 // __kmp_nth_per_core == 1
3999                 int thread = 0;
4000                 int core = address2os[ i ].first.labels[ level ];
4001                 // If the thread level exists, that is we have more than one thread context per core
4002                 if( nth_per_core > 1 ) {
4003                     thread = address2os[ i ].first.labels[ level ] % nth_per_core;
4004                     core = address2os[ i ].first.labels[ level - 1 ];
4005                 }
4006                 procarr[ core * nth_per_core + thread ] = proc;
4007             }
4008 
4009             break;
4010         }
4011 
4012         sortAddresses:
4013         //
4014         // Allocate the gtid->affinity mask table.
4015         //
4016         if (__kmp_affinity_dups) {
4017             __kmp_affinity_num_masks = __kmp_avail_proc;
4018         }
4019         else {
4020             __kmp_affinity_num_masks = numUnique;
4021         }
4022 
4023 # if OMP_40_ENABLED
4024         if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
4025           && ( __kmp_affinity_num_places > 0 )
4026           && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
4027             __kmp_affinity_num_masks = __kmp_affinity_num_places;
4028         }
4029 # endif
4030 
4031         KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4032 
4033         //
4034         // Sort the address2os table according to the current setting of
4035         // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4036         //
4037         qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4038           __kmp_affinity_cmp_Address_child_num);
4039         {
4040             int i;
4041             unsigned j;
4042             for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4043                 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
4044                     continue;
4045                 }
4046                 unsigned osId = address2os[i].second;
4047                 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4048                 kmp_affin_mask_t *dest
4049                   = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4050                 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4051                 KMP_CPU_COPY(dest, src);
4052                 if (++j >= __kmp_affinity_num_masks) {
4053                     break;
4054                 }
4055             }
4056             KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4057         }
4058         break;
4059 
4060         default:
4061         KMP_ASSERT2(0, "Unexpected affinity setting");
4062     }
4063 
4064     __kmp_free(osId2Mask);
4065     machine_hierarchy.init(address2os, __kmp_avail_proc);
4066 }
4067 #undef KMP_EXIT_AFF_NONE
4068 
4069 
4070 void
4071 __kmp_affinity_initialize(void)
4072 {
4073     //
4074     // Much of the code above was written assumming that if a machine was not
4075     // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4076     // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4077     //
4078     // There are too many checks for __kmp_affinity_type == affinity_none
4079     // in this code.  Instead of trying to change them all, check if
4080     // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4081     // affinity_none, call the real initialization routine, then restore
4082     // __kmp_affinity_type to affinity_disabled.
4083     //
4084     int disabled = (__kmp_affinity_type == affinity_disabled);
4085     if (! KMP_AFFINITY_CAPABLE()) {
4086         KMP_ASSERT(disabled);
4087     }
4088     if (disabled) {
4089         __kmp_affinity_type = affinity_none;
4090     }
4091     __kmp_aux_affinity_initialize();
4092     if (disabled) {
4093         __kmp_affinity_type = affinity_disabled;
4094     }
4095 }
4096 
4097 
4098 void
4099 __kmp_affinity_uninitialize(void)
4100 {
4101     if (__kmp_affinity_masks != NULL) {
4102         KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4103         __kmp_affinity_masks = NULL;
4104     }
4105     if (__kmp_affin_fullMask != NULL) {
4106         KMP_CPU_FREE(__kmp_affin_fullMask);
4107         __kmp_affin_fullMask = NULL;
4108     }
4109     __kmp_affinity_num_masks = 0;
4110 # if OMP_40_ENABLED
4111     __kmp_affinity_num_places = 0;
4112 # endif
4113     if (__kmp_affinity_proclist != NULL) {
4114         __kmp_free(__kmp_affinity_proclist);
4115         __kmp_affinity_proclist = NULL;
4116     }
4117     if( address2os != NULL ) {
4118         __kmp_free( address2os );
4119         address2os = NULL;
4120     }
4121     if( procarr != NULL ) {
4122         __kmp_free( procarr );
4123         procarr = NULL;
4124     }
4125 # if KMP_USE_HWLOC
4126     if (__kmp_hwloc_topology != NULL) {
4127         hwloc_topology_destroy(__kmp_hwloc_topology);
4128         __kmp_hwloc_topology = NULL;
4129     }
4130 # endif
4131 }
4132 
4133 
4134 void
4135 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4136 {
4137     if (! KMP_AFFINITY_CAPABLE()) {
4138         return;
4139     }
4140 
4141     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4142     if (th->th.th_affin_mask == NULL) {
4143         KMP_CPU_ALLOC(th->th.th_affin_mask);
4144     }
4145     else {
4146         KMP_CPU_ZERO(th->th.th_affin_mask);
4147     }
4148 
4149     //
4150     // Copy the thread mask to the kmp_info_t strucuture.
4151     // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4152     // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4153     // is set, then the full mask is the same as the mask of the initialization
4154     // thread.
4155     //
4156     kmp_affin_mask_t *mask;
4157     int i;
4158 
4159 # if OMP_40_ENABLED
4160     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4161 # endif
4162     {
4163         if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4164           ) {
4165 # if KMP_GROUP_AFFINITY
4166             if (__kmp_num_proc_groups > 1) {
4167                 return;
4168             }
4169 # endif
4170             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4171             i = KMP_PLACE_ALL;
4172             mask = __kmp_affin_fullMask;
4173         }
4174         else {
4175             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4176             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4177             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4178         }
4179     }
4180 # if OMP_40_ENABLED
4181     else {
4182         if ((! isa_root)
4183           || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4184 #  if KMP_GROUP_AFFINITY
4185             if (__kmp_num_proc_groups > 1) {
4186                 return;
4187             }
4188 #  endif
4189             KMP_ASSERT(__kmp_affin_fullMask != NULL);
4190             i = KMP_PLACE_ALL;
4191             mask = __kmp_affin_fullMask;
4192         }
4193         else {
4194             //
4195             // int i = some hash function or just a counter that doesn't
4196             // always start at 0.  Use gtid for now.
4197             //
4198             KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4199             i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4200             mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4201         }
4202     }
4203 # endif
4204 
4205 # if OMP_40_ENABLED
4206     th->th.th_current_place = i;
4207     if (isa_root) {
4208         th->th.th_new_place = i;
4209         th->th.th_first_place = 0;
4210         th->th.th_last_place = __kmp_affinity_num_masks - 1;
4211     }
4212 
4213     if (i == KMP_PLACE_ALL) {
4214         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4215           gtid));
4216     }
4217     else {
4218         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4219           gtid, i));
4220     }
4221 # else
4222     if (i == -1) {
4223         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4224           gtid));
4225     }
4226     else {
4227         KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4228           gtid, i));
4229     }
4230 # endif /* OMP_40_ENABLED */
4231 
4232     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4233 
4234     if (__kmp_affinity_verbose) {
4235         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4236         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4237           th->th.th_affin_mask);
4238         KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4239           buf);
4240     }
4241 
4242 # if KMP_OS_WINDOWS
4243     //
4244     // On Windows* OS, the process affinity mask might have changed.
4245     // If the user didn't request affinity and this call fails,
4246     // just continue silently.  See CQ171393.
4247     //
4248     if ( __kmp_affinity_type == affinity_none ) {
4249         __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4250     }
4251     else
4252 # endif
4253     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4254 }
4255 
4256 
4257 # if OMP_40_ENABLED
4258 
4259 void
4260 __kmp_affinity_set_place(int gtid)
4261 {
4262     int retval;
4263 
4264     if (! KMP_AFFINITY_CAPABLE()) {
4265         return;
4266     }
4267 
4268     kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4269 
4270     KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4271       gtid, th->th.th_new_place, th->th.th_current_place));
4272 
4273     //
4274     // Check that the new place is within this thread's partition.
4275     //
4276     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4277     KMP_ASSERT(th->th.th_new_place >= 0);
4278     KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4279     if (th->th.th_first_place <= th->th.th_last_place) {
4280         KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4281          && (th->th.th_new_place <= th->th.th_last_place));
4282     }
4283     else {
4284         KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4285          || (th->th.th_new_place >= th->th.th_last_place));
4286     }
4287 
4288     //
4289     // Copy the thread mask to the kmp_info_t strucuture,
4290     // and set this thread's affinity.
4291     //
4292     kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4293       th->th.th_new_place);
4294     KMP_CPU_COPY(th->th.th_affin_mask, mask);
4295     th->th.th_current_place = th->th.th_new_place;
4296 
4297     if (__kmp_affinity_verbose) {
4298         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4299         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4300           th->th.th_affin_mask);
4301         KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4302           gtid, buf);
4303     }
4304     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4305 }
4306 
4307 # endif /* OMP_40_ENABLED */
4308 
4309 
4310 int
4311 __kmp_aux_set_affinity(void **mask)
4312 {
4313     int gtid;
4314     kmp_info_t *th;
4315     int retval;
4316 
4317     if (! KMP_AFFINITY_CAPABLE()) {
4318         return -1;
4319     }
4320 
4321     gtid = __kmp_entry_gtid();
4322     KA_TRACE(1000, ;{
4323         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4324         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4325           (kmp_affin_mask_t *)(*mask));
4326         __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4327           gtid, buf);
4328     });
4329 
4330     if (__kmp_env_consistency_check) {
4331         if ((mask == NULL) || (*mask == NULL)) {
4332             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4333         }
4334         else {
4335             unsigned proc;
4336             int num_procs = 0;
4337 
4338             KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4339                 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4340                     KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4341                 }
4342                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4343                     continue;
4344                 }
4345                 num_procs++;
4346             }
4347             if (num_procs == 0) {
4348                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4349             }
4350 
4351 # if KMP_GROUP_AFFINITY
4352             if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4353                 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4354             }
4355 # endif /* KMP_GROUP_AFFINITY */
4356 
4357         }
4358     }
4359 
4360     th = __kmp_threads[gtid];
4361     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4362     retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4363     if (retval == 0) {
4364         KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4365     }
4366 
4367 # if OMP_40_ENABLED
4368     th->th.th_current_place = KMP_PLACE_UNDEFINED;
4369     th->th.th_new_place = KMP_PLACE_UNDEFINED;
4370     th->th.th_first_place = 0;
4371     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4372 
4373     //
4374     // Turn off 4.0 affinity for the current tread at this parallel level.
4375     //
4376     th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4377 # endif
4378 
4379     return retval;
4380 }
4381 
4382 
4383 int
4384 __kmp_aux_get_affinity(void **mask)
4385 {
4386     int gtid;
4387     int retval;
4388     kmp_info_t *th;
4389 
4390     if (! KMP_AFFINITY_CAPABLE()) {
4391         return -1;
4392     }
4393 
4394     gtid = __kmp_entry_gtid();
4395     th = __kmp_threads[gtid];
4396     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4397 
4398     KA_TRACE(1000, ;{
4399         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4400         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4401           th->th.th_affin_mask);
4402         __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4403     });
4404 
4405     if (__kmp_env_consistency_check) {
4406         if ((mask == NULL) || (*mask == NULL)) {
4407             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4408         }
4409     }
4410 
4411 # if !KMP_OS_WINDOWS
4412 
4413     retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4414     KA_TRACE(1000, ;{
4415         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4416         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4417           (kmp_affin_mask_t *)(*mask));
4418         __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4419     });
4420     return retval;
4421 
4422 # else
4423 
4424     KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4425     return 0;
4426 
4427 # endif /* KMP_OS_WINDOWS */
4428 
4429 }
4430 
4431 int
4432 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4433 {
4434     int retval;
4435 
4436     if (! KMP_AFFINITY_CAPABLE()) {
4437         return -1;
4438     }
4439 
4440     KA_TRACE(1000, ;{
4441         int gtid = __kmp_entry_gtid();
4442         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4443         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4444           (kmp_affin_mask_t *)(*mask));
4445         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4446           proc, gtid, buf);
4447     });
4448 
4449     if (__kmp_env_consistency_check) {
4450         if ((mask == NULL) || (*mask == NULL)) {
4451             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4452         }
4453     }
4454 
4455     if ((proc < 0)
4456 # if !KMP_USE_HWLOC
4457          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4458 # endif
4459        ) {
4460         return -1;
4461     }
4462     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4463         return -2;
4464     }
4465 
4466     KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4467     return 0;
4468 }
4469 
4470 
4471 int
4472 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4473 {
4474     int retval;
4475 
4476     if (! KMP_AFFINITY_CAPABLE()) {
4477         return -1;
4478     }
4479 
4480     KA_TRACE(1000, ;{
4481         int gtid = __kmp_entry_gtid();
4482         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4483         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4484           (kmp_affin_mask_t *)(*mask));
4485         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4486           proc, gtid, buf);
4487     });
4488 
4489     if (__kmp_env_consistency_check) {
4490         if ((mask == NULL) || (*mask == NULL)) {
4491             KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4492         }
4493     }
4494 
4495     if ((proc < 0)
4496 # if !KMP_USE_HWLOC
4497          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4498 # endif
4499        ) {
4500         return -1;
4501     }
4502     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4503         return -2;
4504     }
4505 
4506     KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4507     return 0;
4508 }
4509 
4510 
4511 int
4512 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4513 {
4514     int retval;
4515 
4516     if (! KMP_AFFINITY_CAPABLE()) {
4517         return -1;
4518     }
4519 
4520     KA_TRACE(1000, ;{
4521         int gtid = __kmp_entry_gtid();
4522         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4523         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4524           (kmp_affin_mask_t *)(*mask));
4525         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4526           proc, gtid, buf);
4527     });
4528 
4529     if (__kmp_env_consistency_check) {
4530         if ((mask == NULL) || (*mask == NULL)) {
4531             KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4532         }
4533     }
4534 
4535     if ((proc < 0)
4536 # if !KMP_USE_HWLOC
4537          || ((unsigned)proc >= KMP_CPU_SETSIZE)
4538 # endif
4539        ) {
4540         return -1;
4541     }
4542     if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4543         return 0;
4544     }
4545 
4546     return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4547 }
4548 
4549 
4550 // Dynamic affinity settings - Affinity balanced
4551 void __kmp_balanced_affinity( int tid, int nthreads )
4552 {
4553     if( __kmp_affinity_uniform_topology() ) {
4554         int coreID;
4555         int threadID;
4556         // Number of hyper threads per core in HT machine
4557         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4558         // Number of cores
4559         int ncores = __kmp_ncores;
4560         // How many threads will be bound to each core
4561         int chunk = nthreads / ncores;
4562         // How many cores will have an additional thread bound to it - "big cores"
4563         int big_cores = nthreads % ncores;
4564         // Number of threads on the big cores
4565         int big_nth = ( chunk + 1 ) * big_cores;
4566         if( tid < big_nth ) {
4567             coreID = tid / (chunk + 1 );
4568             threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4569         } else { //tid >= big_nth
4570             coreID = ( tid - big_cores ) / chunk;
4571             threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4572         }
4573 
4574         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4575           "Illegal set affinity operation when not capable");
4576 
4577         kmp_affin_mask_t *mask;
4578         KMP_CPU_ALLOC_ON_STACK(mask);
4579         KMP_CPU_ZERO(mask);
4580 
4581         // Granularity == thread
4582         if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4583             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4584             KMP_CPU_SET( osID, mask);
4585         } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4586             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4587                 int osID;
4588                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4589                 KMP_CPU_SET( osID, mask);
4590             }
4591         }
4592         if (__kmp_affinity_verbose) {
4593             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4594             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4595             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4596               tid, buf);
4597         }
4598         __kmp_set_system_affinity( mask, TRUE );
4599         KMP_CPU_FREE_FROM_STACK(mask);
4600     } else { // Non-uniform topology
4601 
4602         kmp_affin_mask_t *mask;
4603         KMP_CPU_ALLOC_ON_STACK(mask);
4604         KMP_CPU_ZERO(mask);
4605 
4606         // Number of hyper threads per core in HT machine
4607         int nth_per_core = __kmp_nThreadsPerCore;
4608         int core_level;
4609         if( nth_per_core > 1 ) {
4610             core_level = __kmp_aff_depth - 2;
4611         } else {
4612             core_level = __kmp_aff_depth - 1;
4613         }
4614 
4615         // Number of cores - maximum value; it does not count trail cores with 0 processors
4616         int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4617 
4618         // For performance gain consider the special case nthreads == __kmp_avail_proc
4619         if( nthreads == __kmp_avail_proc ) {
4620             if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4621                 int osID = address2os[ tid ].second;
4622                 KMP_CPU_SET( osID, mask);
4623             } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4624                 int coreID = address2os[ tid ].first.labels[ core_level ];
4625                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4626                 // since the address2os is sortied we can break when cnt==nth_per_core
4627                 int cnt = 0;
4628                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
4629                     int osID = address2os[ i ].second;
4630                     int core = address2os[ i ].first.labels[ core_level ];
4631                     if( core == coreID ) {
4632                         KMP_CPU_SET( osID, mask);
4633                         cnt++;
4634                         if( cnt == nth_per_core ) {
4635                             break;
4636                         }
4637                     }
4638                 }
4639             }
4640         } else if( nthreads <= __kmp_ncores ) {
4641 
4642             int core = 0;
4643             for( int i = 0; i < ncores; i++ ) {
4644                 // Check if this core from procarr[] is in the mask
4645                 int in_mask = 0;
4646                 for( int j = 0; j < nth_per_core; j++ ) {
4647                     if( procarr[ i * nth_per_core + j ] != - 1 ) {
4648                         in_mask = 1;
4649                         break;
4650                     }
4651                 }
4652                 if( in_mask ) {
4653                     if( tid == core ) {
4654                         for( int j = 0; j < nth_per_core; j++ ) {
4655                             int osID = procarr[ i * nth_per_core + j ];
4656                             if( osID != -1 ) {
4657                                 KMP_CPU_SET( osID, mask );
4658                                 // For granularity=thread it is enough to set the first available osID for this core
4659                                 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4660                                     break;
4661                                 }
4662                             }
4663                         }
4664                         break;
4665                     } else {
4666                         core++;
4667                     }
4668                 }
4669             }
4670 
4671         } else { // nthreads > __kmp_ncores
4672 
4673             // Array to save the number of processors at each core
4674             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4675             // Array to save the number of cores with "x" available processors;
4676             int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4677             // Array to save the number of cores with # procs from x to nth_per_core
4678             int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4679 
4680             for( int i = 0; i <= nth_per_core; i++ ) {
4681                 ncores_with_x_procs[ i ] = 0;
4682                 ncores_with_x_to_max_procs[ i ] = 0;
4683             }
4684 
4685             for( int i = 0; i < ncores; i++ ) {
4686                 int cnt = 0;
4687                 for( int j = 0; j < nth_per_core; j++ ) {
4688                     if( procarr[ i * nth_per_core + j ] != -1 ) {
4689                         cnt++;
4690                     }
4691                 }
4692                 nproc_at_core[ i ] = cnt;
4693                 ncores_with_x_procs[ cnt ]++;
4694             }
4695 
4696             for( int i = 0; i <= nth_per_core; i++ ) {
4697                 for( int j = i; j <= nth_per_core; j++ ) {
4698                     ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4699                 }
4700             }
4701 
4702             // Max number of processors
4703             int nproc = nth_per_core * ncores;
4704             // An array to keep number of threads per each context
4705             int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4706             for( int i = 0; i < nproc; i++ ) {
4707                 newarr[ i ] = 0;
4708             }
4709 
4710             int nth = nthreads;
4711             int flag = 0;
4712             while( nth > 0 ) {
4713                 for( int j = 1; j <= nth_per_core; j++ ) {
4714                     int cnt = ncores_with_x_to_max_procs[ j ];
4715                     for( int i = 0; i < ncores; i++ ) {
4716                         // Skip the core with 0 processors
4717                         if( nproc_at_core[ i ] == 0 ) {
4718                             continue;
4719                         }
4720                         for( int k = 0; k < nth_per_core; k++ ) {
4721                             if( procarr[ i * nth_per_core + k ] != -1 ) {
4722                                 if( newarr[ i * nth_per_core + k ] == 0 ) {
4723                                     newarr[ i * nth_per_core + k ] = 1;
4724                                     cnt--;
4725                                     nth--;
4726                                     break;
4727                                 } else {
4728                                     if( flag != 0 ) {
4729                                         newarr[ i * nth_per_core + k ] ++;
4730                                         cnt--;
4731                                         nth--;
4732                                         break;
4733                                     }
4734                                 }
4735                             }
4736                         }
4737                         if( cnt == 0 || nth == 0 ) {
4738                             break;
4739                         }
4740                     }
4741                     if( nth == 0 ) {
4742                         break;
4743                     }
4744                 }
4745                 flag = 1;
4746             }
4747             int sum = 0;
4748             for( int i = 0; i < nproc; i++ ) {
4749                 sum += newarr[ i ];
4750                 if( sum > tid ) {
4751                     // Granularity == thread
4752                     if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4753                         int osID = procarr[ i ];
4754                         KMP_CPU_SET( osID, mask);
4755                     } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4756                         int coreID = i / nth_per_core;
4757                         for( int ii = 0; ii < nth_per_core; ii++ ) {
4758                             int osID = procarr[ coreID * nth_per_core + ii ];
4759                             if( osID != -1 ) {
4760                                 KMP_CPU_SET( osID, mask);
4761                             }
4762                         }
4763                     }
4764                     break;
4765                 }
4766             }
4767             __kmp_free( newarr );
4768         }
4769 
4770         if (__kmp_affinity_verbose) {
4771             char buf[KMP_AFFIN_MASK_PRINT_LEN];
4772             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4773             KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4774               tid, buf);
4775         }
4776         __kmp_set_system_affinity( mask, TRUE );
4777         KMP_CPU_FREE_FROM_STACK(mask);
4778     }
4779 }
4780 
4781 #if KMP_OS_LINUX
4782 // We don't need this entry for Windows because
4783 // there is GetProcessAffinityMask() api
4784 //
4785 // The intended usage is indicated by these steps:
4786 // 1) The user gets the current affinity mask
4787 // 2) Then sets the affinity by calling this function
4788 // 3) Error check the return value
4789 // 4) Use non-OpenMP parallelization
4790 // 5) Reset the affinity to what was stored in step 1)
4791 #ifdef __cplusplus
4792 extern "C"
4793 #endif
4794 int
4795 kmp_set_thread_affinity_mask_initial()
4796 // the function returns 0 on success,
4797 //   -1 if we cannot bind thread
4798 //   >0 (errno) if an error happened during binding
4799 {
4800     int gtid = __kmp_get_gtid();
4801     if (gtid < 0) {
4802         // Do not touch non-omp threads
4803         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4804             "non-omp thread, returning\n"));
4805         return -1;
4806     }
4807     if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4808         KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4809             "affinity not initialized, returning\n"));
4810         return -1;
4811     }
4812     KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4813         "set full mask for thread %d\n", gtid));
4814     KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4815     return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4816 }
4817 #endif
4818 
4819 #endif // KMP_AFFINITY_SUPPORTED
4820