1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_i18n.h"
17 #include "kmp_io.h"
18 #include "kmp_str.h"
19 #include "kmp_wrapper_getpid.h"
20 #if KMP_USE_HIER_SCHED
21 #include "kmp_dispatch_hier.h"
22 #endif
23 
24 // Store the real or imagined machine hierarchy here
25 static hierarchy_info machine_hierarchy;
26 
27 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
28 
29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
30   kmp_uint32 depth;
31   // The test below is true if affinity is available, but set to "none". Need to
32   // init on first use of hierarchical barrier.
33   if (TCR_1(machine_hierarchy.uninitialized))
34     machine_hierarchy.init(NULL, nproc);
35 
36   // Adjust the hierarchy in case num threads exceeds original
37   if (nproc > machine_hierarchy.base_num_threads)
38     machine_hierarchy.resize(nproc);
39 
40   depth = machine_hierarchy.depth;
41   KMP_DEBUG_ASSERT(depth > 0);
42 
43   thr_bar->depth = depth;
44   thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
45   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 bool KMPAffinity::picked_api = false;
51 
52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
58 
59 void KMPAffinity::pick_api() {
60   KMPAffinity *affinity_dispatch;
61   if (picked_api)
62     return;
63 #if KMP_USE_HWLOC
64   // Only use Hwloc if affinity isn't explicitly disabled and
65   // user requests Hwloc topology method
66   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
67       __kmp_affinity_type != affinity_disabled) {
68     affinity_dispatch = new KMPHwlocAffinity();
69   } else
70 #endif
71   {
72     affinity_dispatch = new KMPNativeAffinity();
73   }
74   __kmp_affinity_dispatch = affinity_dispatch;
75   picked_api = true;
76 }
77 
78 void KMPAffinity::destroy_api() {
79   if (__kmp_affinity_dispatch != NULL) {
80     delete __kmp_affinity_dispatch;
81     __kmp_affinity_dispatch = NULL;
82     picked_api = false;
83   }
84 }
85 
86 // Print the affinity mask to the character array in a pretty format.
87 char *__kmp_affinity_print_mask(char *buf, int buf_len,
88                                 kmp_affin_mask_t *mask) {
89   KMP_ASSERT(buf_len >= 40);
90   char *scan = buf;
91   char *end = buf + buf_len - 1;
92 
93   // Find first element / check for empty set.
94   size_t i;
95   i = mask->begin();
96   if (i == mask->end()) {
97     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
98     while (*scan != '\0')
99       scan++;
100     KMP_ASSERT(scan <= end);
101     return buf;
102   }
103 
104   KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i);
105   while (*scan != '\0')
106     scan++;
107   i++;
108   for (; i != mask->end(); i = mask->next(i)) {
109     if (!KMP_CPU_ISSET(i, mask)) {
110       continue;
111     }
112 
113     // Check for buffer overflow.  A string of the form ",<n>" will have at most
114     // 10 characters, plus we want to leave room to print ",...}" if the set is
115     // too large to print for a total of 15 characters. We already left room for
116     // '\0' in setting end.
117     if (end - scan < 15) {
118       break;
119     }
120     KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i);
121     while (*scan != '\0')
122       scan++;
123   }
124   if (i != mask->end()) {
125     KMP_SNPRINTF(scan, end - scan + 1, ",...");
126     while (*scan != '\0')
127       scan++;
128   }
129   KMP_SNPRINTF(scan, end - scan + 1, "}");
130   while (*scan != '\0')
131     scan++;
132   KMP_ASSERT(scan <= end);
133   return buf;
134 }
135 
136 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
137   KMP_CPU_ZERO(mask);
138 
139 #if KMP_GROUP_AFFINITY
140 
141   if (__kmp_num_proc_groups > 1) {
142     int group;
143     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
144     for (group = 0; group < __kmp_num_proc_groups; group++) {
145       int i;
146       int num = __kmp_GetActiveProcessorCount(group);
147       for (i = 0; i < num; i++) {
148         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
149       }
150     }
151   } else
152 
153 #endif /* KMP_GROUP_AFFINITY */
154 
155   {
156     int proc;
157     for (proc = 0; proc < __kmp_xproc; proc++) {
158       KMP_CPU_SET(proc, mask);
159     }
160   }
161 }
162 
163 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
164 // called to renumber the labels from [0..n] and place them into the child_num
165 // vector of the address object.  This is done in case the labels used for
166 // the children at one node of the hierarchy differ from those used for
167 // another node at the same level.  Example:  suppose the machine has 2 nodes
168 // with 2 packages each.  The first node contains packages 601 and 602, and
169 // second node contains packages 603 and 604.  If we try to sort the table
170 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
171 // because we are paying attention to the labels themselves, not the ordinal
172 // child numbers.  By using the child numbers in the sort, the result is
173 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
174 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
175                                              int numAddrs) {
176   KMP_DEBUG_ASSERT(numAddrs > 0);
177   int depth = address2os->first.depth;
178   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
179   unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
180   int labCt;
181   for (labCt = 0; labCt < depth; labCt++) {
182     address2os[0].first.childNums[labCt] = counts[labCt] = 0;
183     lastLabel[labCt] = address2os[0].first.labels[labCt];
184   }
185   int i;
186   for (i = 1; i < numAddrs; i++) {
187     for (labCt = 0; labCt < depth; labCt++) {
188       if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
189         int labCt2;
190         for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
191           counts[labCt2] = 0;
192           lastLabel[labCt2] = address2os[i].first.labels[labCt2];
193         }
194         counts[labCt]++;
195         lastLabel[labCt] = address2os[i].first.labels[labCt];
196         break;
197       }
198     }
199     for (labCt = 0; labCt < depth; labCt++) {
200       address2os[i].first.childNums[labCt] = counts[labCt];
201     }
202     for (; labCt < (int)Address::maxDepth; labCt++) {
203       address2os[i].first.childNums[labCt] = 0;
204     }
205   }
206   __kmp_free(lastLabel);
207   __kmp_free(counts);
208 }
209 
210 // All of the __kmp_affinity_create_*_map() routines should set
211 // __kmp_affinity_masks to a vector of affinity mask objects of length
212 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
213 // the number of levels in the machine topology tree (zero if
214 // __kmp_affinity_type == affinity_none).
215 //
216 // All of the __kmp_affinity_create_*_map() routines should set
217 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
218 // They need to save and restore the mask, and it could be needed later, so
219 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
220 // again.
221 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
222 
223 static int nCoresPerPkg, nPackages;
224 static int __kmp_nThreadsPerCore;
225 #ifndef KMP_DFLT_NTH_CORES
226 static int __kmp_ncores;
227 #endif
228 static int *__kmp_pu_os_idx = NULL;
229 
230 // __kmp_affinity_uniform_topology() doesn't work when called from
231 // places which support arbitrarily many levels in the machine topology
232 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
233 // __kmp_affinity_create_x2apicid_map().
234 inline static bool __kmp_affinity_uniform_topology() {
235   return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
236 }
237 
238 // Print out the detailed machine topology map, i.e. the physical locations
239 // of each OS proc.
240 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
241                                           int depth, int pkgLevel,
242                                           int coreLevel, int threadLevel) {
243   int proc;
244 
245   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
246   for (proc = 0; proc < len; proc++) {
247     int level;
248     kmp_str_buf_t buf;
249     __kmp_str_buf_init(&buf);
250     for (level = 0; level < depth; level++) {
251       if (level == threadLevel) {
252         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
253       } else if (level == coreLevel) {
254         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
255       } else if (level == pkgLevel) {
256         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
257       } else if (level > pkgLevel) {
258         __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
259                             level - pkgLevel - 1);
260       } else {
261         __kmp_str_buf_print(&buf, "L%d ", level);
262       }
263       __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
264     }
265     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
266                buf.str);
267     __kmp_str_buf_free(&buf);
268   }
269 }
270 
271 #if KMP_USE_HWLOC
272 
273 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
274                                           int depth, int *levels) {
275   int proc;
276   kmp_str_buf_t buf;
277   __kmp_str_buf_init(&buf);
278   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
279   for (proc = 0; proc < len; proc++) {
280     __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
281                         addrP[proc].first.labels[0]);
282     if (depth > 1) {
283       int level = 1; // iterate over levels
284       int label = 1; // iterate over labels
285       if (__kmp_numa_detected)
286         // node level follows package
287         if (levels[level++] > 0)
288           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
289                               addrP[proc].first.labels[label++]);
290       if (__kmp_tile_depth > 0)
291         // tile level follows node if any, or package
292         if (levels[level++] > 0)
293           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
294                               addrP[proc].first.labels[label++]);
295       if (levels[level++] > 0)
296         // core level follows
297         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
298                             addrP[proc].first.labels[label++]);
299       if (levels[level++] > 0)
300         // thread level is the latest
301         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
302                             addrP[proc].first.labels[label++]);
303       KMP_DEBUG_ASSERT(label == depth);
304     }
305     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
306     __kmp_str_buf_clear(&buf);
307   }
308   __kmp_str_buf_free(&buf);
309 }
310 
311 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
312 
313 // This function removes the topology levels that are radix 1 and don't offer
314 // further information about the topology.  The most common example is when you
315 // have one thread context per core, we don't want the extra thread context
316 // level if it offers no unique labels.  So they are removed.
317 // return value: the new depth of address2os
318 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
319                                                   int depth, int *levels) {
320   int level;
321   int i;
322   int radix1_detected;
323   int new_depth = depth;
324   for (level = depth - 1; level > 0; --level) {
325     // Detect if this level is radix 1
326     radix1_detected = 1;
327     for (i = 1; i < nTh; ++i) {
328       if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
329         // There are differing label values for this level so it stays
330         radix1_detected = 0;
331         break;
332       }
333     }
334     if (!radix1_detected)
335       continue;
336     // Radix 1 was detected
337     --new_depth;
338     levels[level] = -1; // mark level as not present in address2os array
339     if (level == new_depth) {
340       // "turn off" deepest level, just decrement the depth that removes
341       // the level from address2os array
342       for (i = 0; i < nTh; ++i) {
343         addrP[i].first.depth--;
344       }
345     } else {
346       // For other levels, we move labels over and also reduce the depth
347       int j;
348       for (j = level; j < new_depth; ++j) {
349         for (i = 0; i < nTh; ++i) {
350           addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
351           addrP[i].first.depth--;
352         }
353         levels[j + 1] -= 1;
354       }
355     }
356   }
357   return new_depth;
358 }
359 
360 // Returns the number of objects of type 'type' below 'obj' within the topology
361 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
362 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
363 // object.
364 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
365                                            hwloc_obj_type_t type) {
366   int retval = 0;
367   hwloc_obj_t first;
368   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
369                                            obj->logical_index, type, 0);
370        first != NULL &&
371        hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
372            obj;
373        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
374                                           first)) {
375     ++retval;
376   }
377   return retval;
378 }
379 
380 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
381                                                hwloc_obj_t o, unsigned depth,
382                                                hwloc_obj_t *f) {
383   if (o->depth == depth) {
384     if (*f == NULL)
385       *f = o; // output first descendant found
386     return 1;
387   }
388   int sum = 0;
389   for (unsigned i = 0; i < o->arity; i++)
390     sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
391   return sum; // will be 0 if no one found (as PU arity is 0)
392 }
393 
394 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
395                                               hwloc_obj_type_t type,
396                                               hwloc_obj_t *f) {
397   if (!hwloc_compare_types(o->type, type)) {
398     if (*f == NULL)
399       *f = o; // output first descendant found
400     return 1;
401   }
402   int sum = 0;
403   for (unsigned i = 0; i < o->arity; i++)
404     sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
405   return sum; // will be 0 if no one found (as PU arity is 0)
406 }
407 
408 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
409                                            int &nActiveThreads,
410                                            int &num_active_cores,
411                                            hwloc_obj_t obj, int depth,
412                                            int *labels) {
413   hwloc_obj_t core = NULL;
414   hwloc_topology_t &tp = __kmp_hwloc_topology;
415   int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
416   for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
417     hwloc_obj_t pu = NULL;
418     KMP_DEBUG_ASSERT(core != NULL);
419     int num_active_threads = 0;
420     int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
421     // int NT = core->arity; pu = core->first_child; // faster?
422     for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
423       KMP_DEBUG_ASSERT(pu != NULL);
424       if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
425         continue; // skip inactive (inaccessible) unit
426       Address addr(depth + 2);
427       KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
428                     obj->os_index, obj->logical_index, core->os_index,
429                     core->logical_index, pu->os_index, pu->logical_index));
430       for (int i = 0; i < depth; ++i)
431         addr.labels[i] = labels[i]; // package, etc.
432       addr.labels[depth] = core_id; // core
433       addr.labels[depth + 1] = pu_id; // pu
434       addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
435       __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
436       nActiveThreads++;
437       ++num_active_threads; // count active threads per core
438     }
439     if (num_active_threads) { // were there any active threads on the core?
440       ++__kmp_ncores; // count total active cores
441       ++num_active_cores; // count active cores per socket
442       if (num_active_threads > __kmp_nThreadsPerCore)
443         __kmp_nThreadsPerCore = num_active_threads; // calc maximum
444     }
445   }
446   return 0;
447 }
448 
449 // Check if NUMA node detected below the package,
450 // and if tile object is detected and return its depth
451 static int __kmp_hwloc_check_numa() {
452   hwloc_topology_t &tp = __kmp_hwloc_topology;
453   hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
454   int depth;
455 
456   // Get some PU
457   hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
458   if (hT == NULL) // something has gone wrong
459     return 1;
460 
461   // check NUMA node below PACKAGE
462   hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
463   hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
464   KMP_DEBUG_ASSERT(hS != NULL);
465   if (hN != NULL && hN->depth > hS->depth) {
466     __kmp_numa_detected = TRUE; // socket includes node(s)
467     if (__kmp_affinity_gran == affinity_gran_node) {
468       __kmp_affinity_gran == affinity_gran_numa;
469     }
470   }
471 
472   // check tile, get object by depth because of multiple caches possible
473   depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
474   hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
475   hC = NULL; // not used, but reset it here just in case
476   if (hL != NULL &&
477       __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
478     __kmp_tile_depth = depth; // tile consists of multiple cores
479   return 0;
480 }
481 
482 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
483                                            kmp_i18n_id_t *const msg_id) {
484   hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
485   *address2os = NULL;
486   *msg_id = kmp_i18n_null;
487 
488   // Save the affinity mask for the current thread.
489   kmp_affin_mask_t *oldMask;
490   KMP_CPU_ALLOC(oldMask);
491   __kmp_get_system_affinity(oldMask, TRUE);
492   __kmp_hwloc_check_numa();
493 
494   if (!KMP_AFFINITY_CAPABLE()) {
495     // Hack to try and infer the machine topology using only the data
496     // available from cpuid on the current thread, and __kmp_xproc.
497     KMP_ASSERT(__kmp_affinity_type == affinity_none);
498 
499     nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
500         hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
501     __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
502         hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
503     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
504     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
505     if (__kmp_affinity_verbose) {
506       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
507       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
508       if (__kmp_affinity_uniform_topology()) {
509         KMP_INFORM(Uniform, "KMP_AFFINITY");
510       } else {
511         KMP_INFORM(NonUniform, "KMP_AFFINITY");
512       }
513       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
514                  __kmp_nThreadsPerCore, __kmp_ncores);
515     }
516     KMP_CPU_FREE(oldMask);
517     return 0;
518   }
519 
520   int depth = 3;
521   int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
522   int labels[3] = {0}; // package [,node] [,tile] - head of lables array
523   if (__kmp_numa_detected)
524     ++depth;
525   if (__kmp_tile_depth)
526     ++depth;
527 
528   // Allocate the data structure to be returned.
529   AddrUnsPair *retval =
530       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
531   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
532   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
533 
534   // When affinity is off, this routine will still be called to set
535   // __kmp_ncores, as well as __kmp_nThreadsPerCore,
536   // nCoresPerPkg, & nPackages.  Make sure all these vars are set
537   // correctly, and return if affinity is not enabled.
538 
539   hwloc_obj_t socket, node, tile;
540   int nActiveThreads = 0;
541   int socket_id = 0;
542   // re-calculate globals to count only accessible resources
543   __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
544   nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
545   for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
546        socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
547       socket_id++) {
548     labels[0] = socket_id;
549     if (__kmp_numa_detected) {
550       int NN;
551       int n_active_nodes = 0;
552       node = NULL;
553       NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
554                                               &node);
555       for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
556         labels[1] = node_id;
557         if (__kmp_tile_depth) {
558           // NUMA + tiles
559           int NT;
560           int n_active_tiles = 0;
561           tile = NULL;
562           NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
563                                                    &tile);
564           for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
565             labels[2] = tl_id;
566             int n_active_cores = 0;
567             __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
568                                             n_active_cores, tile, 3, labels);
569             if (n_active_cores) { // were there any active cores on the socket?
570               ++n_active_tiles; // count active tiles per node
571               if (n_active_cores > nCorePerTile)
572                 nCorePerTile = n_active_cores; // calc maximum
573             }
574           }
575           if (n_active_tiles) { // were there any active tiles on the socket?
576             ++n_active_nodes; // count active nodes per package
577             if (n_active_tiles > nTilePerNode)
578               nTilePerNode = n_active_tiles; // calc maximum
579           }
580         } else {
581           // NUMA, no tiles
582           int n_active_cores = 0;
583           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
584                                           n_active_cores, node, 2, labels);
585           if (n_active_cores) { // were there any active cores on the socket?
586             ++n_active_nodes; // count active nodes per package
587             if (n_active_cores > nCorePerNode)
588               nCorePerNode = n_active_cores; // calc maximum
589           }
590         }
591       }
592       if (n_active_nodes) { // were there any active nodes on the socket?
593         ++nPackages; // count total active packages
594         if (n_active_nodes > nNodePerPkg)
595           nNodePerPkg = n_active_nodes; // calc maximum
596       }
597     } else {
598       if (__kmp_tile_depth) {
599         // no NUMA, tiles
600         int NT;
601         int n_active_tiles = 0;
602         tile = NULL;
603         NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
604                                                  &tile);
605         for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
606           labels[1] = tl_id;
607           int n_active_cores = 0;
608           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
609                                           n_active_cores, tile, 2, labels);
610           if (n_active_cores) { // were there any active cores on the socket?
611             ++n_active_tiles; // count active tiles per package
612             if (n_active_cores > nCorePerTile)
613               nCorePerTile = n_active_cores; // calc maximum
614           }
615         }
616         if (n_active_tiles) { // were there any active tiles on the socket?
617           ++nPackages; // count total active packages
618           if (n_active_tiles > nTilePerPkg)
619             nTilePerPkg = n_active_tiles; // calc maximum
620         }
621       } else {
622         // no NUMA, no tiles
623         int n_active_cores = 0;
624         __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
625                                         socket, 1, labels);
626         if (n_active_cores) { // were there any active cores on the socket?
627           ++nPackages; // count total active packages
628           if (n_active_cores > nCoresPerPkg)
629             nCoresPerPkg = n_active_cores; // calc maximum
630         }
631       }
632     }
633   }
634 
635   // If there's only one thread context to bind to, return now.
636   KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
637   KMP_ASSERT(nActiveThreads > 0);
638   if (nActiveThreads == 1) {
639     __kmp_ncores = nPackages = 1;
640     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
641     if (__kmp_affinity_verbose) {
642       char buf[KMP_AFFIN_MASK_PRINT_LEN];
643       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
644 
645       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
646       if (__kmp_affinity_respect_mask) {
647         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
648       } else {
649         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
650       }
651       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652       KMP_INFORM(Uniform, "KMP_AFFINITY");
653       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654                  __kmp_nThreadsPerCore, __kmp_ncores);
655     }
656 
657     if (__kmp_affinity_type == affinity_none) {
658       __kmp_free(retval);
659       KMP_CPU_FREE(oldMask);
660       return 0;
661     }
662 
663     // Form an Address object which only includes the package level.
664     Address addr(1);
665     addr.labels[0] = retval[0].first.labels[0];
666     retval[0].first = addr;
667 
668     if (__kmp_affinity_gran_levels < 0) {
669       __kmp_affinity_gran_levels = 0;
670     }
671 
672     if (__kmp_affinity_verbose) {
673       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
674     }
675 
676     *address2os = retval;
677     KMP_CPU_FREE(oldMask);
678     return 1;
679   }
680 
681   // Sort the table by physical Id.
682   qsort(retval, nActiveThreads, sizeof(*retval),
683         __kmp_affinity_cmp_Address_labels);
684 
685   // Check to see if the machine topology is uniform
686   int nPUs = nPackages * __kmp_nThreadsPerCore;
687   if (__kmp_numa_detected) {
688     if (__kmp_tile_depth) { // NUMA + tiles
689       nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
690     } else { // NUMA, no tiles
691       nPUs *= (nNodePerPkg * nCorePerNode);
692     }
693   } else {
694     if (__kmp_tile_depth) { // no NUMA, tiles
695       nPUs *= (nTilePerPkg * nCorePerTile);
696     } else { // no NUMA, no tiles
697       nPUs *= nCoresPerPkg;
698     }
699   }
700   unsigned uniform = (nPUs == nActiveThreads);
701 
702   // Print the machine topology summary.
703   if (__kmp_affinity_verbose) {
704     char mask[KMP_AFFIN_MASK_PRINT_LEN];
705     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
706     if (__kmp_affinity_respect_mask) {
707       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
708     } else {
709       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
710     }
711     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
712     if (uniform) {
713       KMP_INFORM(Uniform, "KMP_AFFINITY");
714     } else {
715       KMP_INFORM(NonUniform, "KMP_AFFINITY");
716     }
717     if (__kmp_numa_detected) {
718       if (__kmp_tile_depth) { // NUMA + tiles
719         KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
720                    nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
721                    __kmp_ncores);
722       } else { // NUMA, no tiles
723         KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
724                    nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
725         nPUs *= (nNodePerPkg * nCorePerNode);
726       }
727     } else {
728       if (__kmp_tile_depth) { // no NUMA, tiles
729         KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
730                    nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
731       } else { // no NUMA, no tiles
732         kmp_str_buf_t buf;
733         __kmp_str_buf_init(&buf);
734         __kmp_str_buf_print(&buf, "%d", nPackages);
735         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
736                    __kmp_nThreadsPerCore, __kmp_ncores);
737         __kmp_str_buf_free(&buf);
738       }
739     }
740   }
741 
742   if (__kmp_affinity_type == affinity_none) {
743     __kmp_free(retval);
744     KMP_CPU_FREE(oldMask);
745     return 0;
746   }
747 
748   int depth_full = depth; // number of levels before compressing
749   // Find any levels with radiix 1, and remove them from the map
750   // (except for the package level).
751   depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
752                                                  levels);
753   KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
754   if (__kmp_affinity_gran_levels < 0) {
755     // Set the granularity level based on what levels are modeled
756     // in the machine topology map.
757     __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
758     if (__kmp_affinity_gran > affinity_gran_thread) {
759       for (int i = 1; i <= depth_full; ++i) {
760         if (__kmp_affinity_gran <= i) // only count deeper levels
761           break;
762         if (levels[depth_full - i] > 0)
763           __kmp_affinity_gran_levels++;
764       }
765     }
766     if (__kmp_affinity_gran > affinity_gran_package)
767       __kmp_affinity_gran_levels++; // e.g. granularity = group
768   }
769 
770   if (__kmp_affinity_verbose)
771     __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
772 
773   KMP_CPU_FREE(oldMask);
774   *address2os = retval;
775   return depth;
776 }
777 #endif // KMP_USE_HWLOC
778 
779 // If we don't know how to retrieve the machine's processor topology, or
780 // encounter an error in doing so, this routine is called to form a "flat"
781 // mapping of os thread id's <-> processor id's.
782 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
783                                           kmp_i18n_id_t *const msg_id) {
784   *address2os = NULL;
785   *msg_id = kmp_i18n_null;
786 
787   // Even if __kmp_affinity_type == affinity_none, this routine might still
788   // called to set __kmp_ncores, as well as
789   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
790   if (!KMP_AFFINITY_CAPABLE()) {
791     KMP_ASSERT(__kmp_affinity_type == affinity_none);
792     __kmp_ncores = nPackages = __kmp_xproc;
793     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
794     if (__kmp_affinity_verbose) {
795       KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
796       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
797       KMP_INFORM(Uniform, "KMP_AFFINITY");
798       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
799                  __kmp_nThreadsPerCore, __kmp_ncores);
800     }
801     return 0;
802   }
803 
804   // When affinity is off, this routine will still be called to set
805   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
806   // Make sure all these vars are set correctly, and return now if affinity is
807   // not enabled.
808   __kmp_ncores = nPackages = __kmp_avail_proc;
809   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
810   if (__kmp_affinity_verbose) {
811     char buf[KMP_AFFIN_MASK_PRINT_LEN];
812     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
813                               __kmp_affin_fullMask);
814 
815     KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
816     if (__kmp_affinity_respect_mask) {
817       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
818     } else {
819       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
820     }
821     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
822     KMP_INFORM(Uniform, "KMP_AFFINITY");
823     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
824                __kmp_nThreadsPerCore, __kmp_ncores);
825   }
826   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
827   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
828   if (__kmp_affinity_type == affinity_none) {
829     int avail_ct = 0;
830     int i;
831     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
832       if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
833         continue;
834       __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
835     }
836     return 0;
837   }
838 
839   // Contruct the data structure to be returned.
840   *address2os =
841       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
842   int avail_ct = 0;
843   unsigned int i;
844   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
845     // Skip this proc if it is not included in the machine model.
846     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
847       continue;
848     }
849     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
850     Address addr(1);
851     addr.labels[0] = i;
852     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
853   }
854   if (__kmp_affinity_verbose) {
855     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
856   }
857 
858   if (__kmp_affinity_gran_levels < 0) {
859     // Only the package level is modeled in the machine topology map,
860     // so the #levels of granularity is either 0 or 1.
861     if (__kmp_affinity_gran > affinity_gran_package) {
862       __kmp_affinity_gran_levels = 1;
863     } else {
864       __kmp_affinity_gran_levels = 0;
865     }
866   }
867   return 1;
868 }
869 
870 #if KMP_GROUP_AFFINITY
871 
872 // If multiple Windows* OS processor groups exist, we can create a 2-level
873 // topology map with the groups at level 0 and the individual procs at level 1.
874 // This facilitates letting the threads float among all procs in a group,
875 // if granularity=group (the default when there are multiple groups).
876 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
877                                                 kmp_i18n_id_t *const msg_id) {
878   *address2os = NULL;
879   *msg_id = kmp_i18n_null;
880 
881   // If we aren't affinity capable, then return now.
882   // The flat mapping will be used.
883   if (!KMP_AFFINITY_CAPABLE()) {
884     // FIXME set *msg_id
885     return -1;
886   }
887 
888   // Contruct the data structure to be returned.
889   *address2os =
890       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
891   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
892   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
893   int avail_ct = 0;
894   int i;
895   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
896     // Skip this proc if it is not included in the machine model.
897     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
898       continue;
899     }
900     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
901     Address addr(2);
902     addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
903     addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
904     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
905 
906     if (__kmp_affinity_verbose) {
907       KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
908                  addr.labels[1]);
909     }
910   }
911 
912   if (__kmp_affinity_gran_levels < 0) {
913     if (__kmp_affinity_gran == affinity_gran_group) {
914       __kmp_affinity_gran_levels = 1;
915     } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
916                (__kmp_affinity_gran == affinity_gran_thread)) {
917       __kmp_affinity_gran_levels = 0;
918     } else {
919       const char *gran_str = NULL;
920       if (__kmp_affinity_gran == affinity_gran_core) {
921         gran_str = "core";
922       } else if (__kmp_affinity_gran == affinity_gran_package) {
923         gran_str = "package";
924       } else if (__kmp_affinity_gran == affinity_gran_node) {
925         gran_str = "node";
926       } else {
927         KMP_ASSERT(0);
928       }
929 
930       // Warning: can't use affinity granularity \"gran\" with group topology
931       // method, using "thread"
932       __kmp_affinity_gran_levels = 0;
933     }
934   }
935   return 2;
936 }
937 
938 #endif /* KMP_GROUP_AFFINITY */
939 
940 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
941 
942 static int __kmp_cpuid_mask_width(int count) {
943   int r = 0;
944 
945   while ((1 << r) < count)
946     ++r;
947   return r;
948 }
949 
950 class apicThreadInfo {
951 public:
952   unsigned osId; // param to __kmp_affinity_bind_thread
953   unsigned apicId; // from cpuid after binding
954   unsigned maxCoresPerPkg; //      ""
955   unsigned maxThreadsPerPkg; //      ""
956   unsigned pkgId; // inferred from above values
957   unsigned coreId; //      ""
958   unsigned threadId; //      ""
959 };
960 
961 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a,
962                                                    const void *b) {
963   const apicThreadInfo *aa = (const apicThreadInfo *)a;
964   const apicThreadInfo *bb = (const apicThreadInfo *)b;
965   if (aa->osId < bb->osId)
966     return -1;
967   if (aa->osId > bb->osId)
968     return 1;
969   return 0;
970 }
971 
972 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
973                                                      const void *b) {
974   const apicThreadInfo *aa = (const apicThreadInfo *)a;
975   const apicThreadInfo *bb = (const apicThreadInfo *)b;
976   if (aa->pkgId < bb->pkgId)
977     return -1;
978   if (aa->pkgId > bb->pkgId)
979     return 1;
980   if (aa->coreId < bb->coreId)
981     return -1;
982   if (aa->coreId > bb->coreId)
983     return 1;
984   if (aa->threadId < bb->threadId)
985     return -1;
986   if (aa->threadId > bb->threadId)
987     return 1;
988   return 0;
989 }
990 
991 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
992 // an algorithm which cycles through the available os threads, setting
993 // the current thread's affinity mask to that thread, and then retrieves
994 // the Apic Id for each thread context using the cpuid instruction.
995 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
996                                             kmp_i18n_id_t *const msg_id) {
997   kmp_cpuid buf;
998   int rc;
999   *address2os = NULL;
1000   *msg_id = kmp_i18n_null;
1001 
1002   // Check if cpuid leaf 4 is supported.
1003   __kmp_x86_cpuid(0, 0, &buf);
1004   if (buf.eax < 4) {
1005     *msg_id = kmp_i18n_str_NoLeaf4Support;
1006     return -1;
1007   }
1008 
1009   // The algorithm used starts by setting the affinity to each available thread
1010   // and retrieving info from the cpuid instruction, so if we are not capable of
1011   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1012   // need to do something else - use the defaults that we calculated from
1013   // issuing cpuid without binding to each proc.
1014   if (!KMP_AFFINITY_CAPABLE()) {
1015     // Hack to try and infer the machine topology using only the data
1016     // available from cpuid on the current thread, and __kmp_xproc.
1017     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1018 
1019     // Get an upper bound on the number of threads per package using cpuid(1).
1020     // On some OS/chps combinations where HT is supported by the chip but is
1021     // disabled, this value will be 2 on a single core chip. Usually, it will be
1022     // 2 if HT is enabled and 1 if HT is disabled.
1023     __kmp_x86_cpuid(1, 0, &buf);
1024     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1025     if (maxThreadsPerPkg == 0) {
1026       maxThreadsPerPkg = 1;
1027     }
1028 
1029     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1030     // value.
1031     //
1032     // The author of cpu_count.cpp treated this only an upper bound on the
1033     // number of cores, but I haven't seen any cases where it was greater than
1034     // the actual number of cores, so we will treat it as exact in this block of
1035     // code.
1036     //
1037     // First, we need to check if cpuid(4) is supported on this chip. To see if
1038     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1039     // greater.
1040     __kmp_x86_cpuid(0, 0, &buf);
1041     if (buf.eax >= 4) {
1042       __kmp_x86_cpuid(4, 0, &buf);
1043       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1044     } else {
1045       nCoresPerPkg = 1;
1046     }
1047 
1048     // There is no way to reliably tell if HT is enabled without issuing the
1049     // cpuid instruction from every thread, can correlating the cpuid info, so
1050     // if the machine is not affinity capable, we assume that HT is off. We have
1051     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1052     // does not support HT.
1053     //
1054     // - Older OSes are usually found on machines with older chips, which do not
1055     //   support HT.
1056     // - The performance penalty for mistakenly identifying a machine as HT when
1057     //   it isn't (which results in blocktime being incorrecly set to 0) is
1058     //   greater than the penalty when for mistakenly identifying a machine as
1059     //   being 1 thread/core when it is really HT enabled (which results in
1060     //   blocktime being incorrectly set to a positive value).
1061     __kmp_ncores = __kmp_xproc;
1062     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1063     __kmp_nThreadsPerCore = 1;
1064     if (__kmp_affinity_verbose) {
1065       KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
1066       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1067       if (__kmp_affinity_uniform_topology()) {
1068         KMP_INFORM(Uniform, "KMP_AFFINITY");
1069       } else {
1070         KMP_INFORM(NonUniform, "KMP_AFFINITY");
1071       }
1072       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1073                  __kmp_nThreadsPerCore, __kmp_ncores);
1074     }
1075     return 0;
1076   }
1077 
1078   // From here on, we can assume that it is safe to call
1079   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1080   // __kmp_affinity_type = affinity_none.
1081 
1082   // Save the affinity mask for the current thread.
1083   kmp_affin_mask_t *oldMask;
1084   KMP_CPU_ALLOC(oldMask);
1085   KMP_ASSERT(oldMask != NULL);
1086   __kmp_get_system_affinity(oldMask, TRUE);
1087 
1088   // Run through each of the available contexts, binding the current thread
1089   // to it, and obtaining the pertinent information using the cpuid instr.
1090   //
1091   // The relevant information is:
1092   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1093   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1094   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1095   //     of this field determines the width of the core# + thread# fields in the
1096   //     Apic Id. It is also an upper bound on the number of threads per
1097   //     package, but it has been verified that situations happen were it is not
1098   //     exact. In particular, on certain OS/chip combinations where Intel(R)
1099   //     Hyper-Threading Technology is supported by the chip but has been
1100   //     disabled, the value of this field will be 2 (for a single core chip).
1101   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
1102   //     Technology, the value of this field will be 1 when Intel(R)
1103   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
1104   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
1105   //     of this field (+1) determines the width of the core# field in the Apic
1106   //     Id. The comments in "cpucount.cpp" say that this value is an upper
1107   //     bound, but the IA-32 architecture manual says that it is exactly the
1108   //     number of cores per package, and I haven't seen any case where it
1109   //     wasn't.
1110   //
1111   // From this information, deduce the package Id, core Id, and thread Id,
1112   // and set the corresponding fields in the apicThreadInfo struct.
1113   unsigned i;
1114   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1115       __kmp_avail_proc * sizeof(apicThreadInfo));
1116   unsigned nApics = 0;
1117   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1118     // Skip this proc if it is not included in the machine model.
1119     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1120       continue;
1121     }
1122     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1123 
1124     __kmp_affinity_dispatch->bind_thread(i);
1125     threadInfo[nApics].osId = i;
1126 
1127     // The apic id and max threads per pkg come from cpuid(1).
1128     __kmp_x86_cpuid(1, 0, &buf);
1129     if (((buf.edx >> 9) & 1) == 0) {
1130       __kmp_set_system_affinity(oldMask, TRUE);
1131       __kmp_free(threadInfo);
1132       KMP_CPU_FREE(oldMask);
1133       *msg_id = kmp_i18n_str_ApicNotPresent;
1134       return -1;
1135     }
1136     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1137     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1138     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1139       threadInfo[nApics].maxThreadsPerPkg = 1;
1140     }
1141 
1142     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1143     // value.
1144     //
1145     // First, we need to check if cpuid(4) is supported on this chip. To see if
1146     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1147     // or greater.
1148     __kmp_x86_cpuid(0, 0, &buf);
1149     if (buf.eax >= 4) {
1150       __kmp_x86_cpuid(4, 0, &buf);
1151       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1152     } else {
1153       threadInfo[nApics].maxCoresPerPkg = 1;
1154     }
1155 
1156     // Infer the pkgId / coreId / threadId using only the info obtained locally.
1157     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1158     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1159 
1160     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1161     int widthT = widthCT - widthC;
1162     if (widthT < 0) {
1163       // I've never seen this one happen, but I suppose it could, if the cpuid
1164       // instruction on a chip was really screwed up. Make sure to restore the
1165       // affinity mask before the tail call.
1166       __kmp_set_system_affinity(oldMask, TRUE);
1167       __kmp_free(threadInfo);
1168       KMP_CPU_FREE(oldMask);
1169       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1170       return -1;
1171     }
1172 
1173     int maskC = (1 << widthC) - 1;
1174     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1175 
1176     int maskT = (1 << widthT) - 1;
1177     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1178 
1179     nApics++;
1180   }
1181 
1182   // We've collected all the info we need.
1183   // Restore the old affinity mask for this thread.
1184   __kmp_set_system_affinity(oldMask, TRUE);
1185 
1186   // If there's only one thread context to bind to, form an Address object
1187   // with depth 1 and return immediately (or, if affinity is off, set
1188   // address2os to NULL and return).
1189   //
1190   // If it is configured to omit the package level when there is only a single
1191   // package, the logic at the end of this routine won't work if there is only
1192   // a single thread - it would try to form an Address object with depth 0.
1193   KMP_ASSERT(nApics > 0);
1194   if (nApics == 1) {
1195     __kmp_ncores = nPackages = 1;
1196     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1197     if (__kmp_affinity_verbose) {
1198       char buf[KMP_AFFIN_MASK_PRINT_LEN];
1199       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1200 
1201       KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1202       if (__kmp_affinity_respect_mask) {
1203         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1204       } else {
1205         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1206       }
1207       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1208       KMP_INFORM(Uniform, "KMP_AFFINITY");
1209       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1210                  __kmp_nThreadsPerCore, __kmp_ncores);
1211     }
1212 
1213     if (__kmp_affinity_type == affinity_none) {
1214       __kmp_free(threadInfo);
1215       KMP_CPU_FREE(oldMask);
1216       return 0;
1217     }
1218 
1219     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
1220     Address addr(1);
1221     addr.labels[0] = threadInfo[0].pkgId;
1222     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1223 
1224     if (__kmp_affinity_gran_levels < 0) {
1225       __kmp_affinity_gran_levels = 0;
1226     }
1227 
1228     if (__kmp_affinity_verbose) {
1229       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1230     }
1231 
1232     __kmp_free(threadInfo);
1233     KMP_CPU_FREE(oldMask);
1234     return 1;
1235   }
1236 
1237   // Sort the threadInfo table by physical Id.
1238   qsort(threadInfo, nApics, sizeof(*threadInfo),
1239         __kmp_affinity_cmp_apicThreadInfo_phys_id);
1240 
1241   // The table is now sorted by pkgId / coreId / threadId, but we really don't
1242   // know the radix of any of the fields. pkgId's may be sparsely assigned among
1243   // the chips on a system. Although coreId's are usually assigned
1244   // [0 .. coresPerPkg-1] and threadId's are usually assigned
1245   // [0..threadsPerCore-1], we don't want to make any such assumptions.
1246   //
1247   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1248   // total # packages) are at this point - we want to determine that now. We
1249   // only have an upper bound on the first two figures.
1250   //
1251   // We also perform a consistency check at this point: the values returned by
1252   // the cpuid instruction for any thread bound to a given package had better
1253   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1254   nPackages = 1;
1255   nCoresPerPkg = 1;
1256   __kmp_nThreadsPerCore = 1;
1257   unsigned nCores = 1;
1258 
1259   unsigned pkgCt = 1; // to determine radii
1260   unsigned lastPkgId = threadInfo[0].pkgId;
1261   unsigned coreCt = 1;
1262   unsigned lastCoreId = threadInfo[0].coreId;
1263   unsigned threadCt = 1;
1264   unsigned lastThreadId = threadInfo[0].threadId;
1265 
1266   // intra-pkg consist checks
1267   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1268   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1269 
1270   for (i = 1; i < nApics; i++) {
1271     if (threadInfo[i].pkgId != lastPkgId) {
1272       nCores++;
1273       pkgCt++;
1274       lastPkgId = threadInfo[i].pkgId;
1275       if ((int)coreCt > nCoresPerPkg)
1276         nCoresPerPkg = coreCt;
1277       coreCt = 1;
1278       lastCoreId = threadInfo[i].coreId;
1279       if ((int)threadCt > __kmp_nThreadsPerCore)
1280         __kmp_nThreadsPerCore = threadCt;
1281       threadCt = 1;
1282       lastThreadId = threadInfo[i].threadId;
1283 
1284       // This is a different package, so go on to the next iteration without
1285       // doing any consistency checks. Reset the consistency check vars, though.
1286       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1287       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1288       continue;
1289     }
1290 
1291     if (threadInfo[i].coreId != lastCoreId) {
1292       nCores++;
1293       coreCt++;
1294       lastCoreId = threadInfo[i].coreId;
1295       if ((int)threadCt > __kmp_nThreadsPerCore)
1296         __kmp_nThreadsPerCore = threadCt;
1297       threadCt = 1;
1298       lastThreadId = threadInfo[i].threadId;
1299     } else if (threadInfo[i].threadId != lastThreadId) {
1300       threadCt++;
1301       lastThreadId = threadInfo[i].threadId;
1302     } else {
1303       __kmp_free(threadInfo);
1304       KMP_CPU_FREE(oldMask);
1305       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1306       return -1;
1307     }
1308 
1309     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1310     // fields agree between all the threads bounds to a given package.
1311     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1312         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1313       __kmp_free(threadInfo);
1314       KMP_CPU_FREE(oldMask);
1315       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1316       return -1;
1317     }
1318   }
1319   nPackages = pkgCt;
1320   if ((int)coreCt > nCoresPerPkg)
1321     nCoresPerPkg = coreCt;
1322   if ((int)threadCt > __kmp_nThreadsPerCore)
1323     __kmp_nThreadsPerCore = threadCt;
1324 
1325   // When affinity is off, this routine will still be called to set
1326   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1327   // Make sure all these vars are set correctly, and return now if affinity is
1328   // not enabled.
1329   __kmp_ncores = nCores;
1330   if (__kmp_affinity_verbose) {
1331     char buf[KMP_AFFIN_MASK_PRINT_LEN];
1332     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1333 
1334     KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1335     if (__kmp_affinity_respect_mask) {
1336       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1337     } else {
1338       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1339     }
1340     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1341     if (__kmp_affinity_uniform_topology()) {
1342       KMP_INFORM(Uniform, "KMP_AFFINITY");
1343     } else {
1344       KMP_INFORM(NonUniform, "KMP_AFFINITY");
1345     }
1346     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1347                __kmp_nThreadsPerCore, __kmp_ncores);
1348   }
1349   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1350   KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1351   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1352   for (i = 0; i < nApics; ++i) {
1353     __kmp_pu_os_idx[i] = threadInfo[i].osId;
1354   }
1355   if (__kmp_affinity_type == affinity_none) {
1356     __kmp_free(threadInfo);
1357     KMP_CPU_FREE(oldMask);
1358     return 0;
1359   }
1360 
1361   // Now that we've determined the number of packages, the number of cores per
1362   // package, and the number of threads per core, we can construct the data
1363   // structure that is to be returned.
1364   int pkgLevel = 0;
1365   int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1366   int threadLevel =
1367       (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1368   unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1369 
1370   KMP_ASSERT(depth > 0);
1371   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1372 
1373   for (i = 0; i < nApics; ++i) {
1374     Address addr(depth);
1375     unsigned os = threadInfo[i].osId;
1376     int d = 0;
1377 
1378     if (pkgLevel >= 0) {
1379       addr.labels[d++] = threadInfo[i].pkgId;
1380     }
1381     if (coreLevel >= 0) {
1382       addr.labels[d++] = threadInfo[i].coreId;
1383     }
1384     if (threadLevel >= 0) {
1385       addr.labels[d++] = threadInfo[i].threadId;
1386     }
1387     (*address2os)[i] = AddrUnsPair(addr, os);
1388   }
1389 
1390   if (__kmp_affinity_gran_levels < 0) {
1391     // Set the granularity level based on what levels are modeled in the machine
1392     // topology map.
1393     __kmp_affinity_gran_levels = 0;
1394     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1395       __kmp_affinity_gran_levels++;
1396     }
1397     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1398       __kmp_affinity_gran_levels++;
1399     }
1400     if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1401       __kmp_affinity_gran_levels++;
1402     }
1403   }
1404 
1405   if (__kmp_affinity_verbose) {
1406     __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1407                                   coreLevel, threadLevel);
1408   }
1409 
1410   __kmp_free(threadInfo);
1411   KMP_CPU_FREE(oldMask);
1412   return depth;
1413 }
1414 
1415 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1416 // architectures support a newer interface for specifying the x2APIC Ids,
1417 // based on cpuid leaf 11.
1418 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1419                                               kmp_i18n_id_t *const msg_id) {
1420   kmp_cpuid buf;
1421   *address2os = NULL;
1422   *msg_id = kmp_i18n_null;
1423 
1424   // Check to see if cpuid leaf 11 is supported.
1425   __kmp_x86_cpuid(0, 0, &buf);
1426   if (buf.eax < 11) {
1427     *msg_id = kmp_i18n_str_NoLeaf11Support;
1428     return -1;
1429   }
1430   __kmp_x86_cpuid(11, 0, &buf);
1431   if (buf.ebx == 0) {
1432     *msg_id = kmp_i18n_str_NoLeaf11Support;
1433     return -1;
1434   }
1435 
1436   // Find the number of levels in the machine topology. While we're at it, get
1437   // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
1438   // get more accurate values later by explicitly counting them, but get
1439   // reasonable defaults now, in case we return early.
1440   int level;
1441   int threadLevel = -1;
1442   int coreLevel = -1;
1443   int pkgLevel = -1;
1444   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1445 
1446   for (level = 0;; level++) {
1447     if (level > 31) {
1448       // FIXME: Hack for DPD200163180
1449       //
1450       // If level is big then something went wrong -> exiting
1451       //
1452       // There could actually be 32 valid levels in the machine topology, but so
1453       // far, the only machine we have seen which does not exit this loop before
1454       // iteration 32 has fubar x2APIC settings.
1455       //
1456       // For now, just reject this case based upon loop trip count.
1457       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1458       return -1;
1459     }
1460     __kmp_x86_cpuid(11, level, &buf);
1461     if (buf.ebx == 0) {
1462       if (pkgLevel < 0) {
1463         // Will infer nPackages from __kmp_xproc
1464         pkgLevel = level;
1465         level++;
1466       }
1467       break;
1468     }
1469     int kind = (buf.ecx >> 8) & 0xff;
1470     if (kind == 1) {
1471       // SMT level
1472       threadLevel = level;
1473       coreLevel = -1;
1474       pkgLevel = -1;
1475       __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1476       if (__kmp_nThreadsPerCore == 0) {
1477         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1478         return -1;
1479       }
1480     } else if (kind == 2) {
1481       // core level
1482       coreLevel = level;
1483       pkgLevel = -1;
1484       nCoresPerPkg = buf.ebx & 0xffff;
1485       if (nCoresPerPkg == 0) {
1486         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1487         return -1;
1488       }
1489     } else {
1490       if (level <= 0) {
1491         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1492         return -1;
1493       }
1494       if (pkgLevel >= 0) {
1495         continue;
1496       }
1497       pkgLevel = level;
1498       nPackages = buf.ebx & 0xffff;
1499       if (nPackages == 0) {
1500         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1501         return -1;
1502       }
1503     }
1504   }
1505   int depth = level;
1506 
1507   // In the above loop, "level" was counted from the finest level (usually
1508   // thread) to the coarsest.  The caller expects that we will place the labels
1509   // in (*address2os)[].first.labels[] in the inverse order, so we need to
1510   // invert the vars saying which level means what.
1511   if (threadLevel >= 0) {
1512     threadLevel = depth - threadLevel - 1;
1513   }
1514   if (coreLevel >= 0) {
1515     coreLevel = depth - coreLevel - 1;
1516   }
1517   KMP_DEBUG_ASSERT(pkgLevel >= 0);
1518   pkgLevel = depth - pkgLevel - 1;
1519 
1520   // The algorithm used starts by setting the affinity to each available thread
1521   // and retrieving info from the cpuid instruction, so if we are not capable of
1522   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1523   // need to do something else - use the defaults that we calculated from
1524   // issuing cpuid without binding to each proc.
1525   if (!KMP_AFFINITY_CAPABLE()) {
1526     // Hack to try and infer the machine topology using only the data
1527     // available from cpuid on the current thread, and __kmp_xproc.
1528     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1529 
1530     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1531     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1532     if (__kmp_affinity_verbose) {
1533       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1534       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1535       if (__kmp_affinity_uniform_topology()) {
1536         KMP_INFORM(Uniform, "KMP_AFFINITY");
1537       } else {
1538         KMP_INFORM(NonUniform, "KMP_AFFINITY");
1539       }
1540       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1541                  __kmp_nThreadsPerCore, __kmp_ncores);
1542     }
1543     return 0;
1544   }
1545 
1546   // From here on, we can assume that it is safe to call
1547   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1548   // __kmp_affinity_type = affinity_none.
1549 
1550   // Save the affinity mask for the current thread.
1551   kmp_affin_mask_t *oldMask;
1552   KMP_CPU_ALLOC(oldMask);
1553   __kmp_get_system_affinity(oldMask, TRUE);
1554 
1555   // Allocate the data structure to be returned.
1556   AddrUnsPair *retval =
1557       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1558 
1559   // Run through each of the available contexts, binding the current thread
1560   // to it, and obtaining the pertinent information using the cpuid instr.
1561   unsigned int proc;
1562   int nApics = 0;
1563   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1564     // Skip this proc if it is not included in the machine model.
1565     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1566       continue;
1567     }
1568     KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1569 
1570     __kmp_affinity_dispatch->bind_thread(proc);
1571 
1572     // Extract labels for each level in the machine topology map from Apic ID.
1573     Address addr(depth);
1574     int prev_shift = 0;
1575 
1576     for (level = 0; level < depth; level++) {
1577       __kmp_x86_cpuid(11, level, &buf);
1578       unsigned apicId = buf.edx;
1579       if (buf.ebx == 0) {
1580         if (level != depth - 1) {
1581           KMP_CPU_FREE(oldMask);
1582           *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1583           return -1;
1584         }
1585         addr.labels[depth - level - 1] = apicId >> prev_shift;
1586         level++;
1587         break;
1588       }
1589       int shift = buf.eax & 0x1f;
1590       int mask = (1 << shift) - 1;
1591       addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1592       prev_shift = shift;
1593     }
1594     if (level != depth) {
1595       KMP_CPU_FREE(oldMask);
1596       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1597       return -1;
1598     }
1599 
1600     retval[nApics] = AddrUnsPair(addr, proc);
1601     nApics++;
1602   }
1603 
1604   // We've collected all the info we need.
1605   // Restore the old affinity mask for this thread.
1606   __kmp_set_system_affinity(oldMask, TRUE);
1607 
1608   // If there's only one thread context to bind to, return now.
1609   KMP_ASSERT(nApics > 0);
1610   if (nApics == 1) {
1611     __kmp_ncores = nPackages = 1;
1612     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1613     if (__kmp_affinity_verbose) {
1614       char buf[KMP_AFFIN_MASK_PRINT_LEN];
1615       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1616 
1617       KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1618       if (__kmp_affinity_respect_mask) {
1619         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1620       } else {
1621         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1622       }
1623       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1624       KMP_INFORM(Uniform, "KMP_AFFINITY");
1625       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1626                  __kmp_nThreadsPerCore, __kmp_ncores);
1627     }
1628 
1629     if (__kmp_affinity_type == affinity_none) {
1630       __kmp_free(retval);
1631       KMP_CPU_FREE(oldMask);
1632       return 0;
1633     }
1634 
1635     // Form an Address object which only includes the package level.
1636     Address addr(1);
1637     addr.labels[0] = retval[0].first.labels[pkgLevel];
1638     retval[0].first = addr;
1639 
1640     if (__kmp_affinity_gran_levels < 0) {
1641       __kmp_affinity_gran_levels = 0;
1642     }
1643 
1644     if (__kmp_affinity_verbose) {
1645       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1646     }
1647 
1648     *address2os = retval;
1649     KMP_CPU_FREE(oldMask);
1650     return 1;
1651   }
1652 
1653   // Sort the table by physical Id.
1654   qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1655 
1656   // Find the radix at each of the levels.
1657   unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1658   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1659   unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1660   unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1661   for (level = 0; level < depth; level++) {
1662     totals[level] = 1;
1663     maxCt[level] = 1;
1664     counts[level] = 1;
1665     last[level] = retval[0].first.labels[level];
1666   }
1667 
1668   // From here on, the iteration variable "level" runs from the finest level to
1669   // the coarsest, i.e. we iterate forward through
1670   // (*address2os)[].first.labels[] - in the previous loops, we iterated
1671   // backwards.
1672   for (proc = 1; (int)proc < nApics; proc++) {
1673     int level;
1674     for (level = 0; level < depth; level++) {
1675       if (retval[proc].first.labels[level] != last[level]) {
1676         int j;
1677         for (j = level + 1; j < depth; j++) {
1678           totals[j]++;
1679           counts[j] = 1;
1680           // The line below causes printing incorrect topology information in
1681           // case the max value for some level (maxCt[level]) is encountered
1682           // earlier than some less value while going through the array. For
1683           // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
1684           // maxCt[1] == 2
1685           // whereas it must be 4.
1686           // TODO!!! Check if it can be commented safely
1687           // maxCt[j] = 1;
1688           last[j] = retval[proc].first.labels[j];
1689         }
1690         totals[level]++;
1691         counts[level]++;
1692         if (counts[level] > maxCt[level]) {
1693           maxCt[level] = counts[level];
1694         }
1695         last[level] = retval[proc].first.labels[level];
1696         break;
1697       } else if (level == depth - 1) {
1698         __kmp_free(last);
1699         __kmp_free(maxCt);
1700         __kmp_free(counts);
1701         __kmp_free(totals);
1702         __kmp_free(retval);
1703         KMP_CPU_FREE(oldMask);
1704         *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1705         return -1;
1706       }
1707     }
1708   }
1709 
1710   // When affinity is off, this routine will still be called to set
1711   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1712   // Make sure all these vars are set correctly, and return if affinity is not
1713   // enabled.
1714   if (threadLevel >= 0) {
1715     __kmp_nThreadsPerCore = maxCt[threadLevel];
1716   } else {
1717     __kmp_nThreadsPerCore = 1;
1718   }
1719   nPackages = totals[pkgLevel];
1720 
1721   if (coreLevel >= 0) {
1722     __kmp_ncores = totals[coreLevel];
1723     nCoresPerPkg = maxCt[coreLevel];
1724   } else {
1725     __kmp_ncores = nPackages;
1726     nCoresPerPkg = 1;
1727   }
1728 
1729   // Check to see if the machine topology is uniform
1730   unsigned prod = maxCt[0];
1731   for (level = 1; level < depth; level++) {
1732     prod *= maxCt[level];
1733   }
1734   bool uniform = (prod == totals[level - 1]);
1735 
1736   // Print the machine topology summary.
1737   if (__kmp_affinity_verbose) {
1738     char mask[KMP_AFFIN_MASK_PRINT_LEN];
1739     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1740 
1741     KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1742     if (__kmp_affinity_respect_mask) {
1743       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1744     } else {
1745       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1746     }
1747     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1748     if (uniform) {
1749       KMP_INFORM(Uniform, "KMP_AFFINITY");
1750     } else {
1751       KMP_INFORM(NonUniform, "KMP_AFFINITY");
1752     }
1753 
1754     kmp_str_buf_t buf;
1755     __kmp_str_buf_init(&buf);
1756 
1757     __kmp_str_buf_print(&buf, "%d", totals[0]);
1758     for (level = 1; level <= pkgLevel; level++) {
1759       __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1760     }
1761     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1762                __kmp_nThreadsPerCore, __kmp_ncores);
1763 
1764     __kmp_str_buf_free(&buf);
1765   }
1766   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1767   KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1768   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1769   for (proc = 0; (int)proc < nApics; ++proc) {
1770     __kmp_pu_os_idx[proc] = retval[proc].second;
1771   }
1772   if (__kmp_affinity_type == affinity_none) {
1773     __kmp_free(last);
1774     __kmp_free(maxCt);
1775     __kmp_free(counts);
1776     __kmp_free(totals);
1777     __kmp_free(retval);
1778     KMP_CPU_FREE(oldMask);
1779     return 0;
1780   }
1781 
1782   // Find any levels with radiix 1, and remove them from the map
1783   // (except for the package level).
1784   int new_depth = 0;
1785   for (level = 0; level < depth; level++) {
1786     if ((maxCt[level] == 1) && (level != pkgLevel)) {
1787       continue;
1788     }
1789     new_depth++;
1790   }
1791 
1792   // If we are removing any levels, allocate a new vector to return,
1793   // and copy the relevant information to it.
1794   if (new_depth != depth) {
1795     AddrUnsPair *new_retval =
1796         (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1797     for (proc = 0; (int)proc < nApics; proc++) {
1798       Address addr(new_depth);
1799       new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1800     }
1801     int new_level = 0;
1802     int newPkgLevel = -1;
1803     int newCoreLevel = -1;
1804     int newThreadLevel = -1;
1805     int i;
1806     for (level = 0; level < depth; level++) {
1807       if ((maxCt[level] == 1) && (level != pkgLevel)) {
1808         // Remove this level. Never remove the package level
1809         continue;
1810       }
1811       if (level == pkgLevel) {
1812         newPkgLevel = new_level;
1813       }
1814       if (level == coreLevel) {
1815         newCoreLevel = new_level;
1816       }
1817       if (level == threadLevel) {
1818         newThreadLevel = new_level;
1819       }
1820       for (proc = 0; (int)proc < nApics; proc++) {
1821         new_retval[proc].first.labels[new_level] =
1822             retval[proc].first.labels[level];
1823       }
1824       new_level++;
1825     }
1826 
1827     __kmp_free(retval);
1828     retval = new_retval;
1829     depth = new_depth;
1830     pkgLevel = newPkgLevel;
1831     coreLevel = newCoreLevel;
1832     threadLevel = newThreadLevel;
1833   }
1834 
1835   if (__kmp_affinity_gran_levels < 0) {
1836     // Set the granularity level based on what levels are modeled
1837     // in the machine topology map.
1838     __kmp_affinity_gran_levels = 0;
1839     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1840       __kmp_affinity_gran_levels++;
1841     }
1842     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1843       __kmp_affinity_gran_levels++;
1844     }
1845     if (__kmp_affinity_gran > affinity_gran_package) {
1846       __kmp_affinity_gran_levels++;
1847     }
1848   }
1849 
1850   if (__kmp_affinity_verbose) {
1851     __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
1852                                   threadLevel);
1853   }
1854 
1855   __kmp_free(last);
1856   __kmp_free(maxCt);
1857   __kmp_free(counts);
1858   __kmp_free(totals);
1859   KMP_CPU_FREE(oldMask);
1860   *address2os = retval;
1861   return depth;
1862 }
1863 
1864 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1865 
1866 #define osIdIndex 0
1867 #define threadIdIndex 1
1868 #define coreIdIndex 2
1869 #define pkgIdIndex 3
1870 #define nodeIdIndex 4
1871 
1872 typedef unsigned *ProcCpuInfo;
1873 static unsigned maxIndex = pkgIdIndex;
1874 
1875 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) {
1876   const unsigned *aa = (const unsigned *)a;
1877   const unsigned *bb = (const unsigned *)b;
1878   if (aa[osIdIndex] < bb[osIdIndex])
1879     return -1;
1880   if (aa[osIdIndex] > bb[osIdIndex])
1881     return 1;
1882   return 0;
1883 }
1884 
1885 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
1886                                                   const void *b) {
1887   unsigned i;
1888   const unsigned *aa = *(unsigned *const *)a;
1889   const unsigned *bb = *(unsigned *const *)b;
1890   for (i = maxIndex;; i--) {
1891     if (aa[i] < bb[i])
1892       return -1;
1893     if (aa[i] > bb[i])
1894       return 1;
1895     if (i == osIdIndex)
1896       break;
1897   }
1898   return 0;
1899 }
1900 
1901 #if KMP_USE_HIER_SCHED
1902 // Set the array sizes for the hierarchy layers
1903 static void __kmp_dispatch_set_hierarchy_values() {
1904   // Set the maximum number of L1's to number of cores
1905   // Set the maximum number of L2's to to either number of cores / 2 for
1906   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
1907   // Or the number of cores for Intel(R) Xeon(R) processors
1908   // Set the maximum number of NUMA nodes and L3's to number of packages
1909   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
1910       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
1911   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
1912 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1913   if (__kmp_mic_type >= mic3)
1914     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
1915   else
1916 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1917     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
1918   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
1919   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
1920   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
1921   // Set the number of threads per unit
1922   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
1923   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
1924   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
1925       __kmp_nThreadsPerCore;
1926 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1927   if (__kmp_mic_type >= mic3)
1928     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
1929         2 * __kmp_nThreadsPerCore;
1930   else
1931 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1932     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
1933         __kmp_nThreadsPerCore;
1934   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
1935       nCoresPerPkg * __kmp_nThreadsPerCore;
1936   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
1937       nCoresPerPkg * __kmp_nThreadsPerCore;
1938   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
1939       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
1940 }
1941 
1942 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
1943 // i.e., this thread's L1 or this thread's L2, etc.
1944 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
1945   int index = type + 1;
1946   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
1947   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
1948   if (type == kmp_hier_layer_e::LAYER_THREAD)
1949     return tid;
1950   else if (type == kmp_hier_layer_e::LAYER_LOOP)
1951     return 0;
1952   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
1953   if (tid >= num_hw_threads)
1954     tid = tid % num_hw_threads;
1955   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
1956 }
1957 
1958 // Return the number of t1's per t2
1959 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
1960   int i1 = t1 + 1;
1961   int i2 = t2 + 1;
1962   KMP_DEBUG_ASSERT(i1 <= i2);
1963   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
1964   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
1965   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
1966   // (nthreads/t2) / (nthreads/t1) = t1 / t2
1967   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
1968 }
1969 #endif // KMP_USE_HIER_SCHED
1970 
1971 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1972 // affinity map.
1973 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
1974                                              int *line,
1975                                              kmp_i18n_id_t *const msg_id,
1976                                              FILE *f) {
1977   *address2os = NULL;
1978   *msg_id = kmp_i18n_null;
1979 
1980   // Scan of the file, and count the number of "processor" (osId) fields,
1981   // and find the highest value of <n> for a node_<n> field.
1982   char buf[256];
1983   unsigned num_records = 0;
1984   while (!feof(f)) {
1985     buf[sizeof(buf) - 1] = 1;
1986     if (!fgets(buf, sizeof(buf), f)) {
1987       // Read errors presumably because of EOF
1988       break;
1989     }
1990 
1991     char s1[] = "processor";
1992     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1993       num_records++;
1994       continue;
1995     }
1996 
1997     // FIXME - this will match "node_<n> <garbage>"
1998     unsigned level;
1999     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2000       if (nodeIdIndex + level >= maxIndex) {
2001         maxIndex = nodeIdIndex + level;
2002       }
2003       continue;
2004     }
2005   }
2006 
2007   // Check for empty file / no valid processor records, or too many. The number
2008   // of records can't exceed the number of valid bits in the affinity mask.
2009   if (num_records == 0) {
2010     *line = 0;
2011     *msg_id = kmp_i18n_str_NoProcRecords;
2012     return -1;
2013   }
2014   if (num_records > (unsigned)__kmp_xproc) {
2015     *line = 0;
2016     *msg_id = kmp_i18n_str_TooManyProcRecords;
2017     return -1;
2018   }
2019 
2020   // Set the file pointer back to the begginning, so that we can scan the file
2021   // again, this time performing a full parse of the data. Allocate a vector of
2022   // ProcCpuInfo object, where we will place the data. Adding an extra element
2023   // at the end allows us to remove a lot of extra checks for termination
2024   // conditions.
2025   if (fseek(f, 0, SEEK_SET) != 0) {
2026     *line = 0;
2027     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2028     return -1;
2029   }
2030 
2031   // Allocate the array of records to store the proc info in.  The dummy
2032   // element at the end makes the logic in filling them out easier to code.
2033   unsigned **threadInfo =
2034       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2035   unsigned i;
2036   for (i = 0; i <= num_records; i++) {
2037     threadInfo[i] =
2038         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2039   }
2040 
2041 #define CLEANUP_THREAD_INFO                                                    \
2042   for (i = 0; i <= num_records; i++) {                                         \
2043     __kmp_free(threadInfo[i]);                                                 \
2044   }                                                                            \
2045   __kmp_free(threadInfo);
2046 
2047   // A value of UINT_MAX means that we didn't find the field
2048   unsigned __index;
2049 
2050 #define INIT_PROC_INFO(p)                                                      \
2051   for (__index = 0; __index <= maxIndex; __index++) {                          \
2052     (p)[__index] = UINT_MAX;                                                   \
2053   }
2054 
2055   for (i = 0; i <= num_records; i++) {
2056     INIT_PROC_INFO(threadInfo[i]);
2057   }
2058 
2059   unsigned num_avail = 0;
2060   *line = 0;
2061   while (!feof(f)) {
2062     // Create an inner scoping level, so that all the goto targets at the end of
2063     // the loop appear in an outer scoping level. This avoids warnings about
2064     // jumping past an initialization to a target in the same block.
2065     {
2066       buf[sizeof(buf) - 1] = 1;
2067       bool long_line = false;
2068       if (!fgets(buf, sizeof(buf), f)) {
2069         // Read errors presumably because of EOF
2070         // If there is valid data in threadInfo[num_avail], then fake
2071         // a blank line in ensure that the last address gets parsed.
2072         bool valid = false;
2073         for (i = 0; i <= maxIndex; i++) {
2074           if (threadInfo[num_avail][i] != UINT_MAX) {
2075             valid = true;
2076           }
2077         }
2078         if (!valid) {
2079           break;
2080         }
2081         buf[0] = 0;
2082       } else if (!buf[sizeof(buf) - 1]) {
2083         // The line is longer than the buffer.  Set a flag and don't
2084         // emit an error if we were going to ignore the line, anyway.
2085         long_line = true;
2086 
2087 #define CHECK_LINE                                                             \
2088   if (long_line) {                                                             \
2089     CLEANUP_THREAD_INFO;                                                       \
2090     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
2091     return -1;                                                                 \
2092   }
2093       }
2094       (*line)++;
2095 
2096       char s1[] = "processor";
2097       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2098         CHECK_LINE;
2099         char *p = strchr(buf + sizeof(s1) - 1, ':');
2100         unsigned val;
2101         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2102           goto no_val;
2103         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2104 #if KMP_ARCH_AARCH64
2105           // Handle the old AArch64 /proc/cpuinfo layout differently,
2106           // it contains all of the 'processor' entries listed in a
2107           // single 'Processor' section, therefore the normal looking
2108           // for duplicates in that section will always fail.
2109           num_avail++;
2110 #else
2111           goto dup_field;
2112 #endif
2113         threadInfo[num_avail][osIdIndex] = val;
2114 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2115         char path[256];
2116         KMP_SNPRINTF(
2117             path, sizeof(path),
2118             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2119             threadInfo[num_avail][osIdIndex]);
2120         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2121 
2122         KMP_SNPRINTF(path, sizeof(path),
2123                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
2124                      threadInfo[num_avail][osIdIndex]);
2125         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2126         continue;
2127 #else
2128       }
2129       char s2[] = "physical id";
2130       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2131         CHECK_LINE;
2132         char *p = strchr(buf + sizeof(s2) - 1, ':');
2133         unsigned val;
2134         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2135           goto no_val;
2136         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2137           goto dup_field;
2138         threadInfo[num_avail][pkgIdIndex] = val;
2139         continue;
2140       }
2141       char s3[] = "core id";
2142       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2143         CHECK_LINE;
2144         char *p = strchr(buf + sizeof(s3) - 1, ':');
2145         unsigned val;
2146         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2147           goto no_val;
2148         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2149           goto dup_field;
2150         threadInfo[num_avail][coreIdIndex] = val;
2151         continue;
2152 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2153       }
2154       char s4[] = "thread id";
2155       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2156         CHECK_LINE;
2157         char *p = strchr(buf + sizeof(s4) - 1, ':');
2158         unsigned val;
2159         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2160           goto no_val;
2161         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2162           goto dup_field;
2163         threadInfo[num_avail][threadIdIndex] = val;
2164         continue;
2165       }
2166       unsigned level;
2167       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2168         CHECK_LINE;
2169         char *p = strchr(buf + sizeof(s4) - 1, ':');
2170         unsigned val;
2171         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2172           goto no_val;
2173         KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2174         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2175           goto dup_field;
2176         threadInfo[num_avail][nodeIdIndex + level] = val;
2177         continue;
2178       }
2179 
2180       // We didn't recognize the leading token on the line. There are lots of
2181       // leading tokens that we don't recognize - if the line isn't empty, go on
2182       // to the next line.
2183       if ((*buf != 0) && (*buf != '\n')) {
2184         // If the line is longer than the buffer, read characters
2185         // until we find a newline.
2186         if (long_line) {
2187           int ch;
2188           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2189             ;
2190         }
2191         continue;
2192       }
2193 
2194       // A newline has signalled the end of the processor record.
2195       // Check that there aren't too many procs specified.
2196       if ((int)num_avail == __kmp_xproc) {
2197         CLEANUP_THREAD_INFO;
2198         *msg_id = kmp_i18n_str_TooManyEntries;
2199         return -1;
2200       }
2201 
2202       // Check for missing fields.  The osId field must be there, and we
2203       // currently require that the physical id field is specified, also.
2204       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2205         CLEANUP_THREAD_INFO;
2206         *msg_id = kmp_i18n_str_MissingProcField;
2207         return -1;
2208       }
2209       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2210         CLEANUP_THREAD_INFO;
2211         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2212         return -1;
2213       }
2214 
2215       // Skip this proc if it is not included in the machine model.
2216       if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2217                          __kmp_affin_fullMask)) {
2218         INIT_PROC_INFO(threadInfo[num_avail]);
2219         continue;
2220       }
2221 
2222       // We have a successful parse of this proc's info.
2223       // Increment the counter, and prepare for the next proc.
2224       num_avail++;
2225       KMP_ASSERT(num_avail <= num_records);
2226       INIT_PROC_INFO(threadInfo[num_avail]);
2227     }
2228     continue;
2229 
2230   no_val:
2231     CLEANUP_THREAD_INFO;
2232     *msg_id = kmp_i18n_str_MissingValCpuinfo;
2233     return -1;
2234 
2235   dup_field:
2236     CLEANUP_THREAD_INFO;
2237     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2238     return -1;
2239   }
2240   *line = 0;
2241 
2242 #if KMP_MIC && REDUCE_TEAM_SIZE
2243   unsigned teamSize = 0;
2244 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2245 
2246   // check for num_records == __kmp_xproc ???
2247 
2248   // If there's only one thread context to bind to, form an Address object with
2249   // depth 1 and return immediately (or, if affinity is off, set address2os to
2250   // NULL and return).
2251   //
2252   // If it is configured to omit the package level when there is only a single
2253   // package, the logic at the end of this routine won't work if there is only a
2254   // single thread - it would try to form an Address object with depth 0.
2255   KMP_ASSERT(num_avail > 0);
2256   KMP_ASSERT(num_avail <= num_records);
2257   if (num_avail == 1) {
2258     __kmp_ncores = 1;
2259     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2260     if (__kmp_affinity_verbose) {
2261       if (!KMP_AFFINITY_CAPABLE()) {
2262         KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2263         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2264         KMP_INFORM(Uniform, "KMP_AFFINITY");
2265       } else {
2266         char buf[KMP_AFFIN_MASK_PRINT_LEN];
2267         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2268                                   __kmp_affin_fullMask);
2269         KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2270         if (__kmp_affinity_respect_mask) {
2271           KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2272         } else {
2273           KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2274         }
2275         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2276         KMP_INFORM(Uniform, "KMP_AFFINITY");
2277       }
2278       int index;
2279       kmp_str_buf_t buf;
2280       __kmp_str_buf_init(&buf);
2281       __kmp_str_buf_print(&buf, "1");
2282       for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2283         __kmp_str_buf_print(&buf, " x 1");
2284       }
2285       KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2286       __kmp_str_buf_free(&buf);
2287     }
2288 
2289     if (__kmp_affinity_type == affinity_none) {
2290       CLEANUP_THREAD_INFO;
2291       return 0;
2292     }
2293 
2294     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
2295     Address addr(1);
2296     addr.labels[0] = threadInfo[0][pkgIdIndex];
2297     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2298 
2299     if (__kmp_affinity_gran_levels < 0) {
2300       __kmp_affinity_gran_levels = 0;
2301     }
2302 
2303     if (__kmp_affinity_verbose) {
2304       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2305     }
2306 
2307     CLEANUP_THREAD_INFO;
2308     return 1;
2309   }
2310 
2311   // Sort the threadInfo table by physical Id.
2312   qsort(threadInfo, num_avail, sizeof(*threadInfo),
2313         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2314 
2315   // The table is now sorted by pkgId / coreId / threadId, but we really don't
2316   // know the radix of any of the fields. pkgId's may be sparsely assigned among
2317   // the chips on a system. Although coreId's are usually assigned
2318   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2319   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2320   //
2321   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2322   // total # packages) are at this point - we want to determine that now. We
2323   // only have an upper bound on the first two figures.
2324   unsigned *counts =
2325       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2326   unsigned *maxCt =
2327       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2328   unsigned *totals =
2329       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2330   unsigned *lastId =
2331       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2332 
2333   bool assign_thread_ids = false;
2334   unsigned threadIdCt;
2335   unsigned index;
2336 
2337 restart_radix_check:
2338   threadIdCt = 0;
2339 
2340   // Initialize the counter arrays with data from threadInfo[0].
2341   if (assign_thread_ids) {
2342     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2343       threadInfo[0][threadIdIndex] = threadIdCt++;
2344     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2345       threadIdCt = threadInfo[0][threadIdIndex] + 1;
2346     }
2347   }
2348   for (index = 0; index <= maxIndex; index++) {
2349     counts[index] = 1;
2350     maxCt[index] = 1;
2351     totals[index] = 1;
2352     lastId[index] = threadInfo[0][index];
2353     ;
2354   }
2355 
2356   // Run through the rest of the OS procs.
2357   for (i = 1; i < num_avail; i++) {
2358     // Find the most significant index whose id differs from the id for the
2359     // previous OS proc.
2360     for (index = maxIndex; index >= threadIdIndex; index--) {
2361       if (assign_thread_ids && (index == threadIdIndex)) {
2362         // Auto-assign the thread id field if it wasn't specified.
2363         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2364           threadInfo[i][threadIdIndex] = threadIdCt++;
2365         }
2366         // Apparently the thread id field was specified for some entries and not
2367         // others. Start the thread id counter off at the next higher thread id.
2368         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2369           threadIdCt = threadInfo[i][threadIdIndex] + 1;
2370         }
2371       }
2372       if (threadInfo[i][index] != lastId[index]) {
2373         // Run through all indices which are less significant, and reset the
2374         // counts to 1. At all levels up to and including index, we need to
2375         // increment the totals and record the last id.
2376         unsigned index2;
2377         for (index2 = threadIdIndex; index2 < index; index2++) {
2378           totals[index2]++;
2379           if (counts[index2] > maxCt[index2]) {
2380             maxCt[index2] = counts[index2];
2381           }
2382           counts[index2] = 1;
2383           lastId[index2] = threadInfo[i][index2];
2384         }
2385         counts[index]++;
2386         totals[index]++;
2387         lastId[index] = threadInfo[i][index];
2388 
2389         if (assign_thread_ids && (index > threadIdIndex)) {
2390 
2391 #if KMP_MIC && REDUCE_TEAM_SIZE
2392           // The default team size is the total #threads in the machine
2393           // minus 1 thread for every core that has 3 or more threads.
2394           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2395 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2396 
2397           // Restart the thread counter, as we are on a new core.
2398           threadIdCt = 0;
2399 
2400           // Auto-assign the thread id field if it wasn't specified.
2401           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2402             threadInfo[i][threadIdIndex] = threadIdCt++;
2403           }
2404 
2405           // Aparrently the thread id field was specified for some entries and
2406           // not others. Start the thread id counter off at the next higher
2407           // thread id.
2408           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2409             threadIdCt = threadInfo[i][threadIdIndex] + 1;
2410           }
2411         }
2412         break;
2413       }
2414     }
2415     if (index < threadIdIndex) {
2416       // If thread ids were specified, it is an error if they are not unique.
2417       // Also, check that we waven't already restarted the loop (to be safe -
2418       // shouldn't need to).
2419       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2420         __kmp_free(lastId);
2421         __kmp_free(totals);
2422         __kmp_free(maxCt);
2423         __kmp_free(counts);
2424         CLEANUP_THREAD_INFO;
2425         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2426         return -1;
2427       }
2428 
2429       // If the thread ids were not specified and we see entries entries that
2430       // are duplicates, start the loop over and assign the thread ids manually.
2431       assign_thread_ids = true;
2432       goto restart_radix_check;
2433     }
2434   }
2435 
2436 #if KMP_MIC && REDUCE_TEAM_SIZE
2437   // The default team size is the total #threads in the machine
2438   // minus 1 thread for every core that has 3 or more threads.
2439   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2440 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2441 
2442   for (index = threadIdIndex; index <= maxIndex; index++) {
2443     if (counts[index] > maxCt[index]) {
2444       maxCt[index] = counts[index];
2445     }
2446   }
2447 
2448   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2449   nCoresPerPkg = maxCt[coreIdIndex];
2450   nPackages = totals[pkgIdIndex];
2451 
2452   // Check to see if the machine topology is uniform
2453   unsigned prod = totals[maxIndex];
2454   for (index = threadIdIndex; index < maxIndex; index++) {
2455     prod *= maxCt[index];
2456   }
2457   bool uniform = (prod == totals[threadIdIndex]);
2458 
2459   // When affinity is off, this routine will still be called to set
2460   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2461   // Make sure all these vars are set correctly, and return now if affinity is
2462   // not enabled.
2463   __kmp_ncores = totals[coreIdIndex];
2464 
2465   if (__kmp_affinity_verbose) {
2466     if (!KMP_AFFINITY_CAPABLE()) {
2467       KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2468       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2469       if (uniform) {
2470         KMP_INFORM(Uniform, "KMP_AFFINITY");
2471       } else {
2472         KMP_INFORM(NonUniform, "KMP_AFFINITY");
2473       }
2474     } else {
2475       char buf[KMP_AFFIN_MASK_PRINT_LEN];
2476       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2477                                 __kmp_affin_fullMask);
2478       KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2479       if (__kmp_affinity_respect_mask) {
2480         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2481       } else {
2482         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2483       }
2484       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2485       if (uniform) {
2486         KMP_INFORM(Uniform, "KMP_AFFINITY");
2487       } else {
2488         KMP_INFORM(NonUniform, "KMP_AFFINITY");
2489       }
2490     }
2491     kmp_str_buf_t buf;
2492     __kmp_str_buf_init(&buf);
2493 
2494     __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2495     for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2496       __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2497     }
2498     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2499                maxCt[threadIdIndex], __kmp_ncores);
2500 
2501     __kmp_str_buf_free(&buf);
2502   }
2503 
2504 #if KMP_MIC && REDUCE_TEAM_SIZE
2505   // Set the default team size.
2506   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2507     __kmp_dflt_team_nth = teamSize;
2508     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2509                   "__kmp_dflt_team_nth = %d\n",
2510                   __kmp_dflt_team_nth));
2511   }
2512 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2513 
2514   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2515   KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2516   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2517   for (i = 0; i < num_avail; ++i) { // fill the os indices
2518     __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2519   }
2520 
2521   if (__kmp_affinity_type == affinity_none) {
2522     __kmp_free(lastId);
2523     __kmp_free(totals);
2524     __kmp_free(maxCt);
2525     __kmp_free(counts);
2526     CLEANUP_THREAD_INFO;
2527     return 0;
2528   }
2529 
2530   // Count the number of levels which have more nodes at that level than at the
2531   // parent's level (with there being an implicit root node of the top level).
2532   // This is equivalent to saying that there is at least one node at this level
2533   // which has a sibling. These levels are in the map, and the package level is
2534   // always in the map.
2535   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2536   int level = 0;
2537   for (index = threadIdIndex; index < maxIndex; index++) {
2538     KMP_ASSERT(totals[index] >= totals[index + 1]);
2539     inMap[index] = (totals[index] > totals[index + 1]);
2540   }
2541   inMap[maxIndex] = (totals[maxIndex] > 1);
2542   inMap[pkgIdIndex] = true;
2543 
2544   int depth = 0;
2545   for (index = threadIdIndex; index <= maxIndex; index++) {
2546     if (inMap[index]) {
2547       depth++;
2548     }
2549   }
2550   KMP_ASSERT(depth > 0);
2551 
2552   // Construct the data structure that is to be returned.
2553   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2554   int pkgLevel = -1;
2555   int coreLevel = -1;
2556   int threadLevel = -1;
2557 
2558   for (i = 0; i < num_avail; ++i) {
2559     Address addr(depth);
2560     unsigned os = threadInfo[i][osIdIndex];
2561     int src_index;
2562     int dst_index = 0;
2563 
2564     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2565       if (!inMap[src_index]) {
2566         continue;
2567       }
2568       addr.labels[dst_index] = threadInfo[i][src_index];
2569       if (src_index == pkgIdIndex) {
2570         pkgLevel = dst_index;
2571       } else if (src_index == coreIdIndex) {
2572         coreLevel = dst_index;
2573       } else if (src_index == threadIdIndex) {
2574         threadLevel = dst_index;
2575       }
2576       dst_index++;
2577     }
2578     (*address2os)[i] = AddrUnsPair(addr, os);
2579   }
2580 
2581   if (__kmp_affinity_gran_levels < 0) {
2582     // Set the granularity level based on what levels are modeled
2583     // in the machine topology map.
2584     unsigned src_index;
2585     __kmp_affinity_gran_levels = 0;
2586     for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2587       if (!inMap[src_index]) {
2588         continue;
2589       }
2590       switch (src_index) {
2591       case threadIdIndex:
2592         if (__kmp_affinity_gran > affinity_gran_thread) {
2593           __kmp_affinity_gran_levels++;
2594         }
2595 
2596         break;
2597       case coreIdIndex:
2598         if (__kmp_affinity_gran > affinity_gran_core) {
2599           __kmp_affinity_gran_levels++;
2600         }
2601         break;
2602 
2603       case pkgIdIndex:
2604         if (__kmp_affinity_gran > affinity_gran_package) {
2605           __kmp_affinity_gran_levels++;
2606         }
2607         break;
2608       }
2609     }
2610   }
2611 
2612   if (__kmp_affinity_verbose) {
2613     __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2614                                   coreLevel, threadLevel);
2615   }
2616 
2617   __kmp_free(inMap);
2618   __kmp_free(lastId);
2619   __kmp_free(totals);
2620   __kmp_free(maxCt);
2621   __kmp_free(counts);
2622   CLEANUP_THREAD_INFO;
2623   return depth;
2624 }
2625 
2626 // Create and return a table of affinity masks, indexed by OS thread ID.
2627 // This routine handles OR'ing together all the affinity masks of threads
2628 // that are sufficiently close, if granularity > fine.
2629 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2630                                             unsigned *numUnique,
2631                                             AddrUnsPair *address2os,
2632                                             unsigned numAddrs) {
2633   // First form a table of affinity masks in order of OS thread id.
2634   unsigned depth;
2635   unsigned maxOsId;
2636   unsigned i;
2637 
2638   KMP_ASSERT(numAddrs > 0);
2639   depth = address2os[0].first.depth;
2640 
2641   maxOsId = 0;
2642   for (i = numAddrs - 1;; --i) {
2643     unsigned osId = address2os[i].second;
2644     if (osId > maxOsId) {
2645       maxOsId = osId;
2646     }
2647     if (i == 0)
2648       break;
2649   }
2650   kmp_affin_mask_t *osId2Mask;
2651   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2652 
2653   // Sort the address2os table according to physical order. Doing so will put
2654   // all threads on the same core/package/node in consecutive locations.
2655   qsort(address2os, numAddrs, sizeof(*address2os),
2656         __kmp_affinity_cmp_Address_labels);
2657 
2658   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2659   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2660     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2661   }
2662   if (__kmp_affinity_gran_levels >= (int)depth) {
2663     if (__kmp_affinity_verbose ||
2664         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2665       KMP_WARNING(AffThreadsMayMigrate);
2666     }
2667   }
2668 
2669   // Run through the table, forming the masks for all threads on each core.
2670   // Threads on the same core will have identical "Address" objects, not
2671   // considering the last level, which must be the thread id. All threads on a
2672   // core will appear consecutively.
2673   unsigned unique = 0;
2674   unsigned j = 0; // index of 1st thread on core
2675   unsigned leader = 0;
2676   Address *leaderAddr = &(address2os[0].first);
2677   kmp_affin_mask_t *sum;
2678   KMP_CPU_ALLOC_ON_STACK(sum);
2679   KMP_CPU_ZERO(sum);
2680   KMP_CPU_SET(address2os[0].second, sum);
2681   for (i = 1; i < numAddrs; i++) {
2682     // If this thread is sufficiently close to the leader (within the
2683     // granularity setting), then set the bit for this os thread in the
2684     // affinity mask for this group, and go on to the next thread.
2685     if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
2686       KMP_CPU_SET(address2os[i].second, sum);
2687       continue;
2688     }
2689 
2690     // For every thread in this group, copy the mask to the thread's entry in
2691     // the osId2Mask table.  Mark the first address as a leader.
2692     for (; j < i; j++) {
2693       unsigned osId = address2os[j].second;
2694       KMP_DEBUG_ASSERT(osId <= maxOsId);
2695       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2696       KMP_CPU_COPY(mask, sum);
2697       address2os[j].first.leader = (j == leader);
2698     }
2699     unique++;
2700 
2701     // Start a new mask.
2702     leader = i;
2703     leaderAddr = &(address2os[i].first);
2704     KMP_CPU_ZERO(sum);
2705     KMP_CPU_SET(address2os[i].second, sum);
2706   }
2707 
2708   // For every thread in last group, copy the mask to the thread's
2709   // entry in the osId2Mask table.
2710   for (; j < i; j++) {
2711     unsigned osId = address2os[j].second;
2712     KMP_DEBUG_ASSERT(osId <= maxOsId);
2713     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2714     KMP_CPU_COPY(mask, sum);
2715     address2os[j].first.leader = (j == leader);
2716   }
2717   unique++;
2718   KMP_CPU_FREE_FROM_STACK(sum);
2719 
2720   *maxIndex = maxOsId;
2721   *numUnique = unique;
2722   return osId2Mask;
2723 }
2724 
2725 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2726 // as file-static than to try and pass them through the calling sequence of
2727 // the recursive-descent OMP_PLACES parser.
2728 static kmp_affin_mask_t *newMasks;
2729 static int numNewMasks;
2730 static int nextNewMask;
2731 
2732 #define ADD_MASK(_mask)                                                        \
2733   {                                                                            \
2734     if (nextNewMask >= numNewMasks) {                                          \
2735       int i;                                                                   \
2736       numNewMasks *= 2;                                                        \
2737       kmp_affin_mask_t *temp;                                                  \
2738       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
2739       for (i = 0; i < numNewMasks / 2; i++) {                                  \
2740         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
2741         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
2742         KMP_CPU_COPY(dest, src);                                               \
2743       }                                                                        \
2744       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
2745       newMasks = temp;                                                         \
2746     }                                                                          \
2747     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
2748     nextNewMask++;                                                             \
2749   }
2750 
2751 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
2752   {                                                                            \
2753     if (((_osId) > _maxOsId) ||                                                \
2754         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
2755       if (__kmp_affinity_verbose ||                                            \
2756           (__kmp_affinity_warnings &&                                          \
2757            (__kmp_affinity_type != affinity_none))) {                          \
2758         KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
2759       }                                                                        \
2760     } else {                                                                   \
2761       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
2762     }                                                                          \
2763   }
2764 
2765 // Re-parse the proclist (for the explicit affinity type), and form the list
2766 // of affinity newMasks indexed by gtid.
2767 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2768                                             unsigned int *out_numMasks,
2769                                             const char *proclist,
2770                                             kmp_affin_mask_t *osId2Mask,
2771                                             int maxOsId) {
2772   int i;
2773   const char *scan = proclist;
2774   const char *next = proclist;
2775 
2776   // We use malloc() for the temporary mask vector, so that we can use
2777   // realloc() to extend it.
2778   numNewMasks = 2;
2779   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2780   nextNewMask = 0;
2781   kmp_affin_mask_t *sumMask;
2782   KMP_CPU_ALLOC(sumMask);
2783   int setSize = 0;
2784 
2785   for (;;) {
2786     int start, end, stride;
2787 
2788     SKIP_WS(scan);
2789     next = scan;
2790     if (*next == '\0') {
2791       break;
2792     }
2793 
2794     if (*next == '{') {
2795       int num;
2796       setSize = 0;
2797       next++; // skip '{'
2798       SKIP_WS(next);
2799       scan = next;
2800 
2801       // Read the first integer in the set.
2802       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2803       SKIP_DIGITS(next);
2804       num = __kmp_str_to_int(scan, *next);
2805       KMP_ASSERT2(num >= 0, "bad explicit proc list");
2806 
2807       // Copy the mask for that osId to the sum (union) mask.
2808       if ((num > maxOsId) ||
2809           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2810         if (__kmp_affinity_verbose ||
2811             (__kmp_affinity_warnings &&
2812              (__kmp_affinity_type != affinity_none))) {
2813           KMP_WARNING(AffIgnoreInvalidProcID, num);
2814         }
2815         KMP_CPU_ZERO(sumMask);
2816       } else {
2817         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2818         setSize = 1;
2819       }
2820 
2821       for (;;) {
2822         // Check for end of set.
2823         SKIP_WS(next);
2824         if (*next == '}') {
2825           next++; // skip '}'
2826           break;
2827         }
2828 
2829         // Skip optional comma.
2830         if (*next == ',') {
2831           next++;
2832         }
2833         SKIP_WS(next);
2834 
2835         // Read the next integer in the set.
2836         scan = next;
2837         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2838 
2839         SKIP_DIGITS(next);
2840         num = __kmp_str_to_int(scan, *next);
2841         KMP_ASSERT2(num >= 0, "bad explicit proc list");
2842 
2843         // Add the mask for that osId to the sum mask.
2844         if ((num > maxOsId) ||
2845             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2846           if (__kmp_affinity_verbose ||
2847               (__kmp_affinity_warnings &&
2848                (__kmp_affinity_type != affinity_none))) {
2849             KMP_WARNING(AffIgnoreInvalidProcID, num);
2850           }
2851         } else {
2852           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2853           setSize++;
2854         }
2855       }
2856       if (setSize > 0) {
2857         ADD_MASK(sumMask);
2858       }
2859 
2860       SKIP_WS(next);
2861       if (*next == ',') {
2862         next++;
2863       }
2864       scan = next;
2865       continue;
2866     }
2867 
2868     // Read the first integer.
2869     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2870     SKIP_DIGITS(next);
2871     start = __kmp_str_to_int(scan, *next);
2872     KMP_ASSERT2(start >= 0, "bad explicit proc list");
2873     SKIP_WS(next);
2874 
2875     // If this isn't a range, then add a mask to the list and go on.
2876     if (*next != '-') {
2877       ADD_MASK_OSID(start, osId2Mask, maxOsId);
2878 
2879       // Skip optional comma.
2880       if (*next == ',') {
2881         next++;
2882       }
2883       scan = next;
2884       continue;
2885     }
2886 
2887     // This is a range.  Skip over the '-' and read in the 2nd int.
2888     next++; // skip '-'
2889     SKIP_WS(next);
2890     scan = next;
2891     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2892     SKIP_DIGITS(next);
2893     end = __kmp_str_to_int(scan, *next);
2894     KMP_ASSERT2(end >= 0, "bad explicit proc list");
2895 
2896     // Check for a stride parameter
2897     stride = 1;
2898     SKIP_WS(next);
2899     if (*next == ':') {
2900       // A stride is specified.  Skip over the ':" and read the 3rd int.
2901       int sign = +1;
2902       next++; // skip ':'
2903       SKIP_WS(next);
2904       scan = next;
2905       if (*next == '-') {
2906         sign = -1;
2907         next++;
2908         SKIP_WS(next);
2909         scan = next;
2910       }
2911       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2912       SKIP_DIGITS(next);
2913       stride = __kmp_str_to_int(scan, *next);
2914       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2915       stride *= sign;
2916     }
2917 
2918     // Do some range checks.
2919     KMP_ASSERT2(stride != 0, "bad explicit proc list");
2920     if (stride > 0) {
2921       KMP_ASSERT2(start <= end, "bad explicit proc list");
2922     } else {
2923       KMP_ASSERT2(start >= end, "bad explicit proc list");
2924     }
2925     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2926 
2927     // Add the mask for each OS proc # to the list.
2928     if (stride > 0) {
2929       do {
2930         ADD_MASK_OSID(start, osId2Mask, maxOsId);
2931         start += stride;
2932       } while (start <= end);
2933     } else {
2934       do {
2935         ADD_MASK_OSID(start, osId2Mask, maxOsId);
2936         start += stride;
2937       } while (start >= end);
2938     }
2939 
2940     // Skip optional comma.
2941     SKIP_WS(next);
2942     if (*next == ',') {
2943       next++;
2944     }
2945     scan = next;
2946   }
2947 
2948   *out_numMasks = nextNewMask;
2949   if (nextNewMask == 0) {
2950     *out_masks = NULL;
2951     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2952     return;
2953   }
2954   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
2955   for (i = 0; i < nextNewMask; i++) {
2956     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
2957     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
2958     KMP_CPU_COPY(dest, src);
2959   }
2960   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2961   KMP_CPU_FREE(sumMask);
2962 }
2963 
2964 #if OMP_40_ENABLED
2965 
2966 /*-----------------------------------------------------------------------------
2967 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2968 places.  Again, Here is the grammar:
2969 
2970 place_list := place
2971 place_list := place , place_list
2972 place := num
2973 place := place : num
2974 place := place : num : signed
2975 place := { subplacelist }
2976 place := ! place                  // (lowest priority)
2977 subplace_list := subplace
2978 subplace_list := subplace , subplace_list
2979 subplace := num
2980 subplace := num : num
2981 subplace := num : num : signed
2982 signed := num
2983 signed := + signed
2984 signed := - signed
2985 -----------------------------------------------------------------------------*/
2986 
2987 static void __kmp_process_subplace_list(const char **scan,
2988                                         kmp_affin_mask_t *osId2Mask,
2989                                         int maxOsId, kmp_affin_mask_t *tempMask,
2990                                         int *setSize) {
2991   const char *next;
2992 
2993   for (;;) {
2994     int start, count, stride, i;
2995 
2996     // Read in the starting proc id
2997     SKIP_WS(*scan);
2998     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
2999     next = *scan;
3000     SKIP_DIGITS(next);
3001     start = __kmp_str_to_int(*scan, *next);
3002     KMP_ASSERT(start >= 0);
3003     *scan = next;
3004 
3005     // valid follow sets are ',' ':' and '}'
3006     SKIP_WS(*scan);
3007     if (**scan == '}' || **scan == ',') {
3008       if ((start > maxOsId) ||
3009           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3010         if (__kmp_affinity_verbose ||
3011             (__kmp_affinity_warnings &&
3012              (__kmp_affinity_type != affinity_none))) {
3013           KMP_WARNING(AffIgnoreInvalidProcID, start);
3014         }
3015       } else {
3016         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3017         (*setSize)++;
3018       }
3019       if (**scan == '}') {
3020         break;
3021       }
3022       (*scan)++; // skip ','
3023       continue;
3024     }
3025     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3026     (*scan)++; // skip ':'
3027 
3028     // Read count parameter
3029     SKIP_WS(*scan);
3030     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3031     next = *scan;
3032     SKIP_DIGITS(next);
3033     count = __kmp_str_to_int(*scan, *next);
3034     KMP_ASSERT(count >= 0);
3035     *scan = next;
3036 
3037     // valid follow sets are ',' ':' and '}'
3038     SKIP_WS(*scan);
3039     if (**scan == '}' || **scan == ',') {
3040       for (i = 0; i < count; i++) {
3041         if ((start > maxOsId) ||
3042             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3043           if (__kmp_affinity_verbose ||
3044               (__kmp_affinity_warnings &&
3045                (__kmp_affinity_type != affinity_none))) {
3046             KMP_WARNING(AffIgnoreInvalidProcID, start);
3047           }
3048           break; // don't proliferate warnings for large count
3049         } else {
3050           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3051           start++;
3052           (*setSize)++;
3053         }
3054       }
3055       if (**scan == '}') {
3056         break;
3057       }
3058       (*scan)++; // skip ','
3059       continue;
3060     }
3061     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3062     (*scan)++; // skip ':'
3063 
3064     // Read stride parameter
3065     int sign = +1;
3066     for (;;) {
3067       SKIP_WS(*scan);
3068       if (**scan == '+') {
3069         (*scan)++; // skip '+'
3070         continue;
3071       }
3072       if (**scan == '-') {
3073         sign *= -1;
3074         (*scan)++; // skip '-'
3075         continue;
3076       }
3077       break;
3078     }
3079     SKIP_WS(*scan);
3080     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3081     next = *scan;
3082     SKIP_DIGITS(next);
3083     stride = __kmp_str_to_int(*scan, *next);
3084     KMP_ASSERT(stride >= 0);
3085     *scan = next;
3086     stride *= sign;
3087 
3088     // valid follow sets are ',' and '}'
3089     SKIP_WS(*scan);
3090     if (**scan == '}' || **scan == ',') {
3091       for (i = 0; i < count; i++) {
3092         if ((start > maxOsId) ||
3093             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3094           if (__kmp_affinity_verbose ||
3095               (__kmp_affinity_warnings &&
3096                (__kmp_affinity_type != affinity_none))) {
3097             KMP_WARNING(AffIgnoreInvalidProcID, start);
3098           }
3099           break; // don't proliferate warnings for large count
3100         } else {
3101           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3102           start += stride;
3103           (*setSize)++;
3104         }
3105       }
3106       if (**scan == '}') {
3107         break;
3108       }
3109       (*scan)++; // skip ','
3110       continue;
3111     }
3112 
3113     KMP_ASSERT2(0, "bad explicit places list");
3114   }
3115 }
3116 
3117 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3118                                 int maxOsId, kmp_affin_mask_t *tempMask,
3119                                 int *setSize) {
3120   const char *next;
3121 
3122   // valid follow sets are '{' '!' and num
3123   SKIP_WS(*scan);
3124   if (**scan == '{') {
3125     (*scan)++; // skip '{'
3126     __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3127     KMP_ASSERT2(**scan == '}', "bad explicit places list");
3128     (*scan)++; // skip '}'
3129   } else if (**scan == '!') {
3130     (*scan)++; // skip '!'
3131     __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3132     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3133   } else if ((**scan >= '0') && (**scan <= '9')) {
3134     next = *scan;
3135     SKIP_DIGITS(next);
3136     int num = __kmp_str_to_int(*scan, *next);
3137     KMP_ASSERT(num >= 0);
3138     if ((num > maxOsId) ||
3139         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3140       if (__kmp_affinity_verbose ||
3141           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3142         KMP_WARNING(AffIgnoreInvalidProcID, num);
3143       }
3144     } else {
3145       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3146       (*setSize)++;
3147     }
3148     *scan = next; // skip num
3149   } else {
3150     KMP_ASSERT2(0, "bad explicit places list");
3151   }
3152 }
3153 
3154 // static void
3155 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3156                                       unsigned int *out_numMasks,
3157                                       const char *placelist,
3158                                       kmp_affin_mask_t *osId2Mask,
3159                                       int maxOsId) {
3160   int i, j, count, stride, sign;
3161   const char *scan = placelist;
3162   const char *next = placelist;
3163 
3164   numNewMasks = 2;
3165   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3166   nextNewMask = 0;
3167 
3168   // tempMask is modified based on the previous or initial
3169   //   place to form the current place
3170   // previousMask contains the previous place
3171   kmp_affin_mask_t *tempMask;
3172   kmp_affin_mask_t *previousMask;
3173   KMP_CPU_ALLOC(tempMask);
3174   KMP_CPU_ZERO(tempMask);
3175   KMP_CPU_ALLOC(previousMask);
3176   KMP_CPU_ZERO(previousMask);
3177   int setSize = 0;
3178 
3179   for (;;) {
3180     __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3181 
3182     // valid follow sets are ',' ':' and EOL
3183     SKIP_WS(scan);
3184     if (*scan == '\0' || *scan == ',') {
3185       if (setSize > 0) {
3186         ADD_MASK(tempMask);
3187       }
3188       KMP_CPU_ZERO(tempMask);
3189       setSize = 0;
3190       if (*scan == '\0') {
3191         break;
3192       }
3193       scan++; // skip ','
3194       continue;
3195     }
3196 
3197     KMP_ASSERT2(*scan == ':', "bad explicit places list");
3198     scan++; // skip ':'
3199 
3200     // Read count parameter
3201     SKIP_WS(scan);
3202     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3203     next = scan;
3204     SKIP_DIGITS(next);
3205     count = __kmp_str_to_int(scan, *next);
3206     KMP_ASSERT(count >= 0);
3207     scan = next;
3208 
3209     // valid follow sets are ',' ':' and EOL
3210     SKIP_WS(scan);
3211     if (*scan == '\0' || *scan == ',') {
3212       stride = +1;
3213     } else {
3214       KMP_ASSERT2(*scan == ':', "bad explicit places list");
3215       scan++; // skip ':'
3216 
3217       // Read stride parameter
3218       sign = +1;
3219       for (;;) {
3220         SKIP_WS(scan);
3221         if (*scan == '+') {
3222           scan++; // skip '+'
3223           continue;
3224         }
3225         if (*scan == '-') {
3226           sign *= -1;
3227           scan++; // skip '-'
3228           continue;
3229         }
3230         break;
3231       }
3232       SKIP_WS(scan);
3233       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3234       next = scan;
3235       SKIP_DIGITS(next);
3236       stride = __kmp_str_to_int(scan, *next);
3237       KMP_DEBUG_ASSERT(stride >= 0);
3238       scan = next;
3239       stride *= sign;
3240     }
3241 
3242     // Add places determined by initial_place : count : stride
3243     for (i = 0; i < count; i++) {
3244       if (setSize == 0) {
3245         break;
3246       }
3247       // Add the current place, then build the next place (tempMask) from that
3248       KMP_CPU_COPY(previousMask, tempMask);
3249       ADD_MASK(previousMask);
3250       KMP_CPU_ZERO(tempMask);
3251       setSize = 0;
3252       KMP_CPU_SET_ITERATE(j, previousMask) {
3253         if (!KMP_CPU_ISSET(j, previousMask)) {
3254           continue;
3255         }
3256         if ((j + stride > maxOsId) || (j + stride < 0) ||
3257             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3258             (!KMP_CPU_ISSET(j + stride,
3259                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3260           if ((__kmp_affinity_verbose ||
3261                (__kmp_affinity_warnings &&
3262                 (__kmp_affinity_type != affinity_none))) &&
3263               i < count - 1) {
3264             KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3265           }
3266           continue;
3267         }
3268         KMP_CPU_SET(j + stride, tempMask);
3269         setSize++;
3270       }
3271     }
3272     KMP_CPU_ZERO(tempMask);
3273     setSize = 0;
3274 
3275     // valid follow sets are ',' and EOL
3276     SKIP_WS(scan);
3277     if (*scan == '\0') {
3278       break;
3279     }
3280     if (*scan == ',') {
3281       scan++; // skip ','
3282       continue;
3283     }
3284 
3285     KMP_ASSERT2(0, "bad explicit places list");
3286   }
3287 
3288   *out_numMasks = nextNewMask;
3289   if (nextNewMask == 0) {
3290     *out_masks = NULL;
3291     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3292     return;
3293   }
3294   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3295   KMP_CPU_FREE(tempMask);
3296   KMP_CPU_FREE(previousMask);
3297   for (i = 0; i < nextNewMask; i++) {
3298     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3299     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3300     KMP_CPU_COPY(dest, src);
3301   }
3302   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3303 }
3304 
3305 #endif /* OMP_40_ENABLED */
3306 
3307 #undef ADD_MASK
3308 #undef ADD_MASK_OSID
3309 
3310 #if KMP_USE_HWLOC
3311 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
3312   // skip PUs descendants of the object o
3313   int skipped = 0;
3314   hwloc_obj_t hT = NULL;
3315   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3316   for (int i = 0; i < N; ++i) {
3317     KMP_DEBUG_ASSERT(hT);
3318     unsigned idx = hT->os_index;
3319     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3320       KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3321       KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3322       ++skipped;
3323     }
3324     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3325   }
3326   return skipped; // count number of skipped units
3327 }
3328 
3329 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
3330   // check if obj has PUs present in fullMask
3331   hwloc_obj_t hT = NULL;
3332   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3333   for (int i = 0; i < N; ++i) {
3334     KMP_DEBUG_ASSERT(hT);
3335     unsigned idx = hT->os_index;
3336     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
3337       return 1; // found PU
3338     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3339   }
3340   return 0; // no PUs found
3341 }
3342 #endif // KMP_USE_HWLOC
3343 
3344 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
3345   AddrUnsPair *newAddr;
3346   if (__kmp_hws_requested == 0)
3347     goto _exit; // no topology limiting actions requested, exit
3348 #if KMP_USE_HWLOC
3349   if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3350     // Number of subobjects calculated dynamically, this works fine for
3351     // any non-uniform topology.
3352     // L2 cache objects are determined by depth, other objects - by type.
3353     hwloc_topology_t tp = __kmp_hwloc_topology;
3354     int nS = 0, nN = 0, nL = 0, nC = 0,
3355         nT = 0; // logical index including skipped
3356     int nCr = 0, nTr = 0; // number of requested units
3357     int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
3358     hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
3359     int L2depth, idx;
3360 
3361     // check support of extensions ----------------------------------
3362     int numa_support = 0, tile_support = 0;
3363     if (__kmp_pu_os_idx)
3364       hT = hwloc_get_pu_obj_by_os_index(tp,
3365                                         __kmp_pu_os_idx[__kmp_avail_proc - 1]);
3366     else
3367       hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
3368     if (hT == NULL) { // something's gone wrong
3369       KMP_WARNING(AffHWSubsetUnsupported);
3370       goto _exit;
3371     }
3372     // check NUMA node
3373     hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
3374     hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
3375     if (hN != NULL && hN->depth > hS->depth) {
3376       numa_support = 1; // 1 in case socket includes node(s)
3377     } else if (__kmp_hws_node.num > 0) {
3378       // don't support sockets inside NUMA node (no such HW found for testing)
3379       KMP_WARNING(AffHWSubsetUnsupported);
3380       goto _exit;
3381     }
3382     // check L2 cahce, get object by depth because of multiple caches
3383     L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
3384     hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
3385     if (hL != NULL &&
3386         __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
3387       tile_support = 1; // no sense to count L2 if it includes single core
3388     } else if (__kmp_hws_tile.num > 0) {
3389       if (__kmp_hws_core.num == 0) {
3390         __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
3391         __kmp_hws_tile.num = 0;
3392       } else {
3393         // L2 and core are both requested, but represent same object
3394         KMP_WARNING(AffHWSubsetInvalid);
3395         goto _exit;
3396       }
3397     }
3398     // end of check of extensions -----------------------------------
3399 
3400     // fill in unset items, validate settings -----------------------
3401     if (__kmp_hws_socket.num == 0)
3402       __kmp_hws_socket.num = nPackages; // use all available sockets
3403     if (__kmp_hws_socket.offset >= nPackages) {
3404       KMP_WARNING(AffHWSubsetManySockets);
3405       goto _exit;
3406     }
3407     if (numa_support) {
3408       hN = NULL;
3409       int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
3410                                                   &hN); // num nodes in socket
3411       if (__kmp_hws_node.num == 0)
3412         __kmp_hws_node.num = NN; // use all available nodes
3413       if (__kmp_hws_node.offset >= NN) {
3414         KMP_WARNING(AffHWSubsetManyNodes);
3415         goto _exit;
3416       }
3417       if (tile_support) {
3418         // get num tiles in node
3419         int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3420         if (__kmp_hws_tile.num == 0) {
3421           __kmp_hws_tile.num = NL + 1;
3422         } // use all available tiles, some node may have more tiles, thus +1
3423         if (__kmp_hws_tile.offset >= NL) {
3424           KMP_WARNING(AffHWSubsetManyTiles);
3425           goto _exit;
3426         }
3427         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3428                                                     &hC); // num cores in tile
3429         if (__kmp_hws_core.num == 0)
3430           __kmp_hws_core.num = NC; // use all available cores
3431         if (__kmp_hws_core.offset >= NC) {
3432           KMP_WARNING(AffHWSubsetManyCores);
3433           goto _exit;
3434         }
3435       } else { // tile_support
3436         int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
3437                                                     &hC); // num cores in node
3438         if (__kmp_hws_core.num == 0)
3439           __kmp_hws_core.num = NC; // use all available cores
3440         if (__kmp_hws_core.offset >= NC) {
3441           KMP_WARNING(AffHWSubsetManyCores);
3442           goto _exit;
3443         }
3444       } // tile_support
3445     } else { // numa_support
3446       if (tile_support) {
3447         // get num tiles in socket
3448         int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3449         if (__kmp_hws_tile.num == 0)
3450           __kmp_hws_tile.num = NL; // use all available tiles
3451         if (__kmp_hws_tile.offset >= NL) {
3452           KMP_WARNING(AffHWSubsetManyTiles);
3453           goto _exit;
3454         }
3455         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3456                                                     &hC); // num cores in tile
3457         if (__kmp_hws_core.num == 0)
3458           __kmp_hws_core.num = NC; // use all available cores
3459         if (__kmp_hws_core.offset >= NC) {
3460           KMP_WARNING(AffHWSubsetManyCores);
3461           goto _exit;
3462         }
3463       } else { // tile_support
3464         int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
3465                                                     &hC); // num cores in socket
3466         if (__kmp_hws_core.num == 0)
3467           __kmp_hws_core.num = NC; // use all available cores
3468         if (__kmp_hws_core.offset >= NC) {
3469           KMP_WARNING(AffHWSubsetManyCores);
3470           goto _exit;
3471         }
3472       } // tile_support
3473     }
3474     if (__kmp_hws_proc.num == 0)
3475       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
3476     if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
3477       KMP_WARNING(AffHWSubsetManyProcs);
3478       goto _exit;
3479     }
3480     // end of validation --------------------------------------------
3481 
3482     if (pAddr) // pAddr is NULL in case of affinity_none
3483       newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
3484                                               __kmp_avail_proc); // max size
3485     // main loop to form HW subset ----------------------------------
3486     hS = NULL;
3487     int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
3488     for (int s = 0; s < NP; ++s) {
3489       // Check Socket -----------------------------------------------
3490       hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
3491       if (!__kmp_hwloc_obj_has_PUs(tp, hS))
3492         continue; // skip socket if all PUs are out of fullMask
3493       ++nS; // only count objects those have PUs in affinity mask
3494       if (nS <= __kmp_hws_socket.offset ||
3495           nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
3496         n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
3497         continue; // move to next socket
3498       }
3499       nCr = 0; // count number of cores per socket
3500       // socket requested, go down the topology tree
3501       // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
3502       if (numa_support) {
3503         nN = 0;
3504         hN = NULL;
3505         // num nodes in current socket
3506         int NN =
3507             __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
3508         for (int n = 0; n < NN; ++n) {
3509           // Check NUMA Node ----------------------------------------
3510           if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
3511             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3512             continue; // skip node if all PUs are out of fullMask
3513           }
3514           ++nN;
3515           if (nN <= __kmp_hws_node.offset ||
3516               nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
3517             // skip node as not requested
3518             n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
3519             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3520             continue; // move to next node
3521           }
3522           // node requested, go down the topology tree
3523           if (tile_support) {
3524             nL = 0;
3525             hL = NULL;
3526             int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3527             for (int l = 0; l < NL; ++l) {
3528               // Check L2 (tile) ------------------------------------
3529               if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3530                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3531                 continue; // skip tile if all PUs are out of fullMask
3532               }
3533               ++nL;
3534               if (nL <= __kmp_hws_tile.offset ||
3535                   nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3536                 // skip tile as not requested
3537                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3538                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3539                 continue; // move to next tile
3540               }
3541               // tile requested, go down the topology tree
3542               nC = 0;
3543               hC = NULL;
3544               // num cores in current tile
3545               int NC = __kmp_hwloc_count_children_by_type(tp, hL,
3546                                                           HWLOC_OBJ_CORE, &hC);
3547               for (int c = 0; c < NC; ++c) {
3548                 // Check Core ---------------------------------------
3549                 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3550                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3551                   continue; // skip core if all PUs are out of fullMask
3552                 }
3553                 ++nC;
3554                 if (nC <= __kmp_hws_core.offset ||
3555                     nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3556                   // skip node as not requested
3557                   n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3558                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3559                   continue; // move to next node
3560                 }
3561                 // core requested, go down to PUs
3562                 nT = 0;
3563                 nTr = 0;
3564                 hT = NULL;
3565                 // num procs in current core
3566                 int NT = __kmp_hwloc_count_children_by_type(tp, hC,
3567                                                             HWLOC_OBJ_PU, &hT);
3568                 for (int t = 0; t < NT; ++t) {
3569                   // Check PU ---------------------------------------
3570                   idx = hT->os_index;
3571                   if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3572                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3573                     continue; // skip PU if not in fullMask
3574                   }
3575                   ++nT;
3576                   if (nT <= __kmp_hws_proc.offset ||
3577                       nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3578                     // skip PU
3579                     KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3580                     ++n_old;
3581                     KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3582                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3583                     continue; // move to next node
3584                   }
3585                   ++nTr;
3586                   if (pAddr) // collect requested thread's data
3587                     newAddr[n_new] = (*pAddr)[n_old];
3588                   ++n_new;
3589                   ++n_old;
3590                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3591                 } // threads loop
3592                 if (nTr > 0) {
3593                   ++nCr; // num cores per socket
3594                   ++nCo; // total num cores
3595                   if (nTr > nTpC)
3596                     nTpC = nTr; // calc max threads per core
3597                 }
3598                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3599               } // cores loop
3600               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3601             } // tiles loop
3602           } else { // tile_support
3603             // no tiles, check cores
3604             nC = 0;
3605             hC = NULL;
3606             // num cores in current node
3607             int NC =
3608                 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
3609             for (int c = 0; c < NC; ++c) {
3610               // Check Core ---------------------------------------
3611               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3612                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3613                 continue; // skip core if all PUs are out of fullMask
3614               }
3615               ++nC;
3616               if (nC <= __kmp_hws_core.offset ||
3617                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3618                 // skip node as not requested
3619                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3620                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3621                 continue; // move to next node
3622               }
3623               // core requested, go down to PUs
3624               nT = 0;
3625               nTr = 0;
3626               hT = NULL;
3627               int NT =
3628                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3629               for (int t = 0; t < NT; ++t) {
3630                 // Check PU ---------------------------------------
3631                 idx = hT->os_index;
3632                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3633                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3634                   continue; // skip PU if not in fullMask
3635                 }
3636                 ++nT;
3637                 if (nT <= __kmp_hws_proc.offset ||
3638                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3639                   // skip PU
3640                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3641                   ++n_old;
3642                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3643                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3644                   continue; // move to next node
3645                 }
3646                 ++nTr;
3647                 if (pAddr) // collect requested thread's data
3648                   newAddr[n_new] = (*pAddr)[n_old];
3649                 ++n_new;
3650                 ++n_old;
3651                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3652               } // threads loop
3653               if (nTr > 0) {
3654                 ++nCr; // num cores per socket
3655                 ++nCo; // total num cores
3656                 if (nTr > nTpC)
3657                   nTpC = nTr; // calc max threads per core
3658               }
3659               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3660             } // cores loop
3661           } // tiles support
3662           hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3663         } // nodes loop
3664       } else { // numa_support
3665         // no NUMA support
3666         if (tile_support) {
3667           nL = 0;
3668           hL = NULL;
3669           // num tiles in current socket
3670           int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3671           for (int l = 0; l < NL; ++l) {
3672             // Check L2 (tile) ------------------------------------
3673             if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3674               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3675               continue; // skip tile if all PUs are out of fullMask
3676             }
3677             ++nL;
3678             if (nL <= __kmp_hws_tile.offset ||
3679                 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3680               // skip tile as not requested
3681               n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3682               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3683               continue; // move to next tile
3684             }
3685             // tile requested, go down the topology tree
3686             nC = 0;
3687             hC = NULL;
3688             // num cores per tile
3689             int NC =
3690                 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
3691             for (int c = 0; c < NC; ++c) {
3692               // Check Core ---------------------------------------
3693               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3694                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3695                 continue; // skip core if all PUs are out of fullMask
3696               }
3697               ++nC;
3698               if (nC <= __kmp_hws_core.offset ||
3699                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3700                 // skip node as not requested
3701                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3702                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3703                 continue; // move to next node
3704               }
3705               // core requested, go down to PUs
3706               nT = 0;
3707               nTr = 0;
3708               hT = NULL;
3709               // num procs per core
3710               int NT =
3711                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3712               for (int t = 0; t < NT; ++t) {
3713                 // Check PU ---------------------------------------
3714                 idx = hT->os_index;
3715                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3716                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3717                   continue; // skip PU if not in fullMask
3718                 }
3719                 ++nT;
3720                 if (nT <= __kmp_hws_proc.offset ||
3721                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3722                   // skip PU
3723                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3724                   ++n_old;
3725                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3726                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3727                   continue; // move to next node
3728                 }
3729                 ++nTr;
3730                 if (pAddr) // collect requested thread's data
3731                   newAddr[n_new] = (*pAddr)[n_old];
3732                 ++n_new;
3733                 ++n_old;
3734                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3735               } // threads loop
3736               if (nTr > 0) {
3737                 ++nCr; // num cores per socket
3738                 ++nCo; // total num cores
3739                 if (nTr > nTpC)
3740                   nTpC = nTr; // calc max threads per core
3741               }
3742               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3743             } // cores loop
3744             hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3745           } // tiles loop
3746         } else { // tile_support
3747           // no tiles, check cores
3748           nC = 0;
3749           hC = NULL;
3750           // num cores in socket
3751           int NC =
3752               __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
3753           for (int c = 0; c < NC; ++c) {
3754             // Check Core -------------------------------------------
3755             if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3756               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3757               continue; // skip core if all PUs are out of fullMask
3758             }
3759             ++nC;
3760             if (nC <= __kmp_hws_core.offset ||
3761                 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3762               // skip node as not requested
3763               n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3764               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3765               continue; // move to next node
3766             }
3767             // core requested, go down to PUs
3768             nT = 0;
3769             nTr = 0;
3770             hT = NULL;
3771             // num procs per core
3772             int NT =
3773                 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3774             for (int t = 0; t < NT; ++t) {
3775               // Check PU ---------------------------------------
3776               idx = hT->os_index;
3777               if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3778                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3779                 continue; // skip PU if not in fullMask
3780               }
3781               ++nT;
3782               if (nT <= __kmp_hws_proc.offset ||
3783                   nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3784                 // skip PU
3785                 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3786                 ++n_old;
3787                 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3788                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3789                 continue; // move to next node
3790               }
3791               ++nTr;
3792               if (pAddr) // collect requested thread's data
3793                 newAddr[n_new] = (*pAddr)[n_old];
3794               ++n_new;
3795               ++n_old;
3796               hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3797             } // threads loop
3798             if (nTr > 0) {
3799               ++nCr; // num cores per socket
3800               ++nCo; // total num cores
3801               if (nTr > nTpC)
3802                 nTpC = nTr; // calc max threads per core
3803             }
3804             hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3805           } // cores loop
3806         } // tiles support
3807       } // numa_support
3808       if (nCr > 0) { // found cores?
3809         ++nPkg; // num sockets
3810         if (nCr > nCpP)
3811           nCpP = nCr; // calc max cores per socket
3812       }
3813     } // sockets loop
3814 
3815     // check the subset is valid
3816     KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
3817     KMP_DEBUG_ASSERT(nPkg > 0);
3818     KMP_DEBUG_ASSERT(nCpP > 0);
3819     KMP_DEBUG_ASSERT(nTpC > 0);
3820     KMP_DEBUG_ASSERT(nCo > 0);
3821     KMP_DEBUG_ASSERT(nPkg <= nPackages);
3822     KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
3823     KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
3824     KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
3825 
3826     nPackages = nPkg; // correct num sockets
3827     nCoresPerPkg = nCpP; // correct num cores per socket
3828     __kmp_nThreadsPerCore = nTpC; // correct num threads per core
3829     __kmp_avail_proc = n_new; // correct num procs
3830     __kmp_ncores = nCo; // correct num cores
3831     // hwloc topology method end
3832   } else
3833 #endif // KMP_USE_HWLOC
3834   {
3835     int n_old = 0, n_new = 0, proc_num = 0;
3836     if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
3837       KMP_WARNING(AffHWSubsetNoHWLOC);
3838       goto _exit;
3839     }
3840     if (__kmp_hws_socket.num == 0)
3841       __kmp_hws_socket.num = nPackages; // use all available sockets
3842     if (__kmp_hws_core.num == 0)
3843       __kmp_hws_core.num = nCoresPerPkg; // use all available cores
3844     if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
3845       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
3846     if (!__kmp_affinity_uniform_topology()) {
3847       KMP_WARNING(AffHWSubsetNonUniform);
3848       goto _exit; // don't support non-uniform topology
3849     }
3850     if (depth > 3) {
3851       KMP_WARNING(AffHWSubsetNonThreeLevel);
3852       goto _exit; // don't support not-3-level topology
3853     }
3854     if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
3855       KMP_WARNING(AffHWSubsetManySockets);
3856       goto _exit;
3857     }
3858     if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
3859       KMP_WARNING(AffHWSubsetManyCores);
3860       goto _exit;
3861     }
3862     // Form the requested subset
3863     if (pAddr) // pAddr is NULL in case of affinity_none
3864       newAddr = (AddrUnsPair *)__kmp_allocate(
3865           sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
3866           __kmp_hws_proc.num);
3867     for (int i = 0; i < nPackages; ++i) {
3868       if (i < __kmp_hws_socket.offset ||
3869           i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
3870         // skip not-requested socket
3871         n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
3872         if (__kmp_pu_os_idx != NULL) {
3873           // walk through skipped socket
3874           for (int j = 0; j < nCoresPerPkg; ++j) {
3875             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3876               KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3877               ++proc_num;
3878             }
3879           }
3880         }
3881       } else {
3882         // walk through requested socket
3883         for (int j = 0; j < nCoresPerPkg; ++j) {
3884           if (j < __kmp_hws_core.offset ||
3885               j >= __kmp_hws_core.offset +
3886                        __kmp_hws_core.num) { // skip not-requested core
3887             n_old += __kmp_nThreadsPerCore;
3888             if (__kmp_pu_os_idx != NULL) {
3889               for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3890                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3891                 ++proc_num;
3892               }
3893             }
3894           } else {
3895             // walk through requested core
3896             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3897               if (k < __kmp_hws_proc.num) {
3898                 if (pAddr) // collect requested thread's data
3899                   newAddr[n_new] = (*pAddr)[n_old];
3900                 n_new++;
3901               } else {
3902                 if (__kmp_pu_os_idx != NULL)
3903                   KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3904               }
3905               n_old++;
3906               ++proc_num;
3907             }
3908           }
3909         }
3910       }
3911     }
3912     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3913     KMP_DEBUG_ASSERT(n_new ==
3914                      __kmp_hws_socket.num * __kmp_hws_core.num *
3915                          __kmp_hws_proc.num);
3916     nPackages = __kmp_hws_socket.num; // correct nPackages
3917     nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
3918     __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
3919     __kmp_avail_proc = n_new; // correct avail_proc
3920     __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
3921   } // non-hwloc topology method
3922   if (pAddr) {
3923     __kmp_free(*pAddr);
3924     *pAddr = newAddr; // replace old topology with new one
3925   }
3926   if (__kmp_affinity_verbose) {
3927     char m[KMP_AFFIN_MASK_PRINT_LEN];
3928     __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
3929                               __kmp_affin_fullMask);
3930     if (__kmp_affinity_respect_mask) {
3931       KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
3932     } else {
3933       KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
3934     }
3935     KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
3936     kmp_str_buf_t buf;
3937     __kmp_str_buf_init(&buf);
3938     __kmp_str_buf_print(&buf, "%d", nPackages);
3939     KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
3940                __kmp_nThreadsPerCore, __kmp_ncores);
3941     __kmp_str_buf_free(&buf);
3942   }
3943 _exit:
3944   if (__kmp_pu_os_idx != NULL) {
3945     __kmp_free(__kmp_pu_os_idx);
3946     __kmp_pu_os_idx = NULL;
3947   }
3948 }
3949 
3950 // This function figures out the deepest level at which there is at least one
3951 // cluster/core with more than one processing unit bound to it.
3952 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
3953                                           int nprocs, int bottom_level) {
3954   int core_level = 0;
3955 
3956   for (int i = 0; i < nprocs; i++) {
3957     for (int j = bottom_level; j > 0; j--) {
3958       if (address2os[i].first.labels[j] > 0) {
3959         if (core_level < (j - 1)) {
3960           core_level = j - 1;
3961         }
3962       }
3963     }
3964   }
3965   return core_level;
3966 }
3967 
3968 // This function counts number of clusters/cores at given level.
3969 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
3970                                          int nprocs, int bottom_level,
3971                                          int core_level) {
3972   int ncores = 0;
3973   int i, j;
3974 
3975   j = bottom_level;
3976   for (i = 0; i < nprocs; i++) {
3977     for (j = bottom_level; j > core_level; j--) {
3978       if ((i + 1) < nprocs) {
3979         if (address2os[i + 1].first.labels[j] > 0) {
3980           break;
3981         }
3982       }
3983     }
3984     if (j == core_level) {
3985       ncores++;
3986     }
3987   }
3988   if (j > core_level) {
3989     // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
3990     // core. May occur when called from __kmp_affinity_find_core().
3991     ncores++;
3992   }
3993   return ncores;
3994 }
3995 
3996 // This function finds to which cluster/core given processing unit is bound.
3997 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
3998                                     int bottom_level, int core_level) {
3999   return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
4000                                        core_level) -
4001          1;
4002 }
4003 
4004 // This function finds maximal number of processing units bound to a
4005 // cluster/core at given level.
4006 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
4007                                             int nprocs, int bottom_level,
4008                                             int core_level) {
4009   int maxprocpercore = 0;
4010 
4011   if (core_level < bottom_level) {
4012     for (int i = 0; i < nprocs; i++) {
4013       int percore = address2os[i].first.labels[core_level + 1] + 1;
4014 
4015       if (percore > maxprocpercore) {
4016         maxprocpercore = percore;
4017       }
4018     }
4019   } else {
4020     maxprocpercore = 1;
4021   }
4022   return maxprocpercore;
4023 }
4024 
4025 static AddrUnsPair *address2os = NULL;
4026 static int *procarr = NULL;
4027 static int __kmp_aff_depth = 0;
4028 
4029 #if KMP_USE_HIER_SCHED
4030 #define KMP_EXIT_AFF_NONE                                                      \
4031   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
4032   KMP_ASSERT(address2os == NULL);                                              \
4033   __kmp_apply_thread_places(NULL, 0);                                          \
4034   __kmp_create_affinity_none_places();                                         \
4035   __kmp_dispatch_set_hierarchy_values();                                       \
4036   return;
4037 #else
4038 #define KMP_EXIT_AFF_NONE                                                      \
4039   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
4040   KMP_ASSERT(address2os == NULL);                                              \
4041   __kmp_apply_thread_places(NULL, 0);                                          \
4042   __kmp_create_affinity_none_places();                                         \
4043   return;
4044 #endif
4045 
4046 // Create a one element mask array (set of places) which only contains the
4047 // initial process's affinity mask
4048 static void __kmp_create_affinity_none_places() {
4049   KMP_ASSERT(__kmp_affin_fullMask != NULL);
4050   KMP_ASSERT(__kmp_affinity_type == affinity_none);
4051   __kmp_affinity_num_masks = 1;
4052   KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4053   kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
4054   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4055 }
4056 
4057 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
4058   const Address *aa = &(((const AddrUnsPair *)a)->first);
4059   const Address *bb = &(((const AddrUnsPair *)b)->first);
4060   unsigned depth = aa->depth;
4061   unsigned i;
4062   KMP_DEBUG_ASSERT(depth == bb->depth);
4063   KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
4064   KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
4065   for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
4066     int j = depth - i - 1;
4067     if (aa->childNums[j] < bb->childNums[j])
4068       return -1;
4069     if (aa->childNums[j] > bb->childNums[j])
4070       return 1;
4071   }
4072   for (; i < depth; i++) {
4073     int j = i - __kmp_affinity_compact;
4074     if (aa->childNums[j] < bb->childNums[j])
4075       return -1;
4076     if (aa->childNums[j] > bb->childNums[j])
4077       return 1;
4078   }
4079   return 0;
4080 }
4081 
4082 static void __kmp_aux_affinity_initialize(void) {
4083   if (__kmp_affinity_masks != NULL) {
4084     KMP_ASSERT(__kmp_affin_fullMask != NULL);
4085     return;
4086   }
4087 
4088   // Create the "full" mask - this defines all of the processors that we
4089   // consider to be in the machine model. If respect is set, then it is the
4090   // initialization thread's affinity mask. Otherwise, it is all processors that
4091   // we know about on the machine.
4092   if (__kmp_affin_fullMask == NULL) {
4093     KMP_CPU_ALLOC(__kmp_affin_fullMask);
4094   }
4095   if (KMP_AFFINITY_CAPABLE()) {
4096     if (__kmp_affinity_respect_mask) {
4097       __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4098 
4099       // Count the number of available processors.
4100       unsigned i;
4101       __kmp_avail_proc = 0;
4102       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4103         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4104           continue;
4105         }
4106         __kmp_avail_proc++;
4107       }
4108       if (__kmp_avail_proc > __kmp_xproc) {
4109         if (__kmp_affinity_verbose ||
4110             (__kmp_affinity_warnings &&
4111              (__kmp_affinity_type != affinity_none))) {
4112           KMP_WARNING(ErrorInitializeAffinity);
4113         }
4114         __kmp_affinity_type = affinity_none;
4115         KMP_AFFINITY_DISABLE();
4116         return;
4117       }
4118     } else {
4119       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4120       __kmp_avail_proc = __kmp_xproc;
4121     }
4122   }
4123 
4124   if (__kmp_affinity_gran == affinity_gran_tile &&
4125       // check if user's request is valid
4126       __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
4127     KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
4128     __kmp_affinity_gran = affinity_gran_package;
4129   }
4130 
4131   int depth = -1;
4132   kmp_i18n_id_t msg_id = kmp_i18n_null;
4133 
4134   // For backward compatibility, setting KMP_CPUINFO_FILE =>
4135   // KMP_TOPOLOGY_METHOD=cpuinfo
4136   if ((__kmp_cpuinfo_file != NULL) &&
4137       (__kmp_affinity_top_method == affinity_top_method_all)) {
4138     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4139   }
4140 
4141   if (__kmp_affinity_top_method == affinity_top_method_all) {
4142     // In the default code path, errors are not fatal - we just try using
4143     // another method. We only emit a warning message if affinity is on, or the
4144     // verbose flag is set, an the nowarnings flag was not set.
4145     const char *file_name = NULL;
4146     int line = 0;
4147 #if KMP_USE_HWLOC
4148     if (depth < 0 &&
4149         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4150       if (__kmp_affinity_verbose) {
4151         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4152       }
4153       if (!__kmp_hwloc_error) {
4154         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4155         if (depth == 0) {
4156           KMP_EXIT_AFF_NONE;
4157         } else if (depth < 0 && __kmp_affinity_verbose) {
4158           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4159         }
4160       } else if (__kmp_affinity_verbose) {
4161         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4162       }
4163     }
4164 #endif
4165 
4166 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4167 
4168     if (depth < 0) {
4169       if (__kmp_affinity_verbose) {
4170         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4171       }
4172 
4173       file_name = NULL;
4174       depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4175       if (depth == 0) {
4176         KMP_EXIT_AFF_NONE;
4177       }
4178 
4179       if (depth < 0) {
4180         if (__kmp_affinity_verbose) {
4181           if (msg_id != kmp_i18n_null) {
4182             KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
4183                        __kmp_i18n_catgets(msg_id),
4184                        KMP_I18N_STR(DecodingLegacyAPIC));
4185           } else {
4186             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
4187                        KMP_I18N_STR(DecodingLegacyAPIC));
4188           }
4189         }
4190 
4191         file_name = NULL;
4192         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4193         if (depth == 0) {
4194           KMP_EXIT_AFF_NONE;
4195         }
4196       }
4197     }
4198 
4199 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4200 
4201 #if KMP_OS_LINUX
4202 
4203     if (depth < 0) {
4204       if (__kmp_affinity_verbose) {
4205         if (msg_id != kmp_i18n_null) {
4206           KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
4207                      __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
4208         } else {
4209           KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
4210         }
4211       }
4212 
4213       FILE *f = fopen("/proc/cpuinfo", "r");
4214       if (f == NULL) {
4215         msg_id = kmp_i18n_str_CantOpenCpuinfo;
4216       } else {
4217         file_name = "/proc/cpuinfo";
4218         depth =
4219             __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4220         fclose(f);
4221         if (depth == 0) {
4222           KMP_EXIT_AFF_NONE;
4223         }
4224       }
4225     }
4226 
4227 #endif /* KMP_OS_LINUX */
4228 
4229 #if KMP_GROUP_AFFINITY
4230 
4231     if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
4232       if (__kmp_affinity_verbose) {
4233         KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4234       }
4235 
4236       depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4237       KMP_ASSERT(depth != 0);
4238     }
4239 
4240 #endif /* KMP_GROUP_AFFINITY */
4241 
4242     if (depth < 0) {
4243       if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
4244         if (file_name == NULL) {
4245           KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
4246         } else if (line == 0) {
4247           KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
4248         } else {
4249           KMP_INFORM(UsingFlatOSFileLine, file_name, line,
4250                      __kmp_i18n_catgets(msg_id));
4251         }
4252       }
4253       // FIXME - print msg if msg_id = kmp_i18n_null ???
4254 
4255       file_name = "";
4256       depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4257       if (depth == 0) {
4258         KMP_EXIT_AFF_NONE;
4259       }
4260       KMP_ASSERT(depth > 0);
4261       KMP_ASSERT(address2os != NULL);
4262     }
4263   }
4264 
4265 #if KMP_USE_HWLOC
4266   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4267     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4268     if (__kmp_affinity_verbose) {
4269       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4270     }
4271     depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4272     if (depth == 0) {
4273       KMP_EXIT_AFF_NONE;
4274     }
4275   }
4276 #endif // KMP_USE_HWLOC
4277 
4278 // If the user has specified that a paricular topology discovery method is to be
4279 // used, then we abort if that method fails. The exception is group affinity,
4280 // which might have been implicitly set.
4281 
4282 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4283 
4284   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
4285     if (__kmp_affinity_verbose) {
4286       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4287     }
4288 
4289     depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4290     if (depth == 0) {
4291       KMP_EXIT_AFF_NONE;
4292     }
4293     if (depth < 0) {
4294       KMP_ASSERT(msg_id != kmp_i18n_null);
4295       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4296     }
4297   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4298     if (__kmp_affinity_verbose) {
4299       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
4300     }
4301 
4302     depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4303     if (depth == 0) {
4304       KMP_EXIT_AFF_NONE;
4305     }
4306     if (depth < 0) {
4307       KMP_ASSERT(msg_id != kmp_i18n_null);
4308       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4309     }
4310   }
4311 
4312 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4313 
4314   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4315     const char *filename;
4316     if (__kmp_cpuinfo_file != NULL) {
4317       filename = __kmp_cpuinfo_file;
4318     } else {
4319       filename = "/proc/cpuinfo";
4320     }
4321 
4322     if (__kmp_affinity_verbose) {
4323       KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
4324     }
4325 
4326     FILE *f = fopen(filename, "r");
4327     if (f == NULL) {
4328       int code = errno;
4329       if (__kmp_cpuinfo_file != NULL) {
4330         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4331                     KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
4332       } else {
4333         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4334                     __kmp_msg_null);
4335       }
4336     }
4337     int line = 0;
4338     depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4339     fclose(f);
4340     if (depth < 0) {
4341       KMP_ASSERT(msg_id != kmp_i18n_null);
4342       if (line > 0) {
4343         KMP_FATAL(FileLineMsgExiting, filename, line,
4344                   __kmp_i18n_catgets(msg_id));
4345       } else {
4346         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4347       }
4348     }
4349     if (__kmp_affinity_type == affinity_none) {
4350       KMP_ASSERT(depth == 0);
4351       KMP_EXIT_AFF_NONE;
4352     }
4353   }
4354 
4355 #if KMP_GROUP_AFFINITY
4356 
4357   else if (__kmp_affinity_top_method == affinity_top_method_group) {
4358     if (__kmp_affinity_verbose) {
4359       KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4360     }
4361 
4362     depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4363     KMP_ASSERT(depth != 0);
4364     if (depth < 0) {
4365       KMP_ASSERT(msg_id != kmp_i18n_null);
4366       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4367     }
4368   }
4369 
4370 #endif /* KMP_GROUP_AFFINITY */
4371 
4372   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4373     if (__kmp_affinity_verbose) {
4374       KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
4375     }
4376 
4377     depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4378     if (depth == 0) {
4379       KMP_EXIT_AFF_NONE;
4380     }
4381     // should not fail
4382     KMP_ASSERT(depth > 0);
4383     KMP_ASSERT(address2os != NULL);
4384   }
4385 
4386 #if KMP_USE_HIER_SCHED
4387   __kmp_dispatch_set_hierarchy_values();
4388 #endif
4389 
4390   if (address2os == NULL) {
4391     if (KMP_AFFINITY_CAPABLE() &&
4392         (__kmp_affinity_verbose ||
4393          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
4394       KMP_WARNING(ErrorInitializeAffinity);
4395     }
4396     __kmp_affinity_type = affinity_none;
4397     __kmp_create_affinity_none_places();
4398     KMP_AFFINITY_DISABLE();
4399     return;
4400   }
4401 
4402   if (__kmp_affinity_gran == affinity_gran_tile
4403 #if KMP_USE_HWLOC
4404       && __kmp_tile_depth == 0
4405 #endif
4406       ) {
4407     // tiles requested but not detected, warn user on this
4408     KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
4409   }
4410 
4411   __kmp_apply_thread_places(&address2os, depth);
4412 
4413   // Create the table of masks, indexed by thread Id.
4414   unsigned maxIndex;
4415   unsigned numUnique;
4416   kmp_affin_mask_t *osId2Mask =
4417       __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
4418   if (__kmp_affinity_gran_levels == 0) {
4419     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4420   }
4421 
4422   // Set the childNums vector in all Address objects. This must be done before
4423   // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
4424   // account the setting of __kmp_affinity_compact.
4425   __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
4426 
4427   switch (__kmp_affinity_type) {
4428 
4429   case affinity_explicit:
4430     KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
4431 #if OMP_40_ENABLED
4432     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4433 #endif
4434     {
4435       __kmp_affinity_process_proclist(
4436           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4437           __kmp_affinity_proclist, osId2Mask, maxIndex);
4438     }
4439 #if OMP_40_ENABLED
4440     else {
4441       __kmp_affinity_process_placelist(
4442           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4443           __kmp_affinity_proclist, osId2Mask, maxIndex);
4444     }
4445 #endif
4446     if (__kmp_affinity_num_masks == 0) {
4447       if (__kmp_affinity_verbose ||
4448           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
4449         KMP_WARNING(AffNoValidProcID);
4450       }
4451       __kmp_affinity_type = affinity_none;
4452       return;
4453     }
4454     break;
4455 
4456   // The other affinity types rely on sorting the Addresses according to some
4457   // permutation of the machine topology tree. Set __kmp_affinity_compact and
4458   // __kmp_affinity_offset appropriately, then jump to a common code fragment
4459   // to do the sort and create the array of affinity masks.
4460 
4461   case affinity_logical:
4462     __kmp_affinity_compact = 0;
4463     if (__kmp_affinity_offset) {
4464       __kmp_affinity_offset =
4465           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4466     }
4467     goto sortAddresses;
4468 
4469   case affinity_physical:
4470     if (__kmp_nThreadsPerCore > 1) {
4471       __kmp_affinity_compact = 1;
4472       if (__kmp_affinity_compact >= depth) {
4473         __kmp_affinity_compact = 0;
4474       }
4475     } else {
4476       __kmp_affinity_compact = 0;
4477     }
4478     if (__kmp_affinity_offset) {
4479       __kmp_affinity_offset =
4480           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4481     }
4482     goto sortAddresses;
4483 
4484   case affinity_scatter:
4485     if (__kmp_affinity_compact >= depth) {
4486       __kmp_affinity_compact = 0;
4487     } else {
4488       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4489     }
4490     goto sortAddresses;
4491 
4492   case affinity_compact:
4493     if (__kmp_affinity_compact >= depth) {
4494       __kmp_affinity_compact = depth - 1;
4495     }
4496     goto sortAddresses;
4497 
4498   case affinity_balanced:
4499     if (depth <= 1) {
4500       if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4501         KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4502       }
4503       __kmp_affinity_type = affinity_none;
4504       return;
4505     } else if (__kmp_affinity_uniform_topology()) {
4506       break;
4507     } else { // Non-uniform topology
4508 
4509       // Save the depth for further usage
4510       __kmp_aff_depth = depth;
4511 
4512       int core_level = __kmp_affinity_find_core_level(
4513           address2os, __kmp_avail_proc, depth - 1);
4514       int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
4515                                                  depth - 1, core_level);
4516       int maxprocpercore = __kmp_affinity_max_proc_per_core(
4517           address2os, __kmp_avail_proc, depth - 1, core_level);
4518 
4519       int nproc = ncores * maxprocpercore;
4520       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4521         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4522           KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4523         }
4524         __kmp_affinity_type = affinity_none;
4525         return;
4526       }
4527 
4528       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4529       for (int i = 0; i < nproc; i++) {
4530         procarr[i] = -1;
4531       }
4532 
4533       int lastcore = -1;
4534       int inlastcore = 0;
4535       for (int i = 0; i < __kmp_avail_proc; i++) {
4536         int proc = address2os[i].second;
4537         int core =
4538             __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4539 
4540         if (core == lastcore) {
4541           inlastcore++;
4542         } else {
4543           inlastcore = 0;
4544         }
4545         lastcore = core;
4546 
4547         procarr[core * maxprocpercore + inlastcore] = proc;
4548       }
4549 
4550       break;
4551     }
4552 
4553   sortAddresses:
4554     // Allocate the gtid->affinity mask table.
4555     if (__kmp_affinity_dups) {
4556       __kmp_affinity_num_masks = __kmp_avail_proc;
4557     } else {
4558       __kmp_affinity_num_masks = numUnique;
4559     }
4560 
4561 #if OMP_40_ENABLED
4562     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4563         (__kmp_affinity_num_places > 0) &&
4564         ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
4565       __kmp_affinity_num_masks = __kmp_affinity_num_places;
4566     }
4567 #endif
4568 
4569     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4570 
4571     // Sort the address2os table according to the current setting of
4572     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4573     qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4574           __kmp_affinity_cmp_Address_child_num);
4575     {
4576       int i;
4577       unsigned j;
4578       for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4579         if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
4580           continue;
4581         }
4582         unsigned osId = address2os[i].second;
4583         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4584         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4585         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4586         KMP_CPU_COPY(dest, src);
4587         if (++j >= __kmp_affinity_num_masks) {
4588           break;
4589         }
4590       }
4591       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4592     }
4593     break;
4594 
4595   default:
4596     KMP_ASSERT2(0, "Unexpected affinity setting");
4597   }
4598 
4599   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
4600   machine_hierarchy.init(address2os, __kmp_avail_proc);
4601 }
4602 #undef KMP_EXIT_AFF_NONE
4603 
4604 void __kmp_affinity_initialize(void) {
4605   // Much of the code above was written assumming that if a machine was not
4606   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4607   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4608   // There are too many checks for __kmp_affinity_type == affinity_none
4609   // in this code.  Instead of trying to change them all, check if
4610   // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4611   // affinity_none, call the real initialization routine, then restore
4612   // __kmp_affinity_type to affinity_disabled.
4613   int disabled = (__kmp_affinity_type == affinity_disabled);
4614   if (!KMP_AFFINITY_CAPABLE()) {
4615     KMP_ASSERT(disabled);
4616   }
4617   if (disabled) {
4618     __kmp_affinity_type = affinity_none;
4619   }
4620   __kmp_aux_affinity_initialize();
4621   if (disabled) {
4622     __kmp_affinity_type = affinity_disabled;
4623   }
4624 }
4625 
4626 void __kmp_affinity_uninitialize(void) {
4627   if (__kmp_affinity_masks != NULL) {
4628     KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4629     __kmp_affinity_masks = NULL;
4630   }
4631   if (__kmp_affin_fullMask != NULL) {
4632     KMP_CPU_FREE(__kmp_affin_fullMask);
4633     __kmp_affin_fullMask = NULL;
4634   }
4635   __kmp_affinity_num_masks = 0;
4636   __kmp_affinity_type = affinity_default;
4637 #if OMP_40_ENABLED
4638   __kmp_affinity_num_places = 0;
4639 #endif
4640   if (__kmp_affinity_proclist != NULL) {
4641     __kmp_free(__kmp_affinity_proclist);
4642     __kmp_affinity_proclist = NULL;
4643   }
4644   if (address2os != NULL) {
4645     __kmp_free(address2os);
4646     address2os = NULL;
4647   }
4648   if (procarr != NULL) {
4649     __kmp_free(procarr);
4650     procarr = NULL;
4651   }
4652 #if KMP_USE_HWLOC
4653   if (__kmp_hwloc_topology != NULL) {
4654     hwloc_topology_destroy(__kmp_hwloc_topology);
4655     __kmp_hwloc_topology = NULL;
4656   }
4657 #endif
4658   KMPAffinity::destroy_api();
4659 }
4660 
4661 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4662   if (!KMP_AFFINITY_CAPABLE()) {
4663     return;
4664   }
4665 
4666   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4667   if (th->th.th_affin_mask == NULL) {
4668     KMP_CPU_ALLOC(th->th.th_affin_mask);
4669   } else {
4670     KMP_CPU_ZERO(th->th.th_affin_mask);
4671   }
4672 
4673   // Copy the thread mask to the kmp_info_t strucuture. If
4674   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4675   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4676   // then the full mask is the same as the mask of the initialization thread.
4677   kmp_affin_mask_t *mask;
4678   int i;
4679 
4680 #if OMP_40_ENABLED
4681   if (KMP_AFFINITY_NON_PROC_BIND)
4682 #endif
4683   {
4684     if ((__kmp_affinity_type == affinity_none) ||
4685         (__kmp_affinity_type == affinity_balanced)) {
4686 #if KMP_GROUP_AFFINITY
4687       if (__kmp_num_proc_groups > 1) {
4688         return;
4689       }
4690 #endif
4691       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4692       i = 0;
4693       mask = __kmp_affin_fullMask;
4694     } else {
4695       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4696       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4697       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4698     }
4699   }
4700 #if OMP_40_ENABLED
4701   else {
4702     if ((!isa_root) ||
4703         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4704 #if KMP_GROUP_AFFINITY
4705       if (__kmp_num_proc_groups > 1) {
4706         return;
4707       }
4708 #endif
4709       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4710       i = KMP_PLACE_ALL;
4711       mask = __kmp_affin_fullMask;
4712     } else {
4713       // int i = some hash function or just a counter that doesn't
4714       // always start at 0.  Use gtid for now.
4715       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4716       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4717       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4718     }
4719   }
4720 #endif
4721 
4722 #if OMP_40_ENABLED
4723   th->th.th_current_place = i;
4724   if (isa_root) {
4725     th->th.th_new_place = i;
4726     th->th.th_first_place = 0;
4727     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4728   }
4729 
4730   if (i == KMP_PLACE_ALL) {
4731     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4732                    gtid));
4733   } else {
4734     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4735                    gtid, i));
4736   }
4737 #else
4738   if (i == -1) {
4739     KA_TRACE(
4740         100,
4741         ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4742          gtid));
4743   } else {
4744     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4745                    gtid, i));
4746   }
4747 #endif /* OMP_40_ENABLED */
4748 
4749   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4750 
4751   if (__kmp_affinity_verbose
4752       /* to avoid duplicate printing (will be correctly printed on barrier) */
4753       && (__kmp_affinity_type == affinity_none || i != KMP_PLACE_ALL)) {
4754     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4755     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4756                               th->th.th_affin_mask);
4757     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4758                __kmp_gettid(), gtid, buf);
4759   }
4760 
4761 #if KMP_OS_WINDOWS
4762   // On Windows* OS, the process affinity mask might have changed. If the user
4763   // didn't request affinity and this call fails, just continue silently.
4764   // See CQ171393.
4765   if (__kmp_affinity_type == affinity_none) {
4766     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4767   } else
4768 #endif
4769     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4770 }
4771 
4772 #if OMP_40_ENABLED
4773 
4774 void __kmp_affinity_set_place(int gtid) {
4775   int retval;
4776 
4777   if (!KMP_AFFINITY_CAPABLE()) {
4778     return;
4779   }
4780 
4781   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4782 
4783   KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4784                  "place = %d)\n",
4785                  gtid, th->th.th_new_place, th->th.th_current_place));
4786 
4787   // Check that the new place is within this thread's partition.
4788   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4789   KMP_ASSERT(th->th.th_new_place >= 0);
4790   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4791   if (th->th.th_first_place <= th->th.th_last_place) {
4792     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4793                (th->th.th_new_place <= th->th.th_last_place));
4794   } else {
4795     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4796                (th->th.th_new_place >= th->th.th_last_place));
4797   }
4798 
4799   // Copy the thread mask to the kmp_info_t strucuture,
4800   // and set this thread's affinity.
4801   kmp_affin_mask_t *mask =
4802       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4803   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4804   th->th.th_current_place = th->th.th_new_place;
4805 
4806   if (__kmp_affinity_verbose) {
4807     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4808     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4809                               th->th.th_affin_mask);
4810     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4811                __kmp_gettid(), gtid, buf);
4812   }
4813   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4814 }
4815 
4816 #endif /* OMP_40_ENABLED */
4817 
4818 int __kmp_aux_set_affinity(void **mask) {
4819   int gtid;
4820   kmp_info_t *th;
4821   int retval;
4822 
4823   if (!KMP_AFFINITY_CAPABLE()) {
4824     return -1;
4825   }
4826 
4827   gtid = __kmp_entry_gtid();
4828   KA_TRACE(1000, ; {
4829     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4830     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4831                               (kmp_affin_mask_t *)(*mask));
4832     __kmp_debug_printf(
4833         "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
4834         buf);
4835   });
4836 
4837   if (__kmp_env_consistency_check) {
4838     if ((mask == NULL) || (*mask == NULL)) {
4839       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4840     } else {
4841       unsigned proc;
4842       int num_procs = 0;
4843 
4844       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4845         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4846           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4847         }
4848         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4849           continue;
4850         }
4851         num_procs++;
4852       }
4853       if (num_procs == 0) {
4854         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4855       }
4856 
4857 #if KMP_GROUP_AFFINITY
4858       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4859         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4860       }
4861 #endif /* KMP_GROUP_AFFINITY */
4862     }
4863   }
4864 
4865   th = __kmp_threads[gtid];
4866   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4867   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4868   if (retval == 0) {
4869     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4870   }
4871 
4872 #if OMP_40_ENABLED
4873   th->th.th_current_place = KMP_PLACE_UNDEFINED;
4874   th->th.th_new_place = KMP_PLACE_UNDEFINED;
4875   th->th.th_first_place = 0;
4876   th->th.th_last_place = __kmp_affinity_num_masks - 1;
4877 
4878   // Turn off 4.0 affinity for the current tread at this parallel level.
4879   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4880 #endif
4881 
4882   return retval;
4883 }
4884 
4885 int __kmp_aux_get_affinity(void **mask) {
4886   int gtid;
4887   int retval;
4888   kmp_info_t *th;
4889 
4890   if (!KMP_AFFINITY_CAPABLE()) {
4891     return -1;
4892   }
4893 
4894   gtid = __kmp_entry_gtid();
4895   th = __kmp_threads[gtid];
4896   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4897 
4898   KA_TRACE(1000, ; {
4899     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4900     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4901                               th->th.th_affin_mask);
4902     __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
4903                  gtid, buf);
4904   });
4905 
4906   if (__kmp_env_consistency_check) {
4907     if ((mask == NULL) || (*mask == NULL)) {
4908       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4909     }
4910   }
4911 
4912 #if !KMP_OS_WINDOWS
4913 
4914   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4915   KA_TRACE(1000, ; {
4916     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4917     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4918                               (kmp_affin_mask_t *)(*mask));
4919     __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
4920                  gtid, buf);
4921   });
4922   return retval;
4923 
4924 #else
4925 
4926   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4927   return 0;
4928 
4929 #endif /* KMP_OS_WINDOWS */
4930 }
4931 
4932 int __kmp_aux_get_affinity_max_proc() {
4933   if (!KMP_AFFINITY_CAPABLE()) {
4934     return 0;
4935   }
4936 #if KMP_GROUP_AFFINITY
4937   if (__kmp_num_proc_groups > 1) {
4938     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4939   }
4940 #endif
4941   return __kmp_xproc;
4942 }
4943 
4944 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4945   int retval;
4946 
4947   if (!KMP_AFFINITY_CAPABLE()) {
4948     return -1;
4949   }
4950 
4951   KA_TRACE(1000, ; {
4952     int gtid = __kmp_entry_gtid();
4953     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4954     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4955                               (kmp_affin_mask_t *)(*mask));
4956     __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4957                        "affinity mask for thread %d = %s\n",
4958                        proc, gtid, buf);
4959   });
4960 
4961   if (__kmp_env_consistency_check) {
4962     if ((mask == NULL) || (*mask == NULL)) {
4963       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4964     }
4965   }
4966 
4967   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4968     return -1;
4969   }
4970   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4971     return -2;
4972   }
4973 
4974   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4975   return 0;
4976 }
4977 
4978 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
4979   int retval;
4980 
4981   if (!KMP_AFFINITY_CAPABLE()) {
4982     return -1;
4983   }
4984 
4985   KA_TRACE(1000, ; {
4986     int gtid = __kmp_entry_gtid();
4987     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4988     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4989                               (kmp_affin_mask_t *)(*mask));
4990     __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
4991                        "affinity mask for thread %d = %s\n",
4992                        proc, gtid, buf);
4993   });
4994 
4995   if (__kmp_env_consistency_check) {
4996     if ((mask == NULL) || (*mask == NULL)) {
4997       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4998     }
4999   }
5000 
5001   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5002     return -1;
5003   }
5004   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5005     return -2;
5006   }
5007 
5008   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5009   return 0;
5010 }
5011 
5012 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5013   int retval;
5014 
5015   if (!KMP_AFFINITY_CAPABLE()) {
5016     return -1;
5017   }
5018 
5019   KA_TRACE(1000, ; {
5020     int gtid = __kmp_entry_gtid();
5021     char buf[KMP_AFFIN_MASK_PRINT_LEN];
5022     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5023                               (kmp_affin_mask_t *)(*mask));
5024     __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5025                        "affinity mask for thread %d = %s\n",
5026                        proc, gtid, buf);
5027   });
5028 
5029   if (__kmp_env_consistency_check) {
5030     if ((mask == NULL) || (*mask == NULL)) {
5031       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5032     }
5033   }
5034 
5035   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5036     return -1;
5037   }
5038   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5039     return 0;
5040   }
5041 
5042   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5043 }
5044 
5045 // Dynamic affinity settings - Affinity balanced
5046 void __kmp_balanced_affinity(int tid, int nthreads) {
5047   bool fine_gran = true;
5048 
5049   switch (__kmp_affinity_gran) {
5050   case affinity_gran_fine:
5051   case affinity_gran_thread:
5052     break;
5053   case affinity_gran_core:
5054     if (__kmp_nThreadsPerCore > 1) {
5055       fine_gran = false;
5056     }
5057     break;
5058   case affinity_gran_package:
5059     if (nCoresPerPkg > 1) {
5060       fine_gran = false;
5061     }
5062     break;
5063   default:
5064     fine_gran = false;
5065   }
5066 
5067   if (__kmp_affinity_uniform_topology()) {
5068     int coreID;
5069     int threadID;
5070     // Number of hyper threads per core in HT machine
5071     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5072     // Number of cores
5073     int ncores = __kmp_ncores;
5074     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5075       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5076       ncores = nPackages;
5077     }
5078     // How many threads will be bound to each core
5079     int chunk = nthreads / ncores;
5080     // How many cores will have an additional thread bound to it - "big cores"
5081     int big_cores = nthreads % ncores;
5082     // Number of threads on the big cores
5083     int big_nth = (chunk + 1) * big_cores;
5084     if (tid < big_nth) {
5085       coreID = tid / (chunk + 1);
5086       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5087     } else { // tid >= big_nth
5088       coreID = (tid - big_cores) / chunk;
5089       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5090     }
5091 
5092     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5093                       "Illegal set affinity operation when not capable");
5094 
5095     kmp_affin_mask_t *mask;
5096     KMP_CPU_ALLOC_ON_STACK(mask);
5097     KMP_CPU_ZERO(mask);
5098 
5099     if (fine_gran) {
5100       int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
5101       KMP_CPU_SET(osID, mask);
5102     } else {
5103       for (int i = 0; i < __kmp_nth_per_core; i++) {
5104         int osID;
5105         osID = address2os[coreID * __kmp_nth_per_core + i].second;
5106         KMP_CPU_SET(osID, mask);
5107       }
5108     }
5109     if (__kmp_affinity_verbose) {
5110       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5111       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5112       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5113                  __kmp_gettid(), tid, buf);
5114     }
5115     __kmp_set_system_affinity(mask, TRUE);
5116     KMP_CPU_FREE_FROM_STACK(mask);
5117   } else { // Non-uniform topology
5118 
5119     kmp_affin_mask_t *mask;
5120     KMP_CPU_ALLOC_ON_STACK(mask);
5121     KMP_CPU_ZERO(mask);
5122 
5123     int core_level = __kmp_affinity_find_core_level(
5124         address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
5125     int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
5126                                                __kmp_aff_depth - 1, core_level);
5127     int nth_per_core = __kmp_affinity_max_proc_per_core(
5128         address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5129 
5130     // For performance gain consider the special case nthreads ==
5131     // __kmp_avail_proc
5132     if (nthreads == __kmp_avail_proc) {
5133       if (fine_gran) {
5134         int osID = address2os[tid].second;
5135         KMP_CPU_SET(osID, mask);
5136       } else {
5137         int core = __kmp_affinity_find_core(address2os, tid,
5138                                             __kmp_aff_depth - 1, core_level);
5139         for (int i = 0; i < __kmp_avail_proc; i++) {
5140           int osID = address2os[i].second;
5141           if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
5142                                        core_level) == core) {
5143             KMP_CPU_SET(osID, mask);
5144           }
5145         }
5146       }
5147     } else if (nthreads <= ncores) {
5148 
5149       int core = 0;
5150       for (int i = 0; i < ncores; i++) {
5151         // Check if this core from procarr[] is in the mask
5152         int in_mask = 0;
5153         for (int j = 0; j < nth_per_core; j++) {
5154           if (procarr[i * nth_per_core + j] != -1) {
5155             in_mask = 1;
5156             break;
5157           }
5158         }
5159         if (in_mask) {
5160           if (tid == core) {
5161             for (int j = 0; j < nth_per_core; j++) {
5162               int osID = procarr[i * nth_per_core + j];
5163               if (osID != -1) {
5164                 KMP_CPU_SET(osID, mask);
5165                 // For fine granularity it is enough to set the first available
5166                 // osID for this core
5167                 if (fine_gran) {
5168                   break;
5169                 }
5170               }
5171             }
5172             break;
5173           } else {
5174             core++;
5175           }
5176         }
5177       }
5178     } else { // nthreads > ncores
5179       // Array to save the number of processors at each core
5180       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5181       // Array to save the number of cores with "x" available processors;
5182       int *ncores_with_x_procs =
5183           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5184       // Array to save the number of cores with # procs from x to nth_per_core
5185       int *ncores_with_x_to_max_procs =
5186           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5187 
5188       for (int i = 0; i <= nth_per_core; i++) {
5189         ncores_with_x_procs[i] = 0;
5190         ncores_with_x_to_max_procs[i] = 0;
5191       }
5192 
5193       for (int i = 0; i < ncores; i++) {
5194         int cnt = 0;
5195         for (int j = 0; j < nth_per_core; j++) {
5196           if (procarr[i * nth_per_core + j] != -1) {
5197             cnt++;
5198           }
5199         }
5200         nproc_at_core[i] = cnt;
5201         ncores_with_x_procs[cnt]++;
5202       }
5203 
5204       for (int i = 0; i <= nth_per_core; i++) {
5205         for (int j = i; j <= nth_per_core; j++) {
5206           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5207         }
5208       }
5209 
5210       // Max number of processors
5211       int nproc = nth_per_core * ncores;
5212       // An array to keep number of threads per each context
5213       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5214       for (int i = 0; i < nproc; i++) {
5215         newarr[i] = 0;
5216       }
5217 
5218       int nth = nthreads;
5219       int flag = 0;
5220       while (nth > 0) {
5221         for (int j = 1; j <= nth_per_core; j++) {
5222           int cnt = ncores_with_x_to_max_procs[j];
5223           for (int i = 0; i < ncores; i++) {
5224             // Skip the core with 0 processors
5225             if (nproc_at_core[i] == 0) {
5226               continue;
5227             }
5228             for (int k = 0; k < nth_per_core; k++) {
5229               if (procarr[i * nth_per_core + k] != -1) {
5230                 if (newarr[i * nth_per_core + k] == 0) {
5231                   newarr[i * nth_per_core + k] = 1;
5232                   cnt--;
5233                   nth--;
5234                   break;
5235                 } else {
5236                   if (flag != 0) {
5237                     newarr[i * nth_per_core + k]++;
5238                     cnt--;
5239                     nth--;
5240                     break;
5241                   }
5242                 }
5243               }
5244             }
5245             if (cnt == 0 || nth == 0) {
5246               break;
5247             }
5248           }
5249           if (nth == 0) {
5250             break;
5251           }
5252         }
5253         flag = 1;
5254       }
5255       int sum = 0;
5256       for (int i = 0; i < nproc; i++) {
5257         sum += newarr[i];
5258         if (sum > tid) {
5259           if (fine_gran) {
5260             int osID = procarr[i];
5261             KMP_CPU_SET(osID, mask);
5262           } else {
5263             int coreID = i / nth_per_core;
5264             for (int ii = 0; ii < nth_per_core; ii++) {
5265               int osID = procarr[coreID * nth_per_core + ii];
5266               if (osID != -1) {
5267                 KMP_CPU_SET(osID, mask);
5268               }
5269             }
5270           }
5271           break;
5272         }
5273       }
5274       __kmp_free(newarr);
5275     }
5276 
5277     if (__kmp_affinity_verbose) {
5278       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5279       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5280       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5281                  __kmp_gettid(), tid, buf);
5282     }
5283     __kmp_set_system_affinity(mask, TRUE);
5284     KMP_CPU_FREE_FROM_STACK(mask);
5285   }
5286 }
5287 
5288 #if KMP_OS_LINUX
5289 // We don't need this entry for Windows because
5290 // there is GetProcessAffinityMask() api
5291 //
5292 // The intended usage is indicated by these steps:
5293 // 1) The user gets the current affinity mask
5294 // 2) Then sets the affinity by calling this function
5295 // 3) Error check the return value
5296 // 4) Use non-OpenMP parallelization
5297 // 5) Reset the affinity to what was stored in step 1)
5298 #ifdef __cplusplus
5299 extern "C"
5300 #endif
5301     int
5302     kmp_set_thread_affinity_mask_initial()
5303 // the function returns 0 on success,
5304 //   -1 if we cannot bind thread
5305 //   >0 (errno) if an error happened during binding
5306 {
5307   int gtid = __kmp_get_gtid();
5308   if (gtid < 0) {
5309     // Do not touch non-omp threads
5310     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5311                   "non-omp thread, returning\n"));
5312     return -1;
5313   }
5314   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5315     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5316                   "affinity not initialized, returning\n"));
5317     return -1;
5318   }
5319   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5320                 "set full mask for thread %d\n",
5321                 gtid));
5322   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5323   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5324 }
5325 #endif
5326 
5327 #endif // KMP_AFFINITY_SUPPORTED
5328