1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 
30 // The machine topology
31 kmp_topology_t *__kmp_topology = nullptr;
32 // KMP_HW_SUBSET environment variable
33 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
34 
35 // Store the real or imagined machine hierarchy here
36 static hierarchy_info machine_hierarchy;
37 
38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
39 
40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
41   kmp_uint32 depth;
42   // The test below is true if affinity is available, but set to "none". Need to
43   // init on first use of hierarchical barrier.
44   if (TCR_1(machine_hierarchy.uninitialized))
45     machine_hierarchy.init(nproc);
46 
47   // Adjust the hierarchy in case num threads exceeds original
48   if (nproc > machine_hierarchy.base_num_threads)
49     machine_hierarchy.resize(nproc);
50 
51   depth = machine_hierarchy.depth;
52   KMP_DEBUG_ASSERT(depth > 0);
53 
54   thr_bar->depth = depth;
55   __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
56                      &(thr_bar->base_leaf_kids));
57   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
58 }
59 
60 static int nCoresPerPkg, nPackages;
61 static int __kmp_nThreadsPerCore;
62 #ifndef KMP_DFLT_NTH_CORES
63 static int __kmp_ncores;
64 #endif
65 
66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
67   switch (type) {
68   case KMP_HW_SOCKET:
69     return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
70   case KMP_HW_DIE:
71     return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
72   case KMP_HW_MODULE:
73     return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
74   case KMP_HW_TILE:
75     return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
76   case KMP_HW_NUMA:
77     return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
78   case KMP_HW_L3:
79     return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
80   case KMP_HW_L2:
81     return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
82   case KMP_HW_L1:
83     return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
84   case KMP_HW_LLC:
85     return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
86   case KMP_HW_CORE:
87     return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
88   case KMP_HW_THREAD:
89     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
90   case KMP_HW_PROC_GROUP:
91     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
92   }
93   return KMP_I18N_STR(Unknown);
94 }
95 
96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
97   switch (type) {
98   case KMP_HW_SOCKET:
99     return ((plural) ? "sockets" : "socket");
100   case KMP_HW_DIE:
101     return ((plural) ? "dice" : "die");
102   case KMP_HW_MODULE:
103     return ((plural) ? "modules" : "module");
104   case KMP_HW_TILE:
105     return ((plural) ? "tiles" : "tile");
106   case KMP_HW_NUMA:
107     return ((plural) ? "numa_domains" : "numa_domain");
108   case KMP_HW_L3:
109     return ((plural) ? "l3_caches" : "l3_cache");
110   case KMP_HW_L2:
111     return ((plural) ? "l2_caches" : "l2_cache");
112   case KMP_HW_L1:
113     return ((plural) ? "l1_caches" : "l1_cache");
114   case KMP_HW_LLC:
115     return ((plural) ? "ll_caches" : "ll_cache");
116   case KMP_HW_CORE:
117     return ((plural) ? "cores" : "core");
118   case KMP_HW_THREAD:
119     return ((plural) ? "threads" : "thread");
120   case KMP_HW_PROC_GROUP:
121     return ((plural) ? "proc_groups" : "proc_group");
122   }
123   return ((plural) ? "unknowns" : "unknown");
124 }
125 
126 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
127   switch (type) {
128   case KMP_HW_CORE_TYPE_UNKNOWN:
129     return "unknown";
130 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
131   case KMP_HW_CORE_TYPE_ATOM:
132     return "Intel Atom(R) processor";
133   case KMP_HW_CORE_TYPE_CORE:
134     return "Intel(R) Core(TM) processor";
135 #endif
136   }
137   return "unknown";
138 }
139 
140 ////////////////////////////////////////////////////////////////////////////////
141 // kmp_hw_thread_t methods
142 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
143   const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
144   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
145   int depth = __kmp_topology->get_depth();
146   for (int level = 0; level < depth; ++level) {
147     if (ahwthread->ids[level] < bhwthread->ids[level])
148       return -1;
149     else if (ahwthread->ids[level] > bhwthread->ids[level])
150       return 1;
151   }
152   if (ahwthread->os_id < bhwthread->os_id)
153     return -1;
154   else if (ahwthread->os_id > bhwthread->os_id)
155     return 1;
156   return 0;
157 }
158 
159 #if KMP_AFFINITY_SUPPORTED
160 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
161   int i;
162   const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
163   const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
164   int depth = __kmp_topology->get_depth();
165   KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
166   KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
167   for (i = 0; i < __kmp_affinity_compact; i++) {
168     int j = depth - i - 1;
169     if (aa->sub_ids[j] < bb->sub_ids[j])
170       return -1;
171     if (aa->sub_ids[j] > bb->sub_ids[j])
172       return 1;
173   }
174   for (; i < depth; i++) {
175     int j = i - __kmp_affinity_compact;
176     if (aa->sub_ids[j] < bb->sub_ids[j])
177       return -1;
178     if (aa->sub_ids[j] > bb->sub_ids[j])
179       return 1;
180   }
181   return 0;
182 }
183 #endif
184 
185 void kmp_hw_thread_t::print() const {
186   int depth = __kmp_topology->get_depth();
187   printf("%4d ", os_id);
188   for (int i = 0; i < depth; ++i) {
189     printf("%4d ", ids[i]);
190   }
191   if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) {
192     printf(" (%s)", __kmp_hw_get_core_type_string(core_type));
193   }
194   printf("\n");
195 }
196 
197 ////////////////////////////////////////////////////////////////////////////////
198 // kmp_topology_t methods
199 
200 // Remove layers that don't add information to the topology.
201 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
202 void kmp_topology_t::_remove_radix1_layers() {
203   int preference[KMP_HW_LAST];
204   int top_index1, top_index2;
205   // Set up preference associative array
206   preference[KMP_HW_PROC_GROUP] = 110;
207   preference[KMP_HW_SOCKET] = 100;
208   preference[KMP_HW_CORE] = 95;
209   preference[KMP_HW_THREAD] = 90;
210   preference[KMP_HW_NUMA] = 85;
211   preference[KMP_HW_DIE] = 80;
212   preference[KMP_HW_TILE] = 75;
213   preference[KMP_HW_MODULE] = 73;
214   preference[KMP_HW_L3] = 70;
215   preference[KMP_HW_L2] = 65;
216   preference[KMP_HW_L1] = 60;
217   preference[KMP_HW_LLC] = 5;
218   top_index1 = 0;
219   top_index2 = 1;
220   while (top_index1 < depth - 1 && top_index2 < depth) {
221     kmp_hw_t type1 = types[top_index1];
222     kmp_hw_t type2 = types[top_index2];
223     KMP_ASSERT_VALID_HW_TYPE(type1);
224     KMP_ASSERT_VALID_HW_TYPE(type2);
225     // Do not allow the three main topology levels (sockets, cores, threads) to
226     // be compacted down
227     if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
228          type1 == KMP_HW_SOCKET) &&
229         (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
230          type2 == KMP_HW_SOCKET)) {
231       top_index1 = top_index2++;
232       continue;
233     }
234     bool radix1 = true;
235     bool all_same = true;
236     int id1 = hw_threads[0].ids[top_index1];
237     int id2 = hw_threads[0].ids[top_index2];
238     int pref1 = preference[type1];
239     int pref2 = preference[type2];
240     for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
241       if (hw_threads[hwidx].ids[top_index1] == id1 &&
242           hw_threads[hwidx].ids[top_index2] != id2) {
243         radix1 = false;
244         break;
245       }
246       if (hw_threads[hwidx].ids[top_index2] != id2)
247         all_same = false;
248       id1 = hw_threads[hwidx].ids[top_index1];
249       id2 = hw_threads[hwidx].ids[top_index2];
250     }
251     if (radix1) {
252       // Select the layer to remove based on preference
253       kmp_hw_t remove_type, keep_type;
254       int remove_layer, remove_layer_ids;
255       if (pref1 > pref2) {
256         remove_type = type2;
257         remove_layer = remove_layer_ids = top_index2;
258         keep_type = type1;
259       } else {
260         remove_type = type1;
261         remove_layer = remove_layer_ids = top_index1;
262         keep_type = type2;
263       }
264       // If all the indexes for the second (deeper) layer are the same.
265       // e.g., all are zero, then make sure to keep the first layer's ids
266       if (all_same)
267         remove_layer_ids = top_index2;
268       // Remove radix one type by setting the equivalence, removing the id from
269       // the hw threads and removing the layer from types and depth
270       set_equivalent_type(remove_type, keep_type);
271       for (int idx = 0; idx < num_hw_threads; ++idx) {
272         kmp_hw_thread_t &hw_thread = hw_threads[idx];
273         for (int d = remove_layer_ids; d < depth - 1; ++d)
274           hw_thread.ids[d] = hw_thread.ids[d + 1];
275       }
276       for (int idx = remove_layer; idx < depth - 1; ++idx)
277         types[idx] = types[idx + 1];
278       depth--;
279     } else {
280       top_index1 = top_index2++;
281     }
282   }
283   KMP_ASSERT(depth > 0);
284 }
285 
286 void kmp_topology_t::_set_last_level_cache() {
287   if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
288     set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
289   else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
290     set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
291 #if KMP_MIC_SUPPORTED
292   else if (__kmp_mic_type == mic3) {
293     if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
294       set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
295     else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
296       set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
297     // L2/Tile wasn't detected so just say L1
298     else
299       set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
300   }
301 #endif
302   else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
303     set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
304   // Fallback is to set last level cache to socket or core
305   if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
306     if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
307       set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
308     else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
309       set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
310   }
311   KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
312 }
313 
314 // Gather the count of each topology layer and the ratio
315 void kmp_topology_t::_gather_enumeration_information() {
316   int previous_id[KMP_HW_LAST];
317   int max[KMP_HW_LAST];
318   int previous_core_id = kmp_hw_thread_t::UNKNOWN_ID;
319 
320   for (int i = 0; i < depth; ++i) {
321     previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
322     max[i] = 0;
323     count[i] = 0;
324     ratio[i] = 0;
325   }
326   if (__kmp_is_hybrid_cpu()) {
327     for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
328       core_types_count[i] = 0;
329       core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
330     }
331   }
332   for (int i = 0; i < num_hw_threads; ++i) {
333     kmp_hw_thread_t &hw_thread = hw_threads[i];
334     for (int layer = 0; layer < depth; ++layer) {
335       int id = hw_thread.ids[layer];
336       if (id != previous_id[layer]) {
337         // Add an additional increment to each count
338         for (int l = layer; l < depth; ++l)
339           count[l]++;
340         // Keep track of topology layer ratio statistics
341         max[layer]++;
342         for (int l = layer + 1; l < depth; ++l) {
343           if (max[l] > ratio[l])
344             ratio[l] = max[l];
345           max[l] = 1;
346         }
347         break;
348       }
349     }
350     for (int layer = 0; layer < depth; ++layer) {
351       previous_id[layer] = hw_thread.ids[layer];
352     }
353     // Figure out the number of each core type for hybrid CPUs
354     if (__kmp_is_hybrid_cpu()) {
355       int core_level = get_level(KMP_HW_CORE);
356       if (core_level != -1) {
357         if (hw_thread.ids[core_level] != previous_core_id)
358           _increment_core_type(hw_thread.core_type);
359         previous_core_id = hw_thread.ids[core_level];
360       }
361     }
362   }
363   for (int layer = 0; layer < depth; ++layer) {
364     if (max[layer] > ratio[layer])
365       ratio[layer] = max[layer];
366   }
367 }
368 
369 // Find out if the topology is uniform
370 void kmp_topology_t::_discover_uniformity() {
371   int num = 1;
372   for (int level = 0; level < depth; ++level)
373     num *= ratio[level];
374   flags.uniform = (num == count[depth - 1]);
375 }
376 
377 // Set all the sub_ids for each hardware thread
378 void kmp_topology_t::_set_sub_ids() {
379   int previous_id[KMP_HW_LAST];
380   int sub_id[KMP_HW_LAST];
381 
382   for (int i = 0; i < depth; ++i) {
383     previous_id[i] = -1;
384     sub_id[i] = -1;
385   }
386   for (int i = 0; i < num_hw_threads; ++i) {
387     kmp_hw_thread_t &hw_thread = hw_threads[i];
388     // Setup the sub_id
389     for (int j = 0; j < depth; ++j) {
390       if (hw_thread.ids[j] != previous_id[j]) {
391         sub_id[j]++;
392         for (int k = j + 1; k < depth; ++k) {
393           sub_id[k] = 0;
394         }
395         break;
396       }
397     }
398     // Set previous_id
399     for (int j = 0; j < depth; ++j) {
400       previous_id[j] = hw_thread.ids[j];
401     }
402     // Set the sub_ids field
403     for (int j = 0; j < depth; ++j) {
404       hw_thread.sub_ids[j] = sub_id[j];
405     }
406   }
407 }
408 
409 void kmp_topology_t::_set_globals() {
410   // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
411   int core_level, thread_level, package_level;
412   package_level = get_level(KMP_HW_SOCKET);
413 #if KMP_GROUP_AFFINITY
414   if (package_level == -1)
415     package_level = get_level(KMP_HW_PROC_GROUP);
416 #endif
417   core_level = get_level(KMP_HW_CORE);
418   thread_level = get_level(KMP_HW_THREAD);
419 
420   KMP_ASSERT(core_level != -1);
421   KMP_ASSERT(thread_level != -1);
422 
423   __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
424   if (package_level != -1) {
425     nCoresPerPkg = calculate_ratio(core_level, package_level);
426     nPackages = get_count(package_level);
427   } else {
428     // assume one socket
429     nCoresPerPkg = get_count(core_level);
430     nPackages = 1;
431   }
432 #ifndef KMP_DFLT_NTH_CORES
433   __kmp_ncores = get_count(core_level);
434 #endif
435 }
436 
437 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
438                                          const kmp_hw_t *types) {
439   kmp_topology_t *retval;
440   // Allocate all data in one large allocation
441   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
442                 sizeof(int) * ndepth * 3;
443   char *bytes = (char *)__kmp_allocate(size);
444   retval = (kmp_topology_t *)bytes;
445   if (nproc > 0) {
446     retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
447   } else {
448     retval->hw_threads = nullptr;
449   }
450   retval->num_hw_threads = nproc;
451   retval->depth = ndepth;
452   int *arr =
453       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
454   retval->types = (kmp_hw_t *)arr;
455   retval->ratio = arr + ndepth;
456   retval->count = arr + 2 * ndepth;
457   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
458   for (int i = 0; i < ndepth; ++i) {
459     retval->types[i] = types[i];
460     retval->equivalent[types[i]] = types[i];
461   }
462   return retval;
463 }
464 
465 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
466   if (topology)
467     __kmp_free(topology);
468 }
469 
470 bool kmp_topology_t::check_ids() const {
471   // Assume ids have been sorted
472   if (num_hw_threads == 0)
473     return true;
474   for (int i = 1; i < num_hw_threads; ++i) {
475     kmp_hw_thread_t &current_thread = hw_threads[i];
476     kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
477     bool unique = false;
478     for (int j = 0; j < depth; ++j) {
479       if (previous_thread.ids[j] != current_thread.ids[j]) {
480         unique = true;
481         break;
482       }
483     }
484     if (unique)
485       continue;
486     return false;
487   }
488   return true;
489 }
490 
491 void kmp_topology_t::dump() const {
492   printf("***********************\n");
493   printf("*** __kmp_topology: ***\n");
494   printf("***********************\n");
495   printf("* depth: %d\n", depth);
496 
497   printf("* types: ");
498   for (int i = 0; i < depth; ++i)
499     printf("%15s ", __kmp_hw_get_keyword(types[i]));
500   printf("\n");
501 
502   printf("* ratio: ");
503   for (int i = 0; i < depth; ++i) {
504     printf("%15d ", ratio[i]);
505   }
506   printf("\n");
507 
508   printf("* count: ");
509   for (int i = 0; i < depth; ++i) {
510     printf("%15d ", count[i]);
511   }
512   printf("\n");
513 
514   printf("* core_types:\n");
515   for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
516     if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) {
517       printf("    %d %s core%c\n", core_types_count[i],
518              __kmp_hw_get_core_type_string(core_types[i]),
519              ((core_types_count[i] > 1) ? 's' : ' '));
520     } else {
521       if (i == 0)
522         printf("No hybrid information available\n");
523       break;
524     }
525   }
526 
527   printf("* equivalent map:\n");
528   KMP_FOREACH_HW_TYPE(i) {
529     const char *key = __kmp_hw_get_keyword(i);
530     const char *value = __kmp_hw_get_keyword(equivalent[i]);
531     printf("%-15s -> %-15s\n", key, value);
532   }
533 
534   printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
535 
536   printf("* num_hw_threads: %d\n", num_hw_threads);
537   printf("* hw_threads:\n");
538   for (int i = 0; i < num_hw_threads; ++i) {
539     hw_threads[i].print();
540   }
541   printf("***********************\n");
542 }
543 
544 void kmp_topology_t::print(const char *env_var) const {
545   kmp_str_buf_t buf;
546   int print_types_depth;
547   __kmp_str_buf_init(&buf);
548   kmp_hw_t print_types[KMP_HW_LAST + 2];
549 
550   // Num Available Threads
551   KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
552 
553   // Uniform or not
554   if (is_uniform()) {
555     KMP_INFORM(Uniform, env_var);
556   } else {
557     KMP_INFORM(NonUniform, env_var);
558   }
559 
560   // Equivalent types
561   KMP_FOREACH_HW_TYPE(type) {
562     kmp_hw_t eq_type = equivalent[type];
563     if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
564       KMP_INFORM(AffEqualTopologyTypes, env_var,
565                  __kmp_hw_get_catalog_string(type),
566                  __kmp_hw_get_catalog_string(eq_type));
567     }
568   }
569 
570   // Quick topology
571   KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
572   // Create a print types array that always guarantees printing
573   // the core and thread level
574   print_types_depth = 0;
575   for (int level = 0; level < depth; ++level)
576     print_types[print_types_depth++] = types[level];
577   if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
578     // Force in the core level for quick topology
579     if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
580       // Force core before thread e.g., 1 socket X 2 threads/socket
581       // becomes 1 socket X 1 core/socket X 2 threads/socket
582       print_types[print_types_depth - 1] = KMP_HW_CORE;
583       print_types[print_types_depth++] = KMP_HW_THREAD;
584     } else {
585       print_types[print_types_depth++] = KMP_HW_CORE;
586     }
587   }
588   // Always put threads at very end of quick topology
589   if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
590     print_types[print_types_depth++] = KMP_HW_THREAD;
591 
592   __kmp_str_buf_clear(&buf);
593   kmp_hw_t numerator_type;
594   kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
595   int core_level = get_level(KMP_HW_CORE);
596   int ncores = get_count(core_level);
597 
598   for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
599     int c;
600     bool plural;
601     numerator_type = print_types[plevel];
602     KMP_ASSERT_VALID_HW_TYPE(numerator_type);
603     if (equivalent[numerator_type] != numerator_type)
604       c = 1;
605     else
606       c = get_ratio(level++);
607     plural = (c > 1);
608     if (plevel == 0) {
609       __kmp_str_buf_print(&buf, "%d %s", c,
610                           __kmp_hw_get_catalog_string(numerator_type, plural));
611     } else {
612       __kmp_str_buf_print(&buf, " x %d %s/%s", c,
613                           __kmp_hw_get_catalog_string(numerator_type, plural),
614                           __kmp_hw_get_catalog_string(denominator_type));
615     }
616     denominator_type = numerator_type;
617   }
618   KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
619 
620   if (__kmp_is_hybrid_cpu()) {
621     for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
622       if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN)
623         break;
624       KMP_INFORM(TopologyHybrid, env_var, core_types_count[i],
625                  __kmp_hw_get_core_type_string(core_types[i]));
626     }
627   }
628 
629   if (num_hw_threads <= 0) {
630     __kmp_str_buf_free(&buf);
631     return;
632   }
633 
634   // Full OS proc to hardware thread map
635   KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
636   for (int i = 0; i < num_hw_threads; i++) {
637     __kmp_str_buf_clear(&buf);
638     for (int level = 0; level < depth; ++level) {
639       kmp_hw_t type = types[level];
640       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
641       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
642     }
643     if (__kmp_is_hybrid_cpu())
644       __kmp_str_buf_print(
645           &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type));
646     KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
647   }
648 
649   __kmp_str_buf_free(&buf);
650 }
651 
652 void kmp_topology_t::canonicalize() {
653   _remove_radix1_layers();
654   _gather_enumeration_information();
655   _discover_uniformity();
656   _set_sub_ids();
657   _set_globals();
658   _set_last_level_cache();
659 
660 #if KMP_MIC_SUPPORTED
661   // Manually Add L2 = Tile equivalence
662   if (__kmp_mic_type == mic3) {
663     if (get_level(KMP_HW_L2) != -1)
664       set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
665     else if (get_level(KMP_HW_TILE) != -1)
666       set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
667   }
668 #endif
669 
670   // Perform post canonicalization checking
671   KMP_ASSERT(depth > 0);
672   for (int level = 0; level < depth; ++level) {
673     // All counts, ratios, and types must be valid
674     KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
675     KMP_ASSERT_VALID_HW_TYPE(types[level]);
676     // Detected types must point to themselves
677     KMP_ASSERT(equivalent[types[level]] == types[level]);
678   }
679 
680 #if KMP_AFFINITY_SUPPORTED
681   // Set the number of affinity granularity levels
682   if (__kmp_affinity_gran_levels < 0) {
683     kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
684     // Check if user's granularity request is valid
685     if (gran_type == KMP_HW_UNKNOWN) {
686       // First try core, then thread, then package
687       kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
688       for (auto g : gran_types) {
689         if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) {
690           gran_type = g;
691           break;
692         }
693       }
694       KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
695       // Warn user what granularity setting will be used instead
696       KMP_WARNING(AffGranularityBad, "KMP_AFFINITY",
697                   __kmp_hw_get_catalog_string(__kmp_affinity_gran),
698                   __kmp_hw_get_catalog_string(gran_type));
699       __kmp_affinity_gran = gran_type;
700     }
701     __kmp_affinity_gran_levels = 0;
702     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
703       __kmp_affinity_gran_levels++;
704   }
705 #endif // KMP_AFFINITY_SUPPORTED
706 }
707 
708 // Canonicalize an explicit packages X cores/pkg X threads/core topology
709 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
710                                   int nthreads_per_core, int ncores) {
711   int ndepth = 3;
712   depth = ndepth;
713   KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
714   for (int level = 0; level < depth; ++level) {
715     count[level] = 0;
716     ratio[level] = 0;
717   }
718   count[0] = npackages;
719   count[1] = ncores;
720   count[2] = __kmp_xproc;
721   ratio[0] = npackages;
722   ratio[1] = ncores_per_pkg;
723   ratio[2] = nthreads_per_core;
724   equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
725   equivalent[KMP_HW_CORE] = KMP_HW_CORE;
726   equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
727   types[0] = KMP_HW_SOCKET;
728   types[1] = KMP_HW_CORE;
729   types[2] = KMP_HW_THREAD;
730   //__kmp_avail_proc = __kmp_xproc;
731   _discover_uniformity();
732 }
733 
734 // Apply the KMP_HW_SUBSET envirable to the topology
735 // Returns true if KMP_HW_SUBSET filtered any processors
736 // otherwise, returns false
737 bool kmp_topology_t::filter_hw_subset() {
738   // If KMP_HW_SUBSET wasn't requested, then do nothing.
739   if (!__kmp_hw_subset)
740     return false;
741 
742   // First, sort the KMP_HW_SUBSET items by the machine topology
743   __kmp_hw_subset->sort();
744 
745   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
746   int hw_subset_depth = __kmp_hw_subset->get_depth();
747   kmp_hw_t specified[KMP_HW_LAST];
748   KMP_ASSERT(hw_subset_depth > 0);
749   KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
750   for (int i = 0; i < hw_subset_depth; ++i) {
751     int max_count;
752     int num = __kmp_hw_subset->at(i).num;
753     int offset = __kmp_hw_subset->at(i).offset;
754     kmp_hw_t type = __kmp_hw_subset->at(i).type;
755     kmp_hw_t equivalent_type = equivalent[type];
756     int level = get_level(type);
757 
758     // Check to see if current layer is in detected machine topology
759     if (equivalent_type != KMP_HW_UNKNOWN) {
760       __kmp_hw_subset->at(i).type = equivalent_type;
761     } else {
762       KMP_WARNING(AffHWSubsetNotExistGeneric,
763                   __kmp_hw_get_catalog_string(type));
764       return false;
765     }
766 
767     // Check to see if current layer has already been specified
768     // either directly or through an equivalent type
769     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
770       KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
771                   __kmp_hw_get_catalog_string(specified[equivalent_type]));
772       return false;
773     }
774     specified[equivalent_type] = type;
775 
776     // Check to see if each layer's num & offset parameters are valid
777     max_count = get_ratio(level);
778     if (max_count < 0 || num + offset > max_count) {
779       bool plural = (num > 1);
780       KMP_WARNING(AffHWSubsetManyGeneric,
781                   __kmp_hw_get_catalog_string(type, plural));
782       return false;
783     }
784   }
785 
786   // Apply the filtered hardware subset
787   int new_index = 0;
788   for (int i = 0; i < num_hw_threads; ++i) {
789     kmp_hw_thread_t &hw_thread = hw_threads[i];
790     // Check to see if this hardware thread should be filtered
791     bool should_be_filtered = false;
792     for (int level = 0, hw_subset_index = 0;
793          level < depth && hw_subset_index < hw_subset_depth; ++level) {
794       kmp_hw_t topology_type = types[level];
795       auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
796       kmp_hw_t hw_subset_type = hw_subset_item.type;
797       if (topology_type != hw_subset_type)
798         continue;
799       int num = hw_subset_item.num;
800       int offset = hw_subset_item.offset;
801       hw_subset_index++;
802       if (hw_thread.sub_ids[level] < offset ||
803           hw_thread.sub_ids[level] >= offset + num) {
804         should_be_filtered = true;
805         break;
806       }
807     }
808     if (!should_be_filtered) {
809       if (i != new_index)
810         hw_threads[new_index] = hw_thread;
811       new_index++;
812     } else {
813 #if KMP_AFFINITY_SUPPORTED
814       KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
815 #endif
816       __kmp_avail_proc--;
817     }
818   }
819   KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
820   num_hw_threads = new_index;
821 
822   // Post hardware subset canonicalization
823   _gather_enumeration_information();
824   _discover_uniformity();
825   _set_globals();
826   _set_last_level_cache();
827   return true;
828 }
829 
830 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
831   if (hw_level >= depth)
832     return true;
833   bool retval = true;
834   const kmp_hw_thread_t &t1 = hw_threads[hwt1];
835   const kmp_hw_thread_t &t2 = hw_threads[hwt2];
836   for (int i = 0; i < (depth - hw_level); ++i) {
837     if (t1.ids[i] != t2.ids[i])
838       return false;
839   }
840   return retval;
841 }
842 
843 ////////////////////////////////////////////////////////////////////////////////
844 
845 #if KMP_AFFINITY_SUPPORTED
846 class kmp_affinity_raii_t {
847   kmp_affin_mask_t *mask;
848   bool restored;
849 
850 public:
851   kmp_affinity_raii_t() : restored(false) {
852     KMP_CPU_ALLOC(mask);
853     KMP_ASSERT(mask != NULL);
854     __kmp_get_system_affinity(mask, TRUE);
855   }
856   void restore() {
857     __kmp_set_system_affinity(mask, TRUE);
858     KMP_CPU_FREE(mask);
859     restored = true;
860   }
861   ~kmp_affinity_raii_t() {
862     if (!restored) {
863       __kmp_set_system_affinity(mask, TRUE);
864       KMP_CPU_FREE(mask);
865     }
866   }
867 };
868 
869 bool KMPAffinity::picked_api = false;
870 
871 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
872 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
873 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
874 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
875 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
876 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
877 
878 void KMPAffinity::pick_api() {
879   KMPAffinity *affinity_dispatch;
880   if (picked_api)
881     return;
882 #if KMP_USE_HWLOC
883   // Only use Hwloc if affinity isn't explicitly disabled and
884   // user requests Hwloc topology method
885   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
886       __kmp_affinity_type != affinity_disabled) {
887     affinity_dispatch = new KMPHwlocAffinity();
888   } else
889 #endif
890   {
891     affinity_dispatch = new KMPNativeAffinity();
892   }
893   __kmp_affinity_dispatch = affinity_dispatch;
894   picked_api = true;
895 }
896 
897 void KMPAffinity::destroy_api() {
898   if (__kmp_affinity_dispatch != NULL) {
899     delete __kmp_affinity_dispatch;
900     __kmp_affinity_dispatch = NULL;
901     picked_api = false;
902   }
903 }
904 
905 #define KMP_ADVANCE_SCAN(scan)                                                 \
906   while (*scan != '\0') {                                                      \
907     scan++;                                                                    \
908   }
909 
910 // Print the affinity mask to the character array in a pretty format.
911 // The format is a comma separated list of non-negative integers or integer
912 // ranges: e.g., 1,2,3-5,7,9-15
913 // The format can also be the string "{<empty>}" if no bits are set in mask
914 char *__kmp_affinity_print_mask(char *buf, int buf_len,
915                                 kmp_affin_mask_t *mask) {
916   int start = 0, finish = 0, previous = 0;
917   bool first_range;
918   KMP_ASSERT(buf);
919   KMP_ASSERT(buf_len >= 40);
920   KMP_ASSERT(mask);
921   char *scan = buf;
922   char *end = buf + buf_len - 1;
923 
924   // Check for empty set.
925   if (mask->begin() == mask->end()) {
926     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
927     KMP_ADVANCE_SCAN(scan);
928     KMP_ASSERT(scan <= end);
929     return buf;
930   }
931 
932   first_range = true;
933   start = mask->begin();
934   while (1) {
935     // Find next range
936     // [start, previous] is inclusive range of contiguous bits in mask
937     for (finish = mask->next(start), previous = start;
938          finish == previous + 1 && finish != mask->end();
939          finish = mask->next(finish)) {
940       previous = finish;
941     }
942 
943     // The first range does not need a comma printed before it, but the rest
944     // of the ranges do need a comma beforehand
945     if (!first_range) {
946       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
947       KMP_ADVANCE_SCAN(scan);
948     } else {
949       first_range = false;
950     }
951     // Range with three or more contiguous bits in the affinity mask
952     if (previous - start > 1) {
953       KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
954     } else {
955       // Range with one or two contiguous bits in the affinity mask
956       KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
957       KMP_ADVANCE_SCAN(scan);
958       if (previous - start > 0) {
959         KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
960       }
961     }
962     KMP_ADVANCE_SCAN(scan);
963     // Start over with new start point
964     start = finish;
965     if (start == mask->end())
966       break;
967     // Check for overflow
968     if (end - scan < 2)
969       break;
970   }
971 
972   // Check for overflow
973   KMP_ASSERT(scan <= end);
974   return buf;
975 }
976 #undef KMP_ADVANCE_SCAN
977 
978 // Print the affinity mask to the string buffer object in a pretty format
979 // The format is a comma separated list of non-negative integers or integer
980 // ranges: e.g., 1,2,3-5,7,9-15
981 // The format can also be the string "{<empty>}" if no bits are set in mask
982 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
983                                            kmp_affin_mask_t *mask) {
984   int start = 0, finish = 0, previous = 0;
985   bool first_range;
986   KMP_ASSERT(buf);
987   KMP_ASSERT(mask);
988 
989   __kmp_str_buf_clear(buf);
990 
991   // Check for empty set.
992   if (mask->begin() == mask->end()) {
993     __kmp_str_buf_print(buf, "%s", "{<empty>}");
994     return buf;
995   }
996 
997   first_range = true;
998   start = mask->begin();
999   while (1) {
1000     // Find next range
1001     // [start, previous] is inclusive range of contiguous bits in mask
1002     for (finish = mask->next(start), previous = start;
1003          finish == previous + 1 && finish != mask->end();
1004          finish = mask->next(finish)) {
1005       previous = finish;
1006     }
1007 
1008     // The first range does not need a comma printed before it, but the rest
1009     // of the ranges do need a comma beforehand
1010     if (!first_range) {
1011       __kmp_str_buf_print(buf, "%s", ",");
1012     } else {
1013       first_range = false;
1014     }
1015     // Range with three or more contiguous bits in the affinity mask
1016     if (previous - start > 1) {
1017       __kmp_str_buf_print(buf, "%u-%u", start, previous);
1018     } else {
1019       // Range with one or two contiguous bits in the affinity mask
1020       __kmp_str_buf_print(buf, "%u", start);
1021       if (previous - start > 0) {
1022         __kmp_str_buf_print(buf, ",%u", previous);
1023       }
1024     }
1025     // Start over with new start point
1026     start = finish;
1027     if (start == mask->end())
1028       break;
1029   }
1030   return buf;
1031 }
1032 
1033 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1034   KMP_CPU_ZERO(mask);
1035 
1036 #if KMP_GROUP_AFFINITY
1037 
1038   if (__kmp_num_proc_groups > 1) {
1039     int group;
1040     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1041     for (group = 0; group < __kmp_num_proc_groups; group++) {
1042       int i;
1043       int num = __kmp_GetActiveProcessorCount(group);
1044       for (i = 0; i < num; i++) {
1045         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1046       }
1047     }
1048   } else
1049 
1050 #endif /* KMP_GROUP_AFFINITY */
1051 
1052   {
1053     int proc;
1054     for (proc = 0; proc < __kmp_xproc; proc++) {
1055       KMP_CPU_SET(proc, mask);
1056     }
1057   }
1058 }
1059 
1060 // All of the __kmp_affinity_create_*_map() routines should allocate the
1061 // internal topology object and set the layer ids for it.  Each routine
1062 // returns a boolean on whether it was successful at doing so.
1063 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1064 
1065 #if KMP_USE_HWLOC
1066 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1067 #if HWLOC_API_VERSION >= 0x00020000
1068   return hwloc_obj_type_is_cache(obj->type);
1069 #else
1070   return obj->type == HWLOC_OBJ_CACHE;
1071 #endif
1072 }
1073 
1074 // Returns KMP_HW_* type derived from HWLOC_* type
1075 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1076 
1077   if (__kmp_hwloc_is_cache_type(obj)) {
1078     if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1079       return KMP_HW_UNKNOWN;
1080     switch (obj->attr->cache.depth) {
1081     case 1:
1082       return KMP_HW_L1;
1083     case 2:
1084 #if KMP_MIC_SUPPORTED
1085       if (__kmp_mic_type == mic3) {
1086         return KMP_HW_TILE;
1087       }
1088 #endif
1089       return KMP_HW_L2;
1090     case 3:
1091       return KMP_HW_L3;
1092     }
1093     return KMP_HW_UNKNOWN;
1094   }
1095 
1096   switch (obj->type) {
1097   case HWLOC_OBJ_PACKAGE:
1098     return KMP_HW_SOCKET;
1099   case HWLOC_OBJ_NUMANODE:
1100     return KMP_HW_NUMA;
1101   case HWLOC_OBJ_CORE:
1102     return KMP_HW_CORE;
1103   case HWLOC_OBJ_PU:
1104     return KMP_HW_THREAD;
1105   case HWLOC_OBJ_GROUP:
1106     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1107       return KMP_HW_DIE;
1108     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1109       return KMP_HW_TILE;
1110     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1111       return KMP_HW_MODULE;
1112     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1113       return KMP_HW_PROC_GROUP;
1114     return KMP_HW_UNKNOWN;
1115 #if HWLOC_API_VERSION >= 0x00020100
1116   case HWLOC_OBJ_DIE:
1117     return KMP_HW_DIE;
1118 #endif
1119   }
1120   return KMP_HW_UNKNOWN;
1121 }
1122 
1123 // Returns the number of objects of type 'type' below 'obj' within the topology
1124 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1125 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1126 // object.
1127 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1128                                            hwloc_obj_type_t type) {
1129   int retval = 0;
1130   hwloc_obj_t first;
1131   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1132                                            obj->logical_index, type, 0);
1133        first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1134                                                        obj->type, first) == obj;
1135        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1136                                           first)) {
1137     ++retval;
1138   }
1139   return retval;
1140 }
1141 
1142 // This gets the sub_id for a lower object under a higher object in the
1143 // topology tree
1144 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1145                                   hwloc_obj_t lower) {
1146   hwloc_obj_t obj;
1147   hwloc_obj_type_t ltype = lower->type;
1148   int lindex = lower->logical_index - 1;
1149   int sub_id = 0;
1150   // Get the previous lower object
1151   obj = hwloc_get_obj_by_type(t, ltype, lindex);
1152   while (obj && lindex >= 0 &&
1153          hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1154     if (obj->userdata) {
1155       sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1156       break;
1157     }
1158     sub_id++;
1159     lindex--;
1160     obj = hwloc_get_obj_by_type(t, ltype, lindex);
1161   }
1162   // store sub_id + 1 so that 0 is differed from NULL
1163   lower->userdata = RCAST(void *, sub_id + 1);
1164   return sub_id;
1165 }
1166 
1167 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1168   kmp_hw_t type;
1169   int hw_thread_index, sub_id;
1170   int depth;
1171   hwloc_obj_t pu, obj, root, prev;
1172   kmp_hw_t types[KMP_HW_LAST];
1173   hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1174 
1175   hwloc_topology_t tp = __kmp_hwloc_topology;
1176   *msg_id = kmp_i18n_null;
1177   if (__kmp_affinity_verbose) {
1178     KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1179   }
1180 
1181   if (!KMP_AFFINITY_CAPABLE()) {
1182     // Hack to try and infer the machine topology using only the data
1183     // available from hwloc on the current thread, and __kmp_xproc.
1184     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1185     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1186     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1187     if (o != NULL)
1188       nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1189     else
1190       nCoresPerPkg = 1; // no PACKAGE found
1191     o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1192     if (o != NULL)
1193       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1194     else
1195       __kmp_nThreadsPerCore = 1; // no CORE found
1196     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1197     if (nCoresPerPkg == 0)
1198       nCoresPerPkg = 1; // to prevent possible division by 0
1199     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1200     return true;
1201   }
1202 
1203   root = hwloc_get_root_obj(tp);
1204 
1205   // Figure out the depth and types in the topology
1206   depth = 0;
1207   pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1208   KMP_ASSERT(pu);
1209   obj = pu;
1210   types[depth] = KMP_HW_THREAD;
1211   hwloc_types[depth] = obj->type;
1212   depth++;
1213   while (obj != root && obj != NULL) {
1214     obj = obj->parent;
1215 #if HWLOC_API_VERSION >= 0x00020000
1216     if (obj->memory_arity) {
1217       hwloc_obj_t memory;
1218       for (memory = obj->memory_first_child; memory;
1219            memory = hwloc_get_next_child(tp, obj, memory)) {
1220         if (memory->type == HWLOC_OBJ_NUMANODE)
1221           break;
1222       }
1223       if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1224         types[depth] = KMP_HW_NUMA;
1225         hwloc_types[depth] = memory->type;
1226         depth++;
1227       }
1228     }
1229 #endif
1230     type = __kmp_hwloc_type_2_topology_type(obj);
1231     if (type != KMP_HW_UNKNOWN) {
1232       types[depth] = type;
1233       hwloc_types[depth] = obj->type;
1234       depth++;
1235     }
1236   }
1237   KMP_ASSERT(depth > 0);
1238 
1239   // Get the order for the types correct
1240   for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1241     hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1242     kmp_hw_t temp = types[i];
1243     types[i] = types[j];
1244     types[j] = temp;
1245     hwloc_types[i] = hwloc_types[j];
1246     hwloc_types[j] = hwloc_temp;
1247   }
1248 
1249   // Allocate the data structure to be returned.
1250   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1251 
1252   hw_thread_index = 0;
1253   pu = NULL;
1254   while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
1255     int index = depth - 1;
1256     bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1257     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1258     if (included) {
1259       hw_thread.clear();
1260       hw_thread.ids[index] = pu->logical_index;
1261       hw_thread.os_id = pu->os_index;
1262       index--;
1263     }
1264     obj = pu;
1265     prev = obj;
1266     while (obj != root && obj != NULL) {
1267       obj = obj->parent;
1268 #if HWLOC_API_VERSION >= 0x00020000
1269       // NUMA Nodes are handled differently since they are not within the
1270       // parent/child structure anymore.  They are separate children
1271       // of obj (memory_first_child points to first memory child)
1272       if (obj->memory_arity) {
1273         hwloc_obj_t memory;
1274         for (memory = obj->memory_first_child; memory;
1275              memory = hwloc_get_next_child(tp, obj, memory)) {
1276           if (memory->type == HWLOC_OBJ_NUMANODE)
1277             break;
1278         }
1279         if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1280           sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1281           if (included) {
1282             hw_thread.ids[index] = memory->logical_index;
1283             hw_thread.ids[index + 1] = sub_id;
1284             index--;
1285           }
1286           prev = memory;
1287         }
1288         prev = obj;
1289       }
1290 #endif
1291       type = __kmp_hwloc_type_2_topology_type(obj);
1292       if (type != KMP_HW_UNKNOWN) {
1293         sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1294         if (included) {
1295           hw_thread.ids[index] = obj->logical_index;
1296           hw_thread.ids[index + 1] = sub_id;
1297           index--;
1298         }
1299         prev = obj;
1300       }
1301     }
1302     if (included)
1303       hw_thread_index++;
1304   }
1305   __kmp_topology->sort_ids();
1306   return true;
1307 }
1308 #endif // KMP_USE_HWLOC
1309 
1310 // If we don't know how to retrieve the machine's processor topology, or
1311 // encounter an error in doing so, this routine is called to form a "flat"
1312 // mapping of os thread id's <-> processor id's.
1313 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1314   *msg_id = kmp_i18n_null;
1315   int depth = 3;
1316   kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1317 
1318   if (__kmp_affinity_verbose) {
1319     KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1320   }
1321 
1322   // Even if __kmp_affinity_type == affinity_none, this routine might still
1323   // called to set __kmp_ncores, as well as
1324   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1325   if (!KMP_AFFINITY_CAPABLE()) {
1326     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1327     __kmp_ncores = nPackages = __kmp_xproc;
1328     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1329     return true;
1330   }
1331 
1332   // When affinity is off, this routine will still be called to set
1333   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1334   // Make sure all these vars are set correctly, and return now if affinity is
1335   // not enabled.
1336   __kmp_ncores = nPackages = __kmp_avail_proc;
1337   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1338 
1339   // Construct the data structure to be returned.
1340   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1341   int avail_ct = 0;
1342   int i;
1343   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1344     // Skip this proc if it is not included in the machine model.
1345     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1346       continue;
1347     }
1348     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1349     hw_thread.clear();
1350     hw_thread.os_id = i;
1351     hw_thread.ids[0] = i;
1352     hw_thread.ids[1] = 0;
1353     hw_thread.ids[2] = 0;
1354     avail_ct++;
1355   }
1356   if (__kmp_affinity_verbose) {
1357     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
1358   }
1359   return true;
1360 }
1361 
1362 #if KMP_GROUP_AFFINITY
1363 // If multiple Windows* OS processor groups exist, we can create a 2-level
1364 // topology map with the groups at level 0 and the individual procs at level 1.
1365 // This facilitates letting the threads float among all procs in a group,
1366 // if granularity=group (the default when there are multiple groups).
1367 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
1368   *msg_id = kmp_i18n_null;
1369   int depth = 3;
1370   kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
1371   const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
1372 
1373   if (__kmp_affinity_verbose) {
1374     KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
1375   }
1376 
1377   // If we aren't affinity capable, then use flat topology
1378   if (!KMP_AFFINITY_CAPABLE()) {
1379     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1380     nPackages = __kmp_num_proc_groups;
1381     __kmp_nThreadsPerCore = 1;
1382     __kmp_ncores = __kmp_xproc;
1383     nCoresPerPkg = nPackages / __kmp_ncores;
1384     return true;
1385   }
1386 
1387   // Construct the data structure to be returned.
1388   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1389   int avail_ct = 0;
1390   int i;
1391   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1392     // Skip this proc if it is not included in the machine model.
1393     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1394       continue;
1395     }
1396     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
1397     hw_thread.clear();
1398     hw_thread.os_id = i;
1399     hw_thread.ids[0] = i / BITS_PER_GROUP;
1400     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
1401   }
1402   return true;
1403 }
1404 #endif /* KMP_GROUP_AFFINITY */
1405 
1406 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1407 
1408 template <kmp_uint32 LSB, kmp_uint32 MSB>
1409 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
1410   const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
1411   const kmp_uint32 SHIFT_RIGHT = LSB;
1412   kmp_uint32 retval = v;
1413   retval <<= SHIFT_LEFT;
1414   retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
1415   return retval;
1416 }
1417 
1418 static int __kmp_cpuid_mask_width(int count) {
1419   int r = 0;
1420 
1421   while ((1 << r) < count)
1422     ++r;
1423   return r;
1424 }
1425 
1426 class apicThreadInfo {
1427 public:
1428   unsigned osId; // param to __kmp_affinity_bind_thread
1429   unsigned apicId; // from cpuid after binding
1430   unsigned maxCoresPerPkg; //      ""
1431   unsigned maxThreadsPerPkg; //      ""
1432   unsigned pkgId; // inferred from above values
1433   unsigned coreId; //      ""
1434   unsigned threadId; //      ""
1435 };
1436 
1437 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
1438                                                      const void *b) {
1439   const apicThreadInfo *aa = (const apicThreadInfo *)a;
1440   const apicThreadInfo *bb = (const apicThreadInfo *)b;
1441   if (aa->pkgId < bb->pkgId)
1442     return -1;
1443   if (aa->pkgId > bb->pkgId)
1444     return 1;
1445   if (aa->coreId < bb->coreId)
1446     return -1;
1447   if (aa->coreId > bb->coreId)
1448     return 1;
1449   if (aa->threadId < bb->threadId)
1450     return -1;
1451   if (aa->threadId > bb->threadId)
1452     return 1;
1453   return 0;
1454 }
1455 
1456 class kmp_cache_info_t {
1457 public:
1458   struct info_t {
1459     unsigned level, mask;
1460   };
1461   kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
1462   size_t get_depth() const { return depth; }
1463   info_t &operator[](size_t index) { return table[index]; }
1464   const info_t &operator[](size_t index) const { return table[index]; }
1465 
1466   static kmp_hw_t get_topology_type(unsigned level) {
1467     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
1468     switch (level) {
1469     case 1:
1470       return KMP_HW_L1;
1471     case 2:
1472       return KMP_HW_L2;
1473     case 3:
1474       return KMP_HW_L3;
1475     }
1476     return KMP_HW_UNKNOWN;
1477   }
1478 
1479 private:
1480   static const int MAX_CACHE_LEVEL = 3;
1481 
1482   size_t depth;
1483   info_t table[MAX_CACHE_LEVEL];
1484 
1485   void get_leaf4_levels() {
1486     unsigned level = 0;
1487     while (depth < MAX_CACHE_LEVEL) {
1488       unsigned cache_type, max_threads_sharing;
1489       unsigned cache_level, cache_mask_width;
1490       kmp_cpuid buf2;
1491       __kmp_x86_cpuid(4, level, &buf2);
1492       cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
1493       if (!cache_type)
1494         break;
1495       // Skip instruction caches
1496       if (cache_type == 2) {
1497         level++;
1498         continue;
1499       }
1500       max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
1501       cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
1502       cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
1503       table[depth].level = cache_level;
1504       table[depth].mask = ((-1) << cache_mask_width);
1505       depth++;
1506       level++;
1507     }
1508   }
1509 };
1510 
1511 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1512 // an algorithm which cycles through the available os threads, setting
1513 // the current thread's affinity mask to that thread, and then retrieves
1514 // the Apic Id for each thread context using the cpuid instruction.
1515 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
1516   kmp_cpuid buf;
1517   *msg_id = kmp_i18n_null;
1518 
1519   if (__kmp_affinity_verbose) {
1520     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
1521   }
1522 
1523   // Check if cpuid leaf 4 is supported.
1524   __kmp_x86_cpuid(0, 0, &buf);
1525   if (buf.eax < 4) {
1526     *msg_id = kmp_i18n_str_NoLeaf4Support;
1527     return false;
1528   }
1529 
1530   // The algorithm used starts by setting the affinity to each available thread
1531   // and retrieving info from the cpuid instruction, so if we are not capable of
1532   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1533   // need to do something else - use the defaults that we calculated from
1534   // issuing cpuid without binding to each proc.
1535   if (!KMP_AFFINITY_CAPABLE()) {
1536     // Hack to try and infer the machine topology using only the data
1537     // available from cpuid on the current thread, and __kmp_xproc.
1538     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1539 
1540     // Get an upper bound on the number of threads per package using cpuid(1).
1541     // On some OS/chps combinations where HT is supported by the chip but is
1542     // disabled, this value will be 2 on a single core chip. Usually, it will be
1543     // 2 if HT is enabled and 1 if HT is disabled.
1544     __kmp_x86_cpuid(1, 0, &buf);
1545     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1546     if (maxThreadsPerPkg == 0) {
1547       maxThreadsPerPkg = 1;
1548     }
1549 
1550     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1551     // value.
1552     //
1553     // The author of cpu_count.cpp treated this only an upper bound on the
1554     // number of cores, but I haven't seen any cases where it was greater than
1555     // the actual number of cores, so we will treat it as exact in this block of
1556     // code.
1557     //
1558     // First, we need to check if cpuid(4) is supported on this chip. To see if
1559     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1560     // greater.
1561     __kmp_x86_cpuid(0, 0, &buf);
1562     if (buf.eax >= 4) {
1563       __kmp_x86_cpuid(4, 0, &buf);
1564       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1565     } else {
1566       nCoresPerPkg = 1;
1567     }
1568 
1569     // There is no way to reliably tell if HT is enabled without issuing the
1570     // cpuid instruction from every thread, can correlating the cpuid info, so
1571     // if the machine is not affinity capable, we assume that HT is off. We have
1572     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1573     // does not support HT.
1574     //
1575     // - Older OSes are usually found on machines with older chips, which do not
1576     //   support HT.
1577     // - The performance penalty for mistakenly identifying a machine as HT when
1578     //   it isn't (which results in blocktime being incorrectly set to 0) is
1579     //   greater than the penalty when for mistakenly identifying a machine as
1580     //   being 1 thread/core when it is really HT enabled (which results in
1581     //   blocktime being incorrectly set to a positive value).
1582     __kmp_ncores = __kmp_xproc;
1583     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1584     __kmp_nThreadsPerCore = 1;
1585     return true;
1586   }
1587 
1588   // From here on, we can assume that it is safe to call
1589   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1590   // __kmp_affinity_type = affinity_none.
1591 
1592   // Save the affinity mask for the current thread.
1593   kmp_affinity_raii_t previous_affinity;
1594 
1595   // Run through each of the available contexts, binding the current thread
1596   // to it, and obtaining the pertinent information using the cpuid instr.
1597   //
1598   // The relevant information is:
1599   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1600   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1601   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1602   //     of this field determines the width of the core# + thread# fields in the
1603   //     Apic Id. It is also an upper bound on the number of threads per
1604   //     package, but it has been verified that situations happen were it is not
1605   //     exact. In particular, on certain OS/chip combinations where Intel(R)
1606   //     Hyper-Threading Technology is supported by the chip but has been
1607   //     disabled, the value of this field will be 2 (for a single core chip).
1608   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
1609   //     Technology, the value of this field will be 1 when Intel(R)
1610   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
1611   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
1612   //     of this field (+1) determines the width of the core# field in the Apic
1613   //     Id. The comments in "cpucount.cpp" say that this value is an upper
1614   //     bound, but the IA-32 architecture manual says that it is exactly the
1615   //     number of cores per package, and I haven't seen any case where it
1616   //     wasn't.
1617   //
1618   // From this information, deduce the package Id, core Id, and thread Id,
1619   // and set the corresponding fields in the apicThreadInfo struct.
1620   unsigned i;
1621   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1622       __kmp_avail_proc * sizeof(apicThreadInfo));
1623   unsigned nApics = 0;
1624   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1625     // Skip this proc if it is not included in the machine model.
1626     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1627       continue;
1628     }
1629     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1630 
1631     __kmp_affinity_dispatch->bind_thread(i);
1632     threadInfo[nApics].osId = i;
1633 
1634     // The apic id and max threads per pkg come from cpuid(1).
1635     __kmp_x86_cpuid(1, 0, &buf);
1636     if (((buf.edx >> 9) & 1) == 0) {
1637       __kmp_free(threadInfo);
1638       *msg_id = kmp_i18n_str_ApicNotPresent;
1639       return false;
1640     }
1641     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1642     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1643     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1644       threadInfo[nApics].maxThreadsPerPkg = 1;
1645     }
1646 
1647     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1648     // value.
1649     //
1650     // First, we need to check if cpuid(4) is supported on this chip. To see if
1651     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1652     // or greater.
1653     __kmp_x86_cpuid(0, 0, &buf);
1654     if (buf.eax >= 4) {
1655       __kmp_x86_cpuid(4, 0, &buf);
1656       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1657     } else {
1658       threadInfo[nApics].maxCoresPerPkg = 1;
1659     }
1660 
1661     // Infer the pkgId / coreId / threadId using only the info obtained locally.
1662     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1663     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1664 
1665     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1666     int widthT = widthCT - widthC;
1667     if (widthT < 0) {
1668       // I've never seen this one happen, but I suppose it could, if the cpuid
1669       // instruction on a chip was really screwed up. Make sure to restore the
1670       // affinity mask before the tail call.
1671       __kmp_free(threadInfo);
1672       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1673       return false;
1674     }
1675 
1676     int maskC = (1 << widthC) - 1;
1677     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1678 
1679     int maskT = (1 << widthT) - 1;
1680     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1681 
1682     nApics++;
1683   }
1684 
1685   // We've collected all the info we need.
1686   // Restore the old affinity mask for this thread.
1687   previous_affinity.restore();
1688 
1689   // Sort the threadInfo table by physical Id.
1690   qsort(threadInfo, nApics, sizeof(*threadInfo),
1691         __kmp_affinity_cmp_apicThreadInfo_phys_id);
1692 
1693   // The table is now sorted by pkgId / coreId / threadId, but we really don't
1694   // know the radix of any of the fields. pkgId's may be sparsely assigned among
1695   // the chips on a system. Although coreId's are usually assigned
1696   // [0 .. coresPerPkg-1] and threadId's are usually assigned
1697   // [0..threadsPerCore-1], we don't want to make any such assumptions.
1698   //
1699   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1700   // total # packages) are at this point - we want to determine that now. We
1701   // only have an upper bound on the first two figures.
1702   //
1703   // We also perform a consistency check at this point: the values returned by
1704   // the cpuid instruction for any thread bound to a given package had better
1705   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1706   nPackages = 1;
1707   nCoresPerPkg = 1;
1708   __kmp_nThreadsPerCore = 1;
1709   unsigned nCores = 1;
1710 
1711   unsigned pkgCt = 1; // to determine radii
1712   unsigned lastPkgId = threadInfo[0].pkgId;
1713   unsigned coreCt = 1;
1714   unsigned lastCoreId = threadInfo[0].coreId;
1715   unsigned threadCt = 1;
1716   unsigned lastThreadId = threadInfo[0].threadId;
1717 
1718   // intra-pkg consist checks
1719   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1720   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1721 
1722   for (i = 1; i < nApics; i++) {
1723     if (threadInfo[i].pkgId != lastPkgId) {
1724       nCores++;
1725       pkgCt++;
1726       lastPkgId = threadInfo[i].pkgId;
1727       if ((int)coreCt > nCoresPerPkg)
1728         nCoresPerPkg = coreCt;
1729       coreCt = 1;
1730       lastCoreId = threadInfo[i].coreId;
1731       if ((int)threadCt > __kmp_nThreadsPerCore)
1732         __kmp_nThreadsPerCore = threadCt;
1733       threadCt = 1;
1734       lastThreadId = threadInfo[i].threadId;
1735 
1736       // This is a different package, so go on to the next iteration without
1737       // doing any consistency checks. Reset the consistency check vars, though.
1738       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1739       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1740       continue;
1741     }
1742 
1743     if (threadInfo[i].coreId != lastCoreId) {
1744       nCores++;
1745       coreCt++;
1746       lastCoreId = threadInfo[i].coreId;
1747       if ((int)threadCt > __kmp_nThreadsPerCore)
1748         __kmp_nThreadsPerCore = threadCt;
1749       threadCt = 1;
1750       lastThreadId = threadInfo[i].threadId;
1751     } else if (threadInfo[i].threadId != lastThreadId) {
1752       threadCt++;
1753       lastThreadId = threadInfo[i].threadId;
1754     } else {
1755       __kmp_free(threadInfo);
1756       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1757       return false;
1758     }
1759 
1760     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1761     // fields agree between all the threads bounds to a given package.
1762     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1763         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1764       __kmp_free(threadInfo);
1765       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1766       return false;
1767     }
1768   }
1769   // When affinity is off, this routine will still be called to set
1770   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1771   // Make sure all these vars are set correctly
1772   nPackages = pkgCt;
1773   if ((int)coreCt > nCoresPerPkg)
1774     nCoresPerPkg = coreCt;
1775   if ((int)threadCt > __kmp_nThreadsPerCore)
1776     __kmp_nThreadsPerCore = threadCt;
1777   __kmp_ncores = nCores;
1778   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
1779 
1780   // Now that we've determined the number of packages, the number of cores per
1781   // package, and the number of threads per core, we can construct the data
1782   // structure that is to be returned.
1783   int idx = 0;
1784   int pkgLevel = 0;
1785   int coreLevel = 1;
1786   int threadLevel = 2;
1787   //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1788   int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1789   kmp_hw_t types[3];
1790   if (pkgLevel >= 0)
1791     types[idx++] = KMP_HW_SOCKET;
1792   if (coreLevel >= 0)
1793     types[idx++] = KMP_HW_CORE;
1794   if (threadLevel >= 0)
1795     types[idx++] = KMP_HW_THREAD;
1796 
1797   KMP_ASSERT(depth > 0);
1798   __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
1799 
1800   for (i = 0; i < nApics; ++i) {
1801     idx = 0;
1802     unsigned os = threadInfo[i].osId;
1803     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
1804     hw_thread.clear();
1805 
1806     if (pkgLevel >= 0) {
1807       hw_thread.ids[idx++] = threadInfo[i].pkgId;
1808     }
1809     if (coreLevel >= 0) {
1810       hw_thread.ids[idx++] = threadInfo[i].coreId;
1811     }
1812     if (threadLevel >= 0) {
1813       hw_thread.ids[idx++] = threadInfo[i].threadId;
1814     }
1815     hw_thread.os_id = os;
1816   }
1817 
1818   __kmp_free(threadInfo);
1819   __kmp_topology->sort_ids();
1820   if (!__kmp_topology->check_ids()) {
1821     kmp_topology_t::deallocate(__kmp_topology);
1822     __kmp_topology = nullptr;
1823     *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1824     return false;
1825   }
1826   return true;
1827 }
1828 
1829 // Hybrid cpu detection using CPUID.1A
1830 // Thread should be pinned to processor already
1831 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type,
1832                                   unsigned *native_model_id) {
1833   kmp_cpuid buf;
1834   __kmp_x86_cpuid(0x1a, 0, &buf);
1835   *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
1836   *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
1837 }
1838 
1839 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1840 // architectures support a newer interface for specifying the x2APIC Ids,
1841 // based on CPUID.B or CPUID.1F
1842 /*
1843  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
1844     Bits            Bits            Bits           Bits
1845     31-16           15-8            7-4            4-0
1846 ---+-----------+--------------+-------------+-----------------+
1847 EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
1848 ---+-----------|--------------+-------------+-----------------|
1849 EBX| reserved  | Num logical processors at level (16 bits)    |
1850 ---+-----------|--------------+-------------------------------|
1851 ECX| reserved  |   Level Type |      Level Number (8 bits)    |
1852 ---+-----------+--------------+-------------------------------|
1853 EDX|                    X2APIC ID (32 bits)                   |
1854 ---+----------------------------------------------------------+
1855 */
1856 
1857 enum {
1858   INTEL_LEVEL_TYPE_INVALID = 0, // Package level
1859   INTEL_LEVEL_TYPE_SMT = 1,
1860   INTEL_LEVEL_TYPE_CORE = 2,
1861   INTEL_LEVEL_TYPE_TILE = 3,
1862   INTEL_LEVEL_TYPE_MODULE = 4,
1863   INTEL_LEVEL_TYPE_DIE = 5,
1864   INTEL_LEVEL_TYPE_LAST = 6,
1865 };
1866 
1867 struct cpuid_level_info_t {
1868   unsigned level_type, mask, mask_width, nitems, cache_mask;
1869 };
1870 
1871 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
1872   switch (intel_type) {
1873   case INTEL_LEVEL_TYPE_INVALID:
1874     return KMP_HW_SOCKET;
1875   case INTEL_LEVEL_TYPE_SMT:
1876     return KMP_HW_THREAD;
1877   case INTEL_LEVEL_TYPE_CORE:
1878     return KMP_HW_CORE;
1879   case INTEL_LEVEL_TYPE_TILE:
1880     return KMP_HW_TILE;
1881   case INTEL_LEVEL_TYPE_MODULE:
1882     return KMP_HW_MODULE;
1883   case INTEL_LEVEL_TYPE_DIE:
1884     return KMP_HW_DIE;
1885   }
1886   return KMP_HW_UNKNOWN;
1887 }
1888 
1889 // This function takes the topology leaf, a levels array to store the levels
1890 // detected and a bitmap of the known levels.
1891 // Returns the number of levels in the topology
1892 static unsigned
1893 __kmp_x2apicid_get_levels(int leaf,
1894                           cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
1895                           kmp_uint64 known_levels) {
1896   unsigned level, levels_index;
1897   unsigned level_type, mask_width, nitems;
1898   kmp_cpuid buf;
1899 
1900   // New algorithm has known topology layers act as highest unknown topology
1901   // layers when unknown topology layers exist.
1902   // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
1903   // are unknown topology layers, Then SMT will take the characteristics of
1904   // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
1905   // This eliminates unknown portions of the topology while still keeping the
1906   // correct structure.
1907   level = levels_index = 0;
1908   do {
1909     __kmp_x86_cpuid(leaf, level, &buf);
1910     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
1911     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
1912     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
1913     if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
1914       return 0;
1915 
1916     if (known_levels & (1ull << level_type)) {
1917       // Add a new level to the topology
1918       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
1919       levels[levels_index].level_type = level_type;
1920       levels[levels_index].mask_width = mask_width;
1921       levels[levels_index].nitems = nitems;
1922       levels_index++;
1923     } else {
1924       // If it is an unknown level, then logically move the previous layer up
1925       if (levels_index > 0) {
1926         levels[levels_index - 1].mask_width = mask_width;
1927         levels[levels_index - 1].nitems = nitems;
1928       }
1929     }
1930     level++;
1931   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
1932 
1933   // Set the masks to & with apicid
1934   for (unsigned i = 0; i < levels_index; ++i) {
1935     if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
1936       levels[i].mask = ~((-1) << levels[i].mask_width);
1937       levels[i].cache_mask = (-1) << levels[i].mask_width;
1938       for (unsigned j = 0; j < i; ++j)
1939         levels[i].mask ^= levels[j].mask;
1940     } else {
1941       KMP_DEBUG_ASSERT(levels_index > 0);
1942       levels[i].mask = (-1) << levels[i - 1].mask_width;
1943       levels[i].cache_mask = 0;
1944     }
1945   }
1946   return levels_index;
1947 }
1948 
1949 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
1950 
1951   cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
1952   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
1953   unsigned levels_index;
1954   kmp_cpuid buf;
1955   kmp_uint64 known_levels;
1956   int topology_leaf, highest_leaf, apic_id;
1957   int num_leaves;
1958   static int leaves[] = {0, 0};
1959 
1960   kmp_i18n_id_t leaf_message_id;
1961 
1962   KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
1963 
1964   *msg_id = kmp_i18n_null;
1965   if (__kmp_affinity_verbose) {
1966     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
1967   }
1968 
1969   // Figure out the known topology levels
1970   known_levels = 0ull;
1971   for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
1972     if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
1973       known_levels |= (1ull << i);
1974     }
1975   }
1976 
1977   // Get the highest cpuid leaf supported
1978   __kmp_x86_cpuid(0, 0, &buf);
1979   highest_leaf = buf.eax;
1980 
1981   // If a specific topology method was requested, only allow that specific leaf
1982   // otherwise, try both leaves 31 and 11 in that order
1983   num_leaves = 0;
1984   if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
1985     num_leaves = 1;
1986     leaves[0] = 11;
1987     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
1988   } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
1989     num_leaves = 1;
1990     leaves[0] = 31;
1991     leaf_message_id = kmp_i18n_str_NoLeaf31Support;
1992   } else {
1993     num_leaves = 2;
1994     leaves[0] = 31;
1995     leaves[1] = 11;
1996     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
1997   }
1998 
1999   // Check to see if cpuid leaf 31 or 11 is supported.
2000   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2001   topology_leaf = -1;
2002   for (int i = 0; i < num_leaves; ++i) {
2003     int leaf = leaves[i];
2004     if (highest_leaf < leaf)
2005       continue;
2006     __kmp_x86_cpuid(leaf, 0, &buf);
2007     if (buf.ebx == 0)
2008       continue;
2009     topology_leaf = leaf;
2010     levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2011     if (levels_index == 0)
2012       continue;
2013     break;
2014   }
2015   if (topology_leaf == -1 || levels_index == 0) {
2016     *msg_id = leaf_message_id;
2017     return false;
2018   }
2019   KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2020 
2021   // The algorithm used starts by setting the affinity to each available thread
2022   // and retrieving info from the cpuid instruction, so if we are not capable of
2023   // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2024   // we need to do something else - use the defaults that we calculated from
2025   // issuing cpuid without binding to each proc.
2026   if (!KMP_AFFINITY_CAPABLE()) {
2027     // Hack to try and infer the machine topology using only the data
2028     // available from cpuid on the current thread, and __kmp_xproc.
2029     KMP_ASSERT(__kmp_affinity_type == affinity_none);
2030     for (unsigned i = 0; i < levels_index; ++i) {
2031       if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2032         __kmp_nThreadsPerCore = levels[i].nitems;
2033       } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2034         nCoresPerPkg = levels[i].nitems;
2035       }
2036     }
2037     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2038     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2039     return true;
2040   }
2041 
2042   // Allocate the data structure to be returned.
2043   int depth = levels_index;
2044   for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2045     types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2046   __kmp_topology =
2047       kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2048 
2049   // Insert equivalent cache types if they exist
2050   kmp_cache_info_t cache_info;
2051   for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2052     const kmp_cache_info_t::info_t &info = cache_info[i];
2053     unsigned cache_mask = info.mask;
2054     unsigned cache_level = info.level;
2055     for (unsigned j = 0; j < levels_index; ++j) {
2056       unsigned hw_cache_mask = levels[j].cache_mask;
2057       kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2058       if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2059         kmp_hw_t type =
2060             __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2061         __kmp_topology->set_equivalent_type(cache_type, type);
2062       }
2063     }
2064   }
2065 
2066   // From here on, we can assume that it is safe to call
2067   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2068   // __kmp_affinity_type = affinity_none.
2069 
2070   // Save the affinity mask for the current thread.
2071   kmp_affinity_raii_t previous_affinity;
2072 
2073   // Run through each of the available contexts, binding the current thread
2074   // to it, and obtaining the pertinent information using the cpuid instr.
2075   unsigned int proc;
2076   int hw_thread_index = 0;
2077   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2078     cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2079     unsigned my_levels_index;
2080 
2081     // Skip this proc if it is not included in the machine model.
2082     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2083       continue;
2084     }
2085     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2086 
2087     __kmp_affinity_dispatch->bind_thread(proc);
2088 
2089     // New algorithm
2090     __kmp_x86_cpuid(topology_leaf, 0, &buf);
2091     apic_id = buf.edx;
2092     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2093     my_levels_index =
2094         __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2095     if (my_levels_index == 0 || my_levels_index != levels_index) {
2096       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2097       return false;
2098     }
2099     hw_thread.clear();
2100     hw_thread.os_id = proc;
2101     // Put in topology information
2102     for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2103       hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2104       if (j > 0) {
2105         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2106       }
2107     }
2108     // Hybrid information
2109     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2110       kmp_hw_core_type_t type;
2111       unsigned native_model_id;
2112       __kmp_get_hybrid_info(&type, &native_model_id);
2113       hw_thread.core_type = type;
2114     }
2115     hw_thread_index++;
2116   }
2117   KMP_ASSERT(hw_thread_index > 0);
2118   __kmp_topology->sort_ids();
2119   if (!__kmp_topology->check_ids()) {
2120     kmp_topology_t::deallocate(__kmp_topology);
2121     __kmp_topology = nullptr;
2122     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2123     return false;
2124   }
2125   return true;
2126 }
2127 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2128 
2129 #define osIdIndex 0
2130 #define threadIdIndex 1
2131 #define coreIdIndex 2
2132 #define pkgIdIndex 3
2133 #define nodeIdIndex 4
2134 
2135 typedef unsigned *ProcCpuInfo;
2136 static unsigned maxIndex = pkgIdIndex;
2137 
2138 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2139                                                   const void *b) {
2140   unsigned i;
2141   const unsigned *aa = *(unsigned *const *)a;
2142   const unsigned *bb = *(unsigned *const *)b;
2143   for (i = maxIndex;; i--) {
2144     if (aa[i] < bb[i])
2145       return -1;
2146     if (aa[i] > bb[i])
2147       return 1;
2148     if (i == osIdIndex)
2149       break;
2150   }
2151   return 0;
2152 }
2153 
2154 #if KMP_USE_HIER_SCHED
2155 // Set the array sizes for the hierarchy layers
2156 static void __kmp_dispatch_set_hierarchy_values() {
2157   // Set the maximum number of L1's to number of cores
2158   // Set the maximum number of L2's to to either number of cores / 2 for
2159   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2160   // Or the number of cores for Intel(R) Xeon(R) processors
2161   // Set the maximum number of NUMA nodes and L3's to number of packages
2162   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2163       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2164   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2165 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
2166     KMP_MIC_SUPPORTED
2167   if (__kmp_mic_type >= mic3)
2168     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2169   else
2170 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2171     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2172   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2173   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2174   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2175   // Set the number of threads per unit
2176   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2177   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2178   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2179       __kmp_nThreadsPerCore;
2180 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
2181     KMP_MIC_SUPPORTED
2182   if (__kmp_mic_type >= mic3)
2183     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2184         2 * __kmp_nThreadsPerCore;
2185   else
2186 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2187     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2188         __kmp_nThreadsPerCore;
2189   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2190       nCoresPerPkg * __kmp_nThreadsPerCore;
2191   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2192       nCoresPerPkg * __kmp_nThreadsPerCore;
2193   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2194       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2195 }
2196 
2197 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2198 // i.e., this thread's L1 or this thread's L2, etc.
2199 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2200   int index = type + 1;
2201   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2202   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2203   if (type == kmp_hier_layer_e::LAYER_THREAD)
2204     return tid;
2205   else if (type == kmp_hier_layer_e::LAYER_LOOP)
2206     return 0;
2207   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2208   if (tid >= num_hw_threads)
2209     tid = tid % num_hw_threads;
2210   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2211 }
2212 
2213 // Return the number of t1's per t2
2214 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2215   int i1 = t1 + 1;
2216   int i2 = t2 + 1;
2217   KMP_DEBUG_ASSERT(i1 <= i2);
2218   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2219   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2220   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2221   // (nthreads/t2) / (nthreads/t1) = t1 / t2
2222   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2223 }
2224 #endif // KMP_USE_HIER_SCHED
2225 
2226 static inline const char *__kmp_cpuinfo_get_filename() {
2227   const char *filename;
2228   if (__kmp_cpuinfo_file != nullptr)
2229     filename = __kmp_cpuinfo_file;
2230   else
2231     filename = "/proc/cpuinfo";
2232   return filename;
2233 }
2234 
2235 static inline const char *__kmp_cpuinfo_get_envvar() {
2236   const char *envvar = nullptr;
2237   if (__kmp_cpuinfo_file != nullptr)
2238     envvar = "KMP_CPUINFO_FILE";
2239   return envvar;
2240 }
2241 
2242 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2243 // affinity map.
2244 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2245                                               kmp_i18n_id_t *const msg_id) {
2246   const char *filename = __kmp_cpuinfo_get_filename();
2247   const char *envvar = __kmp_cpuinfo_get_envvar();
2248   *msg_id = kmp_i18n_null;
2249 
2250   if (__kmp_affinity_verbose) {
2251     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2252   }
2253 
2254   kmp_safe_raii_file_t f(filename, "r", envvar);
2255 
2256   // Scan of the file, and count the number of "processor" (osId) fields,
2257   // and find the highest value of <n> for a node_<n> field.
2258   char buf[256];
2259   unsigned num_records = 0;
2260   while (!feof(f)) {
2261     buf[sizeof(buf) - 1] = 1;
2262     if (!fgets(buf, sizeof(buf), f)) {
2263       // Read errors presumably because of EOF
2264       break;
2265     }
2266 
2267     char s1[] = "processor";
2268     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2269       num_records++;
2270       continue;
2271     }
2272 
2273     // FIXME - this will match "node_<n> <garbage>"
2274     unsigned level;
2275     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2276       // validate the input fisrt:
2277       if (level > (unsigned)__kmp_xproc) { // level is too big
2278         level = __kmp_xproc;
2279       }
2280       if (nodeIdIndex + level >= maxIndex) {
2281         maxIndex = nodeIdIndex + level;
2282       }
2283       continue;
2284     }
2285   }
2286 
2287   // Check for empty file / no valid processor records, or too many. The number
2288   // of records can't exceed the number of valid bits in the affinity mask.
2289   if (num_records == 0) {
2290     *msg_id = kmp_i18n_str_NoProcRecords;
2291     return false;
2292   }
2293   if (num_records > (unsigned)__kmp_xproc) {
2294     *msg_id = kmp_i18n_str_TooManyProcRecords;
2295     return false;
2296   }
2297 
2298   // Set the file pointer back to the beginning, so that we can scan the file
2299   // again, this time performing a full parse of the data. Allocate a vector of
2300   // ProcCpuInfo object, where we will place the data. Adding an extra element
2301   // at the end allows us to remove a lot of extra checks for termination
2302   // conditions.
2303   if (fseek(f, 0, SEEK_SET) != 0) {
2304     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2305     return false;
2306   }
2307 
2308   // Allocate the array of records to store the proc info in.  The dummy
2309   // element at the end makes the logic in filling them out easier to code.
2310   unsigned **threadInfo =
2311       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2312   unsigned i;
2313   for (i = 0; i <= num_records; i++) {
2314     threadInfo[i] =
2315         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2316   }
2317 
2318 #define CLEANUP_THREAD_INFO                                                    \
2319   for (i = 0; i <= num_records; i++) {                                         \
2320     __kmp_free(threadInfo[i]);                                                 \
2321   }                                                                            \
2322   __kmp_free(threadInfo);
2323 
2324   // A value of UINT_MAX means that we didn't find the field
2325   unsigned __index;
2326 
2327 #define INIT_PROC_INFO(p)                                                      \
2328   for (__index = 0; __index <= maxIndex; __index++) {                          \
2329     (p)[__index] = UINT_MAX;                                                   \
2330   }
2331 
2332   for (i = 0; i <= num_records; i++) {
2333     INIT_PROC_INFO(threadInfo[i]);
2334   }
2335 
2336   unsigned num_avail = 0;
2337   *line = 0;
2338   while (!feof(f)) {
2339     // Create an inner scoping level, so that all the goto targets at the end of
2340     // the loop appear in an outer scoping level. This avoids warnings about
2341     // jumping past an initialization to a target in the same block.
2342     {
2343       buf[sizeof(buf) - 1] = 1;
2344       bool long_line = false;
2345       if (!fgets(buf, sizeof(buf), f)) {
2346         // Read errors presumably because of EOF
2347         // If there is valid data in threadInfo[num_avail], then fake
2348         // a blank line in ensure that the last address gets parsed.
2349         bool valid = false;
2350         for (i = 0; i <= maxIndex; i++) {
2351           if (threadInfo[num_avail][i] != UINT_MAX) {
2352             valid = true;
2353           }
2354         }
2355         if (!valid) {
2356           break;
2357         }
2358         buf[0] = 0;
2359       } else if (!buf[sizeof(buf) - 1]) {
2360         // The line is longer than the buffer.  Set a flag and don't
2361         // emit an error if we were going to ignore the line, anyway.
2362         long_line = true;
2363 
2364 #define CHECK_LINE                                                             \
2365   if (long_line) {                                                             \
2366     CLEANUP_THREAD_INFO;                                                       \
2367     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
2368     return false;                                                              \
2369   }
2370       }
2371       (*line)++;
2372 
2373       char s1[] = "processor";
2374       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2375         CHECK_LINE;
2376         char *p = strchr(buf + sizeof(s1) - 1, ':');
2377         unsigned val;
2378         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2379           goto no_val;
2380         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2381 #if KMP_ARCH_AARCH64
2382           // Handle the old AArch64 /proc/cpuinfo layout differently,
2383           // it contains all of the 'processor' entries listed in a
2384           // single 'Processor' section, therefore the normal looking
2385           // for duplicates in that section will always fail.
2386           num_avail++;
2387 #else
2388           goto dup_field;
2389 #endif
2390         threadInfo[num_avail][osIdIndex] = val;
2391 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2392         char path[256];
2393         KMP_SNPRINTF(
2394             path, sizeof(path),
2395             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2396             threadInfo[num_avail][osIdIndex]);
2397         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2398 
2399         KMP_SNPRINTF(path, sizeof(path),
2400                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
2401                      threadInfo[num_avail][osIdIndex]);
2402         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2403         continue;
2404 #else
2405       }
2406       char s2[] = "physical id";
2407       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2408         CHECK_LINE;
2409         char *p = strchr(buf + sizeof(s2) - 1, ':');
2410         unsigned val;
2411         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2412           goto no_val;
2413         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2414           goto dup_field;
2415         threadInfo[num_avail][pkgIdIndex] = val;
2416         continue;
2417       }
2418       char s3[] = "core id";
2419       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2420         CHECK_LINE;
2421         char *p = strchr(buf + sizeof(s3) - 1, ':');
2422         unsigned val;
2423         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2424           goto no_val;
2425         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2426           goto dup_field;
2427         threadInfo[num_avail][coreIdIndex] = val;
2428         continue;
2429 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2430       }
2431       char s4[] = "thread id";
2432       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2433         CHECK_LINE;
2434         char *p = strchr(buf + sizeof(s4) - 1, ':');
2435         unsigned val;
2436         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2437           goto no_val;
2438         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2439           goto dup_field;
2440         threadInfo[num_avail][threadIdIndex] = val;
2441         continue;
2442       }
2443       unsigned level;
2444       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2445         CHECK_LINE;
2446         char *p = strchr(buf + sizeof(s4) - 1, ':');
2447         unsigned val;
2448         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2449           goto no_val;
2450         // validate the input before using level:
2451         if (level > (unsigned)__kmp_xproc) { // level is too big
2452           level = __kmp_xproc;
2453         }
2454         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2455           goto dup_field;
2456         threadInfo[num_avail][nodeIdIndex + level] = val;
2457         continue;
2458       }
2459 
2460       // We didn't recognize the leading token on the line. There are lots of
2461       // leading tokens that we don't recognize - if the line isn't empty, go on
2462       // to the next line.
2463       if ((*buf != 0) && (*buf != '\n')) {
2464         // If the line is longer than the buffer, read characters
2465         // until we find a newline.
2466         if (long_line) {
2467           int ch;
2468           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2469             ;
2470         }
2471         continue;
2472       }
2473 
2474       // A newline has signalled the end of the processor record.
2475       // Check that there aren't too many procs specified.
2476       if ((int)num_avail == __kmp_xproc) {
2477         CLEANUP_THREAD_INFO;
2478         *msg_id = kmp_i18n_str_TooManyEntries;
2479         return false;
2480       }
2481 
2482       // Check for missing fields.  The osId field must be there, and we
2483       // currently require that the physical id field is specified, also.
2484       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2485         CLEANUP_THREAD_INFO;
2486         *msg_id = kmp_i18n_str_MissingProcField;
2487         return false;
2488       }
2489       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2490         CLEANUP_THREAD_INFO;
2491         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2492         return false;
2493       }
2494 
2495       // Skip this proc if it is not included in the machine model.
2496       if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2497                          __kmp_affin_fullMask)) {
2498         INIT_PROC_INFO(threadInfo[num_avail]);
2499         continue;
2500       }
2501 
2502       // We have a successful parse of this proc's info.
2503       // Increment the counter, and prepare for the next proc.
2504       num_avail++;
2505       KMP_ASSERT(num_avail <= num_records);
2506       INIT_PROC_INFO(threadInfo[num_avail]);
2507     }
2508     continue;
2509 
2510   no_val:
2511     CLEANUP_THREAD_INFO;
2512     *msg_id = kmp_i18n_str_MissingValCpuinfo;
2513     return false;
2514 
2515   dup_field:
2516     CLEANUP_THREAD_INFO;
2517     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2518     return false;
2519   }
2520   *line = 0;
2521 
2522 #if KMP_MIC && REDUCE_TEAM_SIZE
2523   unsigned teamSize = 0;
2524 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2525 
2526   // check for num_records == __kmp_xproc ???
2527 
2528   // If it is configured to omit the package level when there is only a single
2529   // package, the logic at the end of this routine won't work if there is only a
2530   // single thread
2531   KMP_ASSERT(num_avail > 0);
2532   KMP_ASSERT(num_avail <= num_records);
2533 
2534   // Sort the threadInfo table by physical Id.
2535   qsort(threadInfo, num_avail, sizeof(*threadInfo),
2536         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2537 
2538   // The table is now sorted by pkgId / coreId / threadId, but we really don't
2539   // know the radix of any of the fields. pkgId's may be sparsely assigned among
2540   // the chips on a system. Although coreId's are usually assigned
2541   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2542   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2543   //
2544   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2545   // total # packages) are at this point - we want to determine that now. We
2546   // only have an upper bound on the first two figures.
2547   unsigned *counts =
2548       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2549   unsigned *maxCt =
2550       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2551   unsigned *totals =
2552       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2553   unsigned *lastId =
2554       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2555 
2556   bool assign_thread_ids = false;
2557   unsigned threadIdCt;
2558   unsigned index;
2559 
2560 restart_radix_check:
2561   threadIdCt = 0;
2562 
2563   // Initialize the counter arrays with data from threadInfo[0].
2564   if (assign_thread_ids) {
2565     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2566       threadInfo[0][threadIdIndex] = threadIdCt++;
2567     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2568       threadIdCt = threadInfo[0][threadIdIndex] + 1;
2569     }
2570   }
2571   for (index = 0; index <= maxIndex; index++) {
2572     counts[index] = 1;
2573     maxCt[index] = 1;
2574     totals[index] = 1;
2575     lastId[index] = threadInfo[0][index];
2576     ;
2577   }
2578 
2579   // Run through the rest of the OS procs.
2580   for (i = 1; i < num_avail; i++) {
2581     // Find the most significant index whose id differs from the id for the
2582     // previous OS proc.
2583     for (index = maxIndex; index >= threadIdIndex; index--) {
2584       if (assign_thread_ids && (index == threadIdIndex)) {
2585         // Auto-assign the thread id field if it wasn't specified.
2586         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2587           threadInfo[i][threadIdIndex] = threadIdCt++;
2588         }
2589         // Apparently the thread id field was specified for some entries and not
2590         // others. Start the thread id counter off at the next higher thread id.
2591         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2592           threadIdCt = threadInfo[i][threadIdIndex] + 1;
2593         }
2594       }
2595       if (threadInfo[i][index] != lastId[index]) {
2596         // Run through all indices which are less significant, and reset the
2597         // counts to 1. At all levels up to and including index, we need to
2598         // increment the totals and record the last id.
2599         unsigned index2;
2600         for (index2 = threadIdIndex; index2 < index; index2++) {
2601           totals[index2]++;
2602           if (counts[index2] > maxCt[index2]) {
2603             maxCt[index2] = counts[index2];
2604           }
2605           counts[index2] = 1;
2606           lastId[index2] = threadInfo[i][index2];
2607         }
2608         counts[index]++;
2609         totals[index]++;
2610         lastId[index] = threadInfo[i][index];
2611 
2612         if (assign_thread_ids && (index > threadIdIndex)) {
2613 
2614 #if KMP_MIC && REDUCE_TEAM_SIZE
2615           // The default team size is the total #threads in the machine
2616           // minus 1 thread for every core that has 3 or more threads.
2617           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2618 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2619 
2620           // Restart the thread counter, as we are on a new core.
2621           threadIdCt = 0;
2622 
2623           // Auto-assign the thread id field if it wasn't specified.
2624           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2625             threadInfo[i][threadIdIndex] = threadIdCt++;
2626           }
2627 
2628           // Apparently the thread id field was specified for some entries and
2629           // not others. Start the thread id counter off at the next higher
2630           // thread id.
2631           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2632             threadIdCt = threadInfo[i][threadIdIndex] + 1;
2633           }
2634         }
2635         break;
2636       }
2637     }
2638     if (index < threadIdIndex) {
2639       // If thread ids were specified, it is an error if they are not unique.
2640       // Also, check that we waven't already restarted the loop (to be safe -
2641       // shouldn't need to).
2642       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2643         __kmp_free(lastId);
2644         __kmp_free(totals);
2645         __kmp_free(maxCt);
2646         __kmp_free(counts);
2647         CLEANUP_THREAD_INFO;
2648         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2649         return false;
2650       }
2651 
2652       // If the thread ids were not specified and we see entries entries that
2653       // are duplicates, start the loop over and assign the thread ids manually.
2654       assign_thread_ids = true;
2655       goto restart_radix_check;
2656     }
2657   }
2658 
2659 #if KMP_MIC && REDUCE_TEAM_SIZE
2660   // The default team size is the total #threads in the machine
2661   // minus 1 thread for every core that has 3 or more threads.
2662   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2663 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2664 
2665   for (index = threadIdIndex; index <= maxIndex; index++) {
2666     if (counts[index] > maxCt[index]) {
2667       maxCt[index] = counts[index];
2668     }
2669   }
2670 
2671   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2672   nCoresPerPkg = maxCt[coreIdIndex];
2673   nPackages = totals[pkgIdIndex];
2674 
2675   // When affinity is off, this routine will still be called to set
2676   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2677   // Make sure all these vars are set correctly, and return now if affinity is
2678   // not enabled.
2679   __kmp_ncores = totals[coreIdIndex];
2680   if (!KMP_AFFINITY_CAPABLE()) {
2681     KMP_ASSERT(__kmp_affinity_type == affinity_none);
2682     return true;
2683   }
2684 
2685 #if KMP_MIC && REDUCE_TEAM_SIZE
2686   // Set the default team size.
2687   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2688     __kmp_dflt_team_nth = teamSize;
2689     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2690                   "__kmp_dflt_team_nth = %d\n",
2691                   __kmp_dflt_team_nth));
2692   }
2693 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2694 
2695   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
2696 
2697   // Count the number of levels which have more nodes at that level than at the
2698   // parent's level (with there being an implicit root node of the top level).
2699   // This is equivalent to saying that there is at least one node at this level
2700   // which has a sibling. These levels are in the map, and the package level is
2701   // always in the map.
2702   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2703   for (index = threadIdIndex; index < maxIndex; index++) {
2704     KMP_ASSERT(totals[index] >= totals[index + 1]);
2705     inMap[index] = (totals[index] > totals[index + 1]);
2706   }
2707   inMap[maxIndex] = (totals[maxIndex] > 1);
2708   inMap[pkgIdIndex] = true;
2709   inMap[coreIdIndex] = true;
2710   inMap[threadIdIndex] = true;
2711 
2712   int depth = 0;
2713   int idx = 0;
2714   kmp_hw_t types[KMP_HW_LAST];
2715   int pkgLevel = -1;
2716   int coreLevel = -1;
2717   int threadLevel = -1;
2718   for (index = threadIdIndex; index <= maxIndex; index++) {
2719     if (inMap[index]) {
2720       depth++;
2721     }
2722   }
2723   if (inMap[pkgIdIndex]) {
2724     pkgLevel = idx;
2725     types[idx++] = KMP_HW_SOCKET;
2726   }
2727   if (inMap[coreIdIndex]) {
2728     coreLevel = idx;
2729     types[idx++] = KMP_HW_CORE;
2730   }
2731   if (inMap[threadIdIndex]) {
2732     threadLevel = idx;
2733     types[idx++] = KMP_HW_THREAD;
2734   }
2735   KMP_ASSERT(depth > 0);
2736 
2737   // Construct the data structure that is to be returned.
2738   __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
2739 
2740   for (i = 0; i < num_avail; ++i) {
2741     unsigned os = threadInfo[i][osIdIndex];
2742     int src_index;
2743     int dst_index = 0;
2744     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2745     hw_thread.clear();
2746     hw_thread.os_id = os;
2747 
2748     idx = 0;
2749     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2750       if (!inMap[src_index]) {
2751         continue;
2752       }
2753       if (src_index == pkgIdIndex) {
2754         hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
2755       } else if (src_index == coreIdIndex) {
2756         hw_thread.ids[coreLevel] = threadInfo[i][src_index];
2757       } else if (src_index == threadIdIndex) {
2758         hw_thread.ids[threadLevel] = threadInfo[i][src_index];
2759       }
2760       dst_index++;
2761     }
2762   }
2763 
2764   __kmp_free(inMap);
2765   __kmp_free(lastId);
2766   __kmp_free(totals);
2767   __kmp_free(maxCt);
2768   __kmp_free(counts);
2769   CLEANUP_THREAD_INFO;
2770   __kmp_topology->sort_ids();
2771   if (!__kmp_topology->check_ids()) {
2772     kmp_topology_t::deallocate(__kmp_topology);
2773     __kmp_topology = nullptr;
2774     *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2775     return false;
2776   }
2777   return true;
2778 }
2779 
2780 // Create and return a table of affinity masks, indexed by OS thread ID.
2781 // This routine handles OR'ing together all the affinity masks of threads
2782 // that are sufficiently close, if granularity > fine.
2783 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2784                                             unsigned *numUnique) {
2785   // First form a table of affinity masks in order of OS thread id.
2786   int maxOsId;
2787   int i;
2788   int numAddrs = __kmp_topology->get_num_hw_threads();
2789   int depth = __kmp_topology->get_depth();
2790   KMP_ASSERT(numAddrs);
2791   KMP_ASSERT(depth);
2792 
2793   maxOsId = 0;
2794   for (i = numAddrs - 1;; --i) {
2795     int osId = __kmp_topology->at(i).os_id;
2796     if (osId > maxOsId) {
2797       maxOsId = osId;
2798     }
2799     if (i == 0)
2800       break;
2801   }
2802   kmp_affin_mask_t *osId2Mask;
2803   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2804   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2805   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2806     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2807   }
2808   if (__kmp_affinity_gran_levels >= (int)depth) {
2809     if (__kmp_affinity_verbose ||
2810         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2811       KMP_WARNING(AffThreadsMayMigrate);
2812     }
2813   }
2814 
2815   // Run through the table, forming the masks for all threads on each core.
2816   // Threads on the same core will have identical kmp_hw_thread_t objects, not
2817   // considering the last level, which must be the thread id. All threads on a
2818   // core will appear consecutively.
2819   int unique = 0;
2820   int j = 0; // index of 1st thread on core
2821   int leader = 0;
2822   kmp_affin_mask_t *sum;
2823   KMP_CPU_ALLOC_ON_STACK(sum);
2824   KMP_CPU_ZERO(sum);
2825   KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
2826   for (i = 1; i < numAddrs; i++) {
2827     // If this thread is sufficiently close to the leader (within the
2828     // granularity setting), then set the bit for this os thread in the
2829     // affinity mask for this group, and go on to the next thread.
2830     if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
2831       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2832       continue;
2833     }
2834 
2835     // For every thread in this group, copy the mask to the thread's entry in
2836     // the osId2Mask table.  Mark the first address as a leader.
2837     for (; j < i; j++) {
2838       int osId = __kmp_topology->at(j).os_id;
2839       KMP_DEBUG_ASSERT(osId <= maxOsId);
2840       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2841       KMP_CPU_COPY(mask, sum);
2842       __kmp_topology->at(j).leader = (j == leader);
2843     }
2844     unique++;
2845 
2846     // Start a new mask.
2847     leader = i;
2848     KMP_CPU_ZERO(sum);
2849     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2850   }
2851 
2852   // For every thread in last group, copy the mask to the thread's
2853   // entry in the osId2Mask table.
2854   for (; j < i; j++) {
2855     int osId = __kmp_topology->at(j).os_id;
2856     KMP_DEBUG_ASSERT(osId <= maxOsId);
2857     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2858     KMP_CPU_COPY(mask, sum);
2859     __kmp_topology->at(j).leader = (j == leader);
2860   }
2861   unique++;
2862   KMP_CPU_FREE_FROM_STACK(sum);
2863 
2864   *maxIndex = maxOsId;
2865   *numUnique = unique;
2866   return osId2Mask;
2867 }
2868 
2869 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2870 // as file-static than to try and pass them through the calling sequence of
2871 // the recursive-descent OMP_PLACES parser.
2872 static kmp_affin_mask_t *newMasks;
2873 static int numNewMasks;
2874 static int nextNewMask;
2875 
2876 #define ADD_MASK(_mask)                                                        \
2877   {                                                                            \
2878     if (nextNewMask >= numNewMasks) {                                          \
2879       int i;                                                                   \
2880       numNewMasks *= 2;                                                        \
2881       kmp_affin_mask_t *temp;                                                  \
2882       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
2883       for (i = 0; i < numNewMasks / 2; i++) {                                  \
2884         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
2885         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
2886         KMP_CPU_COPY(dest, src);                                               \
2887       }                                                                        \
2888       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
2889       newMasks = temp;                                                         \
2890     }                                                                          \
2891     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
2892     nextNewMask++;                                                             \
2893   }
2894 
2895 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
2896   {                                                                            \
2897     if (((_osId) > _maxOsId) ||                                                \
2898         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
2899       if (__kmp_affinity_verbose ||                                            \
2900           (__kmp_affinity_warnings &&                                          \
2901            (__kmp_affinity_type != affinity_none))) {                          \
2902         KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
2903       }                                                                        \
2904     } else {                                                                   \
2905       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
2906     }                                                                          \
2907   }
2908 
2909 // Re-parse the proclist (for the explicit affinity type), and form the list
2910 // of affinity newMasks indexed by gtid.
2911 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2912                                             unsigned int *out_numMasks,
2913                                             const char *proclist,
2914                                             kmp_affin_mask_t *osId2Mask,
2915                                             int maxOsId) {
2916   int i;
2917   const char *scan = proclist;
2918   const char *next = proclist;
2919 
2920   // We use malloc() for the temporary mask vector, so that we can use
2921   // realloc() to extend it.
2922   numNewMasks = 2;
2923   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2924   nextNewMask = 0;
2925   kmp_affin_mask_t *sumMask;
2926   KMP_CPU_ALLOC(sumMask);
2927   int setSize = 0;
2928 
2929   for (;;) {
2930     int start, end, stride;
2931 
2932     SKIP_WS(scan);
2933     next = scan;
2934     if (*next == '\0') {
2935       break;
2936     }
2937 
2938     if (*next == '{') {
2939       int num;
2940       setSize = 0;
2941       next++; // skip '{'
2942       SKIP_WS(next);
2943       scan = next;
2944 
2945       // Read the first integer in the set.
2946       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2947       SKIP_DIGITS(next);
2948       num = __kmp_str_to_int(scan, *next);
2949       KMP_ASSERT2(num >= 0, "bad explicit proc list");
2950 
2951       // Copy the mask for that osId to the sum (union) mask.
2952       if ((num > maxOsId) ||
2953           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2954         if (__kmp_affinity_verbose ||
2955             (__kmp_affinity_warnings &&
2956              (__kmp_affinity_type != affinity_none))) {
2957           KMP_WARNING(AffIgnoreInvalidProcID, num);
2958         }
2959         KMP_CPU_ZERO(sumMask);
2960       } else {
2961         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2962         setSize = 1;
2963       }
2964 
2965       for (;;) {
2966         // Check for end of set.
2967         SKIP_WS(next);
2968         if (*next == '}') {
2969           next++; // skip '}'
2970           break;
2971         }
2972 
2973         // Skip optional comma.
2974         if (*next == ',') {
2975           next++;
2976         }
2977         SKIP_WS(next);
2978 
2979         // Read the next integer in the set.
2980         scan = next;
2981         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2982 
2983         SKIP_DIGITS(next);
2984         num = __kmp_str_to_int(scan, *next);
2985         KMP_ASSERT2(num >= 0, "bad explicit proc list");
2986 
2987         // Add the mask for that osId to the sum mask.
2988         if ((num > maxOsId) ||
2989             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2990           if (__kmp_affinity_verbose ||
2991               (__kmp_affinity_warnings &&
2992                (__kmp_affinity_type != affinity_none))) {
2993             KMP_WARNING(AffIgnoreInvalidProcID, num);
2994           }
2995         } else {
2996           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2997           setSize++;
2998         }
2999       }
3000       if (setSize > 0) {
3001         ADD_MASK(sumMask);
3002       }
3003 
3004       SKIP_WS(next);
3005       if (*next == ',') {
3006         next++;
3007       }
3008       scan = next;
3009       continue;
3010     }
3011 
3012     // Read the first integer.
3013     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3014     SKIP_DIGITS(next);
3015     start = __kmp_str_to_int(scan, *next);
3016     KMP_ASSERT2(start >= 0, "bad explicit proc list");
3017     SKIP_WS(next);
3018 
3019     // If this isn't a range, then add a mask to the list and go on.
3020     if (*next != '-') {
3021       ADD_MASK_OSID(start, osId2Mask, maxOsId);
3022 
3023       // Skip optional comma.
3024       if (*next == ',') {
3025         next++;
3026       }
3027       scan = next;
3028       continue;
3029     }
3030 
3031     // This is a range.  Skip over the '-' and read in the 2nd int.
3032     next++; // skip '-'
3033     SKIP_WS(next);
3034     scan = next;
3035     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3036     SKIP_DIGITS(next);
3037     end = __kmp_str_to_int(scan, *next);
3038     KMP_ASSERT2(end >= 0, "bad explicit proc list");
3039 
3040     // Check for a stride parameter
3041     stride = 1;
3042     SKIP_WS(next);
3043     if (*next == ':') {
3044       // A stride is specified.  Skip over the ':" and read the 3rd int.
3045       int sign = +1;
3046       next++; // skip ':'
3047       SKIP_WS(next);
3048       scan = next;
3049       if (*next == '-') {
3050         sign = -1;
3051         next++;
3052         SKIP_WS(next);
3053         scan = next;
3054       }
3055       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3056       SKIP_DIGITS(next);
3057       stride = __kmp_str_to_int(scan, *next);
3058       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3059       stride *= sign;
3060     }
3061 
3062     // Do some range checks.
3063     KMP_ASSERT2(stride != 0, "bad explicit proc list");
3064     if (stride > 0) {
3065       KMP_ASSERT2(start <= end, "bad explicit proc list");
3066     } else {
3067       KMP_ASSERT2(start >= end, "bad explicit proc list");
3068     }
3069     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3070 
3071     // Add the mask for each OS proc # to the list.
3072     if (stride > 0) {
3073       do {
3074         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3075         start += stride;
3076       } while (start <= end);
3077     } else {
3078       do {
3079         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3080         start += stride;
3081       } while (start >= end);
3082     }
3083 
3084     // Skip optional comma.
3085     SKIP_WS(next);
3086     if (*next == ',') {
3087       next++;
3088     }
3089     scan = next;
3090   }
3091 
3092   *out_numMasks = nextNewMask;
3093   if (nextNewMask == 0) {
3094     *out_masks = NULL;
3095     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3096     return;
3097   }
3098   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3099   for (i = 0; i < nextNewMask; i++) {
3100     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3101     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3102     KMP_CPU_COPY(dest, src);
3103   }
3104   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3105   KMP_CPU_FREE(sumMask);
3106 }
3107 
3108 /*-----------------------------------------------------------------------------
3109 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3110 places.  Again, Here is the grammar:
3111 
3112 place_list := place
3113 place_list := place , place_list
3114 place := num
3115 place := place : num
3116 place := place : num : signed
3117 place := { subplacelist }
3118 place := ! place                  // (lowest priority)
3119 subplace_list := subplace
3120 subplace_list := subplace , subplace_list
3121 subplace := num
3122 subplace := num : num
3123 subplace := num : num : signed
3124 signed := num
3125 signed := + signed
3126 signed := - signed
3127 -----------------------------------------------------------------------------*/
3128 static void __kmp_process_subplace_list(const char **scan,
3129                                         kmp_affin_mask_t *osId2Mask,
3130                                         int maxOsId, kmp_affin_mask_t *tempMask,
3131                                         int *setSize) {
3132   const char *next;
3133 
3134   for (;;) {
3135     int start, count, stride, i;
3136 
3137     // Read in the starting proc id
3138     SKIP_WS(*scan);
3139     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3140     next = *scan;
3141     SKIP_DIGITS(next);
3142     start = __kmp_str_to_int(*scan, *next);
3143     KMP_ASSERT(start >= 0);
3144     *scan = next;
3145 
3146     // valid follow sets are ',' ':' and '}'
3147     SKIP_WS(*scan);
3148     if (**scan == '}' || **scan == ',') {
3149       if ((start > maxOsId) ||
3150           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3151         if (__kmp_affinity_verbose ||
3152             (__kmp_affinity_warnings &&
3153              (__kmp_affinity_type != affinity_none))) {
3154           KMP_WARNING(AffIgnoreInvalidProcID, start);
3155         }
3156       } else {
3157         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3158         (*setSize)++;
3159       }
3160       if (**scan == '}') {
3161         break;
3162       }
3163       (*scan)++; // skip ','
3164       continue;
3165     }
3166     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3167     (*scan)++; // skip ':'
3168 
3169     // Read count parameter
3170     SKIP_WS(*scan);
3171     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3172     next = *scan;
3173     SKIP_DIGITS(next);
3174     count = __kmp_str_to_int(*scan, *next);
3175     KMP_ASSERT(count >= 0);
3176     *scan = next;
3177 
3178     // valid follow sets are ',' ':' and '}'
3179     SKIP_WS(*scan);
3180     if (**scan == '}' || **scan == ',') {
3181       for (i = 0; i < count; i++) {
3182         if ((start > maxOsId) ||
3183             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3184           if (__kmp_affinity_verbose ||
3185               (__kmp_affinity_warnings &&
3186                (__kmp_affinity_type != affinity_none))) {
3187             KMP_WARNING(AffIgnoreInvalidProcID, start);
3188           }
3189           break; // don't proliferate warnings for large count
3190         } else {
3191           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3192           start++;
3193           (*setSize)++;
3194         }
3195       }
3196       if (**scan == '}') {
3197         break;
3198       }
3199       (*scan)++; // skip ','
3200       continue;
3201     }
3202     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3203     (*scan)++; // skip ':'
3204 
3205     // Read stride parameter
3206     int sign = +1;
3207     for (;;) {
3208       SKIP_WS(*scan);
3209       if (**scan == '+') {
3210         (*scan)++; // skip '+'
3211         continue;
3212       }
3213       if (**scan == '-') {
3214         sign *= -1;
3215         (*scan)++; // skip '-'
3216         continue;
3217       }
3218       break;
3219     }
3220     SKIP_WS(*scan);
3221     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3222     next = *scan;
3223     SKIP_DIGITS(next);
3224     stride = __kmp_str_to_int(*scan, *next);
3225     KMP_ASSERT(stride >= 0);
3226     *scan = next;
3227     stride *= sign;
3228 
3229     // valid follow sets are ',' and '}'
3230     SKIP_WS(*scan);
3231     if (**scan == '}' || **scan == ',') {
3232       for (i = 0; i < count; i++) {
3233         if ((start > maxOsId) ||
3234             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3235           if (__kmp_affinity_verbose ||
3236               (__kmp_affinity_warnings &&
3237                (__kmp_affinity_type != affinity_none))) {
3238             KMP_WARNING(AffIgnoreInvalidProcID, start);
3239           }
3240           break; // don't proliferate warnings for large count
3241         } else {
3242           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3243           start += stride;
3244           (*setSize)++;
3245         }
3246       }
3247       if (**scan == '}') {
3248         break;
3249       }
3250       (*scan)++; // skip ','
3251       continue;
3252     }
3253 
3254     KMP_ASSERT2(0, "bad explicit places list");
3255   }
3256 }
3257 
3258 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3259                                 int maxOsId, kmp_affin_mask_t *tempMask,
3260                                 int *setSize) {
3261   const char *next;
3262 
3263   // valid follow sets are '{' '!' and num
3264   SKIP_WS(*scan);
3265   if (**scan == '{') {
3266     (*scan)++; // skip '{'
3267     __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3268     KMP_ASSERT2(**scan == '}', "bad explicit places list");
3269     (*scan)++; // skip '}'
3270   } else if (**scan == '!') {
3271     (*scan)++; // skip '!'
3272     __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3273     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3274   } else if ((**scan >= '0') && (**scan <= '9')) {
3275     next = *scan;
3276     SKIP_DIGITS(next);
3277     int num = __kmp_str_to_int(*scan, *next);
3278     KMP_ASSERT(num >= 0);
3279     if ((num > maxOsId) ||
3280         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3281       if (__kmp_affinity_verbose ||
3282           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3283         KMP_WARNING(AffIgnoreInvalidProcID, num);
3284       }
3285     } else {
3286       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3287       (*setSize)++;
3288     }
3289     *scan = next; // skip num
3290   } else {
3291     KMP_ASSERT2(0, "bad explicit places list");
3292   }
3293 }
3294 
3295 // static void
3296 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3297                                       unsigned int *out_numMasks,
3298                                       const char *placelist,
3299                                       kmp_affin_mask_t *osId2Mask,
3300                                       int maxOsId) {
3301   int i, j, count, stride, sign;
3302   const char *scan = placelist;
3303   const char *next = placelist;
3304 
3305   numNewMasks = 2;
3306   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3307   nextNewMask = 0;
3308 
3309   // tempMask is modified based on the previous or initial
3310   //   place to form the current place
3311   // previousMask contains the previous place
3312   kmp_affin_mask_t *tempMask;
3313   kmp_affin_mask_t *previousMask;
3314   KMP_CPU_ALLOC(tempMask);
3315   KMP_CPU_ZERO(tempMask);
3316   KMP_CPU_ALLOC(previousMask);
3317   KMP_CPU_ZERO(previousMask);
3318   int setSize = 0;
3319 
3320   for (;;) {
3321     __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3322 
3323     // valid follow sets are ',' ':' and EOL
3324     SKIP_WS(scan);
3325     if (*scan == '\0' || *scan == ',') {
3326       if (setSize > 0) {
3327         ADD_MASK(tempMask);
3328       }
3329       KMP_CPU_ZERO(tempMask);
3330       setSize = 0;
3331       if (*scan == '\0') {
3332         break;
3333       }
3334       scan++; // skip ','
3335       continue;
3336     }
3337 
3338     KMP_ASSERT2(*scan == ':', "bad explicit places list");
3339     scan++; // skip ':'
3340 
3341     // Read count parameter
3342     SKIP_WS(scan);
3343     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3344     next = scan;
3345     SKIP_DIGITS(next);
3346     count = __kmp_str_to_int(scan, *next);
3347     KMP_ASSERT(count >= 0);
3348     scan = next;
3349 
3350     // valid follow sets are ',' ':' and EOL
3351     SKIP_WS(scan);
3352     if (*scan == '\0' || *scan == ',') {
3353       stride = +1;
3354     } else {
3355       KMP_ASSERT2(*scan == ':', "bad explicit places list");
3356       scan++; // skip ':'
3357 
3358       // Read stride parameter
3359       sign = +1;
3360       for (;;) {
3361         SKIP_WS(scan);
3362         if (*scan == '+') {
3363           scan++; // skip '+'
3364           continue;
3365         }
3366         if (*scan == '-') {
3367           sign *= -1;
3368           scan++; // skip '-'
3369           continue;
3370         }
3371         break;
3372       }
3373       SKIP_WS(scan);
3374       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3375       next = scan;
3376       SKIP_DIGITS(next);
3377       stride = __kmp_str_to_int(scan, *next);
3378       KMP_DEBUG_ASSERT(stride >= 0);
3379       scan = next;
3380       stride *= sign;
3381     }
3382 
3383     // Add places determined by initial_place : count : stride
3384     for (i = 0; i < count; i++) {
3385       if (setSize == 0) {
3386         break;
3387       }
3388       // Add the current place, then build the next place (tempMask) from that
3389       KMP_CPU_COPY(previousMask, tempMask);
3390       ADD_MASK(previousMask);
3391       KMP_CPU_ZERO(tempMask);
3392       setSize = 0;
3393       KMP_CPU_SET_ITERATE(j, previousMask) {
3394         if (!KMP_CPU_ISSET(j, previousMask)) {
3395           continue;
3396         }
3397         if ((j + stride > maxOsId) || (j + stride < 0) ||
3398             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3399             (!KMP_CPU_ISSET(j + stride,
3400                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3401           if ((__kmp_affinity_verbose ||
3402                (__kmp_affinity_warnings &&
3403                 (__kmp_affinity_type != affinity_none))) &&
3404               i < count - 1) {
3405             KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3406           }
3407           continue;
3408         }
3409         KMP_CPU_SET(j + stride, tempMask);
3410         setSize++;
3411       }
3412     }
3413     KMP_CPU_ZERO(tempMask);
3414     setSize = 0;
3415 
3416     // valid follow sets are ',' and EOL
3417     SKIP_WS(scan);
3418     if (*scan == '\0') {
3419       break;
3420     }
3421     if (*scan == ',') {
3422       scan++; // skip ','
3423       continue;
3424     }
3425 
3426     KMP_ASSERT2(0, "bad explicit places list");
3427   }
3428 
3429   *out_numMasks = nextNewMask;
3430   if (nextNewMask == 0) {
3431     *out_masks = NULL;
3432     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3433     return;
3434   }
3435   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3436   KMP_CPU_FREE(tempMask);
3437   KMP_CPU_FREE(previousMask);
3438   for (i = 0; i < nextNewMask; i++) {
3439     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3440     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3441     KMP_CPU_COPY(dest, src);
3442   }
3443   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3444 }
3445 
3446 #undef ADD_MASK
3447 #undef ADD_MASK_OSID
3448 
3449 // This function figures out the deepest level at which there is at least one
3450 // cluster/core with more than one processing unit bound to it.
3451 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
3452   int core_level = 0;
3453 
3454   for (int i = 0; i < nprocs; i++) {
3455     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3456     for (int j = bottom_level; j > 0; j--) {
3457       if (hw_thread.ids[j] > 0) {
3458         if (core_level < (j - 1)) {
3459           core_level = j - 1;
3460         }
3461       }
3462     }
3463   }
3464   return core_level;
3465 }
3466 
3467 // This function counts number of clusters/cores at given level.
3468 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
3469                                          int core_level) {
3470   return __kmp_topology->get_count(core_level);
3471 }
3472 // This function finds to which cluster/core given processing unit is bound.
3473 static int __kmp_affinity_find_core(int proc, int bottom_level,
3474                                     int core_level) {
3475   int core = 0;
3476   KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
3477   for (int i = 0; i <= proc; ++i) {
3478     if (i + 1 <= proc) {
3479       for (int j = 0; j <= core_level; ++j) {
3480         if (__kmp_topology->at(i + 1).sub_ids[j] !=
3481             __kmp_topology->at(i).sub_ids[j]) {
3482           core++;
3483           break;
3484         }
3485       }
3486     }
3487   }
3488   return core;
3489 }
3490 
3491 // This function finds maximal number of processing units bound to a
3492 // cluster/core at given level.
3493 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
3494                                             int core_level) {
3495   if (core_level >= bottom_level)
3496     return 1;
3497   int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
3498   return __kmp_topology->calculate_ratio(thread_level, core_level);
3499 }
3500 
3501 static int *procarr = NULL;
3502 static int __kmp_aff_depth = 0;
3503 
3504 // Create a one element mask array (set of places) which only contains the
3505 // initial process's affinity mask
3506 static void __kmp_create_affinity_none_places() {
3507   KMP_ASSERT(__kmp_affin_fullMask != NULL);
3508   KMP_ASSERT(__kmp_affinity_type == affinity_none);
3509   __kmp_affinity_num_masks = 1;
3510   KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3511   kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
3512   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
3513 }
3514 
3515 static void __kmp_aux_affinity_initialize(void) {
3516   if (__kmp_affinity_masks != NULL) {
3517     KMP_ASSERT(__kmp_affin_fullMask != NULL);
3518     return;
3519   }
3520 
3521   // Create the "full" mask - this defines all of the processors that we
3522   // consider to be in the machine model. If respect is set, then it is the
3523   // initialization thread's affinity mask. Otherwise, it is all processors that
3524   // we know about on the machine.
3525   if (__kmp_affin_fullMask == NULL) {
3526     KMP_CPU_ALLOC(__kmp_affin_fullMask);
3527   }
3528   if (KMP_AFFINITY_CAPABLE()) {
3529     __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3530     if (__kmp_affinity_respect_mask) {
3531       // Count the number of available processors.
3532       unsigned i;
3533       __kmp_avail_proc = 0;
3534       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3535         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3536           continue;
3537         }
3538         __kmp_avail_proc++;
3539       }
3540       if (__kmp_avail_proc > __kmp_xproc) {
3541         if (__kmp_affinity_verbose ||
3542             (__kmp_affinity_warnings &&
3543              (__kmp_affinity_type != affinity_none))) {
3544           KMP_WARNING(ErrorInitializeAffinity);
3545         }
3546         __kmp_affinity_type = affinity_none;
3547         KMP_AFFINITY_DISABLE();
3548         return;
3549       }
3550 
3551       if (__kmp_affinity_verbose) {
3552         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3553         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3554                                   __kmp_affin_fullMask);
3555         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
3556       }
3557     } else {
3558       if (__kmp_affinity_verbose) {
3559         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3560         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3561                                   __kmp_affin_fullMask);
3562         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
3563       }
3564       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3565       __kmp_avail_proc = __kmp_xproc;
3566 #if KMP_OS_WINDOWS
3567       // Set the process affinity mask since threads' affinity
3568       // masks must be subset of process mask in Windows* OS
3569       __kmp_affin_fullMask->set_process_affinity(true);
3570 #endif
3571     }
3572   }
3573 
3574   kmp_i18n_id_t msg_id = kmp_i18n_null;
3575 
3576   // For backward compatibility, setting KMP_CPUINFO_FILE =>
3577   // KMP_TOPOLOGY_METHOD=cpuinfo
3578   if ((__kmp_cpuinfo_file != NULL) &&
3579       (__kmp_affinity_top_method == affinity_top_method_all)) {
3580     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3581   }
3582 
3583   bool success = false;
3584   if (__kmp_affinity_top_method == affinity_top_method_all) {
3585 // In the default code path, errors are not fatal - we just try using
3586 // another method. We only emit a warning message if affinity is on, or the
3587 // verbose flag is set, an the nowarnings flag was not set.
3588 #if KMP_USE_HWLOC
3589     if (!success &&
3590         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3591       if (!__kmp_hwloc_error) {
3592         success = __kmp_affinity_create_hwloc_map(&msg_id);
3593         if (!success && __kmp_affinity_verbose) {
3594           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3595         }
3596       } else if (__kmp_affinity_verbose) {
3597         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3598       }
3599     }
3600 #endif
3601 
3602 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3603     if (!success) {
3604       success = __kmp_affinity_create_x2apicid_map(&msg_id);
3605       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3606         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3607       }
3608     }
3609     if (!success) {
3610       success = __kmp_affinity_create_apicid_map(&msg_id);
3611       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3612         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3613       }
3614     }
3615 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3616 
3617 #if KMP_OS_LINUX
3618     if (!success) {
3619       int line = 0;
3620       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3621       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3622         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3623       }
3624     }
3625 #endif /* KMP_OS_LINUX */
3626 
3627 #if KMP_GROUP_AFFINITY
3628     if (!success && (__kmp_num_proc_groups > 1)) {
3629       success = __kmp_affinity_create_proc_group_map(&msg_id);
3630       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3631         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3632       }
3633     }
3634 #endif /* KMP_GROUP_AFFINITY */
3635 
3636     if (!success) {
3637       success = __kmp_affinity_create_flat_map(&msg_id);
3638       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3639         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3640       }
3641       KMP_ASSERT(success);
3642     }
3643   }
3644 
3645 // If the user has specified that a paricular topology discovery method is to be
3646 // used, then we abort if that method fails. The exception is group affinity,
3647 // which might have been implicitly set.
3648 #if KMP_USE_HWLOC
3649   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3650     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
3651     success = __kmp_affinity_create_hwloc_map(&msg_id);
3652     if (!success) {
3653       KMP_ASSERT(msg_id != kmp_i18n_null);
3654       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3655     }
3656   }
3657 #endif // KMP_USE_HWLOC
3658 
3659 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3660   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
3661            __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
3662     success = __kmp_affinity_create_x2apicid_map(&msg_id);
3663     if (!success) {
3664       KMP_ASSERT(msg_id != kmp_i18n_null);
3665       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3666     }
3667   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3668     success = __kmp_affinity_create_apicid_map(&msg_id);
3669     if (!success) {
3670       KMP_ASSERT(msg_id != kmp_i18n_null);
3671       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3672     }
3673   }
3674 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3675 
3676   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3677     int line = 0;
3678     success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3679     if (!success) {
3680       KMP_ASSERT(msg_id != kmp_i18n_null);
3681       const char *filename = __kmp_cpuinfo_get_filename();
3682       if (line > 0) {
3683         KMP_FATAL(FileLineMsgExiting, filename, line,
3684                   __kmp_i18n_catgets(msg_id));
3685       } else {
3686         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3687       }
3688     }
3689   }
3690 
3691 #if KMP_GROUP_AFFINITY
3692   else if (__kmp_affinity_top_method == affinity_top_method_group) {
3693     success = __kmp_affinity_create_proc_group_map(&msg_id);
3694     KMP_ASSERT(success);
3695     if (!success) {
3696       KMP_ASSERT(msg_id != kmp_i18n_null);
3697       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3698     }
3699   }
3700 #endif /* KMP_GROUP_AFFINITY */
3701 
3702   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3703     success = __kmp_affinity_create_flat_map(&msg_id);
3704     // should not fail
3705     KMP_ASSERT(success);
3706   }
3707 
3708   // Early exit if topology could not be created
3709   if (!__kmp_topology) {
3710     if (KMP_AFFINITY_CAPABLE() &&
3711         (__kmp_affinity_verbose ||
3712          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
3713       KMP_WARNING(ErrorInitializeAffinity);
3714     }
3715     if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
3716         __kmp_ncores > 0) {
3717       __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
3718       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
3719                                    __kmp_nThreadsPerCore, __kmp_ncores);
3720       if (__kmp_affinity_verbose) {
3721         __kmp_topology->print("KMP_AFFINITY");
3722       }
3723     }
3724     __kmp_affinity_type = affinity_none;
3725     __kmp_create_affinity_none_places();
3726 #if KMP_USE_HIER_SCHED
3727     __kmp_dispatch_set_hierarchy_values();
3728 #endif
3729     KMP_AFFINITY_DISABLE();
3730     return;
3731   }
3732 
3733   // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
3734   // initialize other data structures which depend on the topology
3735   __kmp_topology->canonicalize();
3736   if (__kmp_affinity_verbose)
3737     __kmp_topology->print("KMP_AFFINITY");
3738   bool filtered = __kmp_topology->filter_hw_subset();
3739   if (filtered && __kmp_affinity_verbose)
3740     __kmp_topology->print("KMP_HW_SUBSET");
3741   machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
3742   KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
3743   // If KMP_AFFINITY=none, then only create the single "none" place
3744   // which is the process's initial affinity mask or the number of
3745   // hardware threads depending on respect,norespect
3746   if (__kmp_affinity_type == affinity_none) {
3747     __kmp_create_affinity_none_places();
3748 #if KMP_USE_HIER_SCHED
3749     __kmp_dispatch_set_hierarchy_values();
3750 #endif
3751     return;
3752   }
3753   int depth = __kmp_topology->get_depth();
3754 
3755   // Create the table of masks, indexed by thread Id.
3756   unsigned maxIndex;
3757   unsigned numUnique;
3758   kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
3759   if (__kmp_affinity_gran_levels == 0) {
3760     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3761   }
3762 
3763   switch (__kmp_affinity_type) {
3764 
3765   case affinity_explicit:
3766     KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3767     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
3768       __kmp_affinity_process_proclist(
3769           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3770           __kmp_affinity_proclist, osId2Mask, maxIndex);
3771     } else {
3772       __kmp_affinity_process_placelist(
3773           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3774           __kmp_affinity_proclist, osId2Mask, maxIndex);
3775     }
3776     if (__kmp_affinity_num_masks == 0) {
3777       if (__kmp_affinity_verbose ||
3778           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3779         KMP_WARNING(AffNoValidProcID);
3780       }
3781       __kmp_affinity_type = affinity_none;
3782       __kmp_create_affinity_none_places();
3783       return;
3784     }
3785     break;
3786 
3787   // The other affinity types rely on sorting the hardware threads according to
3788   // some permutation of the machine topology tree. Set __kmp_affinity_compact
3789   // and __kmp_affinity_offset appropriately, then jump to a common code
3790   // fragment to do the sort and create the array of affinity masks.
3791   case affinity_logical:
3792     __kmp_affinity_compact = 0;
3793     if (__kmp_affinity_offset) {
3794       __kmp_affinity_offset =
3795           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3796     }
3797     goto sortTopology;
3798 
3799   case affinity_physical:
3800     if (__kmp_nThreadsPerCore > 1) {
3801       __kmp_affinity_compact = 1;
3802       if (__kmp_affinity_compact >= depth) {
3803         __kmp_affinity_compact = 0;
3804       }
3805     } else {
3806       __kmp_affinity_compact = 0;
3807     }
3808     if (__kmp_affinity_offset) {
3809       __kmp_affinity_offset =
3810           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3811     }
3812     goto sortTopology;
3813 
3814   case affinity_scatter:
3815     if (__kmp_affinity_compact >= depth) {
3816       __kmp_affinity_compact = 0;
3817     } else {
3818       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3819     }
3820     goto sortTopology;
3821 
3822   case affinity_compact:
3823     if (__kmp_affinity_compact >= depth) {
3824       __kmp_affinity_compact = depth - 1;
3825     }
3826     goto sortTopology;
3827 
3828   case affinity_balanced:
3829     if (depth <= 1) {
3830       if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3831         KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3832       }
3833       __kmp_affinity_type = affinity_none;
3834       __kmp_create_affinity_none_places();
3835       return;
3836     } else if (!__kmp_topology->is_uniform()) {
3837       // Save the depth for further usage
3838       __kmp_aff_depth = depth;
3839 
3840       int core_level =
3841           __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
3842       int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
3843                                                  core_level);
3844       int maxprocpercore = __kmp_affinity_max_proc_per_core(
3845           __kmp_avail_proc, depth - 1, core_level);
3846 
3847       int nproc = ncores * maxprocpercore;
3848       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
3849         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3850           KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3851         }
3852         __kmp_affinity_type = affinity_none;
3853         return;
3854       }
3855 
3856       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
3857       for (int i = 0; i < nproc; i++) {
3858         procarr[i] = -1;
3859       }
3860 
3861       int lastcore = -1;
3862       int inlastcore = 0;
3863       for (int i = 0; i < __kmp_avail_proc; i++) {
3864         int proc = __kmp_topology->at(i).os_id;
3865         int core = __kmp_affinity_find_core(i, depth - 1, core_level);
3866 
3867         if (core == lastcore) {
3868           inlastcore++;
3869         } else {
3870           inlastcore = 0;
3871         }
3872         lastcore = core;
3873 
3874         procarr[core * maxprocpercore + inlastcore] = proc;
3875       }
3876     }
3877     if (__kmp_affinity_compact >= depth) {
3878       __kmp_affinity_compact = depth - 1;
3879     }
3880 
3881   sortTopology:
3882     // Allocate the gtid->affinity mask table.
3883     if (__kmp_affinity_dups) {
3884       __kmp_affinity_num_masks = __kmp_avail_proc;
3885     } else {
3886       __kmp_affinity_num_masks = numUnique;
3887     }
3888 
3889     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
3890         (__kmp_affinity_num_places > 0) &&
3891         ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
3892       __kmp_affinity_num_masks = __kmp_affinity_num_places;
3893     }
3894 
3895     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3896 
3897     // Sort the topology table according to the current setting of
3898     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3899     __kmp_topology->sort_compact();
3900     {
3901       int i;
3902       unsigned j;
3903       int num_hw_threads = __kmp_topology->get_num_hw_threads();
3904       for (i = 0, j = 0; i < num_hw_threads; i++) {
3905         if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
3906           continue;
3907         }
3908         int osId = __kmp_topology->at(i).os_id;
3909 
3910         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3911         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3912         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3913         KMP_CPU_COPY(dest, src);
3914         if (++j >= __kmp_affinity_num_masks) {
3915           break;
3916         }
3917       }
3918       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3919     }
3920     // Sort the topology back using ids
3921     __kmp_topology->sort_ids();
3922     break;
3923 
3924   default:
3925     KMP_ASSERT2(0, "Unexpected affinity setting");
3926   }
3927 
3928   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
3929 }
3930 
3931 void __kmp_affinity_initialize(void) {
3932   // Much of the code above was written assuming that if a machine was not
3933   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3934   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3935   // There are too many checks for __kmp_affinity_type == affinity_none
3936   // in this code.  Instead of trying to change them all, check if
3937   // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3938   // affinity_none, call the real initialization routine, then restore
3939   // __kmp_affinity_type to affinity_disabled.
3940   int disabled = (__kmp_affinity_type == affinity_disabled);
3941   if (!KMP_AFFINITY_CAPABLE()) {
3942     KMP_ASSERT(disabled);
3943   }
3944   if (disabled) {
3945     __kmp_affinity_type = affinity_none;
3946   }
3947   __kmp_aux_affinity_initialize();
3948   if (disabled) {
3949     __kmp_affinity_type = affinity_disabled;
3950   }
3951 }
3952 
3953 void __kmp_affinity_uninitialize(void) {
3954   if (__kmp_affinity_masks != NULL) {
3955     KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3956     __kmp_affinity_masks = NULL;
3957   }
3958   if (__kmp_affin_fullMask != NULL) {
3959     KMP_CPU_FREE(__kmp_affin_fullMask);
3960     __kmp_affin_fullMask = NULL;
3961   }
3962   __kmp_affinity_num_masks = 0;
3963   __kmp_affinity_type = affinity_default;
3964   __kmp_affinity_num_places = 0;
3965   if (__kmp_affinity_proclist != NULL) {
3966     __kmp_free(__kmp_affinity_proclist);
3967     __kmp_affinity_proclist = NULL;
3968   }
3969   if (procarr != NULL) {
3970     __kmp_free(procarr);
3971     procarr = NULL;
3972   }
3973 #if KMP_USE_HWLOC
3974   if (__kmp_hwloc_topology != NULL) {
3975     hwloc_topology_destroy(__kmp_hwloc_topology);
3976     __kmp_hwloc_topology = NULL;
3977   }
3978 #endif
3979   if (__kmp_hw_subset) {
3980     kmp_hw_subset_t::deallocate(__kmp_hw_subset);
3981     __kmp_hw_subset = nullptr;
3982   }
3983   if (__kmp_topology) {
3984     kmp_topology_t::deallocate(__kmp_topology);
3985     __kmp_topology = nullptr;
3986   }
3987   KMPAffinity::destroy_api();
3988 }
3989 
3990 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
3991   if (!KMP_AFFINITY_CAPABLE()) {
3992     return;
3993   }
3994 
3995   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3996   if (th->th.th_affin_mask == NULL) {
3997     KMP_CPU_ALLOC(th->th.th_affin_mask);
3998   } else {
3999     KMP_CPU_ZERO(th->th.th_affin_mask);
4000   }
4001 
4002   // Copy the thread mask to the kmp_info_t structure. If
4003   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4004   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4005   // then the full mask is the same as the mask of the initialization thread.
4006   kmp_affin_mask_t *mask;
4007   int i;
4008 
4009   if (KMP_AFFINITY_NON_PROC_BIND) {
4010     if ((__kmp_affinity_type == affinity_none) ||
4011         (__kmp_affinity_type == affinity_balanced) ||
4012         KMP_HIDDEN_HELPER_THREAD(gtid)) {
4013 #if KMP_GROUP_AFFINITY
4014       if (__kmp_num_proc_groups > 1) {
4015         return;
4016       }
4017 #endif
4018       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4019       i = 0;
4020       mask = __kmp_affin_fullMask;
4021     } else {
4022       int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4023       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4024       i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4025       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4026     }
4027   } else {
4028     if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
4029         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4030 #if KMP_GROUP_AFFINITY
4031       if (__kmp_num_proc_groups > 1) {
4032         return;
4033       }
4034 #endif
4035       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4036       i = KMP_PLACE_ALL;
4037       mask = __kmp_affin_fullMask;
4038     } else {
4039       // int i = some hash function or just a counter that doesn't
4040       // always start at 0.  Use adjusted gtid for now.
4041       int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4042       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4043       i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4044       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4045     }
4046   }
4047 
4048   th->th.th_current_place = i;
4049   if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
4050     th->th.th_new_place = i;
4051     th->th.th_first_place = 0;
4052     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4053   } else if (KMP_AFFINITY_NON_PROC_BIND) {
4054     // When using a Non-OMP_PROC_BIND affinity method,
4055     // set all threads' place-partition-var to the entire place list
4056     th->th.th_first_place = 0;
4057     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4058   }
4059 
4060   if (i == KMP_PLACE_ALL) {
4061     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4062                    gtid));
4063   } else {
4064     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4065                    gtid, i));
4066   }
4067 
4068   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4069 
4070   if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
4071       /* to avoid duplicate printing (will be correctly printed on barrier) */
4072       && (__kmp_affinity_type == affinity_none ||
4073           (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
4074     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4075     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4076                               th->th.th_affin_mask);
4077     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4078                __kmp_gettid(), gtid, buf);
4079   }
4080 
4081 #if KMP_DEBUG
4082   // Hidden helper thread affinity only printed for debug builds
4083   if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
4084     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4085     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4086                               th->th.th_affin_mask);
4087     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
4088                (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
4089   }
4090 #endif
4091 
4092 #if KMP_OS_WINDOWS
4093   // On Windows* OS, the process affinity mask might have changed. If the user
4094   // didn't request affinity and this call fails, just continue silently.
4095   // See CQ171393.
4096   if (__kmp_affinity_type == affinity_none) {
4097     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4098   } else
4099 #endif
4100     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4101 }
4102 
4103 void __kmp_affinity_set_place(int gtid) {
4104   if (!KMP_AFFINITY_CAPABLE()) {
4105     return;
4106   }
4107 
4108   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4109 
4110   KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4111                  "place = %d)\n",
4112                  gtid, th->th.th_new_place, th->th.th_current_place));
4113 
4114   // Check that the new place is within this thread's partition.
4115   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4116   KMP_ASSERT(th->th.th_new_place >= 0);
4117   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4118   if (th->th.th_first_place <= th->th.th_last_place) {
4119     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4120                (th->th.th_new_place <= th->th.th_last_place));
4121   } else {
4122     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4123                (th->th.th_new_place >= th->th.th_last_place));
4124   }
4125 
4126   // Copy the thread mask to the kmp_info_t structure,
4127   // and set this thread's affinity.
4128   kmp_affin_mask_t *mask =
4129       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4130   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4131   th->th.th_current_place = th->th.th_new_place;
4132 
4133   if (__kmp_affinity_verbose) {
4134     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4135     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4136                               th->th.th_affin_mask);
4137     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4138                __kmp_gettid(), gtid, buf);
4139   }
4140   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4141 }
4142 
4143 int __kmp_aux_set_affinity(void **mask) {
4144   int gtid;
4145   kmp_info_t *th;
4146   int retval;
4147 
4148   if (!KMP_AFFINITY_CAPABLE()) {
4149     return -1;
4150   }
4151 
4152   gtid = __kmp_entry_gtid();
4153   KA_TRACE(
4154       1000, (""); {
4155         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4156         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4157                                   (kmp_affin_mask_t *)(*mask));
4158         __kmp_debug_printf(
4159             "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4160             gtid, buf);
4161       });
4162 
4163   if (__kmp_env_consistency_check) {
4164     if ((mask == NULL) || (*mask == NULL)) {
4165       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4166     } else {
4167       unsigned proc;
4168       int num_procs = 0;
4169 
4170       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4171         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4172           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4173         }
4174         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4175           continue;
4176         }
4177         num_procs++;
4178       }
4179       if (num_procs == 0) {
4180         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4181       }
4182 
4183 #if KMP_GROUP_AFFINITY
4184       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4185         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4186       }
4187 #endif /* KMP_GROUP_AFFINITY */
4188     }
4189   }
4190 
4191   th = __kmp_threads[gtid];
4192   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4193   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4194   if (retval == 0) {
4195     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4196   }
4197 
4198   th->th.th_current_place = KMP_PLACE_UNDEFINED;
4199   th->th.th_new_place = KMP_PLACE_UNDEFINED;
4200   th->th.th_first_place = 0;
4201   th->th.th_last_place = __kmp_affinity_num_masks - 1;
4202 
4203   // Turn off 4.0 affinity for the current tread at this parallel level.
4204   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4205 
4206   return retval;
4207 }
4208 
4209 int __kmp_aux_get_affinity(void **mask) {
4210   int gtid;
4211   int retval;
4212 #if KMP_OS_WINDOWS || KMP_DEBUG
4213   kmp_info_t *th;
4214 #endif
4215   if (!KMP_AFFINITY_CAPABLE()) {
4216     return -1;
4217   }
4218 
4219   gtid = __kmp_entry_gtid();
4220 #if KMP_OS_WINDOWS || KMP_DEBUG
4221   th = __kmp_threads[gtid];
4222 #else
4223   (void)gtid; // unused variable
4224 #endif
4225   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4226 
4227   KA_TRACE(
4228       1000, (""); {
4229         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4230         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4231                                   th->th.th_affin_mask);
4232         __kmp_printf(
4233             "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
4234             buf);
4235       });
4236 
4237   if (__kmp_env_consistency_check) {
4238     if ((mask == NULL) || (*mask == NULL)) {
4239       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4240     }
4241   }
4242 
4243 #if !KMP_OS_WINDOWS
4244 
4245   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4246   KA_TRACE(
4247       1000, (""); {
4248         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4249         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4250                                   (kmp_affin_mask_t *)(*mask));
4251         __kmp_printf(
4252             "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
4253             buf);
4254       });
4255   return retval;
4256 
4257 #else
4258   (void)retval;
4259 
4260   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4261   return 0;
4262 
4263 #endif /* KMP_OS_WINDOWS */
4264 }
4265 
4266 int __kmp_aux_get_affinity_max_proc() {
4267   if (!KMP_AFFINITY_CAPABLE()) {
4268     return 0;
4269   }
4270 #if KMP_GROUP_AFFINITY
4271   if (__kmp_num_proc_groups > 1) {
4272     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4273   }
4274 #endif
4275   return __kmp_xproc;
4276 }
4277 
4278 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4279   if (!KMP_AFFINITY_CAPABLE()) {
4280     return -1;
4281   }
4282 
4283   KA_TRACE(
4284       1000, (""); {
4285         int gtid = __kmp_entry_gtid();
4286         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4287         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4288                                   (kmp_affin_mask_t *)(*mask));
4289         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4290                            "affinity mask for thread %d = %s\n",
4291                            proc, gtid, buf);
4292       });
4293 
4294   if (__kmp_env_consistency_check) {
4295     if ((mask == NULL) || (*mask == NULL)) {
4296       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4297     }
4298   }
4299 
4300   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4301     return -1;
4302   }
4303   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4304     return -2;
4305   }
4306 
4307   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4308   return 0;
4309 }
4310 
4311 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
4312   if (!KMP_AFFINITY_CAPABLE()) {
4313     return -1;
4314   }
4315 
4316   KA_TRACE(
4317       1000, (""); {
4318         int gtid = __kmp_entry_gtid();
4319         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4320         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4321                                   (kmp_affin_mask_t *)(*mask));
4322         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
4323                            "affinity mask for thread %d = %s\n",
4324                            proc, gtid, buf);
4325       });
4326 
4327   if (__kmp_env_consistency_check) {
4328     if ((mask == NULL) || (*mask == NULL)) {
4329       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4330     }
4331   }
4332 
4333   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4334     return -1;
4335   }
4336   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4337     return -2;
4338   }
4339 
4340   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4341   return 0;
4342 }
4343 
4344 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
4345   if (!KMP_AFFINITY_CAPABLE()) {
4346     return -1;
4347   }
4348 
4349   KA_TRACE(
4350       1000, (""); {
4351         int gtid = __kmp_entry_gtid();
4352         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4353         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4354                                   (kmp_affin_mask_t *)(*mask));
4355         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
4356                            "affinity mask for thread %d = %s\n",
4357                            proc, gtid, buf);
4358       });
4359 
4360   if (__kmp_env_consistency_check) {
4361     if ((mask == NULL) || (*mask == NULL)) {
4362       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4363     }
4364   }
4365 
4366   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4367     return -1;
4368   }
4369   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4370     return 0;
4371   }
4372 
4373   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4374 }
4375 
4376 // Dynamic affinity settings - Affinity balanced
4377 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
4378   KMP_DEBUG_ASSERT(th);
4379   bool fine_gran = true;
4380   int tid = th->th.th_info.ds.ds_tid;
4381 
4382   // Do not perform balanced affinity for the hidden helper threads
4383   if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
4384     return;
4385 
4386   switch (__kmp_affinity_gran) {
4387   case KMP_HW_THREAD:
4388     break;
4389   case KMP_HW_CORE:
4390     if (__kmp_nThreadsPerCore > 1) {
4391       fine_gran = false;
4392     }
4393     break;
4394   case KMP_HW_SOCKET:
4395     if (nCoresPerPkg > 1) {
4396       fine_gran = false;
4397     }
4398     break;
4399   default:
4400     fine_gran = false;
4401   }
4402 
4403   if (__kmp_topology->is_uniform()) {
4404     int coreID;
4405     int threadID;
4406     // Number of hyper threads per core in HT machine
4407     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4408     // Number of cores
4409     int ncores = __kmp_ncores;
4410     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
4411       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4412       ncores = nPackages;
4413     }
4414     // How many threads will be bound to each core
4415     int chunk = nthreads / ncores;
4416     // How many cores will have an additional thread bound to it - "big cores"
4417     int big_cores = nthreads % ncores;
4418     // Number of threads on the big cores
4419     int big_nth = (chunk + 1) * big_cores;
4420     if (tid < big_nth) {
4421       coreID = tid / (chunk + 1);
4422       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
4423     } else { // tid >= big_nth
4424       coreID = (tid - big_cores) / chunk;
4425       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
4426     }
4427     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4428                       "Illegal set affinity operation when not capable");
4429 
4430     kmp_affin_mask_t *mask = th->th.th_affin_mask;
4431     KMP_CPU_ZERO(mask);
4432 
4433     if (fine_gran) {
4434       int osID =
4435           __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
4436       KMP_CPU_SET(osID, mask);
4437     } else {
4438       for (int i = 0; i < __kmp_nth_per_core; i++) {
4439         int osID;
4440         osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
4441         KMP_CPU_SET(osID, mask);
4442       }
4443     }
4444     if (__kmp_affinity_verbose) {
4445       char buf[KMP_AFFIN_MASK_PRINT_LEN];
4446       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4447       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4448                  __kmp_gettid(), tid, buf);
4449     }
4450     __kmp_set_system_affinity(mask, TRUE);
4451   } else { // Non-uniform topology
4452 
4453     kmp_affin_mask_t *mask = th->th.th_affin_mask;
4454     KMP_CPU_ZERO(mask);
4455 
4456     int core_level =
4457         __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
4458     int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
4459                                                __kmp_aff_depth - 1, core_level);
4460     int nth_per_core = __kmp_affinity_max_proc_per_core(
4461         __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4462 
4463     // For performance gain consider the special case nthreads ==
4464     // __kmp_avail_proc
4465     if (nthreads == __kmp_avail_proc) {
4466       if (fine_gran) {
4467         int osID = __kmp_topology->at(tid).os_id;
4468         KMP_CPU_SET(osID, mask);
4469       } else {
4470         int core =
4471             __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
4472         for (int i = 0; i < __kmp_avail_proc; i++) {
4473           int osID = __kmp_topology->at(i).os_id;
4474           if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
4475               core) {
4476             KMP_CPU_SET(osID, mask);
4477           }
4478         }
4479       }
4480     } else if (nthreads <= ncores) {
4481 
4482       int core = 0;
4483       for (int i = 0; i < ncores; i++) {
4484         // Check if this core from procarr[] is in the mask
4485         int in_mask = 0;
4486         for (int j = 0; j < nth_per_core; j++) {
4487           if (procarr[i * nth_per_core + j] != -1) {
4488             in_mask = 1;
4489             break;
4490           }
4491         }
4492         if (in_mask) {
4493           if (tid == core) {
4494             for (int j = 0; j < nth_per_core; j++) {
4495               int osID = procarr[i * nth_per_core + j];
4496               if (osID != -1) {
4497                 KMP_CPU_SET(osID, mask);
4498                 // For fine granularity it is enough to set the first available
4499                 // osID for this core
4500                 if (fine_gran) {
4501                   break;
4502                 }
4503               }
4504             }
4505             break;
4506           } else {
4507             core++;
4508           }
4509         }
4510       }
4511     } else { // nthreads > ncores
4512       // Array to save the number of processors at each core
4513       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
4514       // Array to save the number of cores with "x" available processors;
4515       int *ncores_with_x_procs =
4516           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4517       // Array to save the number of cores with # procs from x to nth_per_core
4518       int *ncores_with_x_to_max_procs =
4519           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4520 
4521       for (int i = 0; i <= nth_per_core; i++) {
4522         ncores_with_x_procs[i] = 0;
4523         ncores_with_x_to_max_procs[i] = 0;
4524       }
4525 
4526       for (int i = 0; i < ncores; i++) {
4527         int cnt = 0;
4528         for (int j = 0; j < nth_per_core; j++) {
4529           if (procarr[i * nth_per_core + j] != -1) {
4530             cnt++;
4531           }
4532         }
4533         nproc_at_core[i] = cnt;
4534         ncores_with_x_procs[cnt]++;
4535       }
4536 
4537       for (int i = 0; i <= nth_per_core; i++) {
4538         for (int j = i; j <= nth_per_core; j++) {
4539           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
4540         }
4541       }
4542 
4543       // Max number of processors
4544       int nproc = nth_per_core * ncores;
4545       // An array to keep number of threads per each context
4546       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4547       for (int i = 0; i < nproc; i++) {
4548         newarr[i] = 0;
4549       }
4550 
4551       int nth = nthreads;
4552       int flag = 0;
4553       while (nth > 0) {
4554         for (int j = 1; j <= nth_per_core; j++) {
4555           int cnt = ncores_with_x_to_max_procs[j];
4556           for (int i = 0; i < ncores; i++) {
4557             // Skip the core with 0 processors
4558             if (nproc_at_core[i] == 0) {
4559               continue;
4560             }
4561             for (int k = 0; k < nth_per_core; k++) {
4562               if (procarr[i * nth_per_core + k] != -1) {
4563                 if (newarr[i * nth_per_core + k] == 0) {
4564                   newarr[i * nth_per_core + k] = 1;
4565                   cnt--;
4566                   nth--;
4567                   break;
4568                 } else {
4569                   if (flag != 0) {
4570                     newarr[i * nth_per_core + k]++;
4571                     cnt--;
4572                     nth--;
4573                     break;
4574                   }
4575                 }
4576               }
4577             }
4578             if (cnt == 0 || nth == 0) {
4579               break;
4580             }
4581           }
4582           if (nth == 0) {
4583             break;
4584           }
4585         }
4586         flag = 1;
4587       }
4588       int sum = 0;
4589       for (int i = 0; i < nproc; i++) {
4590         sum += newarr[i];
4591         if (sum > tid) {
4592           if (fine_gran) {
4593             int osID = procarr[i];
4594             KMP_CPU_SET(osID, mask);
4595           } else {
4596             int coreID = i / nth_per_core;
4597             for (int ii = 0; ii < nth_per_core; ii++) {
4598               int osID = procarr[coreID * nth_per_core + ii];
4599               if (osID != -1) {
4600                 KMP_CPU_SET(osID, mask);
4601               }
4602             }
4603           }
4604           break;
4605         }
4606       }
4607       __kmp_free(newarr);
4608     }
4609 
4610     if (__kmp_affinity_verbose) {
4611       char buf[KMP_AFFIN_MASK_PRINT_LEN];
4612       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4613       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4614                  __kmp_gettid(), tid, buf);
4615     }
4616     __kmp_set_system_affinity(mask, TRUE);
4617   }
4618 }
4619 
4620 #if KMP_OS_LINUX || KMP_OS_FREEBSD
4621 // We don't need this entry for Windows because
4622 // there is GetProcessAffinityMask() api
4623 //
4624 // The intended usage is indicated by these steps:
4625 // 1) The user gets the current affinity mask
4626 // 2) Then sets the affinity by calling this function
4627 // 3) Error check the return value
4628 // 4) Use non-OpenMP parallelization
4629 // 5) Reset the affinity to what was stored in step 1)
4630 #ifdef __cplusplus
4631 extern "C"
4632 #endif
4633     int
4634     kmp_set_thread_affinity_mask_initial()
4635 // the function returns 0 on success,
4636 //   -1 if we cannot bind thread
4637 //   >0 (errno) if an error happened during binding
4638 {
4639   int gtid = __kmp_get_gtid();
4640   if (gtid < 0) {
4641     // Do not touch non-omp threads
4642     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4643                   "non-omp thread, returning\n"));
4644     return -1;
4645   }
4646   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4647     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4648                   "affinity not initialized, returning\n"));
4649     return -1;
4650   }
4651   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4652                 "set full mask for thread %d\n",
4653                 gtid));
4654   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4655   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4656 }
4657 #endif
4658 
4659 #endif // KMP_AFFINITY_SUPPORTED
4660