1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 
30 // The machine topology
31 kmp_topology_t *__kmp_topology = nullptr;
32 // KMP_HW_SUBSET environment variable
33 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
34 
35 // Store the real or imagined machine hierarchy here
36 static hierarchy_info machine_hierarchy;
37 
38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
39 
40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
41   kmp_uint32 depth;
42   // The test below is true if affinity is available, but set to "none". Need to
43   // init on first use of hierarchical barrier.
44   if (TCR_1(machine_hierarchy.uninitialized))
45     machine_hierarchy.init(nproc);
46 
47   // Adjust the hierarchy in case num threads exceeds original
48   if (nproc > machine_hierarchy.base_num_threads)
49     machine_hierarchy.resize(nproc);
50 
51   depth = machine_hierarchy.depth;
52   KMP_DEBUG_ASSERT(depth > 0);
53 
54   thr_bar->depth = depth;
55   __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
56                      &(thr_bar->base_leaf_kids));
57   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
58 }
59 
60 static int nCoresPerPkg, nPackages;
61 static int __kmp_nThreadsPerCore;
62 #ifndef KMP_DFLT_NTH_CORES
63 static int __kmp_ncores;
64 #endif
65 
66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
67   switch (type) {
68   case KMP_HW_SOCKET:
69     return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
70   case KMP_HW_DIE:
71     return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
72   case KMP_HW_MODULE:
73     return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
74   case KMP_HW_TILE:
75     return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
76   case KMP_HW_NUMA:
77     return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
78   case KMP_HW_L3:
79     return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
80   case KMP_HW_L2:
81     return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
82   case KMP_HW_L1:
83     return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
84   case KMP_HW_LLC:
85     return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
86   case KMP_HW_CORE:
87     return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
88   case KMP_HW_THREAD:
89     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
90   case KMP_HW_PROC_GROUP:
91     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
92   }
93   return KMP_I18N_STR(Unknown);
94 }
95 
96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
97   switch (type) {
98   case KMP_HW_SOCKET:
99     return ((plural) ? "sockets" : "socket");
100   case KMP_HW_DIE:
101     return ((plural) ? "dice" : "die");
102   case KMP_HW_MODULE:
103     return ((plural) ? "modules" : "module");
104   case KMP_HW_TILE:
105     return ((plural) ? "tiles" : "tile");
106   case KMP_HW_NUMA:
107     return ((plural) ? "numa_domains" : "numa_domain");
108   case KMP_HW_L3:
109     return ((plural) ? "l3_caches" : "l3_cache");
110   case KMP_HW_L2:
111     return ((plural) ? "l2_caches" : "l2_cache");
112   case KMP_HW_L1:
113     return ((plural) ? "l1_caches" : "l1_cache");
114   case KMP_HW_LLC:
115     return ((plural) ? "ll_caches" : "ll_cache");
116   case KMP_HW_CORE:
117     return ((plural) ? "cores" : "core");
118   case KMP_HW_THREAD:
119     return ((plural) ? "threads" : "thread");
120   case KMP_HW_PROC_GROUP:
121     return ((plural) ? "proc_groups" : "proc_group");
122   }
123   return ((plural) ? "unknowns" : "unknown");
124 }
125 
126 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
127   switch (type) {
128   case KMP_HW_CORE_TYPE_UNKNOWN:
129     return "unknown";
130 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
131   case KMP_HW_CORE_TYPE_ATOM:
132     return "Intel Atom(R) processor";
133   case KMP_HW_CORE_TYPE_CORE:
134     return "Intel(R) Core(TM) processor";
135 #endif
136   }
137   return "unknown";
138 }
139 
140 ////////////////////////////////////////////////////////////////////////////////
141 // kmp_hw_thread_t methods
142 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
143   const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
144   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
145   int depth = __kmp_topology->get_depth();
146   for (int level = 0; level < depth; ++level) {
147     if (ahwthread->ids[level] < bhwthread->ids[level])
148       return -1;
149     else if (ahwthread->ids[level] > bhwthread->ids[level])
150       return 1;
151   }
152   if (ahwthread->os_id < bhwthread->os_id)
153     return -1;
154   else if (ahwthread->os_id > bhwthread->os_id)
155     return 1;
156   return 0;
157 }
158 
159 #if KMP_AFFINITY_SUPPORTED
160 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
161   int i;
162   const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
163   const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
164   int depth = __kmp_topology->get_depth();
165   KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
166   KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
167   for (i = 0; i < __kmp_affinity_compact; i++) {
168     int j = depth - i - 1;
169     if (aa->sub_ids[j] < bb->sub_ids[j])
170       return -1;
171     if (aa->sub_ids[j] > bb->sub_ids[j])
172       return 1;
173   }
174   for (; i < depth; i++) {
175     int j = i - __kmp_affinity_compact;
176     if (aa->sub_ids[j] < bb->sub_ids[j])
177       return -1;
178     if (aa->sub_ids[j] > bb->sub_ids[j])
179       return 1;
180   }
181   return 0;
182 }
183 #endif
184 
185 void kmp_hw_thread_t::print() const {
186   int depth = __kmp_topology->get_depth();
187   printf("%4d ", os_id);
188   for (int i = 0; i < depth; ++i) {
189     printf("%4d ", ids[i]);
190   }
191   if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) {
192     printf(" (%s)", __kmp_hw_get_core_type_string(core_type));
193   }
194   printf("\n");
195 }
196 
197 ////////////////////////////////////////////////////////////////////////////////
198 // kmp_topology_t methods
199 
200 // Remove layers that don't add information to the topology.
201 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
202 void kmp_topology_t::_remove_radix1_layers() {
203   int preference[KMP_HW_LAST];
204   int top_index1, top_index2;
205   // Set up preference associative array
206   preference[KMP_HW_PROC_GROUP] = 110;
207   preference[KMP_HW_SOCKET] = 100;
208   preference[KMP_HW_CORE] = 95;
209   preference[KMP_HW_THREAD] = 90;
210   preference[KMP_HW_NUMA] = 85;
211   preference[KMP_HW_DIE] = 80;
212   preference[KMP_HW_TILE] = 75;
213   preference[KMP_HW_MODULE] = 73;
214   preference[KMP_HW_L3] = 70;
215   preference[KMP_HW_L2] = 65;
216   preference[KMP_HW_L1] = 60;
217   preference[KMP_HW_LLC] = 5;
218   top_index1 = 0;
219   top_index2 = 1;
220   while (top_index1 < depth - 1 && top_index2 < depth) {
221     kmp_hw_t type1 = types[top_index1];
222     kmp_hw_t type2 = types[top_index2];
223     KMP_ASSERT_VALID_HW_TYPE(type1);
224     KMP_ASSERT_VALID_HW_TYPE(type2);
225     // Do not allow the three main topology levels (sockets, cores, threads) to
226     // be compacted down
227     if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
228          type1 == KMP_HW_SOCKET) &&
229         (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
230          type2 == KMP_HW_SOCKET)) {
231       top_index1 = top_index2++;
232       continue;
233     }
234     bool radix1 = true;
235     bool all_same = true;
236     int id1 = hw_threads[0].ids[top_index1];
237     int id2 = hw_threads[0].ids[top_index2];
238     int pref1 = preference[type1];
239     int pref2 = preference[type2];
240     for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
241       if (hw_threads[hwidx].ids[top_index1] == id1 &&
242           hw_threads[hwidx].ids[top_index2] != id2) {
243         radix1 = false;
244         break;
245       }
246       if (hw_threads[hwidx].ids[top_index2] != id2)
247         all_same = false;
248       id1 = hw_threads[hwidx].ids[top_index1];
249       id2 = hw_threads[hwidx].ids[top_index2];
250     }
251     if (radix1) {
252       // Select the layer to remove based on preference
253       kmp_hw_t remove_type, keep_type;
254       int remove_layer, remove_layer_ids;
255       if (pref1 > pref2) {
256         remove_type = type2;
257         remove_layer = remove_layer_ids = top_index2;
258         keep_type = type1;
259       } else {
260         remove_type = type1;
261         remove_layer = remove_layer_ids = top_index1;
262         keep_type = type2;
263       }
264       // If all the indexes for the second (deeper) layer are the same.
265       // e.g., all are zero, then make sure to keep the first layer's ids
266       if (all_same)
267         remove_layer_ids = top_index2;
268       // Remove radix one type by setting the equivalence, removing the id from
269       // the hw threads and removing the layer from types and depth
270       set_equivalent_type(remove_type, keep_type);
271       for (int idx = 0; idx < num_hw_threads; ++idx) {
272         kmp_hw_thread_t &hw_thread = hw_threads[idx];
273         for (int d = remove_layer_ids; d < depth - 1; ++d)
274           hw_thread.ids[d] = hw_thread.ids[d + 1];
275       }
276       for (int idx = remove_layer; idx < depth - 1; ++idx)
277         types[idx] = types[idx + 1];
278       depth--;
279     } else {
280       top_index1 = top_index2++;
281     }
282   }
283   KMP_ASSERT(depth > 0);
284 }
285 
286 void kmp_topology_t::_set_last_level_cache() {
287   if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
288     set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
289   else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
290     set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
291 #if KMP_MIC_SUPPORTED
292   else if (__kmp_mic_type == mic3) {
293     if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
294       set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
295     else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
296       set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
297     // L2/Tile wasn't detected so just say L1
298     else
299       set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
300   }
301 #endif
302   else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
303     set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
304   // Fallback is to set last level cache to socket or core
305   if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
306     if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
307       set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
308     else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
309       set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
310   }
311   KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
312 }
313 
314 // Gather the count of each topology layer and the ratio
315 void kmp_topology_t::_gather_enumeration_information() {
316   int previous_id[KMP_HW_LAST];
317   int max[KMP_HW_LAST];
318   int previous_core_id = kmp_hw_thread_t::UNKNOWN_ID;
319 
320   for (int i = 0; i < depth; ++i) {
321     previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
322     max[i] = 0;
323     count[i] = 0;
324     ratio[i] = 0;
325   }
326   if (__kmp_is_hybrid_cpu()) {
327     for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
328       core_types_count[i] = 0;
329       core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
330     }
331   }
332   for (int i = 0; i < num_hw_threads; ++i) {
333     kmp_hw_thread_t &hw_thread = hw_threads[i];
334     for (int layer = 0; layer < depth; ++layer) {
335       int id = hw_thread.ids[layer];
336       if (id != previous_id[layer]) {
337         // Add an additional increment to each count
338         for (int l = layer; l < depth; ++l)
339           count[l]++;
340         // Keep track of topology layer ratio statistics
341         max[layer]++;
342         for (int l = layer + 1; l < depth; ++l) {
343           if (max[l] > ratio[l])
344             ratio[l] = max[l];
345           max[l] = 1;
346         }
347         break;
348       }
349     }
350     for (int layer = 0; layer < depth; ++layer) {
351       previous_id[layer] = hw_thread.ids[layer];
352     }
353     // Figure out the number of each core type for hybrid CPUs
354     if (__kmp_is_hybrid_cpu()) {
355       int core_level = get_level(KMP_HW_CORE);
356       if (core_level != -1) {
357         if (hw_thread.ids[core_level] != previous_core_id)
358           _increment_core_type(hw_thread.core_type);
359         previous_core_id = hw_thread.ids[core_level];
360       }
361     }
362   }
363   for (int layer = 0; layer < depth; ++layer) {
364     if (max[layer] > ratio[layer])
365       ratio[layer] = max[layer];
366   }
367 }
368 
369 // Find out if the topology is uniform
370 void kmp_topology_t::_discover_uniformity() {
371   int num = 1;
372   for (int level = 0; level < depth; ++level)
373     num *= ratio[level];
374   flags.uniform = (num == count[depth - 1]);
375 }
376 
377 // Set all the sub_ids for each hardware thread
378 void kmp_topology_t::_set_sub_ids() {
379   int previous_id[KMP_HW_LAST];
380   int sub_id[KMP_HW_LAST];
381 
382   for (int i = 0; i < depth; ++i) {
383     previous_id[i] = -1;
384     sub_id[i] = -1;
385   }
386   for (int i = 0; i < num_hw_threads; ++i) {
387     kmp_hw_thread_t &hw_thread = hw_threads[i];
388     // Setup the sub_id
389     for (int j = 0; j < depth; ++j) {
390       if (hw_thread.ids[j] != previous_id[j]) {
391         sub_id[j]++;
392         for (int k = j + 1; k < depth; ++k) {
393           sub_id[k] = 0;
394         }
395         break;
396       }
397     }
398     // Set previous_id
399     for (int j = 0; j < depth; ++j) {
400       previous_id[j] = hw_thread.ids[j];
401     }
402     // Set the sub_ids field
403     for (int j = 0; j < depth; ++j) {
404       hw_thread.sub_ids[j] = sub_id[j];
405     }
406   }
407 }
408 
409 void kmp_topology_t::_set_globals() {
410   // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
411   int core_level, thread_level, package_level;
412   package_level = get_level(KMP_HW_SOCKET);
413 #if KMP_GROUP_AFFINITY
414   if (package_level == -1)
415     package_level = get_level(KMP_HW_PROC_GROUP);
416 #endif
417   core_level = get_level(KMP_HW_CORE);
418   thread_level = get_level(KMP_HW_THREAD);
419 
420   KMP_ASSERT(core_level != -1);
421   KMP_ASSERT(thread_level != -1);
422 
423   __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
424   if (package_level != -1) {
425     nCoresPerPkg = calculate_ratio(core_level, package_level);
426     nPackages = get_count(package_level);
427   } else {
428     // assume one socket
429     nCoresPerPkg = get_count(core_level);
430     nPackages = 1;
431   }
432 #ifndef KMP_DFLT_NTH_CORES
433   __kmp_ncores = get_count(core_level);
434 #endif
435 }
436 
437 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
438                                          const kmp_hw_t *types) {
439   kmp_topology_t *retval;
440   // Allocate all data in one large allocation
441   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
442                 sizeof(int) * ndepth * 3;
443   char *bytes = (char *)__kmp_allocate(size);
444   retval = (kmp_topology_t *)bytes;
445   if (nproc > 0) {
446     retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
447   } else {
448     retval->hw_threads = nullptr;
449   }
450   retval->num_hw_threads = nproc;
451   retval->depth = ndepth;
452   int *arr =
453       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
454   retval->types = (kmp_hw_t *)arr;
455   retval->ratio = arr + ndepth;
456   retval->count = arr + 2 * ndepth;
457   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
458   for (int i = 0; i < ndepth; ++i) {
459     retval->types[i] = types[i];
460     retval->equivalent[types[i]] = types[i];
461   }
462   return retval;
463 }
464 
465 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
466   if (topology)
467     __kmp_free(topology);
468 }
469 
470 bool kmp_topology_t::check_ids() const {
471   // Assume ids have been sorted
472   if (num_hw_threads == 0)
473     return true;
474   for (int i = 1; i < num_hw_threads; ++i) {
475     kmp_hw_thread_t &current_thread = hw_threads[i];
476     kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
477     bool unique = false;
478     for (int j = 0; j < depth; ++j) {
479       if (previous_thread.ids[j] != current_thread.ids[j]) {
480         unique = true;
481         break;
482       }
483     }
484     if (unique)
485       continue;
486     return false;
487   }
488   return true;
489 }
490 
491 void kmp_topology_t::dump() const {
492   printf("***********************\n");
493   printf("*** __kmp_topology: ***\n");
494   printf("***********************\n");
495   printf("* depth: %d\n", depth);
496 
497   printf("* types: ");
498   for (int i = 0; i < depth; ++i)
499     printf("%15s ", __kmp_hw_get_keyword(types[i]));
500   printf("\n");
501 
502   printf("* ratio: ");
503   for (int i = 0; i < depth; ++i) {
504     printf("%15d ", ratio[i]);
505   }
506   printf("\n");
507 
508   printf("* count: ");
509   for (int i = 0; i < depth; ++i) {
510     printf("%15d ", count[i]);
511   }
512   printf("\n");
513 
514   printf("* core_types:\n");
515   for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
516     if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) {
517       printf("    %d %s core%c\n", core_types_count[i],
518              __kmp_hw_get_core_type_string(core_types[i]),
519              ((core_types_count[i] > 1) ? 's' : ' '));
520     } else {
521       if (i == 0)
522         printf("No hybrid information available\n");
523       break;
524     }
525   }
526 
527   printf("* equivalent map:\n");
528   KMP_FOREACH_HW_TYPE(i) {
529     const char *key = __kmp_hw_get_keyword(i);
530     const char *value = __kmp_hw_get_keyword(equivalent[i]);
531     printf("%-15s -> %-15s\n", key, value);
532   }
533 
534   printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
535 
536   printf("* num_hw_threads: %d\n", num_hw_threads);
537   printf("* hw_threads:\n");
538   for (int i = 0; i < num_hw_threads; ++i) {
539     hw_threads[i].print();
540   }
541   printf("***********************\n");
542 }
543 
544 void kmp_topology_t::print(const char *env_var) const {
545   kmp_str_buf_t buf;
546   int print_types_depth;
547   __kmp_str_buf_init(&buf);
548   kmp_hw_t print_types[KMP_HW_LAST + 2];
549 
550   // Num Available Threads
551   KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
552 
553   // Uniform or not
554   if (is_uniform()) {
555     KMP_INFORM(Uniform, env_var);
556   } else {
557     KMP_INFORM(NonUniform, env_var);
558   }
559 
560   // Equivalent types
561   KMP_FOREACH_HW_TYPE(type) {
562     kmp_hw_t eq_type = equivalent[type];
563     if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
564       KMP_INFORM(AffEqualTopologyTypes, env_var,
565                  __kmp_hw_get_catalog_string(type),
566                  __kmp_hw_get_catalog_string(eq_type));
567     }
568   }
569 
570   // Quick topology
571   KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
572   // Create a print types array that always guarantees printing
573   // the core and thread level
574   print_types_depth = 0;
575   for (int level = 0; level < depth; ++level)
576     print_types[print_types_depth++] = types[level];
577   if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
578     // Force in the core level for quick topology
579     if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
580       // Force core before thread e.g., 1 socket X 2 threads/socket
581       // becomes 1 socket X 1 core/socket X 2 threads/socket
582       print_types[print_types_depth - 1] = KMP_HW_CORE;
583       print_types[print_types_depth++] = KMP_HW_THREAD;
584     } else {
585       print_types[print_types_depth++] = KMP_HW_CORE;
586     }
587   }
588   // Always put threads at very end of quick topology
589   if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
590     print_types[print_types_depth++] = KMP_HW_THREAD;
591 
592   __kmp_str_buf_clear(&buf);
593   kmp_hw_t numerator_type;
594   kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
595   int core_level = get_level(KMP_HW_CORE);
596   int ncores = get_count(core_level);
597 
598   for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
599     int c;
600     bool plural;
601     numerator_type = print_types[plevel];
602     KMP_ASSERT_VALID_HW_TYPE(numerator_type);
603     if (equivalent[numerator_type] != numerator_type)
604       c = 1;
605     else
606       c = get_ratio(level++);
607     plural = (c > 1);
608     if (plevel == 0) {
609       __kmp_str_buf_print(&buf, "%d %s", c,
610                           __kmp_hw_get_catalog_string(numerator_type, plural));
611     } else {
612       __kmp_str_buf_print(&buf, " x %d %s/%s", c,
613                           __kmp_hw_get_catalog_string(numerator_type, plural),
614                           __kmp_hw_get_catalog_string(denominator_type));
615     }
616     denominator_type = numerator_type;
617   }
618   KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
619 
620   if (__kmp_is_hybrid_cpu()) {
621     for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
622       if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN)
623         break;
624       KMP_INFORM(TopologyHybrid, env_var, core_types_count[i],
625                  __kmp_hw_get_core_type_string(core_types[i]));
626     }
627   }
628 
629   if (num_hw_threads <= 0) {
630     __kmp_str_buf_free(&buf);
631     return;
632   }
633 
634   // Full OS proc to hardware thread map
635   KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
636   for (int i = 0; i < num_hw_threads; i++) {
637     __kmp_str_buf_clear(&buf);
638     for (int level = 0; level < depth; ++level) {
639       kmp_hw_t type = types[level];
640       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
641       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
642     }
643     if (__kmp_is_hybrid_cpu())
644       __kmp_str_buf_print(
645           &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type));
646     KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
647   }
648 
649   __kmp_str_buf_free(&buf);
650 }
651 
652 void kmp_topology_t::canonicalize() {
653   _remove_radix1_layers();
654   _gather_enumeration_information();
655   _discover_uniformity();
656   _set_sub_ids();
657   _set_globals();
658   _set_last_level_cache();
659 
660 #if KMP_MIC_SUPPORTED
661   // Manually Add L2 = Tile equivalence
662   if (__kmp_mic_type == mic3) {
663     if (get_level(KMP_HW_L2) != -1)
664       set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
665     else if (get_level(KMP_HW_TILE) != -1)
666       set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
667   }
668 #endif
669 
670   // Perform post canonicalization checking
671   KMP_ASSERT(depth > 0);
672   for (int level = 0; level < depth; ++level) {
673     // All counts, ratios, and types must be valid
674     KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
675     KMP_ASSERT_VALID_HW_TYPE(types[level]);
676     // Detected types must point to themselves
677     KMP_ASSERT(equivalent[types[level]] == types[level]);
678   }
679 
680 #if KMP_AFFINITY_SUPPORTED
681   // Set the number of affinity granularity levels
682   if (__kmp_affinity_gran_levels < 0) {
683     kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
684     // Check if user's granularity request is valid
685     if (gran_type == KMP_HW_UNKNOWN) {
686       // First try core, then thread, then package
687       kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
688       for (auto g : gran_types) {
689         if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) {
690           gran_type = g;
691           break;
692         }
693       }
694       KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
695       // Warn user what granularity setting will be used instead
696       KMP_WARNING(AffGranularityBad, "KMP_AFFINITY",
697                   __kmp_hw_get_catalog_string(__kmp_affinity_gran),
698                   __kmp_hw_get_catalog_string(gran_type));
699       __kmp_affinity_gran = gran_type;
700     }
701     __kmp_affinity_gran_levels = 0;
702     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
703       __kmp_affinity_gran_levels++;
704   }
705 #endif // KMP_AFFINITY_SUPPORTED
706 }
707 
708 // Canonicalize an explicit packages X cores/pkg X threads/core topology
709 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
710                                   int nthreads_per_core, int ncores) {
711   int ndepth = 3;
712   depth = ndepth;
713   KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
714   for (int level = 0; level < depth; ++level) {
715     count[level] = 0;
716     ratio[level] = 0;
717   }
718   count[0] = npackages;
719   count[1] = ncores;
720   count[2] = __kmp_xproc;
721   ratio[0] = npackages;
722   ratio[1] = ncores_per_pkg;
723   ratio[2] = nthreads_per_core;
724   equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
725   equivalent[KMP_HW_CORE] = KMP_HW_CORE;
726   equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
727   types[0] = KMP_HW_SOCKET;
728   types[1] = KMP_HW_CORE;
729   types[2] = KMP_HW_THREAD;
730   //__kmp_avail_proc = __kmp_xproc;
731   _discover_uniformity();
732 }
733 
734 // Apply the KMP_HW_SUBSET envirable to the topology
735 // Returns true if KMP_HW_SUBSET filtered any processors
736 // otherwise, returns false
737 bool kmp_topology_t::filter_hw_subset() {
738   // If KMP_HW_SUBSET wasn't requested, then do nothing.
739   if (!__kmp_hw_subset)
740     return false;
741 
742   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
743   int hw_subset_depth = __kmp_hw_subset->get_depth();
744   kmp_hw_t specified[KMP_HW_LAST];
745   KMP_ASSERT(hw_subset_depth > 0);
746   KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
747   for (int i = 0; i < hw_subset_depth; ++i) {
748     int max_count;
749     int num = __kmp_hw_subset->at(i).num;
750     int offset = __kmp_hw_subset->at(i).offset;
751     kmp_hw_t type = __kmp_hw_subset->at(i).type;
752     kmp_hw_t equivalent_type = equivalent[type];
753     int level = get_level(type);
754 
755     // Check to see if current layer is in detected machine topology
756     if (equivalent_type != KMP_HW_UNKNOWN) {
757       __kmp_hw_subset->at(i).type = equivalent_type;
758     } else {
759       KMP_WARNING(AffHWSubsetNotExistGeneric,
760                   __kmp_hw_get_catalog_string(type));
761       return false;
762     }
763 
764     // Check to see if current layer has already been specified
765     // either directly or through an equivalent type
766     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
767       KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
768                   __kmp_hw_get_catalog_string(specified[equivalent_type]));
769       return false;
770     }
771     specified[equivalent_type] = type;
772 
773     // Check to see if layers are in order
774     if (i + 1 < hw_subset_depth) {
775       kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
776       if (next_type == KMP_HW_UNKNOWN) {
777         KMP_WARNING(
778             AffHWSubsetNotExistGeneric,
779             __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
780         return false;
781       }
782       int next_topology_level = get_level(next_type);
783       if (level > next_topology_level) {
784         KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
785                     __kmp_hw_get_catalog_string(next_type));
786         return false;
787       }
788     }
789 
790     // Check to see if each layer's num & offset parameters are valid
791     max_count = get_ratio(level);
792     if (max_count < 0 || num + offset > max_count) {
793       bool plural = (num > 1);
794       KMP_WARNING(AffHWSubsetManyGeneric,
795                   __kmp_hw_get_catalog_string(type, plural));
796       return false;
797     }
798   }
799 
800   // Apply the filtered hardware subset
801   int new_index = 0;
802   for (int i = 0; i < num_hw_threads; ++i) {
803     kmp_hw_thread_t &hw_thread = hw_threads[i];
804     // Check to see if this hardware thread should be filtered
805     bool should_be_filtered = false;
806     for (int level = 0, hw_subset_index = 0;
807          level < depth && hw_subset_index < hw_subset_depth; ++level) {
808       kmp_hw_t topology_type = types[level];
809       auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
810       kmp_hw_t hw_subset_type = hw_subset_item.type;
811       if (topology_type != hw_subset_type)
812         continue;
813       int num = hw_subset_item.num;
814       int offset = hw_subset_item.offset;
815       hw_subset_index++;
816       if (hw_thread.sub_ids[level] < offset ||
817           hw_thread.sub_ids[level] >= offset + num) {
818         should_be_filtered = true;
819         break;
820       }
821     }
822     if (!should_be_filtered) {
823       if (i != new_index)
824         hw_threads[new_index] = hw_thread;
825       new_index++;
826     } else {
827 #if KMP_AFFINITY_SUPPORTED
828       KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
829 #endif
830       __kmp_avail_proc--;
831     }
832   }
833   KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
834   num_hw_threads = new_index;
835 
836   // Post hardware subset canonicalization
837   _gather_enumeration_information();
838   _discover_uniformity();
839   _set_globals();
840   _set_last_level_cache();
841   return true;
842 }
843 
844 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
845   if (hw_level >= depth)
846     return true;
847   bool retval = true;
848   const kmp_hw_thread_t &t1 = hw_threads[hwt1];
849   const kmp_hw_thread_t &t2 = hw_threads[hwt2];
850   for (int i = 0; i < (depth - hw_level); ++i) {
851     if (t1.ids[i] != t2.ids[i])
852       return false;
853   }
854   return retval;
855 }
856 
857 ////////////////////////////////////////////////////////////////////////////////
858 
859 #if KMP_AFFINITY_SUPPORTED
860 class kmp_affinity_raii_t {
861   kmp_affin_mask_t *mask;
862   bool restored;
863 
864 public:
865   kmp_affinity_raii_t() : restored(false) {
866     KMP_CPU_ALLOC(mask);
867     KMP_ASSERT(mask != NULL);
868     __kmp_get_system_affinity(mask, TRUE);
869   }
870   void restore() {
871     __kmp_set_system_affinity(mask, TRUE);
872     KMP_CPU_FREE(mask);
873     restored = true;
874   }
875   ~kmp_affinity_raii_t() {
876     if (!restored) {
877       __kmp_set_system_affinity(mask, TRUE);
878       KMP_CPU_FREE(mask);
879     }
880   }
881 };
882 
883 bool KMPAffinity::picked_api = false;
884 
885 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
886 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
887 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
888 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
889 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
890 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
891 
892 void KMPAffinity::pick_api() {
893   KMPAffinity *affinity_dispatch;
894   if (picked_api)
895     return;
896 #if KMP_USE_HWLOC
897   // Only use Hwloc if affinity isn't explicitly disabled and
898   // user requests Hwloc topology method
899   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
900       __kmp_affinity_type != affinity_disabled) {
901     affinity_dispatch = new KMPHwlocAffinity();
902   } else
903 #endif
904   {
905     affinity_dispatch = new KMPNativeAffinity();
906   }
907   __kmp_affinity_dispatch = affinity_dispatch;
908   picked_api = true;
909 }
910 
911 void KMPAffinity::destroy_api() {
912   if (__kmp_affinity_dispatch != NULL) {
913     delete __kmp_affinity_dispatch;
914     __kmp_affinity_dispatch = NULL;
915     picked_api = false;
916   }
917 }
918 
919 #define KMP_ADVANCE_SCAN(scan)                                                 \
920   while (*scan != '\0') {                                                      \
921     scan++;                                                                    \
922   }
923 
924 // Print the affinity mask to the character array in a pretty format.
925 // The format is a comma separated list of non-negative integers or integer
926 // ranges: e.g., 1,2,3-5,7,9-15
927 // The format can also be the string "{<empty>}" if no bits are set in mask
928 char *__kmp_affinity_print_mask(char *buf, int buf_len,
929                                 kmp_affin_mask_t *mask) {
930   int start = 0, finish = 0, previous = 0;
931   bool first_range;
932   KMP_ASSERT(buf);
933   KMP_ASSERT(buf_len >= 40);
934   KMP_ASSERT(mask);
935   char *scan = buf;
936   char *end = buf + buf_len - 1;
937 
938   // Check for empty set.
939   if (mask->begin() == mask->end()) {
940     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
941     KMP_ADVANCE_SCAN(scan);
942     KMP_ASSERT(scan <= end);
943     return buf;
944   }
945 
946   first_range = true;
947   start = mask->begin();
948   while (1) {
949     // Find next range
950     // [start, previous] is inclusive range of contiguous bits in mask
951     for (finish = mask->next(start), previous = start;
952          finish == previous + 1 && finish != mask->end();
953          finish = mask->next(finish)) {
954       previous = finish;
955     }
956 
957     // The first range does not need a comma printed before it, but the rest
958     // of the ranges do need a comma beforehand
959     if (!first_range) {
960       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
961       KMP_ADVANCE_SCAN(scan);
962     } else {
963       first_range = false;
964     }
965     // Range with three or more contiguous bits in the affinity mask
966     if (previous - start > 1) {
967       KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
968     } else {
969       // Range with one or two contiguous bits in the affinity mask
970       KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
971       KMP_ADVANCE_SCAN(scan);
972       if (previous - start > 0) {
973         KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
974       }
975     }
976     KMP_ADVANCE_SCAN(scan);
977     // Start over with new start point
978     start = finish;
979     if (start == mask->end())
980       break;
981     // Check for overflow
982     if (end - scan < 2)
983       break;
984   }
985 
986   // Check for overflow
987   KMP_ASSERT(scan <= end);
988   return buf;
989 }
990 #undef KMP_ADVANCE_SCAN
991 
992 // Print the affinity mask to the string buffer object in a pretty format
993 // The format is a comma separated list of non-negative integers or integer
994 // ranges: e.g., 1,2,3-5,7,9-15
995 // The format can also be the string "{<empty>}" if no bits are set in mask
996 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
997                                            kmp_affin_mask_t *mask) {
998   int start = 0, finish = 0, previous = 0;
999   bool first_range;
1000   KMP_ASSERT(buf);
1001   KMP_ASSERT(mask);
1002 
1003   __kmp_str_buf_clear(buf);
1004 
1005   // Check for empty set.
1006   if (mask->begin() == mask->end()) {
1007     __kmp_str_buf_print(buf, "%s", "{<empty>}");
1008     return buf;
1009   }
1010 
1011   first_range = true;
1012   start = mask->begin();
1013   while (1) {
1014     // Find next range
1015     // [start, previous] is inclusive range of contiguous bits in mask
1016     for (finish = mask->next(start), previous = start;
1017          finish == previous + 1 && finish != mask->end();
1018          finish = mask->next(finish)) {
1019       previous = finish;
1020     }
1021 
1022     // The first range does not need a comma printed before it, but the rest
1023     // of the ranges do need a comma beforehand
1024     if (!first_range) {
1025       __kmp_str_buf_print(buf, "%s", ",");
1026     } else {
1027       first_range = false;
1028     }
1029     // Range with three or more contiguous bits in the affinity mask
1030     if (previous - start > 1) {
1031       __kmp_str_buf_print(buf, "%u-%u", start, previous);
1032     } else {
1033       // Range with one or two contiguous bits in the affinity mask
1034       __kmp_str_buf_print(buf, "%u", start);
1035       if (previous - start > 0) {
1036         __kmp_str_buf_print(buf, ",%u", previous);
1037       }
1038     }
1039     // Start over with new start point
1040     start = finish;
1041     if (start == mask->end())
1042       break;
1043   }
1044   return buf;
1045 }
1046 
1047 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1048   KMP_CPU_ZERO(mask);
1049 
1050 #if KMP_GROUP_AFFINITY
1051 
1052   if (__kmp_num_proc_groups > 1) {
1053     int group;
1054     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1055     for (group = 0; group < __kmp_num_proc_groups; group++) {
1056       int i;
1057       int num = __kmp_GetActiveProcessorCount(group);
1058       for (i = 0; i < num; i++) {
1059         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1060       }
1061     }
1062   } else
1063 
1064 #endif /* KMP_GROUP_AFFINITY */
1065 
1066   {
1067     int proc;
1068     for (proc = 0; proc < __kmp_xproc; proc++) {
1069       KMP_CPU_SET(proc, mask);
1070     }
1071   }
1072 }
1073 
1074 // All of the __kmp_affinity_create_*_map() routines should allocate the
1075 // internal topology object and set the layer ids for it.  Each routine
1076 // returns a boolean on whether it was successful at doing so.
1077 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1078 
1079 #if KMP_USE_HWLOC
1080 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1081 #if HWLOC_API_VERSION >= 0x00020000
1082   return hwloc_obj_type_is_cache(obj->type);
1083 #else
1084   return obj->type == HWLOC_OBJ_CACHE;
1085 #endif
1086 }
1087 
1088 // Returns KMP_HW_* type derived from HWLOC_* type
1089 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1090 
1091   if (__kmp_hwloc_is_cache_type(obj)) {
1092     if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1093       return KMP_HW_UNKNOWN;
1094     switch (obj->attr->cache.depth) {
1095     case 1:
1096       return KMP_HW_L1;
1097     case 2:
1098 #if KMP_MIC_SUPPORTED
1099       if (__kmp_mic_type == mic3) {
1100         return KMP_HW_TILE;
1101       }
1102 #endif
1103       return KMP_HW_L2;
1104     case 3:
1105       return KMP_HW_L3;
1106     }
1107     return KMP_HW_UNKNOWN;
1108   }
1109 
1110   switch (obj->type) {
1111   case HWLOC_OBJ_PACKAGE:
1112     return KMP_HW_SOCKET;
1113   case HWLOC_OBJ_NUMANODE:
1114     return KMP_HW_NUMA;
1115   case HWLOC_OBJ_CORE:
1116     return KMP_HW_CORE;
1117   case HWLOC_OBJ_PU:
1118     return KMP_HW_THREAD;
1119   case HWLOC_OBJ_GROUP:
1120     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1121       return KMP_HW_DIE;
1122     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1123       return KMP_HW_TILE;
1124     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1125       return KMP_HW_MODULE;
1126     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1127       return KMP_HW_PROC_GROUP;
1128     return KMP_HW_UNKNOWN;
1129 #if HWLOC_API_VERSION >= 0x00020100
1130   case HWLOC_OBJ_DIE:
1131     return KMP_HW_DIE;
1132 #endif
1133   }
1134   return KMP_HW_UNKNOWN;
1135 }
1136 
1137 // Returns the number of objects of type 'type' below 'obj' within the topology
1138 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1139 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1140 // object.
1141 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1142                                            hwloc_obj_type_t type) {
1143   int retval = 0;
1144   hwloc_obj_t first;
1145   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1146                                            obj->logical_index, type, 0);
1147        first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1148                                                        obj->type, first) == obj;
1149        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1150                                           first)) {
1151     ++retval;
1152   }
1153   return retval;
1154 }
1155 
1156 // This gets the sub_id for a lower object under a higher object in the
1157 // topology tree
1158 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1159                                   hwloc_obj_t lower) {
1160   hwloc_obj_t obj;
1161   hwloc_obj_type_t ltype = lower->type;
1162   int lindex = lower->logical_index - 1;
1163   int sub_id = 0;
1164   // Get the previous lower object
1165   obj = hwloc_get_obj_by_type(t, ltype, lindex);
1166   while (obj && lindex >= 0 &&
1167          hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1168     if (obj->userdata) {
1169       sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1170       break;
1171     }
1172     sub_id++;
1173     lindex--;
1174     obj = hwloc_get_obj_by_type(t, ltype, lindex);
1175   }
1176   // store sub_id + 1 so that 0 is differed from NULL
1177   lower->userdata = RCAST(void *, sub_id + 1);
1178   return sub_id;
1179 }
1180 
1181 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1182   kmp_hw_t type;
1183   int hw_thread_index, sub_id;
1184   int depth;
1185   hwloc_obj_t pu, obj, root, prev;
1186   kmp_hw_t types[KMP_HW_LAST];
1187   hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1188 
1189   hwloc_topology_t tp = __kmp_hwloc_topology;
1190   *msg_id = kmp_i18n_null;
1191   if (__kmp_affinity_verbose) {
1192     KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1193   }
1194 
1195   if (!KMP_AFFINITY_CAPABLE()) {
1196     // Hack to try and infer the machine topology using only the data
1197     // available from hwloc on the current thread, and __kmp_xproc.
1198     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1199     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1200     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1201     if (o != NULL)
1202       nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1203     else
1204       nCoresPerPkg = 1; // no PACKAGE found
1205     o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1206     if (o != NULL)
1207       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1208     else
1209       __kmp_nThreadsPerCore = 1; // no CORE found
1210     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1211     if (nCoresPerPkg == 0)
1212       nCoresPerPkg = 1; // to prevent possible division by 0
1213     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1214     return true;
1215   }
1216 
1217   root = hwloc_get_root_obj(tp);
1218 
1219   // Figure out the depth and types in the topology
1220   depth = 0;
1221   pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1222   KMP_ASSERT(pu);
1223   obj = pu;
1224   types[depth] = KMP_HW_THREAD;
1225   hwloc_types[depth] = obj->type;
1226   depth++;
1227   while (obj != root && obj != NULL) {
1228     obj = obj->parent;
1229 #if HWLOC_API_VERSION >= 0x00020000
1230     if (obj->memory_arity) {
1231       hwloc_obj_t memory;
1232       for (memory = obj->memory_first_child; memory;
1233            memory = hwloc_get_next_child(tp, obj, memory)) {
1234         if (memory->type == HWLOC_OBJ_NUMANODE)
1235           break;
1236       }
1237       if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1238         types[depth] = KMP_HW_NUMA;
1239         hwloc_types[depth] = memory->type;
1240         depth++;
1241       }
1242     }
1243 #endif
1244     type = __kmp_hwloc_type_2_topology_type(obj);
1245     if (type != KMP_HW_UNKNOWN) {
1246       types[depth] = type;
1247       hwloc_types[depth] = obj->type;
1248       depth++;
1249     }
1250   }
1251   KMP_ASSERT(depth > 0);
1252 
1253   // Get the order for the types correct
1254   for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1255     hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1256     kmp_hw_t temp = types[i];
1257     types[i] = types[j];
1258     types[j] = temp;
1259     hwloc_types[i] = hwloc_types[j];
1260     hwloc_types[j] = hwloc_temp;
1261   }
1262 
1263   // Allocate the data structure to be returned.
1264   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1265 
1266   hw_thread_index = 0;
1267   pu = NULL;
1268   while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
1269     int index = depth - 1;
1270     bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1271     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1272     if (included) {
1273       hw_thread.clear();
1274       hw_thread.ids[index] = pu->logical_index;
1275       hw_thread.os_id = pu->os_index;
1276       index--;
1277     }
1278     obj = pu;
1279     prev = obj;
1280     while (obj != root && obj != NULL) {
1281       obj = obj->parent;
1282 #if HWLOC_API_VERSION >= 0x00020000
1283       // NUMA Nodes are handled differently since they are not within the
1284       // parent/child structure anymore.  They are separate children
1285       // of obj (memory_first_child points to first memory child)
1286       if (obj->memory_arity) {
1287         hwloc_obj_t memory;
1288         for (memory = obj->memory_first_child; memory;
1289              memory = hwloc_get_next_child(tp, obj, memory)) {
1290           if (memory->type == HWLOC_OBJ_NUMANODE)
1291             break;
1292         }
1293         if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1294           sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1295           if (included) {
1296             hw_thread.ids[index] = memory->logical_index;
1297             hw_thread.ids[index + 1] = sub_id;
1298             index--;
1299           }
1300           prev = memory;
1301         }
1302         prev = obj;
1303       }
1304 #endif
1305       type = __kmp_hwloc_type_2_topology_type(obj);
1306       if (type != KMP_HW_UNKNOWN) {
1307         sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1308         if (included) {
1309           hw_thread.ids[index] = obj->logical_index;
1310           hw_thread.ids[index + 1] = sub_id;
1311           index--;
1312         }
1313         prev = obj;
1314       }
1315     }
1316     if (included)
1317       hw_thread_index++;
1318   }
1319   __kmp_topology->sort_ids();
1320   return true;
1321 }
1322 #endif // KMP_USE_HWLOC
1323 
1324 // If we don't know how to retrieve the machine's processor topology, or
1325 // encounter an error in doing so, this routine is called to form a "flat"
1326 // mapping of os thread id's <-> processor id's.
1327 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1328   *msg_id = kmp_i18n_null;
1329   int depth = 3;
1330   kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1331 
1332   if (__kmp_affinity_verbose) {
1333     KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1334   }
1335 
1336   // Even if __kmp_affinity_type == affinity_none, this routine might still
1337   // called to set __kmp_ncores, as well as
1338   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1339   if (!KMP_AFFINITY_CAPABLE()) {
1340     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1341     __kmp_ncores = nPackages = __kmp_xproc;
1342     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1343     return true;
1344   }
1345 
1346   // When affinity is off, this routine will still be called to set
1347   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1348   // Make sure all these vars are set correctly, and return now if affinity is
1349   // not enabled.
1350   __kmp_ncores = nPackages = __kmp_avail_proc;
1351   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1352 
1353   // Construct the data structure to be returned.
1354   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1355   int avail_ct = 0;
1356   int i;
1357   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1358     // Skip this proc if it is not included in the machine model.
1359     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1360       continue;
1361     }
1362     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1363     hw_thread.clear();
1364     hw_thread.os_id = i;
1365     hw_thread.ids[0] = i;
1366     hw_thread.ids[1] = 0;
1367     hw_thread.ids[2] = 0;
1368     avail_ct++;
1369   }
1370   if (__kmp_affinity_verbose) {
1371     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
1372   }
1373   return true;
1374 }
1375 
1376 #if KMP_GROUP_AFFINITY
1377 // If multiple Windows* OS processor groups exist, we can create a 2-level
1378 // topology map with the groups at level 0 and the individual procs at level 1.
1379 // This facilitates letting the threads float among all procs in a group,
1380 // if granularity=group (the default when there are multiple groups).
1381 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
1382   *msg_id = kmp_i18n_null;
1383   int depth = 3;
1384   kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
1385   const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
1386 
1387   if (__kmp_affinity_verbose) {
1388     KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
1389   }
1390 
1391   // If we aren't affinity capable, then use flat topology
1392   if (!KMP_AFFINITY_CAPABLE()) {
1393     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1394     nPackages = __kmp_num_proc_groups;
1395     __kmp_nThreadsPerCore = 1;
1396     __kmp_ncores = __kmp_xproc;
1397     nCoresPerPkg = nPackages / __kmp_ncores;
1398     return true;
1399   }
1400 
1401   // Construct the data structure to be returned.
1402   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1403   int avail_ct = 0;
1404   int i;
1405   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1406     // Skip this proc if it is not included in the machine model.
1407     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1408       continue;
1409     }
1410     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
1411     hw_thread.clear();
1412     hw_thread.os_id = i;
1413     hw_thread.ids[0] = i / BITS_PER_GROUP;
1414     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
1415   }
1416   return true;
1417 }
1418 #endif /* KMP_GROUP_AFFINITY */
1419 
1420 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1421 
1422 template <kmp_uint32 LSB, kmp_uint32 MSB>
1423 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
1424   const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
1425   const kmp_uint32 SHIFT_RIGHT = LSB;
1426   kmp_uint32 retval = v;
1427   retval <<= SHIFT_LEFT;
1428   retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
1429   return retval;
1430 }
1431 
1432 static int __kmp_cpuid_mask_width(int count) {
1433   int r = 0;
1434 
1435   while ((1 << r) < count)
1436     ++r;
1437   return r;
1438 }
1439 
1440 class apicThreadInfo {
1441 public:
1442   unsigned osId; // param to __kmp_affinity_bind_thread
1443   unsigned apicId; // from cpuid after binding
1444   unsigned maxCoresPerPkg; //      ""
1445   unsigned maxThreadsPerPkg; //      ""
1446   unsigned pkgId; // inferred from above values
1447   unsigned coreId; //      ""
1448   unsigned threadId; //      ""
1449 };
1450 
1451 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
1452                                                      const void *b) {
1453   const apicThreadInfo *aa = (const apicThreadInfo *)a;
1454   const apicThreadInfo *bb = (const apicThreadInfo *)b;
1455   if (aa->pkgId < bb->pkgId)
1456     return -1;
1457   if (aa->pkgId > bb->pkgId)
1458     return 1;
1459   if (aa->coreId < bb->coreId)
1460     return -1;
1461   if (aa->coreId > bb->coreId)
1462     return 1;
1463   if (aa->threadId < bb->threadId)
1464     return -1;
1465   if (aa->threadId > bb->threadId)
1466     return 1;
1467   return 0;
1468 }
1469 
1470 class kmp_cache_info_t {
1471 public:
1472   struct info_t {
1473     unsigned level, mask;
1474   };
1475   kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
1476   size_t get_depth() const { return depth; }
1477   info_t &operator[](size_t index) { return table[index]; }
1478   const info_t &operator[](size_t index) const { return table[index]; }
1479 
1480   static kmp_hw_t get_topology_type(unsigned level) {
1481     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
1482     switch (level) {
1483     case 1:
1484       return KMP_HW_L1;
1485     case 2:
1486       return KMP_HW_L2;
1487     case 3:
1488       return KMP_HW_L3;
1489     }
1490     return KMP_HW_UNKNOWN;
1491   }
1492 
1493 private:
1494   static const int MAX_CACHE_LEVEL = 3;
1495 
1496   size_t depth;
1497   info_t table[MAX_CACHE_LEVEL];
1498 
1499   void get_leaf4_levels() {
1500     unsigned level = 0;
1501     while (depth < MAX_CACHE_LEVEL) {
1502       unsigned cache_type, max_threads_sharing;
1503       unsigned cache_level, cache_mask_width;
1504       kmp_cpuid buf2;
1505       __kmp_x86_cpuid(4, level, &buf2);
1506       cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
1507       if (!cache_type)
1508         break;
1509       // Skip instruction caches
1510       if (cache_type == 2) {
1511         level++;
1512         continue;
1513       }
1514       max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
1515       cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
1516       cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
1517       table[depth].level = cache_level;
1518       table[depth].mask = ((-1) << cache_mask_width);
1519       depth++;
1520       level++;
1521     }
1522   }
1523 };
1524 
1525 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1526 // an algorithm which cycles through the available os threads, setting
1527 // the current thread's affinity mask to that thread, and then retrieves
1528 // the Apic Id for each thread context using the cpuid instruction.
1529 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
1530   kmp_cpuid buf;
1531   *msg_id = kmp_i18n_null;
1532 
1533   if (__kmp_affinity_verbose) {
1534     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
1535   }
1536 
1537   // Check if cpuid leaf 4 is supported.
1538   __kmp_x86_cpuid(0, 0, &buf);
1539   if (buf.eax < 4) {
1540     *msg_id = kmp_i18n_str_NoLeaf4Support;
1541     return false;
1542   }
1543 
1544   // The algorithm used starts by setting the affinity to each available thread
1545   // and retrieving info from the cpuid instruction, so if we are not capable of
1546   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1547   // need to do something else - use the defaults that we calculated from
1548   // issuing cpuid without binding to each proc.
1549   if (!KMP_AFFINITY_CAPABLE()) {
1550     // Hack to try and infer the machine topology using only the data
1551     // available from cpuid on the current thread, and __kmp_xproc.
1552     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1553 
1554     // Get an upper bound on the number of threads per package using cpuid(1).
1555     // On some OS/chps combinations where HT is supported by the chip but is
1556     // disabled, this value will be 2 on a single core chip. Usually, it will be
1557     // 2 if HT is enabled and 1 if HT is disabled.
1558     __kmp_x86_cpuid(1, 0, &buf);
1559     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1560     if (maxThreadsPerPkg == 0) {
1561       maxThreadsPerPkg = 1;
1562     }
1563 
1564     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1565     // value.
1566     //
1567     // The author of cpu_count.cpp treated this only an upper bound on the
1568     // number of cores, but I haven't seen any cases where it was greater than
1569     // the actual number of cores, so we will treat it as exact in this block of
1570     // code.
1571     //
1572     // First, we need to check if cpuid(4) is supported on this chip. To see if
1573     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1574     // greater.
1575     __kmp_x86_cpuid(0, 0, &buf);
1576     if (buf.eax >= 4) {
1577       __kmp_x86_cpuid(4, 0, &buf);
1578       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1579     } else {
1580       nCoresPerPkg = 1;
1581     }
1582 
1583     // There is no way to reliably tell if HT is enabled without issuing the
1584     // cpuid instruction from every thread, can correlating the cpuid info, so
1585     // if the machine is not affinity capable, we assume that HT is off. We have
1586     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1587     // does not support HT.
1588     //
1589     // - Older OSes are usually found on machines with older chips, which do not
1590     //   support HT.
1591     // - The performance penalty for mistakenly identifying a machine as HT when
1592     //   it isn't (which results in blocktime being incorrectly set to 0) is
1593     //   greater than the penalty when for mistakenly identifying a machine as
1594     //   being 1 thread/core when it is really HT enabled (which results in
1595     //   blocktime being incorrectly set to a positive value).
1596     __kmp_ncores = __kmp_xproc;
1597     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1598     __kmp_nThreadsPerCore = 1;
1599     return true;
1600   }
1601 
1602   // From here on, we can assume that it is safe to call
1603   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1604   // __kmp_affinity_type = affinity_none.
1605 
1606   // Save the affinity mask for the current thread.
1607   kmp_affinity_raii_t previous_affinity;
1608 
1609   // Run through each of the available contexts, binding the current thread
1610   // to it, and obtaining the pertinent information using the cpuid instr.
1611   //
1612   // The relevant information is:
1613   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1614   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1615   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1616   //     of this field determines the width of the core# + thread# fields in the
1617   //     Apic Id. It is also an upper bound on the number of threads per
1618   //     package, but it has been verified that situations happen were it is not
1619   //     exact. In particular, on certain OS/chip combinations where Intel(R)
1620   //     Hyper-Threading Technology is supported by the chip but has been
1621   //     disabled, the value of this field will be 2 (for a single core chip).
1622   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
1623   //     Technology, the value of this field will be 1 when Intel(R)
1624   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
1625   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
1626   //     of this field (+1) determines the width of the core# field in the Apic
1627   //     Id. The comments in "cpucount.cpp" say that this value is an upper
1628   //     bound, but the IA-32 architecture manual says that it is exactly the
1629   //     number of cores per package, and I haven't seen any case where it
1630   //     wasn't.
1631   //
1632   // From this information, deduce the package Id, core Id, and thread Id,
1633   // and set the corresponding fields in the apicThreadInfo struct.
1634   unsigned i;
1635   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1636       __kmp_avail_proc * sizeof(apicThreadInfo));
1637   unsigned nApics = 0;
1638   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1639     // Skip this proc if it is not included in the machine model.
1640     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1641       continue;
1642     }
1643     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1644 
1645     __kmp_affinity_dispatch->bind_thread(i);
1646     threadInfo[nApics].osId = i;
1647 
1648     // The apic id and max threads per pkg come from cpuid(1).
1649     __kmp_x86_cpuid(1, 0, &buf);
1650     if (((buf.edx >> 9) & 1) == 0) {
1651       __kmp_free(threadInfo);
1652       *msg_id = kmp_i18n_str_ApicNotPresent;
1653       return false;
1654     }
1655     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1656     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1657     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1658       threadInfo[nApics].maxThreadsPerPkg = 1;
1659     }
1660 
1661     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1662     // value.
1663     //
1664     // First, we need to check if cpuid(4) is supported on this chip. To see if
1665     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1666     // or greater.
1667     __kmp_x86_cpuid(0, 0, &buf);
1668     if (buf.eax >= 4) {
1669       __kmp_x86_cpuid(4, 0, &buf);
1670       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1671     } else {
1672       threadInfo[nApics].maxCoresPerPkg = 1;
1673     }
1674 
1675     // Infer the pkgId / coreId / threadId using only the info obtained locally.
1676     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1677     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1678 
1679     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1680     int widthT = widthCT - widthC;
1681     if (widthT < 0) {
1682       // I've never seen this one happen, but I suppose it could, if the cpuid
1683       // instruction on a chip was really screwed up. Make sure to restore the
1684       // affinity mask before the tail call.
1685       __kmp_free(threadInfo);
1686       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1687       return false;
1688     }
1689 
1690     int maskC = (1 << widthC) - 1;
1691     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1692 
1693     int maskT = (1 << widthT) - 1;
1694     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1695 
1696     nApics++;
1697   }
1698 
1699   // We've collected all the info we need.
1700   // Restore the old affinity mask for this thread.
1701   previous_affinity.restore();
1702 
1703   // Sort the threadInfo table by physical Id.
1704   qsort(threadInfo, nApics, sizeof(*threadInfo),
1705         __kmp_affinity_cmp_apicThreadInfo_phys_id);
1706 
1707   // The table is now sorted by pkgId / coreId / threadId, but we really don't
1708   // know the radix of any of the fields. pkgId's may be sparsely assigned among
1709   // the chips on a system. Although coreId's are usually assigned
1710   // [0 .. coresPerPkg-1] and threadId's are usually assigned
1711   // [0..threadsPerCore-1], we don't want to make any such assumptions.
1712   //
1713   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1714   // total # packages) are at this point - we want to determine that now. We
1715   // only have an upper bound on the first two figures.
1716   //
1717   // We also perform a consistency check at this point: the values returned by
1718   // the cpuid instruction for any thread bound to a given package had better
1719   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1720   nPackages = 1;
1721   nCoresPerPkg = 1;
1722   __kmp_nThreadsPerCore = 1;
1723   unsigned nCores = 1;
1724 
1725   unsigned pkgCt = 1; // to determine radii
1726   unsigned lastPkgId = threadInfo[0].pkgId;
1727   unsigned coreCt = 1;
1728   unsigned lastCoreId = threadInfo[0].coreId;
1729   unsigned threadCt = 1;
1730   unsigned lastThreadId = threadInfo[0].threadId;
1731 
1732   // intra-pkg consist checks
1733   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1734   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1735 
1736   for (i = 1; i < nApics; i++) {
1737     if (threadInfo[i].pkgId != lastPkgId) {
1738       nCores++;
1739       pkgCt++;
1740       lastPkgId = threadInfo[i].pkgId;
1741       if ((int)coreCt > nCoresPerPkg)
1742         nCoresPerPkg = coreCt;
1743       coreCt = 1;
1744       lastCoreId = threadInfo[i].coreId;
1745       if ((int)threadCt > __kmp_nThreadsPerCore)
1746         __kmp_nThreadsPerCore = threadCt;
1747       threadCt = 1;
1748       lastThreadId = threadInfo[i].threadId;
1749 
1750       // This is a different package, so go on to the next iteration without
1751       // doing any consistency checks. Reset the consistency check vars, though.
1752       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1753       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1754       continue;
1755     }
1756 
1757     if (threadInfo[i].coreId != lastCoreId) {
1758       nCores++;
1759       coreCt++;
1760       lastCoreId = threadInfo[i].coreId;
1761       if ((int)threadCt > __kmp_nThreadsPerCore)
1762         __kmp_nThreadsPerCore = threadCt;
1763       threadCt = 1;
1764       lastThreadId = threadInfo[i].threadId;
1765     } else if (threadInfo[i].threadId != lastThreadId) {
1766       threadCt++;
1767       lastThreadId = threadInfo[i].threadId;
1768     } else {
1769       __kmp_free(threadInfo);
1770       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1771       return false;
1772     }
1773 
1774     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1775     // fields agree between all the threads bounds to a given package.
1776     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1777         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1778       __kmp_free(threadInfo);
1779       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1780       return false;
1781     }
1782   }
1783   // When affinity is off, this routine will still be called to set
1784   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1785   // Make sure all these vars are set correctly
1786   nPackages = pkgCt;
1787   if ((int)coreCt > nCoresPerPkg)
1788     nCoresPerPkg = coreCt;
1789   if ((int)threadCt > __kmp_nThreadsPerCore)
1790     __kmp_nThreadsPerCore = threadCt;
1791   __kmp_ncores = nCores;
1792   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
1793 
1794   // Now that we've determined the number of packages, the number of cores per
1795   // package, and the number of threads per core, we can construct the data
1796   // structure that is to be returned.
1797   int idx = 0;
1798   int pkgLevel = 0;
1799   int coreLevel = 1;
1800   int threadLevel = 2;
1801   //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1802   int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1803   kmp_hw_t types[3];
1804   if (pkgLevel >= 0)
1805     types[idx++] = KMP_HW_SOCKET;
1806   if (coreLevel >= 0)
1807     types[idx++] = KMP_HW_CORE;
1808   if (threadLevel >= 0)
1809     types[idx++] = KMP_HW_THREAD;
1810 
1811   KMP_ASSERT(depth > 0);
1812   __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
1813 
1814   for (i = 0; i < nApics; ++i) {
1815     idx = 0;
1816     unsigned os = threadInfo[i].osId;
1817     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
1818     hw_thread.clear();
1819 
1820     if (pkgLevel >= 0) {
1821       hw_thread.ids[idx++] = threadInfo[i].pkgId;
1822     }
1823     if (coreLevel >= 0) {
1824       hw_thread.ids[idx++] = threadInfo[i].coreId;
1825     }
1826     if (threadLevel >= 0) {
1827       hw_thread.ids[idx++] = threadInfo[i].threadId;
1828     }
1829     hw_thread.os_id = os;
1830   }
1831 
1832   __kmp_free(threadInfo);
1833   __kmp_topology->sort_ids();
1834   if (!__kmp_topology->check_ids()) {
1835     kmp_topology_t::deallocate(__kmp_topology);
1836     __kmp_topology = nullptr;
1837     *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1838     return false;
1839   }
1840   return true;
1841 }
1842 
1843 // Hybrid cpu detection using CPUID.1A
1844 // Thread should be pinned to processor already
1845 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type,
1846                                   unsigned *native_model_id) {
1847   kmp_cpuid buf;
1848   __kmp_x86_cpuid(0x1a, 0, &buf);
1849   *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
1850   *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
1851 }
1852 
1853 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1854 // architectures support a newer interface for specifying the x2APIC Ids,
1855 // based on CPUID.B or CPUID.1F
1856 /*
1857  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
1858     Bits            Bits            Bits           Bits
1859     31-16           15-8            7-4            4-0
1860 ---+-----------+--------------+-------------+-----------------+
1861 EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
1862 ---+-----------|--------------+-------------+-----------------|
1863 EBX| reserved  | Num logical processors at level (16 bits)    |
1864 ---+-----------|--------------+-------------------------------|
1865 ECX| reserved  |   Level Type |      Level Number (8 bits)    |
1866 ---+-----------+--------------+-------------------------------|
1867 EDX|                    X2APIC ID (32 bits)                   |
1868 ---+----------------------------------------------------------+
1869 */
1870 
1871 enum {
1872   INTEL_LEVEL_TYPE_INVALID = 0, // Package level
1873   INTEL_LEVEL_TYPE_SMT = 1,
1874   INTEL_LEVEL_TYPE_CORE = 2,
1875   INTEL_LEVEL_TYPE_TILE = 3,
1876   INTEL_LEVEL_TYPE_MODULE = 4,
1877   INTEL_LEVEL_TYPE_DIE = 5,
1878   INTEL_LEVEL_TYPE_LAST = 6,
1879 };
1880 
1881 struct cpuid_level_info_t {
1882   unsigned level_type, mask, mask_width, nitems, cache_mask;
1883 };
1884 
1885 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
1886   switch (intel_type) {
1887   case INTEL_LEVEL_TYPE_INVALID:
1888     return KMP_HW_SOCKET;
1889   case INTEL_LEVEL_TYPE_SMT:
1890     return KMP_HW_THREAD;
1891   case INTEL_LEVEL_TYPE_CORE:
1892     return KMP_HW_CORE;
1893   case INTEL_LEVEL_TYPE_TILE:
1894     return KMP_HW_TILE;
1895   case INTEL_LEVEL_TYPE_MODULE:
1896     return KMP_HW_MODULE;
1897   case INTEL_LEVEL_TYPE_DIE:
1898     return KMP_HW_DIE;
1899   }
1900   return KMP_HW_UNKNOWN;
1901 }
1902 
1903 // This function takes the topology leaf, a levels array to store the levels
1904 // detected and a bitmap of the known levels.
1905 // Returns the number of levels in the topology
1906 static unsigned
1907 __kmp_x2apicid_get_levels(int leaf,
1908                           cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
1909                           kmp_uint64 known_levels) {
1910   unsigned level, levels_index;
1911   unsigned level_type, mask_width, nitems;
1912   kmp_cpuid buf;
1913 
1914   // New algorithm has known topology layers act as highest unknown topology
1915   // layers when unknown topology layers exist.
1916   // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
1917   // are unknown topology layers, Then SMT will take the characteristics of
1918   // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
1919   // This eliminates unknown portions of the topology while still keeping the
1920   // correct structure.
1921   level = levels_index = 0;
1922   do {
1923     __kmp_x86_cpuid(leaf, level, &buf);
1924     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
1925     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
1926     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
1927     if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
1928       return 0;
1929 
1930     if (known_levels & (1ull << level_type)) {
1931       // Add a new level to the topology
1932       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
1933       levels[levels_index].level_type = level_type;
1934       levels[levels_index].mask_width = mask_width;
1935       levels[levels_index].nitems = nitems;
1936       levels_index++;
1937     } else {
1938       // If it is an unknown level, then logically move the previous layer up
1939       if (levels_index > 0) {
1940         levels[levels_index - 1].mask_width = mask_width;
1941         levels[levels_index - 1].nitems = nitems;
1942       }
1943     }
1944     level++;
1945   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
1946 
1947   // Set the masks to & with apicid
1948   for (unsigned i = 0; i < levels_index; ++i) {
1949     if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
1950       levels[i].mask = ~((-1) << levels[i].mask_width);
1951       levels[i].cache_mask = (-1) << levels[i].mask_width;
1952       for (unsigned j = 0; j < i; ++j)
1953         levels[i].mask ^= levels[j].mask;
1954     } else {
1955       KMP_DEBUG_ASSERT(levels_index > 0);
1956       levels[i].mask = (-1) << levels[i - 1].mask_width;
1957       levels[i].cache_mask = 0;
1958     }
1959   }
1960   return levels_index;
1961 }
1962 
1963 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
1964 
1965   cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
1966   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
1967   unsigned levels_index;
1968   kmp_cpuid buf;
1969   kmp_uint64 known_levels;
1970   int topology_leaf, highest_leaf, apic_id;
1971   int num_leaves;
1972   static int leaves[] = {0, 0};
1973 
1974   kmp_i18n_id_t leaf_message_id;
1975 
1976   KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
1977 
1978   *msg_id = kmp_i18n_null;
1979   if (__kmp_affinity_verbose) {
1980     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
1981   }
1982 
1983   // Figure out the known topology levels
1984   known_levels = 0ull;
1985   for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
1986     if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
1987       known_levels |= (1ull << i);
1988     }
1989   }
1990 
1991   // Get the highest cpuid leaf supported
1992   __kmp_x86_cpuid(0, 0, &buf);
1993   highest_leaf = buf.eax;
1994 
1995   // If a specific topology method was requested, only allow that specific leaf
1996   // otherwise, try both leaves 31 and 11 in that order
1997   num_leaves = 0;
1998   if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
1999     num_leaves = 1;
2000     leaves[0] = 11;
2001     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2002   } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2003     num_leaves = 1;
2004     leaves[0] = 31;
2005     leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2006   } else {
2007     num_leaves = 2;
2008     leaves[0] = 31;
2009     leaves[1] = 11;
2010     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2011   }
2012 
2013   // Check to see if cpuid leaf 31 or 11 is supported.
2014   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2015   topology_leaf = -1;
2016   for (int i = 0; i < num_leaves; ++i) {
2017     int leaf = leaves[i];
2018     if (highest_leaf < leaf)
2019       continue;
2020     __kmp_x86_cpuid(leaf, 0, &buf);
2021     if (buf.ebx == 0)
2022       continue;
2023     topology_leaf = leaf;
2024     levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2025     if (levels_index == 0)
2026       continue;
2027     break;
2028   }
2029   if (topology_leaf == -1 || levels_index == 0) {
2030     *msg_id = leaf_message_id;
2031     return false;
2032   }
2033   KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2034 
2035   // The algorithm used starts by setting the affinity to each available thread
2036   // and retrieving info from the cpuid instruction, so if we are not capable of
2037   // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2038   // we need to do something else - use the defaults that we calculated from
2039   // issuing cpuid without binding to each proc.
2040   if (!KMP_AFFINITY_CAPABLE()) {
2041     // Hack to try and infer the machine topology using only the data
2042     // available from cpuid on the current thread, and __kmp_xproc.
2043     KMP_ASSERT(__kmp_affinity_type == affinity_none);
2044     for (unsigned i = 0; i < levels_index; ++i) {
2045       if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2046         __kmp_nThreadsPerCore = levels[i].nitems;
2047       } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2048         nCoresPerPkg = levels[i].nitems;
2049       }
2050     }
2051     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2052     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2053     return true;
2054   }
2055 
2056   // Allocate the data structure to be returned.
2057   int depth = levels_index;
2058   for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2059     types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2060   __kmp_topology =
2061       kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2062 
2063   // Insert equivalent cache types if they exist
2064   kmp_cache_info_t cache_info;
2065   for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2066     const kmp_cache_info_t::info_t &info = cache_info[i];
2067     unsigned cache_mask = info.mask;
2068     unsigned cache_level = info.level;
2069     for (unsigned j = 0; j < levels_index; ++j) {
2070       unsigned hw_cache_mask = levels[j].cache_mask;
2071       kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2072       if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2073         kmp_hw_t type =
2074             __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2075         __kmp_topology->set_equivalent_type(cache_type, type);
2076       }
2077     }
2078   }
2079 
2080   // From here on, we can assume that it is safe to call
2081   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2082   // __kmp_affinity_type = affinity_none.
2083 
2084   // Save the affinity mask for the current thread.
2085   kmp_affinity_raii_t previous_affinity;
2086 
2087   // Run through each of the available contexts, binding the current thread
2088   // to it, and obtaining the pertinent information using the cpuid instr.
2089   unsigned int proc;
2090   int hw_thread_index = 0;
2091   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2092     cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2093     unsigned my_levels_index;
2094 
2095     // Skip this proc if it is not included in the machine model.
2096     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2097       continue;
2098     }
2099     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2100 
2101     __kmp_affinity_dispatch->bind_thread(proc);
2102 
2103     // New algorithm
2104     __kmp_x86_cpuid(topology_leaf, 0, &buf);
2105     apic_id = buf.edx;
2106     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2107     my_levels_index =
2108         __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2109     if (my_levels_index == 0 || my_levels_index != levels_index) {
2110       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2111       return false;
2112     }
2113     hw_thread.clear();
2114     hw_thread.os_id = proc;
2115     // Put in topology information
2116     for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2117       hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2118       if (j > 0) {
2119         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2120       }
2121     }
2122     // Hybrid information
2123     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2124       kmp_hw_core_type_t type;
2125       unsigned native_model_id;
2126       __kmp_get_hybrid_info(&type, &native_model_id);
2127       hw_thread.core_type = type;
2128     }
2129     hw_thread_index++;
2130   }
2131   KMP_ASSERT(hw_thread_index > 0);
2132   __kmp_topology->sort_ids();
2133   if (!__kmp_topology->check_ids()) {
2134     kmp_topology_t::deallocate(__kmp_topology);
2135     __kmp_topology = nullptr;
2136     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2137     return false;
2138   }
2139   return true;
2140 }
2141 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2142 
2143 #define osIdIndex 0
2144 #define threadIdIndex 1
2145 #define coreIdIndex 2
2146 #define pkgIdIndex 3
2147 #define nodeIdIndex 4
2148 
2149 typedef unsigned *ProcCpuInfo;
2150 static unsigned maxIndex = pkgIdIndex;
2151 
2152 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2153                                                   const void *b) {
2154   unsigned i;
2155   const unsigned *aa = *(unsigned *const *)a;
2156   const unsigned *bb = *(unsigned *const *)b;
2157   for (i = maxIndex;; i--) {
2158     if (aa[i] < bb[i])
2159       return -1;
2160     if (aa[i] > bb[i])
2161       return 1;
2162     if (i == osIdIndex)
2163       break;
2164   }
2165   return 0;
2166 }
2167 
2168 #if KMP_USE_HIER_SCHED
2169 // Set the array sizes for the hierarchy layers
2170 static void __kmp_dispatch_set_hierarchy_values() {
2171   // Set the maximum number of L1's to number of cores
2172   // Set the maximum number of L2's to to either number of cores / 2 for
2173   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2174   // Or the number of cores for Intel(R) Xeon(R) processors
2175   // Set the maximum number of NUMA nodes and L3's to number of packages
2176   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2177       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2178   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2179 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
2180     KMP_MIC_SUPPORTED
2181   if (__kmp_mic_type >= mic3)
2182     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2183   else
2184 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2185     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2186   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2187   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2188   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2189   // Set the number of threads per unit
2190   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2191   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2192   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2193       __kmp_nThreadsPerCore;
2194 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
2195     KMP_MIC_SUPPORTED
2196   if (__kmp_mic_type >= mic3)
2197     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2198         2 * __kmp_nThreadsPerCore;
2199   else
2200 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2201     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2202         __kmp_nThreadsPerCore;
2203   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2204       nCoresPerPkg * __kmp_nThreadsPerCore;
2205   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2206       nCoresPerPkg * __kmp_nThreadsPerCore;
2207   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2208       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2209 }
2210 
2211 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2212 // i.e., this thread's L1 or this thread's L2, etc.
2213 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2214   int index = type + 1;
2215   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2216   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2217   if (type == kmp_hier_layer_e::LAYER_THREAD)
2218     return tid;
2219   else if (type == kmp_hier_layer_e::LAYER_LOOP)
2220     return 0;
2221   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2222   if (tid >= num_hw_threads)
2223     tid = tid % num_hw_threads;
2224   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2225 }
2226 
2227 // Return the number of t1's per t2
2228 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2229   int i1 = t1 + 1;
2230   int i2 = t2 + 1;
2231   KMP_DEBUG_ASSERT(i1 <= i2);
2232   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2233   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2234   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2235   // (nthreads/t2) / (nthreads/t1) = t1 / t2
2236   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2237 }
2238 #endif // KMP_USE_HIER_SCHED
2239 
2240 static inline const char *__kmp_cpuinfo_get_filename() {
2241   const char *filename;
2242   if (__kmp_cpuinfo_file != nullptr)
2243     filename = __kmp_cpuinfo_file;
2244   else
2245     filename = "/proc/cpuinfo";
2246   return filename;
2247 }
2248 
2249 static inline const char *__kmp_cpuinfo_get_envvar() {
2250   const char *envvar = nullptr;
2251   if (__kmp_cpuinfo_file != nullptr)
2252     envvar = "KMP_CPUINFO_FILE";
2253   return envvar;
2254 }
2255 
2256 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2257 // affinity map.
2258 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2259                                               kmp_i18n_id_t *const msg_id) {
2260   const char *filename = __kmp_cpuinfo_get_filename();
2261   const char *envvar = __kmp_cpuinfo_get_envvar();
2262   *msg_id = kmp_i18n_null;
2263 
2264   if (__kmp_affinity_verbose) {
2265     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2266   }
2267 
2268   kmp_safe_raii_file_t f(filename, "r", envvar);
2269 
2270   // Scan of the file, and count the number of "processor" (osId) fields,
2271   // and find the highest value of <n> for a node_<n> field.
2272   char buf[256];
2273   unsigned num_records = 0;
2274   while (!feof(f)) {
2275     buf[sizeof(buf) - 1] = 1;
2276     if (!fgets(buf, sizeof(buf), f)) {
2277       // Read errors presumably because of EOF
2278       break;
2279     }
2280 
2281     char s1[] = "processor";
2282     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2283       num_records++;
2284       continue;
2285     }
2286 
2287     // FIXME - this will match "node_<n> <garbage>"
2288     unsigned level;
2289     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2290       // validate the input fisrt:
2291       if (level > (unsigned)__kmp_xproc) { // level is too big
2292         level = __kmp_xproc;
2293       }
2294       if (nodeIdIndex + level >= maxIndex) {
2295         maxIndex = nodeIdIndex + level;
2296       }
2297       continue;
2298     }
2299   }
2300 
2301   // Check for empty file / no valid processor records, or too many. The number
2302   // of records can't exceed the number of valid bits in the affinity mask.
2303   if (num_records == 0) {
2304     *msg_id = kmp_i18n_str_NoProcRecords;
2305     return false;
2306   }
2307   if (num_records > (unsigned)__kmp_xproc) {
2308     *msg_id = kmp_i18n_str_TooManyProcRecords;
2309     return false;
2310   }
2311 
2312   // Set the file pointer back to the beginning, so that we can scan the file
2313   // again, this time performing a full parse of the data. Allocate a vector of
2314   // ProcCpuInfo object, where we will place the data. Adding an extra element
2315   // at the end allows us to remove a lot of extra checks for termination
2316   // conditions.
2317   if (fseek(f, 0, SEEK_SET) != 0) {
2318     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2319     return false;
2320   }
2321 
2322   // Allocate the array of records to store the proc info in.  The dummy
2323   // element at the end makes the logic in filling them out easier to code.
2324   unsigned **threadInfo =
2325       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2326   unsigned i;
2327   for (i = 0; i <= num_records; i++) {
2328     threadInfo[i] =
2329         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2330   }
2331 
2332 #define CLEANUP_THREAD_INFO                                                    \
2333   for (i = 0; i <= num_records; i++) {                                         \
2334     __kmp_free(threadInfo[i]);                                                 \
2335   }                                                                            \
2336   __kmp_free(threadInfo);
2337 
2338   // A value of UINT_MAX means that we didn't find the field
2339   unsigned __index;
2340 
2341 #define INIT_PROC_INFO(p)                                                      \
2342   for (__index = 0; __index <= maxIndex; __index++) {                          \
2343     (p)[__index] = UINT_MAX;                                                   \
2344   }
2345 
2346   for (i = 0; i <= num_records; i++) {
2347     INIT_PROC_INFO(threadInfo[i]);
2348   }
2349 
2350   unsigned num_avail = 0;
2351   *line = 0;
2352   while (!feof(f)) {
2353     // Create an inner scoping level, so that all the goto targets at the end of
2354     // the loop appear in an outer scoping level. This avoids warnings about
2355     // jumping past an initialization to a target in the same block.
2356     {
2357       buf[sizeof(buf) - 1] = 1;
2358       bool long_line = false;
2359       if (!fgets(buf, sizeof(buf), f)) {
2360         // Read errors presumably because of EOF
2361         // If there is valid data in threadInfo[num_avail], then fake
2362         // a blank line in ensure that the last address gets parsed.
2363         bool valid = false;
2364         for (i = 0; i <= maxIndex; i++) {
2365           if (threadInfo[num_avail][i] != UINT_MAX) {
2366             valid = true;
2367           }
2368         }
2369         if (!valid) {
2370           break;
2371         }
2372         buf[0] = 0;
2373       } else if (!buf[sizeof(buf) - 1]) {
2374         // The line is longer than the buffer.  Set a flag and don't
2375         // emit an error if we were going to ignore the line, anyway.
2376         long_line = true;
2377 
2378 #define CHECK_LINE                                                             \
2379   if (long_line) {                                                             \
2380     CLEANUP_THREAD_INFO;                                                       \
2381     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
2382     return false;                                                              \
2383   }
2384       }
2385       (*line)++;
2386 
2387       char s1[] = "processor";
2388       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2389         CHECK_LINE;
2390         char *p = strchr(buf + sizeof(s1) - 1, ':');
2391         unsigned val;
2392         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2393           goto no_val;
2394         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2395 #if KMP_ARCH_AARCH64
2396           // Handle the old AArch64 /proc/cpuinfo layout differently,
2397           // it contains all of the 'processor' entries listed in a
2398           // single 'Processor' section, therefore the normal looking
2399           // for duplicates in that section will always fail.
2400           num_avail++;
2401 #else
2402           goto dup_field;
2403 #endif
2404         threadInfo[num_avail][osIdIndex] = val;
2405 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2406         char path[256];
2407         KMP_SNPRINTF(
2408             path, sizeof(path),
2409             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2410             threadInfo[num_avail][osIdIndex]);
2411         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2412 
2413         KMP_SNPRINTF(path, sizeof(path),
2414                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
2415                      threadInfo[num_avail][osIdIndex]);
2416         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2417         continue;
2418 #else
2419       }
2420       char s2[] = "physical id";
2421       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2422         CHECK_LINE;
2423         char *p = strchr(buf + sizeof(s2) - 1, ':');
2424         unsigned val;
2425         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2426           goto no_val;
2427         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2428           goto dup_field;
2429         threadInfo[num_avail][pkgIdIndex] = val;
2430         continue;
2431       }
2432       char s3[] = "core id";
2433       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2434         CHECK_LINE;
2435         char *p = strchr(buf + sizeof(s3) - 1, ':');
2436         unsigned val;
2437         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2438           goto no_val;
2439         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2440           goto dup_field;
2441         threadInfo[num_avail][coreIdIndex] = val;
2442         continue;
2443 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2444       }
2445       char s4[] = "thread id";
2446       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2447         CHECK_LINE;
2448         char *p = strchr(buf + sizeof(s4) - 1, ':');
2449         unsigned val;
2450         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2451           goto no_val;
2452         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2453           goto dup_field;
2454         threadInfo[num_avail][threadIdIndex] = val;
2455         continue;
2456       }
2457       unsigned level;
2458       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2459         CHECK_LINE;
2460         char *p = strchr(buf + sizeof(s4) - 1, ':');
2461         unsigned val;
2462         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2463           goto no_val;
2464         // validate the input before using level:
2465         if (level > (unsigned)__kmp_xproc) { // level is too big
2466           level = __kmp_xproc;
2467         }
2468         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2469           goto dup_field;
2470         threadInfo[num_avail][nodeIdIndex + level] = val;
2471         continue;
2472       }
2473 
2474       // We didn't recognize the leading token on the line. There are lots of
2475       // leading tokens that we don't recognize - if the line isn't empty, go on
2476       // to the next line.
2477       if ((*buf != 0) && (*buf != '\n')) {
2478         // If the line is longer than the buffer, read characters
2479         // until we find a newline.
2480         if (long_line) {
2481           int ch;
2482           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2483             ;
2484         }
2485         continue;
2486       }
2487 
2488       // A newline has signalled the end of the processor record.
2489       // Check that there aren't too many procs specified.
2490       if ((int)num_avail == __kmp_xproc) {
2491         CLEANUP_THREAD_INFO;
2492         *msg_id = kmp_i18n_str_TooManyEntries;
2493         return false;
2494       }
2495 
2496       // Check for missing fields.  The osId field must be there, and we
2497       // currently require that the physical id field is specified, also.
2498       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2499         CLEANUP_THREAD_INFO;
2500         *msg_id = kmp_i18n_str_MissingProcField;
2501         return false;
2502       }
2503       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2504         CLEANUP_THREAD_INFO;
2505         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2506         return false;
2507       }
2508 
2509       // Skip this proc if it is not included in the machine model.
2510       if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2511                          __kmp_affin_fullMask)) {
2512         INIT_PROC_INFO(threadInfo[num_avail]);
2513         continue;
2514       }
2515 
2516       // We have a successful parse of this proc's info.
2517       // Increment the counter, and prepare for the next proc.
2518       num_avail++;
2519       KMP_ASSERT(num_avail <= num_records);
2520       INIT_PROC_INFO(threadInfo[num_avail]);
2521     }
2522     continue;
2523 
2524   no_val:
2525     CLEANUP_THREAD_INFO;
2526     *msg_id = kmp_i18n_str_MissingValCpuinfo;
2527     return false;
2528 
2529   dup_field:
2530     CLEANUP_THREAD_INFO;
2531     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2532     return false;
2533   }
2534   *line = 0;
2535 
2536 #if KMP_MIC && REDUCE_TEAM_SIZE
2537   unsigned teamSize = 0;
2538 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2539 
2540   // check for num_records == __kmp_xproc ???
2541 
2542   // If it is configured to omit the package level when there is only a single
2543   // package, the logic at the end of this routine won't work if there is only a
2544   // single thread
2545   KMP_ASSERT(num_avail > 0);
2546   KMP_ASSERT(num_avail <= num_records);
2547 
2548   // Sort the threadInfo table by physical Id.
2549   qsort(threadInfo, num_avail, sizeof(*threadInfo),
2550         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2551 
2552   // The table is now sorted by pkgId / coreId / threadId, but we really don't
2553   // know the radix of any of the fields. pkgId's may be sparsely assigned among
2554   // the chips on a system. Although coreId's are usually assigned
2555   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2556   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2557   //
2558   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2559   // total # packages) are at this point - we want to determine that now. We
2560   // only have an upper bound on the first two figures.
2561   unsigned *counts =
2562       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2563   unsigned *maxCt =
2564       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2565   unsigned *totals =
2566       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2567   unsigned *lastId =
2568       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2569 
2570   bool assign_thread_ids = false;
2571   unsigned threadIdCt;
2572   unsigned index;
2573 
2574 restart_radix_check:
2575   threadIdCt = 0;
2576 
2577   // Initialize the counter arrays with data from threadInfo[0].
2578   if (assign_thread_ids) {
2579     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2580       threadInfo[0][threadIdIndex] = threadIdCt++;
2581     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2582       threadIdCt = threadInfo[0][threadIdIndex] + 1;
2583     }
2584   }
2585   for (index = 0; index <= maxIndex; index++) {
2586     counts[index] = 1;
2587     maxCt[index] = 1;
2588     totals[index] = 1;
2589     lastId[index] = threadInfo[0][index];
2590     ;
2591   }
2592 
2593   // Run through the rest of the OS procs.
2594   for (i = 1; i < num_avail; i++) {
2595     // Find the most significant index whose id differs from the id for the
2596     // previous OS proc.
2597     for (index = maxIndex; index >= threadIdIndex; index--) {
2598       if (assign_thread_ids && (index == threadIdIndex)) {
2599         // Auto-assign the thread id field if it wasn't specified.
2600         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2601           threadInfo[i][threadIdIndex] = threadIdCt++;
2602         }
2603         // Apparently the thread id field was specified for some entries and not
2604         // others. Start the thread id counter off at the next higher thread id.
2605         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2606           threadIdCt = threadInfo[i][threadIdIndex] + 1;
2607         }
2608       }
2609       if (threadInfo[i][index] != lastId[index]) {
2610         // Run through all indices which are less significant, and reset the
2611         // counts to 1. At all levels up to and including index, we need to
2612         // increment the totals and record the last id.
2613         unsigned index2;
2614         for (index2 = threadIdIndex; index2 < index; index2++) {
2615           totals[index2]++;
2616           if (counts[index2] > maxCt[index2]) {
2617             maxCt[index2] = counts[index2];
2618           }
2619           counts[index2] = 1;
2620           lastId[index2] = threadInfo[i][index2];
2621         }
2622         counts[index]++;
2623         totals[index]++;
2624         lastId[index] = threadInfo[i][index];
2625 
2626         if (assign_thread_ids && (index > threadIdIndex)) {
2627 
2628 #if KMP_MIC && REDUCE_TEAM_SIZE
2629           // The default team size is the total #threads in the machine
2630           // minus 1 thread for every core that has 3 or more threads.
2631           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2632 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2633 
2634           // Restart the thread counter, as we are on a new core.
2635           threadIdCt = 0;
2636 
2637           // Auto-assign the thread id field if it wasn't specified.
2638           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2639             threadInfo[i][threadIdIndex] = threadIdCt++;
2640           }
2641 
2642           // Apparently the thread id field was specified for some entries and
2643           // not others. Start the thread id counter off at the next higher
2644           // thread id.
2645           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2646             threadIdCt = threadInfo[i][threadIdIndex] + 1;
2647           }
2648         }
2649         break;
2650       }
2651     }
2652     if (index < threadIdIndex) {
2653       // If thread ids were specified, it is an error if they are not unique.
2654       // Also, check that we waven't already restarted the loop (to be safe -
2655       // shouldn't need to).
2656       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2657         __kmp_free(lastId);
2658         __kmp_free(totals);
2659         __kmp_free(maxCt);
2660         __kmp_free(counts);
2661         CLEANUP_THREAD_INFO;
2662         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2663         return false;
2664       }
2665 
2666       // If the thread ids were not specified and we see entries entries that
2667       // are duplicates, start the loop over and assign the thread ids manually.
2668       assign_thread_ids = true;
2669       goto restart_radix_check;
2670     }
2671   }
2672 
2673 #if KMP_MIC && REDUCE_TEAM_SIZE
2674   // The default team size is the total #threads in the machine
2675   // minus 1 thread for every core that has 3 or more threads.
2676   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2677 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2678 
2679   for (index = threadIdIndex; index <= maxIndex; index++) {
2680     if (counts[index] > maxCt[index]) {
2681       maxCt[index] = counts[index];
2682     }
2683   }
2684 
2685   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2686   nCoresPerPkg = maxCt[coreIdIndex];
2687   nPackages = totals[pkgIdIndex];
2688 
2689   // When affinity is off, this routine will still be called to set
2690   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2691   // Make sure all these vars are set correctly, and return now if affinity is
2692   // not enabled.
2693   __kmp_ncores = totals[coreIdIndex];
2694   if (!KMP_AFFINITY_CAPABLE()) {
2695     KMP_ASSERT(__kmp_affinity_type == affinity_none);
2696     return true;
2697   }
2698 
2699 #if KMP_MIC && REDUCE_TEAM_SIZE
2700   // Set the default team size.
2701   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2702     __kmp_dflt_team_nth = teamSize;
2703     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2704                   "__kmp_dflt_team_nth = %d\n",
2705                   __kmp_dflt_team_nth));
2706   }
2707 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2708 
2709   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
2710 
2711   // Count the number of levels which have more nodes at that level than at the
2712   // parent's level (with there being an implicit root node of the top level).
2713   // This is equivalent to saying that there is at least one node at this level
2714   // which has a sibling. These levels are in the map, and the package level is
2715   // always in the map.
2716   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2717   for (index = threadIdIndex; index < maxIndex; index++) {
2718     KMP_ASSERT(totals[index] >= totals[index + 1]);
2719     inMap[index] = (totals[index] > totals[index + 1]);
2720   }
2721   inMap[maxIndex] = (totals[maxIndex] > 1);
2722   inMap[pkgIdIndex] = true;
2723   inMap[coreIdIndex] = true;
2724   inMap[threadIdIndex] = true;
2725 
2726   int depth = 0;
2727   int idx = 0;
2728   kmp_hw_t types[KMP_HW_LAST];
2729   int pkgLevel = -1;
2730   int coreLevel = -1;
2731   int threadLevel = -1;
2732   for (index = threadIdIndex; index <= maxIndex; index++) {
2733     if (inMap[index]) {
2734       depth++;
2735     }
2736   }
2737   if (inMap[pkgIdIndex]) {
2738     pkgLevel = idx;
2739     types[idx++] = KMP_HW_SOCKET;
2740   }
2741   if (inMap[coreIdIndex]) {
2742     coreLevel = idx;
2743     types[idx++] = KMP_HW_CORE;
2744   }
2745   if (inMap[threadIdIndex]) {
2746     threadLevel = idx;
2747     types[idx++] = KMP_HW_THREAD;
2748   }
2749   KMP_ASSERT(depth > 0);
2750 
2751   // Construct the data structure that is to be returned.
2752   __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
2753 
2754   for (i = 0; i < num_avail; ++i) {
2755     unsigned os = threadInfo[i][osIdIndex];
2756     int src_index;
2757     int dst_index = 0;
2758     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2759     hw_thread.clear();
2760     hw_thread.os_id = os;
2761 
2762     idx = 0;
2763     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2764       if (!inMap[src_index]) {
2765         continue;
2766       }
2767       if (src_index == pkgIdIndex) {
2768         hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
2769       } else if (src_index == coreIdIndex) {
2770         hw_thread.ids[coreLevel] = threadInfo[i][src_index];
2771       } else if (src_index == threadIdIndex) {
2772         hw_thread.ids[threadLevel] = threadInfo[i][src_index];
2773       }
2774       dst_index++;
2775     }
2776   }
2777 
2778   __kmp_free(inMap);
2779   __kmp_free(lastId);
2780   __kmp_free(totals);
2781   __kmp_free(maxCt);
2782   __kmp_free(counts);
2783   CLEANUP_THREAD_INFO;
2784   __kmp_topology->sort_ids();
2785   if (!__kmp_topology->check_ids()) {
2786     kmp_topology_t::deallocate(__kmp_topology);
2787     __kmp_topology = nullptr;
2788     *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2789     return false;
2790   }
2791   return true;
2792 }
2793 
2794 // Create and return a table of affinity masks, indexed by OS thread ID.
2795 // This routine handles OR'ing together all the affinity masks of threads
2796 // that are sufficiently close, if granularity > fine.
2797 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2798                                             unsigned *numUnique) {
2799   // First form a table of affinity masks in order of OS thread id.
2800   int maxOsId;
2801   int i;
2802   int numAddrs = __kmp_topology->get_num_hw_threads();
2803   int depth = __kmp_topology->get_depth();
2804   KMP_ASSERT(numAddrs);
2805   KMP_ASSERT(depth);
2806 
2807   maxOsId = 0;
2808   for (i = numAddrs - 1;; --i) {
2809     int osId = __kmp_topology->at(i).os_id;
2810     if (osId > maxOsId) {
2811       maxOsId = osId;
2812     }
2813     if (i == 0)
2814       break;
2815   }
2816   kmp_affin_mask_t *osId2Mask;
2817   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2818   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2819   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2820     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2821   }
2822   if (__kmp_affinity_gran_levels >= (int)depth) {
2823     if (__kmp_affinity_verbose ||
2824         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2825       KMP_WARNING(AffThreadsMayMigrate);
2826     }
2827   }
2828 
2829   // Run through the table, forming the masks for all threads on each core.
2830   // Threads on the same core will have identical kmp_hw_thread_t objects, not
2831   // considering the last level, which must be the thread id. All threads on a
2832   // core will appear consecutively.
2833   int unique = 0;
2834   int j = 0; // index of 1st thread on core
2835   int leader = 0;
2836   kmp_affin_mask_t *sum;
2837   KMP_CPU_ALLOC_ON_STACK(sum);
2838   KMP_CPU_ZERO(sum);
2839   KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
2840   for (i = 1; i < numAddrs; i++) {
2841     // If this thread is sufficiently close to the leader (within the
2842     // granularity setting), then set the bit for this os thread in the
2843     // affinity mask for this group, and go on to the next thread.
2844     if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
2845       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2846       continue;
2847     }
2848 
2849     // For every thread in this group, copy the mask to the thread's entry in
2850     // the osId2Mask table.  Mark the first address as a leader.
2851     for (; j < i; j++) {
2852       int osId = __kmp_topology->at(j).os_id;
2853       KMP_DEBUG_ASSERT(osId <= maxOsId);
2854       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2855       KMP_CPU_COPY(mask, sum);
2856       __kmp_topology->at(j).leader = (j == leader);
2857     }
2858     unique++;
2859 
2860     // Start a new mask.
2861     leader = i;
2862     KMP_CPU_ZERO(sum);
2863     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2864   }
2865 
2866   // For every thread in last group, copy the mask to the thread's
2867   // entry in the osId2Mask table.
2868   for (; j < i; j++) {
2869     int osId = __kmp_topology->at(j).os_id;
2870     KMP_DEBUG_ASSERT(osId <= maxOsId);
2871     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2872     KMP_CPU_COPY(mask, sum);
2873     __kmp_topology->at(j).leader = (j == leader);
2874   }
2875   unique++;
2876   KMP_CPU_FREE_FROM_STACK(sum);
2877 
2878   *maxIndex = maxOsId;
2879   *numUnique = unique;
2880   return osId2Mask;
2881 }
2882 
2883 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2884 // as file-static than to try and pass them through the calling sequence of
2885 // the recursive-descent OMP_PLACES parser.
2886 static kmp_affin_mask_t *newMasks;
2887 static int numNewMasks;
2888 static int nextNewMask;
2889 
2890 #define ADD_MASK(_mask)                                                        \
2891   {                                                                            \
2892     if (nextNewMask >= numNewMasks) {                                          \
2893       int i;                                                                   \
2894       numNewMasks *= 2;                                                        \
2895       kmp_affin_mask_t *temp;                                                  \
2896       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
2897       for (i = 0; i < numNewMasks / 2; i++) {                                  \
2898         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
2899         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
2900         KMP_CPU_COPY(dest, src);                                               \
2901       }                                                                        \
2902       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
2903       newMasks = temp;                                                         \
2904     }                                                                          \
2905     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
2906     nextNewMask++;                                                             \
2907   }
2908 
2909 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
2910   {                                                                            \
2911     if (((_osId) > _maxOsId) ||                                                \
2912         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
2913       if (__kmp_affinity_verbose ||                                            \
2914           (__kmp_affinity_warnings &&                                          \
2915            (__kmp_affinity_type != affinity_none))) {                          \
2916         KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
2917       }                                                                        \
2918     } else {                                                                   \
2919       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
2920     }                                                                          \
2921   }
2922 
2923 // Re-parse the proclist (for the explicit affinity type), and form the list
2924 // of affinity newMasks indexed by gtid.
2925 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2926                                             unsigned int *out_numMasks,
2927                                             const char *proclist,
2928                                             kmp_affin_mask_t *osId2Mask,
2929                                             int maxOsId) {
2930   int i;
2931   const char *scan = proclist;
2932   const char *next = proclist;
2933 
2934   // We use malloc() for the temporary mask vector, so that we can use
2935   // realloc() to extend it.
2936   numNewMasks = 2;
2937   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2938   nextNewMask = 0;
2939   kmp_affin_mask_t *sumMask;
2940   KMP_CPU_ALLOC(sumMask);
2941   int setSize = 0;
2942 
2943   for (;;) {
2944     int start, end, stride;
2945 
2946     SKIP_WS(scan);
2947     next = scan;
2948     if (*next == '\0') {
2949       break;
2950     }
2951 
2952     if (*next == '{') {
2953       int num;
2954       setSize = 0;
2955       next++; // skip '{'
2956       SKIP_WS(next);
2957       scan = next;
2958 
2959       // Read the first integer in the set.
2960       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2961       SKIP_DIGITS(next);
2962       num = __kmp_str_to_int(scan, *next);
2963       KMP_ASSERT2(num >= 0, "bad explicit proc list");
2964 
2965       // Copy the mask for that osId to the sum (union) mask.
2966       if ((num > maxOsId) ||
2967           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2968         if (__kmp_affinity_verbose ||
2969             (__kmp_affinity_warnings &&
2970              (__kmp_affinity_type != affinity_none))) {
2971           KMP_WARNING(AffIgnoreInvalidProcID, num);
2972         }
2973         KMP_CPU_ZERO(sumMask);
2974       } else {
2975         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2976         setSize = 1;
2977       }
2978 
2979       for (;;) {
2980         // Check for end of set.
2981         SKIP_WS(next);
2982         if (*next == '}') {
2983           next++; // skip '}'
2984           break;
2985         }
2986 
2987         // Skip optional comma.
2988         if (*next == ',') {
2989           next++;
2990         }
2991         SKIP_WS(next);
2992 
2993         // Read the next integer in the set.
2994         scan = next;
2995         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2996 
2997         SKIP_DIGITS(next);
2998         num = __kmp_str_to_int(scan, *next);
2999         KMP_ASSERT2(num >= 0, "bad explicit proc list");
3000 
3001         // Add the mask for that osId to the sum mask.
3002         if ((num > maxOsId) ||
3003             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3004           if (__kmp_affinity_verbose ||
3005               (__kmp_affinity_warnings &&
3006                (__kmp_affinity_type != affinity_none))) {
3007             KMP_WARNING(AffIgnoreInvalidProcID, num);
3008           }
3009         } else {
3010           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3011           setSize++;
3012         }
3013       }
3014       if (setSize > 0) {
3015         ADD_MASK(sumMask);
3016       }
3017 
3018       SKIP_WS(next);
3019       if (*next == ',') {
3020         next++;
3021       }
3022       scan = next;
3023       continue;
3024     }
3025 
3026     // Read the first integer.
3027     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3028     SKIP_DIGITS(next);
3029     start = __kmp_str_to_int(scan, *next);
3030     KMP_ASSERT2(start >= 0, "bad explicit proc list");
3031     SKIP_WS(next);
3032 
3033     // If this isn't a range, then add a mask to the list and go on.
3034     if (*next != '-') {
3035       ADD_MASK_OSID(start, osId2Mask, maxOsId);
3036 
3037       // Skip optional comma.
3038       if (*next == ',') {
3039         next++;
3040       }
3041       scan = next;
3042       continue;
3043     }
3044 
3045     // This is a range.  Skip over the '-' and read in the 2nd int.
3046     next++; // skip '-'
3047     SKIP_WS(next);
3048     scan = next;
3049     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3050     SKIP_DIGITS(next);
3051     end = __kmp_str_to_int(scan, *next);
3052     KMP_ASSERT2(end >= 0, "bad explicit proc list");
3053 
3054     // Check for a stride parameter
3055     stride = 1;
3056     SKIP_WS(next);
3057     if (*next == ':') {
3058       // A stride is specified.  Skip over the ':" and read the 3rd int.
3059       int sign = +1;
3060       next++; // skip ':'
3061       SKIP_WS(next);
3062       scan = next;
3063       if (*next == '-') {
3064         sign = -1;
3065         next++;
3066         SKIP_WS(next);
3067         scan = next;
3068       }
3069       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3070       SKIP_DIGITS(next);
3071       stride = __kmp_str_to_int(scan, *next);
3072       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3073       stride *= sign;
3074     }
3075 
3076     // Do some range checks.
3077     KMP_ASSERT2(stride != 0, "bad explicit proc list");
3078     if (stride > 0) {
3079       KMP_ASSERT2(start <= end, "bad explicit proc list");
3080     } else {
3081       KMP_ASSERT2(start >= end, "bad explicit proc list");
3082     }
3083     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3084 
3085     // Add the mask for each OS proc # to the list.
3086     if (stride > 0) {
3087       do {
3088         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3089         start += stride;
3090       } while (start <= end);
3091     } else {
3092       do {
3093         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3094         start += stride;
3095       } while (start >= end);
3096     }
3097 
3098     // Skip optional comma.
3099     SKIP_WS(next);
3100     if (*next == ',') {
3101       next++;
3102     }
3103     scan = next;
3104   }
3105 
3106   *out_numMasks = nextNewMask;
3107   if (nextNewMask == 0) {
3108     *out_masks = NULL;
3109     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3110     return;
3111   }
3112   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3113   for (i = 0; i < nextNewMask; i++) {
3114     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3115     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3116     KMP_CPU_COPY(dest, src);
3117   }
3118   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3119   KMP_CPU_FREE(sumMask);
3120 }
3121 
3122 /*-----------------------------------------------------------------------------
3123 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3124 places.  Again, Here is the grammar:
3125 
3126 place_list := place
3127 place_list := place , place_list
3128 place := num
3129 place := place : num
3130 place := place : num : signed
3131 place := { subplacelist }
3132 place := ! place                  // (lowest priority)
3133 subplace_list := subplace
3134 subplace_list := subplace , subplace_list
3135 subplace := num
3136 subplace := num : num
3137 subplace := num : num : signed
3138 signed := num
3139 signed := + signed
3140 signed := - signed
3141 -----------------------------------------------------------------------------*/
3142 static void __kmp_process_subplace_list(const char **scan,
3143                                         kmp_affin_mask_t *osId2Mask,
3144                                         int maxOsId, kmp_affin_mask_t *tempMask,
3145                                         int *setSize) {
3146   const char *next;
3147 
3148   for (;;) {
3149     int start, count, stride, i;
3150 
3151     // Read in the starting proc id
3152     SKIP_WS(*scan);
3153     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3154     next = *scan;
3155     SKIP_DIGITS(next);
3156     start = __kmp_str_to_int(*scan, *next);
3157     KMP_ASSERT(start >= 0);
3158     *scan = next;
3159 
3160     // valid follow sets are ',' ':' and '}'
3161     SKIP_WS(*scan);
3162     if (**scan == '}' || **scan == ',') {
3163       if ((start > maxOsId) ||
3164           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3165         if (__kmp_affinity_verbose ||
3166             (__kmp_affinity_warnings &&
3167              (__kmp_affinity_type != affinity_none))) {
3168           KMP_WARNING(AffIgnoreInvalidProcID, start);
3169         }
3170       } else {
3171         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3172         (*setSize)++;
3173       }
3174       if (**scan == '}') {
3175         break;
3176       }
3177       (*scan)++; // skip ','
3178       continue;
3179     }
3180     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3181     (*scan)++; // skip ':'
3182 
3183     // Read count parameter
3184     SKIP_WS(*scan);
3185     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3186     next = *scan;
3187     SKIP_DIGITS(next);
3188     count = __kmp_str_to_int(*scan, *next);
3189     KMP_ASSERT(count >= 0);
3190     *scan = next;
3191 
3192     // valid follow sets are ',' ':' and '}'
3193     SKIP_WS(*scan);
3194     if (**scan == '}' || **scan == ',') {
3195       for (i = 0; i < count; i++) {
3196         if ((start > maxOsId) ||
3197             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3198           if (__kmp_affinity_verbose ||
3199               (__kmp_affinity_warnings &&
3200                (__kmp_affinity_type != affinity_none))) {
3201             KMP_WARNING(AffIgnoreInvalidProcID, start);
3202           }
3203           break; // don't proliferate warnings for large count
3204         } else {
3205           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3206           start++;
3207           (*setSize)++;
3208         }
3209       }
3210       if (**scan == '}') {
3211         break;
3212       }
3213       (*scan)++; // skip ','
3214       continue;
3215     }
3216     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3217     (*scan)++; // skip ':'
3218 
3219     // Read stride parameter
3220     int sign = +1;
3221     for (;;) {
3222       SKIP_WS(*scan);
3223       if (**scan == '+') {
3224         (*scan)++; // skip '+'
3225         continue;
3226       }
3227       if (**scan == '-') {
3228         sign *= -1;
3229         (*scan)++; // skip '-'
3230         continue;
3231       }
3232       break;
3233     }
3234     SKIP_WS(*scan);
3235     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3236     next = *scan;
3237     SKIP_DIGITS(next);
3238     stride = __kmp_str_to_int(*scan, *next);
3239     KMP_ASSERT(stride >= 0);
3240     *scan = next;
3241     stride *= sign;
3242 
3243     // valid follow sets are ',' and '}'
3244     SKIP_WS(*scan);
3245     if (**scan == '}' || **scan == ',') {
3246       for (i = 0; i < count; i++) {
3247         if ((start > maxOsId) ||
3248             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3249           if (__kmp_affinity_verbose ||
3250               (__kmp_affinity_warnings &&
3251                (__kmp_affinity_type != affinity_none))) {
3252             KMP_WARNING(AffIgnoreInvalidProcID, start);
3253           }
3254           break; // don't proliferate warnings for large count
3255         } else {
3256           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3257           start += stride;
3258           (*setSize)++;
3259         }
3260       }
3261       if (**scan == '}') {
3262         break;
3263       }
3264       (*scan)++; // skip ','
3265       continue;
3266     }
3267 
3268     KMP_ASSERT2(0, "bad explicit places list");
3269   }
3270 }
3271 
3272 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3273                                 int maxOsId, kmp_affin_mask_t *tempMask,
3274                                 int *setSize) {
3275   const char *next;
3276 
3277   // valid follow sets are '{' '!' and num
3278   SKIP_WS(*scan);
3279   if (**scan == '{') {
3280     (*scan)++; // skip '{'
3281     __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3282     KMP_ASSERT2(**scan == '}', "bad explicit places list");
3283     (*scan)++; // skip '}'
3284   } else if (**scan == '!') {
3285     (*scan)++; // skip '!'
3286     __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3287     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3288   } else if ((**scan >= '0') && (**scan <= '9')) {
3289     next = *scan;
3290     SKIP_DIGITS(next);
3291     int num = __kmp_str_to_int(*scan, *next);
3292     KMP_ASSERT(num >= 0);
3293     if ((num > maxOsId) ||
3294         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3295       if (__kmp_affinity_verbose ||
3296           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3297         KMP_WARNING(AffIgnoreInvalidProcID, num);
3298       }
3299     } else {
3300       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3301       (*setSize)++;
3302     }
3303     *scan = next; // skip num
3304   } else {
3305     KMP_ASSERT2(0, "bad explicit places list");
3306   }
3307 }
3308 
3309 // static void
3310 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3311                                       unsigned int *out_numMasks,
3312                                       const char *placelist,
3313                                       kmp_affin_mask_t *osId2Mask,
3314                                       int maxOsId) {
3315   int i, j, count, stride, sign;
3316   const char *scan = placelist;
3317   const char *next = placelist;
3318 
3319   numNewMasks = 2;
3320   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3321   nextNewMask = 0;
3322 
3323   // tempMask is modified based on the previous or initial
3324   //   place to form the current place
3325   // previousMask contains the previous place
3326   kmp_affin_mask_t *tempMask;
3327   kmp_affin_mask_t *previousMask;
3328   KMP_CPU_ALLOC(tempMask);
3329   KMP_CPU_ZERO(tempMask);
3330   KMP_CPU_ALLOC(previousMask);
3331   KMP_CPU_ZERO(previousMask);
3332   int setSize = 0;
3333 
3334   for (;;) {
3335     __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3336 
3337     // valid follow sets are ',' ':' and EOL
3338     SKIP_WS(scan);
3339     if (*scan == '\0' || *scan == ',') {
3340       if (setSize > 0) {
3341         ADD_MASK(tempMask);
3342       }
3343       KMP_CPU_ZERO(tempMask);
3344       setSize = 0;
3345       if (*scan == '\0') {
3346         break;
3347       }
3348       scan++; // skip ','
3349       continue;
3350     }
3351 
3352     KMP_ASSERT2(*scan == ':', "bad explicit places list");
3353     scan++; // skip ':'
3354 
3355     // Read count parameter
3356     SKIP_WS(scan);
3357     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3358     next = scan;
3359     SKIP_DIGITS(next);
3360     count = __kmp_str_to_int(scan, *next);
3361     KMP_ASSERT(count >= 0);
3362     scan = next;
3363 
3364     // valid follow sets are ',' ':' and EOL
3365     SKIP_WS(scan);
3366     if (*scan == '\0' || *scan == ',') {
3367       stride = +1;
3368     } else {
3369       KMP_ASSERT2(*scan == ':', "bad explicit places list");
3370       scan++; // skip ':'
3371 
3372       // Read stride parameter
3373       sign = +1;
3374       for (;;) {
3375         SKIP_WS(scan);
3376         if (*scan == '+') {
3377           scan++; // skip '+'
3378           continue;
3379         }
3380         if (*scan == '-') {
3381           sign *= -1;
3382           scan++; // skip '-'
3383           continue;
3384         }
3385         break;
3386       }
3387       SKIP_WS(scan);
3388       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3389       next = scan;
3390       SKIP_DIGITS(next);
3391       stride = __kmp_str_to_int(scan, *next);
3392       KMP_DEBUG_ASSERT(stride >= 0);
3393       scan = next;
3394       stride *= sign;
3395     }
3396 
3397     // Add places determined by initial_place : count : stride
3398     for (i = 0; i < count; i++) {
3399       if (setSize == 0) {
3400         break;
3401       }
3402       // Add the current place, then build the next place (tempMask) from that
3403       KMP_CPU_COPY(previousMask, tempMask);
3404       ADD_MASK(previousMask);
3405       KMP_CPU_ZERO(tempMask);
3406       setSize = 0;
3407       KMP_CPU_SET_ITERATE(j, previousMask) {
3408         if (!KMP_CPU_ISSET(j, previousMask)) {
3409           continue;
3410         }
3411         if ((j + stride > maxOsId) || (j + stride < 0) ||
3412             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3413             (!KMP_CPU_ISSET(j + stride,
3414                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3415           if ((__kmp_affinity_verbose ||
3416                (__kmp_affinity_warnings &&
3417                 (__kmp_affinity_type != affinity_none))) &&
3418               i < count - 1) {
3419             KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3420           }
3421           continue;
3422         }
3423         KMP_CPU_SET(j + stride, tempMask);
3424         setSize++;
3425       }
3426     }
3427     KMP_CPU_ZERO(tempMask);
3428     setSize = 0;
3429 
3430     // valid follow sets are ',' and EOL
3431     SKIP_WS(scan);
3432     if (*scan == '\0') {
3433       break;
3434     }
3435     if (*scan == ',') {
3436       scan++; // skip ','
3437       continue;
3438     }
3439 
3440     KMP_ASSERT2(0, "bad explicit places list");
3441   }
3442 
3443   *out_numMasks = nextNewMask;
3444   if (nextNewMask == 0) {
3445     *out_masks = NULL;
3446     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3447     return;
3448   }
3449   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3450   KMP_CPU_FREE(tempMask);
3451   KMP_CPU_FREE(previousMask);
3452   for (i = 0; i < nextNewMask; i++) {
3453     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3454     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3455     KMP_CPU_COPY(dest, src);
3456   }
3457   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3458 }
3459 
3460 #undef ADD_MASK
3461 #undef ADD_MASK_OSID
3462 
3463 // This function figures out the deepest level at which there is at least one
3464 // cluster/core with more than one processing unit bound to it.
3465 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
3466   int core_level = 0;
3467 
3468   for (int i = 0; i < nprocs; i++) {
3469     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3470     for (int j = bottom_level; j > 0; j--) {
3471       if (hw_thread.ids[j] > 0) {
3472         if (core_level < (j - 1)) {
3473           core_level = j - 1;
3474         }
3475       }
3476     }
3477   }
3478   return core_level;
3479 }
3480 
3481 // This function counts number of clusters/cores at given level.
3482 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
3483                                          int core_level) {
3484   return __kmp_topology->get_count(core_level);
3485 }
3486 // This function finds to which cluster/core given processing unit is bound.
3487 static int __kmp_affinity_find_core(int proc, int bottom_level,
3488                                     int core_level) {
3489   int core = 0;
3490   KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
3491   for (int i = 0; i <= proc; ++i) {
3492     if (i + 1 <= proc) {
3493       for (int j = 0; j <= core_level; ++j) {
3494         if (__kmp_topology->at(i + 1).sub_ids[j] !=
3495             __kmp_topology->at(i).sub_ids[j]) {
3496           core++;
3497           break;
3498         }
3499       }
3500     }
3501   }
3502   return core;
3503 }
3504 
3505 // This function finds maximal number of processing units bound to a
3506 // cluster/core at given level.
3507 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
3508                                             int core_level) {
3509   if (core_level >= bottom_level)
3510     return 1;
3511   int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
3512   return __kmp_topology->calculate_ratio(thread_level, core_level);
3513 }
3514 
3515 static int *procarr = NULL;
3516 static int __kmp_aff_depth = 0;
3517 
3518 // Create a one element mask array (set of places) which only contains the
3519 // initial process's affinity mask
3520 static void __kmp_create_affinity_none_places() {
3521   KMP_ASSERT(__kmp_affin_fullMask != NULL);
3522   KMP_ASSERT(__kmp_affinity_type == affinity_none);
3523   __kmp_affinity_num_masks = 1;
3524   KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3525   kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
3526   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
3527 }
3528 
3529 static void __kmp_aux_affinity_initialize(void) {
3530   if (__kmp_affinity_masks != NULL) {
3531     KMP_ASSERT(__kmp_affin_fullMask != NULL);
3532     return;
3533   }
3534 
3535   // Create the "full" mask - this defines all of the processors that we
3536   // consider to be in the machine model. If respect is set, then it is the
3537   // initialization thread's affinity mask. Otherwise, it is all processors that
3538   // we know about on the machine.
3539   if (__kmp_affin_fullMask == NULL) {
3540     KMP_CPU_ALLOC(__kmp_affin_fullMask);
3541   }
3542   if (KMP_AFFINITY_CAPABLE()) {
3543     __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3544     if (__kmp_affinity_respect_mask) {
3545       // Count the number of available processors.
3546       unsigned i;
3547       __kmp_avail_proc = 0;
3548       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3549         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3550           continue;
3551         }
3552         __kmp_avail_proc++;
3553       }
3554       if (__kmp_avail_proc > __kmp_xproc) {
3555         if (__kmp_affinity_verbose ||
3556             (__kmp_affinity_warnings &&
3557              (__kmp_affinity_type != affinity_none))) {
3558           KMP_WARNING(ErrorInitializeAffinity);
3559         }
3560         __kmp_affinity_type = affinity_none;
3561         KMP_AFFINITY_DISABLE();
3562         return;
3563       }
3564 
3565       if (__kmp_affinity_verbose) {
3566         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3567         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3568                                   __kmp_affin_fullMask);
3569         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
3570       }
3571     } else {
3572       if (__kmp_affinity_verbose) {
3573         char buf[KMP_AFFIN_MASK_PRINT_LEN];
3574         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3575                                   __kmp_affin_fullMask);
3576         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
3577       }
3578       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3579       __kmp_avail_proc = __kmp_xproc;
3580 #if KMP_OS_WINDOWS
3581       // Set the process affinity mask since threads' affinity
3582       // masks must be subset of process mask in Windows* OS
3583       __kmp_affin_fullMask->set_process_affinity(true);
3584 #endif
3585     }
3586   }
3587 
3588   kmp_i18n_id_t msg_id = kmp_i18n_null;
3589 
3590   // For backward compatibility, setting KMP_CPUINFO_FILE =>
3591   // KMP_TOPOLOGY_METHOD=cpuinfo
3592   if ((__kmp_cpuinfo_file != NULL) &&
3593       (__kmp_affinity_top_method == affinity_top_method_all)) {
3594     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3595   }
3596 
3597   bool success = false;
3598   if (__kmp_affinity_top_method == affinity_top_method_all) {
3599 // In the default code path, errors are not fatal - we just try using
3600 // another method. We only emit a warning message if affinity is on, or the
3601 // verbose flag is set, an the nowarnings flag was not set.
3602 #if KMP_USE_HWLOC
3603     if (!success &&
3604         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3605       if (!__kmp_hwloc_error) {
3606         success = __kmp_affinity_create_hwloc_map(&msg_id);
3607         if (!success && __kmp_affinity_verbose) {
3608           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3609         }
3610       } else if (__kmp_affinity_verbose) {
3611         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3612       }
3613     }
3614 #endif
3615 
3616 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3617     if (!success) {
3618       success = __kmp_affinity_create_x2apicid_map(&msg_id);
3619       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3620         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3621       }
3622     }
3623     if (!success) {
3624       success = __kmp_affinity_create_apicid_map(&msg_id);
3625       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3626         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3627       }
3628     }
3629 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3630 
3631 #if KMP_OS_LINUX
3632     if (!success) {
3633       int line = 0;
3634       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3635       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3636         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3637       }
3638     }
3639 #endif /* KMP_OS_LINUX */
3640 
3641 #if KMP_GROUP_AFFINITY
3642     if (!success && (__kmp_num_proc_groups > 1)) {
3643       success = __kmp_affinity_create_proc_group_map(&msg_id);
3644       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3645         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3646       }
3647     }
3648 #endif /* KMP_GROUP_AFFINITY */
3649 
3650     if (!success) {
3651       success = __kmp_affinity_create_flat_map(&msg_id);
3652       if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3653         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3654       }
3655       KMP_ASSERT(success);
3656     }
3657   }
3658 
3659 // If the user has specified that a paricular topology discovery method is to be
3660 // used, then we abort if that method fails. The exception is group affinity,
3661 // which might have been implicitly set.
3662 #if KMP_USE_HWLOC
3663   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3664     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
3665     success = __kmp_affinity_create_hwloc_map(&msg_id);
3666     if (!success) {
3667       KMP_ASSERT(msg_id != kmp_i18n_null);
3668       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3669     }
3670   }
3671 #endif // KMP_USE_HWLOC
3672 
3673 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3674   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
3675            __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
3676     success = __kmp_affinity_create_x2apicid_map(&msg_id);
3677     if (!success) {
3678       KMP_ASSERT(msg_id != kmp_i18n_null);
3679       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3680     }
3681   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3682     success = __kmp_affinity_create_apicid_map(&msg_id);
3683     if (!success) {
3684       KMP_ASSERT(msg_id != kmp_i18n_null);
3685       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3686     }
3687   }
3688 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3689 
3690   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3691     int line = 0;
3692     success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3693     if (!success) {
3694       KMP_ASSERT(msg_id != kmp_i18n_null);
3695       const char *filename = __kmp_cpuinfo_get_filename();
3696       if (line > 0) {
3697         KMP_FATAL(FileLineMsgExiting, filename, line,
3698                   __kmp_i18n_catgets(msg_id));
3699       } else {
3700         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3701       }
3702     }
3703   }
3704 
3705 #if KMP_GROUP_AFFINITY
3706   else if (__kmp_affinity_top_method == affinity_top_method_group) {
3707     success = __kmp_affinity_create_proc_group_map(&msg_id);
3708     KMP_ASSERT(success);
3709     if (!success) {
3710       KMP_ASSERT(msg_id != kmp_i18n_null);
3711       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3712     }
3713   }
3714 #endif /* KMP_GROUP_AFFINITY */
3715 
3716   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3717     success = __kmp_affinity_create_flat_map(&msg_id);
3718     // should not fail
3719     KMP_ASSERT(success);
3720   }
3721 
3722   // Early exit if topology could not be created
3723   if (!__kmp_topology) {
3724     if (KMP_AFFINITY_CAPABLE() &&
3725         (__kmp_affinity_verbose ||
3726          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
3727       KMP_WARNING(ErrorInitializeAffinity);
3728     }
3729     if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
3730         __kmp_ncores > 0) {
3731       __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
3732       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
3733                                    __kmp_nThreadsPerCore, __kmp_ncores);
3734       if (__kmp_affinity_verbose) {
3735         __kmp_topology->print("KMP_AFFINITY");
3736       }
3737     }
3738     __kmp_affinity_type = affinity_none;
3739     __kmp_create_affinity_none_places();
3740 #if KMP_USE_HIER_SCHED
3741     __kmp_dispatch_set_hierarchy_values();
3742 #endif
3743     KMP_AFFINITY_DISABLE();
3744     return;
3745   }
3746 
3747   // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
3748   // initialize other data structures which depend on the topology
3749   __kmp_topology->canonicalize();
3750   if (__kmp_affinity_verbose)
3751     __kmp_topology->print("KMP_AFFINITY");
3752   bool filtered = __kmp_topology->filter_hw_subset();
3753   if (filtered && __kmp_affinity_verbose)
3754     __kmp_topology->print("KMP_HW_SUBSET");
3755   machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
3756   KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
3757   // If KMP_AFFINITY=none, then only create the single "none" place
3758   // which is the process's initial affinity mask or the number of
3759   // hardware threads depending on respect,norespect
3760   if (__kmp_affinity_type == affinity_none) {
3761     __kmp_create_affinity_none_places();
3762 #if KMP_USE_HIER_SCHED
3763     __kmp_dispatch_set_hierarchy_values();
3764 #endif
3765     return;
3766   }
3767   int depth = __kmp_topology->get_depth();
3768 
3769   // Create the table of masks, indexed by thread Id.
3770   unsigned maxIndex;
3771   unsigned numUnique;
3772   kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
3773   if (__kmp_affinity_gran_levels == 0) {
3774     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3775   }
3776 
3777   switch (__kmp_affinity_type) {
3778 
3779   case affinity_explicit:
3780     KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3781     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
3782       __kmp_affinity_process_proclist(
3783           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3784           __kmp_affinity_proclist, osId2Mask, maxIndex);
3785     } else {
3786       __kmp_affinity_process_placelist(
3787           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3788           __kmp_affinity_proclist, osId2Mask, maxIndex);
3789     }
3790     if (__kmp_affinity_num_masks == 0) {
3791       if (__kmp_affinity_verbose ||
3792           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3793         KMP_WARNING(AffNoValidProcID);
3794       }
3795       __kmp_affinity_type = affinity_none;
3796       __kmp_create_affinity_none_places();
3797       return;
3798     }
3799     break;
3800 
3801   // The other affinity types rely on sorting the hardware threads according to
3802   // some permutation of the machine topology tree. Set __kmp_affinity_compact
3803   // and __kmp_affinity_offset appropriately, then jump to a common code
3804   // fragment to do the sort and create the array of affinity masks.
3805   case affinity_logical:
3806     __kmp_affinity_compact = 0;
3807     if (__kmp_affinity_offset) {
3808       __kmp_affinity_offset =
3809           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3810     }
3811     goto sortTopology;
3812 
3813   case affinity_physical:
3814     if (__kmp_nThreadsPerCore > 1) {
3815       __kmp_affinity_compact = 1;
3816       if (__kmp_affinity_compact >= depth) {
3817         __kmp_affinity_compact = 0;
3818       }
3819     } else {
3820       __kmp_affinity_compact = 0;
3821     }
3822     if (__kmp_affinity_offset) {
3823       __kmp_affinity_offset =
3824           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3825     }
3826     goto sortTopology;
3827 
3828   case affinity_scatter:
3829     if (__kmp_affinity_compact >= depth) {
3830       __kmp_affinity_compact = 0;
3831     } else {
3832       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3833     }
3834     goto sortTopology;
3835 
3836   case affinity_compact:
3837     if (__kmp_affinity_compact >= depth) {
3838       __kmp_affinity_compact = depth - 1;
3839     }
3840     goto sortTopology;
3841 
3842   case affinity_balanced:
3843     if (depth <= 1) {
3844       if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3845         KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3846       }
3847       __kmp_affinity_type = affinity_none;
3848       __kmp_create_affinity_none_places();
3849       return;
3850     } else if (!__kmp_topology->is_uniform()) {
3851       // Save the depth for further usage
3852       __kmp_aff_depth = depth;
3853 
3854       int core_level =
3855           __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
3856       int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
3857                                                  core_level);
3858       int maxprocpercore = __kmp_affinity_max_proc_per_core(
3859           __kmp_avail_proc, depth - 1, core_level);
3860 
3861       int nproc = ncores * maxprocpercore;
3862       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
3863         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3864           KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3865         }
3866         __kmp_affinity_type = affinity_none;
3867         return;
3868       }
3869 
3870       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
3871       for (int i = 0; i < nproc; i++) {
3872         procarr[i] = -1;
3873       }
3874 
3875       int lastcore = -1;
3876       int inlastcore = 0;
3877       for (int i = 0; i < __kmp_avail_proc; i++) {
3878         int proc = __kmp_topology->at(i).os_id;
3879         int core = __kmp_affinity_find_core(i, depth - 1, core_level);
3880 
3881         if (core == lastcore) {
3882           inlastcore++;
3883         } else {
3884           inlastcore = 0;
3885         }
3886         lastcore = core;
3887 
3888         procarr[core * maxprocpercore + inlastcore] = proc;
3889       }
3890     }
3891     if (__kmp_affinity_compact >= depth) {
3892       __kmp_affinity_compact = depth - 1;
3893     }
3894 
3895   sortTopology:
3896     // Allocate the gtid->affinity mask table.
3897     if (__kmp_affinity_dups) {
3898       __kmp_affinity_num_masks = __kmp_avail_proc;
3899     } else {
3900       __kmp_affinity_num_masks = numUnique;
3901     }
3902 
3903     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
3904         (__kmp_affinity_num_places > 0) &&
3905         ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
3906       __kmp_affinity_num_masks = __kmp_affinity_num_places;
3907     }
3908 
3909     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3910 
3911     // Sort the topology table according to the current setting of
3912     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3913     __kmp_topology->sort_compact();
3914     {
3915       int i;
3916       unsigned j;
3917       int num_hw_threads = __kmp_topology->get_num_hw_threads();
3918       for (i = 0, j = 0; i < num_hw_threads; i++) {
3919         if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
3920           continue;
3921         }
3922         int osId = __kmp_topology->at(i).os_id;
3923 
3924         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3925         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3926         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3927         KMP_CPU_COPY(dest, src);
3928         if (++j >= __kmp_affinity_num_masks) {
3929           break;
3930         }
3931       }
3932       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3933     }
3934     // Sort the topology back using ids
3935     __kmp_topology->sort_ids();
3936     break;
3937 
3938   default:
3939     KMP_ASSERT2(0, "Unexpected affinity setting");
3940   }
3941 
3942   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
3943 }
3944 
3945 void __kmp_affinity_initialize(void) {
3946   // Much of the code above was written assuming that if a machine was not
3947   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
3948   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3949   // There are too many checks for __kmp_affinity_type == affinity_none
3950   // in this code.  Instead of trying to change them all, check if
3951   // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3952   // affinity_none, call the real initialization routine, then restore
3953   // __kmp_affinity_type to affinity_disabled.
3954   int disabled = (__kmp_affinity_type == affinity_disabled);
3955   if (!KMP_AFFINITY_CAPABLE()) {
3956     KMP_ASSERT(disabled);
3957   }
3958   if (disabled) {
3959     __kmp_affinity_type = affinity_none;
3960   }
3961   __kmp_aux_affinity_initialize();
3962   if (disabled) {
3963     __kmp_affinity_type = affinity_disabled;
3964   }
3965 }
3966 
3967 void __kmp_affinity_uninitialize(void) {
3968   if (__kmp_affinity_masks != NULL) {
3969     KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3970     __kmp_affinity_masks = NULL;
3971   }
3972   if (__kmp_affin_fullMask != NULL) {
3973     KMP_CPU_FREE(__kmp_affin_fullMask);
3974     __kmp_affin_fullMask = NULL;
3975   }
3976   __kmp_affinity_num_masks = 0;
3977   __kmp_affinity_type = affinity_default;
3978   __kmp_affinity_num_places = 0;
3979   if (__kmp_affinity_proclist != NULL) {
3980     __kmp_free(__kmp_affinity_proclist);
3981     __kmp_affinity_proclist = NULL;
3982   }
3983   if (procarr != NULL) {
3984     __kmp_free(procarr);
3985     procarr = NULL;
3986   }
3987 #if KMP_USE_HWLOC
3988   if (__kmp_hwloc_topology != NULL) {
3989     hwloc_topology_destroy(__kmp_hwloc_topology);
3990     __kmp_hwloc_topology = NULL;
3991   }
3992 #endif
3993   if (__kmp_hw_subset) {
3994     kmp_hw_subset_t::deallocate(__kmp_hw_subset);
3995     __kmp_hw_subset = nullptr;
3996   }
3997   if (__kmp_topology) {
3998     kmp_topology_t::deallocate(__kmp_topology);
3999     __kmp_topology = nullptr;
4000   }
4001   KMPAffinity::destroy_api();
4002 }
4003 
4004 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4005   if (!KMP_AFFINITY_CAPABLE()) {
4006     return;
4007   }
4008 
4009   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4010   if (th->th.th_affin_mask == NULL) {
4011     KMP_CPU_ALLOC(th->th.th_affin_mask);
4012   } else {
4013     KMP_CPU_ZERO(th->th.th_affin_mask);
4014   }
4015 
4016   // Copy the thread mask to the kmp_info_t structure. If
4017   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4018   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4019   // then the full mask is the same as the mask of the initialization thread.
4020   kmp_affin_mask_t *mask;
4021   int i;
4022 
4023   if (KMP_AFFINITY_NON_PROC_BIND) {
4024     if ((__kmp_affinity_type == affinity_none) ||
4025         (__kmp_affinity_type == affinity_balanced) ||
4026         KMP_HIDDEN_HELPER_THREAD(gtid)) {
4027 #if KMP_GROUP_AFFINITY
4028       if (__kmp_num_proc_groups > 1) {
4029         return;
4030       }
4031 #endif
4032       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4033       i = 0;
4034       mask = __kmp_affin_fullMask;
4035     } else {
4036       int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4037       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4038       i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4039       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4040     }
4041   } else {
4042     if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
4043         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4044 #if KMP_GROUP_AFFINITY
4045       if (__kmp_num_proc_groups > 1) {
4046         return;
4047       }
4048 #endif
4049       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4050       i = KMP_PLACE_ALL;
4051       mask = __kmp_affin_fullMask;
4052     } else {
4053       // int i = some hash function or just a counter that doesn't
4054       // always start at 0.  Use adjusted gtid for now.
4055       int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4056       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4057       i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4058       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4059     }
4060   }
4061 
4062   th->th.th_current_place = i;
4063   if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
4064     th->th.th_new_place = i;
4065     th->th.th_first_place = 0;
4066     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4067   } else if (KMP_AFFINITY_NON_PROC_BIND) {
4068     // When using a Non-OMP_PROC_BIND affinity method,
4069     // set all threads' place-partition-var to the entire place list
4070     th->th.th_first_place = 0;
4071     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4072   }
4073 
4074   if (i == KMP_PLACE_ALL) {
4075     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4076                    gtid));
4077   } else {
4078     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4079                    gtid, i));
4080   }
4081 
4082   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4083 
4084   if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
4085       /* to avoid duplicate printing (will be correctly printed on barrier) */
4086       && (__kmp_affinity_type == affinity_none ||
4087           (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
4088     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4089     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4090                               th->th.th_affin_mask);
4091     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4092                __kmp_gettid(), gtid, buf);
4093   }
4094 
4095 #if KMP_DEBUG
4096   // Hidden helper thread affinity only printed for debug builds
4097   if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
4098     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4099     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4100                               th->th.th_affin_mask);
4101     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
4102                (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
4103   }
4104 #endif
4105 
4106 #if KMP_OS_WINDOWS
4107   // On Windows* OS, the process affinity mask might have changed. If the user
4108   // didn't request affinity and this call fails, just continue silently.
4109   // See CQ171393.
4110   if (__kmp_affinity_type == affinity_none) {
4111     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4112   } else
4113 #endif
4114     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4115 }
4116 
4117 void __kmp_affinity_set_place(int gtid) {
4118   if (!KMP_AFFINITY_CAPABLE()) {
4119     return;
4120   }
4121 
4122   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4123 
4124   KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4125                  "place = %d)\n",
4126                  gtid, th->th.th_new_place, th->th.th_current_place));
4127 
4128   // Check that the new place is within this thread's partition.
4129   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4130   KMP_ASSERT(th->th.th_new_place >= 0);
4131   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4132   if (th->th.th_first_place <= th->th.th_last_place) {
4133     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4134                (th->th.th_new_place <= th->th.th_last_place));
4135   } else {
4136     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4137                (th->th.th_new_place >= th->th.th_last_place));
4138   }
4139 
4140   // Copy the thread mask to the kmp_info_t structure,
4141   // and set this thread's affinity.
4142   kmp_affin_mask_t *mask =
4143       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4144   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4145   th->th.th_current_place = th->th.th_new_place;
4146 
4147   if (__kmp_affinity_verbose) {
4148     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4149     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4150                               th->th.th_affin_mask);
4151     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4152                __kmp_gettid(), gtid, buf);
4153   }
4154   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4155 }
4156 
4157 int __kmp_aux_set_affinity(void **mask) {
4158   int gtid;
4159   kmp_info_t *th;
4160   int retval;
4161 
4162   if (!KMP_AFFINITY_CAPABLE()) {
4163     return -1;
4164   }
4165 
4166   gtid = __kmp_entry_gtid();
4167   KA_TRACE(
4168       1000, (""); {
4169         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4170         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4171                                   (kmp_affin_mask_t *)(*mask));
4172         __kmp_debug_printf(
4173             "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4174             gtid, buf);
4175       });
4176 
4177   if (__kmp_env_consistency_check) {
4178     if ((mask == NULL) || (*mask == NULL)) {
4179       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4180     } else {
4181       unsigned proc;
4182       int num_procs = 0;
4183 
4184       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4185         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4186           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4187         }
4188         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4189           continue;
4190         }
4191         num_procs++;
4192       }
4193       if (num_procs == 0) {
4194         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4195       }
4196 
4197 #if KMP_GROUP_AFFINITY
4198       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4199         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4200       }
4201 #endif /* KMP_GROUP_AFFINITY */
4202     }
4203   }
4204 
4205   th = __kmp_threads[gtid];
4206   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4207   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4208   if (retval == 0) {
4209     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4210   }
4211 
4212   th->th.th_current_place = KMP_PLACE_UNDEFINED;
4213   th->th.th_new_place = KMP_PLACE_UNDEFINED;
4214   th->th.th_first_place = 0;
4215   th->th.th_last_place = __kmp_affinity_num_masks - 1;
4216 
4217   // Turn off 4.0 affinity for the current tread at this parallel level.
4218   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4219 
4220   return retval;
4221 }
4222 
4223 int __kmp_aux_get_affinity(void **mask) {
4224   int gtid;
4225   int retval;
4226 #if KMP_OS_WINDOWS || KMP_DEBUG
4227   kmp_info_t *th;
4228 #endif
4229   if (!KMP_AFFINITY_CAPABLE()) {
4230     return -1;
4231   }
4232 
4233   gtid = __kmp_entry_gtid();
4234 #if KMP_OS_WINDOWS || KMP_DEBUG
4235   th = __kmp_threads[gtid];
4236 #else
4237   (void)gtid; // unused variable
4238 #endif
4239   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4240 
4241   KA_TRACE(
4242       1000, (""); {
4243         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4244         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4245                                   th->th.th_affin_mask);
4246         __kmp_printf(
4247             "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
4248             buf);
4249       });
4250 
4251   if (__kmp_env_consistency_check) {
4252     if ((mask == NULL) || (*mask == NULL)) {
4253       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4254     }
4255   }
4256 
4257 #if !KMP_OS_WINDOWS
4258 
4259   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4260   KA_TRACE(
4261       1000, (""); {
4262         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4263         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4264                                   (kmp_affin_mask_t *)(*mask));
4265         __kmp_printf(
4266             "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
4267             buf);
4268       });
4269   return retval;
4270 
4271 #else
4272   (void)retval;
4273 
4274   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4275   return 0;
4276 
4277 #endif /* KMP_OS_WINDOWS */
4278 }
4279 
4280 int __kmp_aux_get_affinity_max_proc() {
4281   if (!KMP_AFFINITY_CAPABLE()) {
4282     return 0;
4283   }
4284 #if KMP_GROUP_AFFINITY
4285   if (__kmp_num_proc_groups > 1) {
4286     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4287   }
4288 #endif
4289   return __kmp_xproc;
4290 }
4291 
4292 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4293   if (!KMP_AFFINITY_CAPABLE()) {
4294     return -1;
4295   }
4296 
4297   KA_TRACE(
4298       1000, (""); {
4299         int gtid = __kmp_entry_gtid();
4300         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4301         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4302                                   (kmp_affin_mask_t *)(*mask));
4303         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4304                            "affinity mask for thread %d = %s\n",
4305                            proc, gtid, buf);
4306       });
4307 
4308   if (__kmp_env_consistency_check) {
4309     if ((mask == NULL) || (*mask == NULL)) {
4310       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4311     }
4312   }
4313 
4314   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4315     return -1;
4316   }
4317   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4318     return -2;
4319   }
4320 
4321   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4322   return 0;
4323 }
4324 
4325 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
4326   if (!KMP_AFFINITY_CAPABLE()) {
4327     return -1;
4328   }
4329 
4330   KA_TRACE(
4331       1000, (""); {
4332         int gtid = __kmp_entry_gtid();
4333         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4334         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4335                                   (kmp_affin_mask_t *)(*mask));
4336         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
4337                            "affinity mask for thread %d = %s\n",
4338                            proc, gtid, buf);
4339       });
4340 
4341   if (__kmp_env_consistency_check) {
4342     if ((mask == NULL) || (*mask == NULL)) {
4343       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4344     }
4345   }
4346 
4347   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4348     return -1;
4349   }
4350   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4351     return -2;
4352   }
4353 
4354   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4355   return 0;
4356 }
4357 
4358 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
4359   if (!KMP_AFFINITY_CAPABLE()) {
4360     return -1;
4361   }
4362 
4363   KA_TRACE(
4364       1000, (""); {
4365         int gtid = __kmp_entry_gtid();
4366         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4367         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4368                                   (kmp_affin_mask_t *)(*mask));
4369         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
4370                            "affinity mask for thread %d = %s\n",
4371                            proc, gtid, buf);
4372       });
4373 
4374   if (__kmp_env_consistency_check) {
4375     if ((mask == NULL) || (*mask == NULL)) {
4376       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4377     }
4378   }
4379 
4380   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4381     return -1;
4382   }
4383   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4384     return 0;
4385   }
4386 
4387   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4388 }
4389 
4390 // Dynamic affinity settings - Affinity balanced
4391 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
4392   KMP_DEBUG_ASSERT(th);
4393   bool fine_gran = true;
4394   int tid = th->th.th_info.ds.ds_tid;
4395 
4396   // Do not perform balanced affinity for the hidden helper threads
4397   if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
4398     return;
4399 
4400   switch (__kmp_affinity_gran) {
4401   case KMP_HW_THREAD:
4402     break;
4403   case KMP_HW_CORE:
4404     if (__kmp_nThreadsPerCore > 1) {
4405       fine_gran = false;
4406     }
4407     break;
4408   case KMP_HW_SOCKET:
4409     if (nCoresPerPkg > 1) {
4410       fine_gran = false;
4411     }
4412     break;
4413   default:
4414     fine_gran = false;
4415   }
4416 
4417   if (__kmp_topology->is_uniform()) {
4418     int coreID;
4419     int threadID;
4420     // Number of hyper threads per core in HT machine
4421     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4422     // Number of cores
4423     int ncores = __kmp_ncores;
4424     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
4425       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4426       ncores = nPackages;
4427     }
4428     // How many threads will be bound to each core
4429     int chunk = nthreads / ncores;
4430     // How many cores will have an additional thread bound to it - "big cores"
4431     int big_cores = nthreads % ncores;
4432     // Number of threads on the big cores
4433     int big_nth = (chunk + 1) * big_cores;
4434     if (tid < big_nth) {
4435       coreID = tid / (chunk + 1);
4436       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
4437     } else { // tid >= big_nth
4438       coreID = (tid - big_cores) / chunk;
4439       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
4440     }
4441     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4442                       "Illegal set affinity operation when not capable");
4443 
4444     kmp_affin_mask_t *mask = th->th.th_affin_mask;
4445     KMP_CPU_ZERO(mask);
4446 
4447     if (fine_gran) {
4448       int osID =
4449           __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
4450       KMP_CPU_SET(osID, mask);
4451     } else {
4452       for (int i = 0; i < __kmp_nth_per_core; i++) {
4453         int osID;
4454         osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
4455         KMP_CPU_SET(osID, mask);
4456       }
4457     }
4458     if (__kmp_affinity_verbose) {
4459       char buf[KMP_AFFIN_MASK_PRINT_LEN];
4460       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4461       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4462                  __kmp_gettid(), tid, buf);
4463     }
4464     __kmp_set_system_affinity(mask, TRUE);
4465   } else { // Non-uniform topology
4466 
4467     kmp_affin_mask_t *mask = th->th.th_affin_mask;
4468     KMP_CPU_ZERO(mask);
4469 
4470     int core_level =
4471         __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
4472     int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
4473                                                __kmp_aff_depth - 1, core_level);
4474     int nth_per_core = __kmp_affinity_max_proc_per_core(
4475         __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4476 
4477     // For performance gain consider the special case nthreads ==
4478     // __kmp_avail_proc
4479     if (nthreads == __kmp_avail_proc) {
4480       if (fine_gran) {
4481         int osID = __kmp_topology->at(tid).os_id;
4482         KMP_CPU_SET(osID, mask);
4483       } else {
4484         int core =
4485             __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
4486         for (int i = 0; i < __kmp_avail_proc; i++) {
4487           int osID = __kmp_topology->at(i).os_id;
4488           if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
4489               core) {
4490             KMP_CPU_SET(osID, mask);
4491           }
4492         }
4493       }
4494     } else if (nthreads <= ncores) {
4495 
4496       int core = 0;
4497       for (int i = 0; i < ncores; i++) {
4498         // Check if this core from procarr[] is in the mask
4499         int in_mask = 0;
4500         for (int j = 0; j < nth_per_core; j++) {
4501           if (procarr[i * nth_per_core + j] != -1) {
4502             in_mask = 1;
4503             break;
4504           }
4505         }
4506         if (in_mask) {
4507           if (tid == core) {
4508             for (int j = 0; j < nth_per_core; j++) {
4509               int osID = procarr[i * nth_per_core + j];
4510               if (osID != -1) {
4511                 KMP_CPU_SET(osID, mask);
4512                 // For fine granularity it is enough to set the first available
4513                 // osID for this core
4514                 if (fine_gran) {
4515                   break;
4516                 }
4517               }
4518             }
4519             break;
4520           } else {
4521             core++;
4522           }
4523         }
4524       }
4525     } else { // nthreads > ncores
4526       // Array to save the number of processors at each core
4527       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
4528       // Array to save the number of cores with "x" available processors;
4529       int *ncores_with_x_procs =
4530           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4531       // Array to save the number of cores with # procs from x to nth_per_core
4532       int *ncores_with_x_to_max_procs =
4533           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4534 
4535       for (int i = 0; i <= nth_per_core; i++) {
4536         ncores_with_x_procs[i] = 0;
4537         ncores_with_x_to_max_procs[i] = 0;
4538       }
4539 
4540       for (int i = 0; i < ncores; i++) {
4541         int cnt = 0;
4542         for (int j = 0; j < nth_per_core; j++) {
4543           if (procarr[i * nth_per_core + j] != -1) {
4544             cnt++;
4545           }
4546         }
4547         nproc_at_core[i] = cnt;
4548         ncores_with_x_procs[cnt]++;
4549       }
4550 
4551       for (int i = 0; i <= nth_per_core; i++) {
4552         for (int j = i; j <= nth_per_core; j++) {
4553           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
4554         }
4555       }
4556 
4557       // Max number of processors
4558       int nproc = nth_per_core * ncores;
4559       // An array to keep number of threads per each context
4560       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4561       for (int i = 0; i < nproc; i++) {
4562         newarr[i] = 0;
4563       }
4564 
4565       int nth = nthreads;
4566       int flag = 0;
4567       while (nth > 0) {
4568         for (int j = 1; j <= nth_per_core; j++) {
4569           int cnt = ncores_with_x_to_max_procs[j];
4570           for (int i = 0; i < ncores; i++) {
4571             // Skip the core with 0 processors
4572             if (nproc_at_core[i] == 0) {
4573               continue;
4574             }
4575             for (int k = 0; k < nth_per_core; k++) {
4576               if (procarr[i * nth_per_core + k] != -1) {
4577                 if (newarr[i * nth_per_core + k] == 0) {
4578                   newarr[i * nth_per_core + k] = 1;
4579                   cnt--;
4580                   nth--;
4581                   break;
4582                 } else {
4583                   if (flag != 0) {
4584                     newarr[i * nth_per_core + k]++;
4585                     cnt--;
4586                     nth--;
4587                     break;
4588                   }
4589                 }
4590               }
4591             }
4592             if (cnt == 0 || nth == 0) {
4593               break;
4594             }
4595           }
4596           if (nth == 0) {
4597             break;
4598           }
4599         }
4600         flag = 1;
4601       }
4602       int sum = 0;
4603       for (int i = 0; i < nproc; i++) {
4604         sum += newarr[i];
4605         if (sum > tid) {
4606           if (fine_gran) {
4607             int osID = procarr[i];
4608             KMP_CPU_SET(osID, mask);
4609           } else {
4610             int coreID = i / nth_per_core;
4611             for (int ii = 0; ii < nth_per_core; ii++) {
4612               int osID = procarr[coreID * nth_per_core + ii];
4613               if (osID != -1) {
4614                 KMP_CPU_SET(osID, mask);
4615               }
4616             }
4617           }
4618           break;
4619         }
4620       }
4621       __kmp_free(newarr);
4622     }
4623 
4624     if (__kmp_affinity_verbose) {
4625       char buf[KMP_AFFIN_MASK_PRINT_LEN];
4626       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4627       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4628                  __kmp_gettid(), tid, buf);
4629     }
4630     __kmp_set_system_affinity(mask, TRUE);
4631   }
4632 }
4633 
4634 #if KMP_OS_LINUX || KMP_OS_FREEBSD
4635 // We don't need this entry for Windows because
4636 // there is GetProcessAffinityMask() api
4637 //
4638 // The intended usage is indicated by these steps:
4639 // 1) The user gets the current affinity mask
4640 // 2) Then sets the affinity by calling this function
4641 // 3) Error check the return value
4642 // 4) Use non-OpenMP parallelization
4643 // 5) Reset the affinity to what was stored in step 1)
4644 #ifdef __cplusplus
4645 extern "C"
4646 #endif
4647     int
4648     kmp_set_thread_affinity_mask_initial()
4649 // the function returns 0 on success,
4650 //   -1 if we cannot bind thread
4651 //   >0 (errno) if an error happened during binding
4652 {
4653   int gtid = __kmp_get_gtid();
4654   if (gtid < 0) {
4655     // Do not touch non-omp threads
4656     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4657                   "non-omp thread, returning\n"));
4658     return -1;
4659   }
4660   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4661     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4662                   "affinity not initialized, returning\n"));
4663     return -1;
4664   }
4665   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4666                 "set full mask for thread %d\n",
4667                 gtid));
4668   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4669   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4670 }
4671 #endif
4672 
4673 #endif // KMP_AFFINITY_SUPPORTED
4674