LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 #include "kmp_affinity.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() {
27  machine_hierarchy.fini();
28 }
29 
30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
31  kmp_uint32 depth;
32  // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
33  if (TCR_1(machine_hierarchy.uninitialized))
34  machine_hierarchy.init(NULL, nproc);
35 
36  // Adjust the hierarchy in case num threads exceeds original
37  if (nproc > machine_hierarchy.base_num_threads)
38  machine_hierarchy.resize(nproc);
39 
40  depth = machine_hierarchy.depth;
41  KMP_DEBUG_ASSERT(depth > 0);
42 
43  thr_bar->depth = depth;
44  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
45  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 bool KMPAffinity::picked_api = false;
51 
52 void* KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
53 void* KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
54 void KMPAffinity::Mask::operator delete(void* p) { __kmp_free(p); }
55 void KMPAffinity::Mask::operator delete[](void* p) { __kmp_free(p); }
56 void* KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
57 void KMPAffinity::operator delete(void* p) { __kmp_free(p); }
58 
59 void KMPAffinity::pick_api() {
60  KMPAffinity* affinity_dispatch;
61  if (picked_api)
62  return;
63 #if KMP_USE_HWLOC
64  if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
65  affinity_dispatch = new KMPHwlocAffinity();
66  } else
67 #endif
68  {
69  affinity_dispatch = new KMPNativeAffinity();
70  }
71  __kmp_affinity_dispatch = affinity_dispatch;
72  picked_api = true;
73 }
74 
75 void KMPAffinity::destroy_api() {
76  if (__kmp_affinity_dispatch != NULL) {
77  delete __kmp_affinity_dispatch;
78  __kmp_affinity_dispatch = NULL;
79  picked_api = false;
80  }
81 }
82 
83 //
84 // Print the affinity mask to the character array in a pretty format.
85 //
86 char *
87 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
88 {
89  KMP_ASSERT(buf_len >= 40);
90  char *scan = buf;
91  char *end = buf + buf_len - 1;
92 
93  //
94  // Find first element / check for empty set.
95  //
96  size_t i;
97  i = mask->begin();
98  if (i == mask->end()) {
99  KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
100  while (*scan != '\0') scan++;
101  KMP_ASSERT(scan <= end);
102  return buf;
103  }
104 
105  KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
106  while (*scan != '\0') scan++;
107  i++;
108  for (; i != mask->end(); i = mask->next(i)) {
109  if (! KMP_CPU_ISSET(i, mask)) {
110  continue;
111  }
112 
113  //
114  // Check for buffer overflow. A string of the form ",<n>" will have
115  // at most 10 characters, plus we want to leave room to print ",...}"
116  // if the set is too large to print for a total of 15 characters.
117  // We already left room for '\0' in setting end.
118  //
119  if (end - scan < 15) {
120  break;
121  }
122  KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
123  while (*scan != '\0') scan++;
124  }
125  if (i != mask->end()) {
126  KMP_SNPRINTF(scan, end-scan+1, ",...");
127  while (*scan != '\0') scan++;
128  }
129  KMP_SNPRINTF(scan, end-scan+1, "}");
130  while (*scan != '\0') scan++;
131  KMP_ASSERT(scan <= end);
132  return buf;
133 }
134 
135 
136 void
137 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
138 {
139  KMP_CPU_ZERO(mask);
140 
141 # if KMP_GROUP_AFFINITY
142 
143  if (__kmp_num_proc_groups > 1) {
144  int group;
145  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
146  for (group = 0; group < __kmp_num_proc_groups; group++) {
147  int i;
148  int num = __kmp_GetActiveProcessorCount(group);
149  for (i = 0; i < num; i++) {
150  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
151  }
152  }
153  }
154  else
155 
156 # endif /* KMP_GROUP_AFFINITY */
157 
158  {
159  int proc;
160  for (proc = 0; proc < __kmp_xproc; proc++) {
161  KMP_CPU_SET(proc, mask);
162  }
163  }
164 }
165 
166 //
167 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
168 // called to renumber the labels from [0..n] and place them into the child_num
169 // vector of the address object. This is done in case the labels used for
170 // the children at one node of the hierarchy differ from those used for
171 // another node at the same level. Example: suppose the machine has 2 nodes
172 // with 2 packages each. The first node contains packages 601 and 602, and
173 // second node contains packages 603 and 604. If we try to sort the table
174 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
175 // because we are paying attention to the labels themselves, not the ordinal
176 // child numbers. By using the child numbers in the sort, the result is
177 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
178 //
179 static void
180 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
181  int numAddrs)
182 {
183  KMP_DEBUG_ASSERT(numAddrs > 0);
184  int depth = address2os->first.depth;
185  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
186  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
187  * sizeof(unsigned));
188  int labCt;
189  for (labCt = 0; labCt < depth; labCt++) {
190  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
191  lastLabel[labCt] = address2os[0].first.labels[labCt];
192  }
193  int i;
194  for (i = 1; i < numAddrs; i++) {
195  for (labCt = 0; labCt < depth; labCt++) {
196  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
197  int labCt2;
198  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
199  counts[labCt2] = 0;
200  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
201  }
202  counts[labCt]++;
203  lastLabel[labCt] = address2os[i].first.labels[labCt];
204  break;
205  }
206  }
207  for (labCt = 0; labCt < depth; labCt++) {
208  address2os[i].first.childNums[labCt] = counts[labCt];
209  }
210  for (; labCt < (int)Address::maxDepth; labCt++) {
211  address2os[i].first.childNums[labCt] = 0;
212  }
213  }
214  __kmp_free(lastLabel);
215  __kmp_free(counts);
216 }
217 
218 
219 //
220 // All of the __kmp_affinity_create_*_map() routines should set
221 // __kmp_affinity_masks to a vector of affinity mask objects of length
222 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
223 // return the number of levels in the machine topology tree (zero if
224 // __kmp_affinity_type == affinity_none).
225 //
226 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
227 // to the affinity mask for the initialization thread. They need to save and
228 // restore the mask, and it could be needed later, so saving it is just an
229 // optimization to avoid calling kmp_get_system_affinity() again.
230 //
231 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
232 
233 static int nCoresPerPkg, nPackages;
234 static int __kmp_nThreadsPerCore;
235 #ifndef KMP_DFLT_NTH_CORES
236 static int __kmp_ncores;
237 #endif
238 static int *__kmp_pu_os_idx = NULL;
239 
240 //
241 // __kmp_affinity_uniform_topology() doesn't work when called from
242 // places which support arbitrarily many levels in the machine topology
243 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
244 // __kmp_affinity_create_x2apicid_map().
245 //
246 inline static bool
247 __kmp_affinity_uniform_topology()
248 {
249  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
250 }
251 
252 
253 //
254 // Print out the detailed machine topology map, i.e. the physical locations
255 // of each OS proc.
256 //
257 static void
258 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
259  int pkgLevel, int coreLevel, int threadLevel)
260 {
261  int proc;
262 
263  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
264  for (proc = 0; proc < len; proc++) {
265  int level;
266  kmp_str_buf_t buf;
267  __kmp_str_buf_init(&buf);
268  for (level = 0; level < depth; level++) {
269  if (level == threadLevel) {
270  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
271  }
272  else if (level == coreLevel) {
273  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
274  }
275  else if (level == pkgLevel) {
276  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
277  }
278  else if (level > pkgLevel) {
279  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
280  level - pkgLevel - 1);
281  }
282  else {
283  __kmp_str_buf_print(&buf, "L%d ", level);
284  }
285  __kmp_str_buf_print(&buf, "%d ",
286  address2os[proc].first.labels[level]);
287  }
288  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
289  buf.str);
290  __kmp_str_buf_free(&buf);
291  }
292 }
293 
294 #if KMP_USE_HWLOC
295 
296 // This function removes the topology levels that are radix 1 and don't offer
297 // further information about the topology. The most common example is when you
298 // have one thread context per core, we don't want the extra thread context
299 // level if it offers no unique labels. So they are removed.
300 // return value: the new depth of address2os
301 static int
302 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
303  int level;
304  int i;
305  int radix1_detected;
306 
307  for (level = depth-1; level >= 0; --level) {
308  // Always keep the package level
309  if (level == *pkgLevel)
310  continue;
311  // Detect if this level is radix 1
312  radix1_detected = 1;
313  for (i = 1; i < nActiveThreads; ++i) {
314  if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
315  // There are differing label values for this level so it stays
316  radix1_detected = 0;
317  break;
318  }
319  }
320  if (!radix1_detected)
321  continue;
322  // Radix 1 was detected
323  if (level == *threadLevel) {
324  // If only one thread per core, then just decrement
325  // the depth which removes the threadlevel from address2os
326  for (i = 0; i < nActiveThreads; ++i) {
327  address2os[i].first.depth--;
328  }
329  *threadLevel = -1;
330  } else if (level == *coreLevel) {
331  // For core level, we move the thread labels over if they are still
332  // valid (*threadLevel != -1), and also reduce the depth another level
333  for (i = 0; i < nActiveThreads; ++i) {
334  if (*threadLevel != -1) {
335  address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
336  }
337  address2os[i].first.depth--;
338  }
339  *coreLevel = -1;
340  }
341  }
342  return address2os[0].first.depth;
343 }
344 
345 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
346 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
347 // this will return the number of PU's under the SOCKET object.
348 static int
349 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
350  int retval = 0;
351  hwloc_obj_t first;
352  for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
353  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
354  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
355  {
356  ++retval;
357  }
358  return retval;
359 }
360 
361 static int
362 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
363  kmp_i18n_id_t *const msg_id)
364 {
365  *address2os = NULL;
366  *msg_id = kmp_i18n_null;
367 
368  //
369  // Save the affinity mask for the current thread.
370  //
371  kmp_affin_mask_t *oldMask;
372  KMP_CPU_ALLOC(oldMask);
373  __kmp_get_system_affinity(oldMask, TRUE);
374 
375  int depth = 3;
376  int pkgLevel = 0;
377  int coreLevel = 1;
378  int threadLevel = 2;
379 
380  if (! KMP_AFFINITY_CAPABLE())
381  {
382  //
383  // Hack to try and infer the machine topology using only the data
384  // available from cpuid on the current thread, and __kmp_xproc.
385  //
386  KMP_ASSERT(__kmp_affinity_type == affinity_none);
387 
388  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
389  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
390  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
391  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
392  if (__kmp_affinity_verbose) {
393  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
394  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
395  if (__kmp_affinity_uniform_topology()) {
396  KMP_INFORM(Uniform, "KMP_AFFINITY");
397  } else {
398  KMP_INFORM(NonUniform, "KMP_AFFINITY");
399  }
400  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
401  __kmp_nThreadsPerCore, __kmp_ncores);
402  }
403  KMP_CPU_FREE(oldMask);
404  return 0;
405  }
406 
407  //
408  // Allocate the data structure to be returned.
409  //
410  AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
411  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
412 
413  //
414  // When affinity is off, this routine will still be called to set
415  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
416  // nCoresPerPkg, & nPackages. Make sure all these vars are set
417  // correctly, and return if affinity is not enabled.
418  //
419 
420  hwloc_obj_t pu;
421  hwloc_obj_t core;
422  hwloc_obj_t socket;
423  int nActiveThreads = 0;
424  int socket_identifier = 0;
425  // re-calculate globals to count only accessible resources
426  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
427  for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
428  socket != NULL;
429  socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
430  socket_identifier++)
431  {
432  int core_identifier = 0;
433  int num_active_cores = 0;
434  for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
435  core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
436  core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
437  core_identifier++)
438  {
439  int pu_identifier = 0;
440  int num_active_threads = 0;
441  for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
442  pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
443  pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
444  pu_identifier++)
445  {
446  Address addr(3);
447  if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
448  continue; // skip inactive (inaccessible) unit
449  KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
450  socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
451  addr.labels[0] = socket_identifier; // package
452  addr.labels[1] = core_identifier; // core
453  addr.labels[2] = pu_identifier; // pu
454  retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
455  __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
456  nActiveThreads++;
457  ++num_active_threads; // count active threads per core
458  }
459  if (num_active_threads) { // were there any active threads on the core?
460  ++__kmp_ncores; // count total active cores
461  ++num_active_cores; // count active cores per socket
462  if (num_active_threads > __kmp_nThreadsPerCore)
463  __kmp_nThreadsPerCore = num_active_threads; // calc maximum
464  }
465  }
466  if (num_active_cores) { // were there any active cores on the socket?
467  ++nPackages; // count total active packages
468  if (num_active_cores > nCoresPerPkg)
469  nCoresPerPkg = num_active_cores; // calc maximum
470  }
471  }
472 
473  //
474  // If there's only one thread context to bind to, return now.
475  //
476  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
477  KMP_ASSERT(nActiveThreads > 0);
478  if (nActiveThreads == 1) {
479  __kmp_ncores = nPackages = 1;
480  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
481  if (__kmp_affinity_verbose) {
482  char buf[KMP_AFFIN_MASK_PRINT_LEN];
483  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
484 
485  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
486  if (__kmp_affinity_respect_mask) {
487  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
488  } else {
489  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
490  }
491  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
492  KMP_INFORM(Uniform, "KMP_AFFINITY");
493  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
494  __kmp_nThreadsPerCore, __kmp_ncores);
495  }
496 
497  if (__kmp_affinity_type == affinity_none) {
498  __kmp_free(retval);
499  KMP_CPU_FREE(oldMask);
500  return 0;
501  }
502 
503  //
504  // Form an Address object which only includes the package level.
505  //
506  Address addr(1);
507  addr.labels[0] = retval[0].first.labels[pkgLevel];
508  retval[0].first = addr;
509 
510  if (__kmp_affinity_gran_levels < 0) {
511  __kmp_affinity_gran_levels = 0;
512  }
513 
514  if (__kmp_affinity_verbose) {
515  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
516  }
517 
518  *address2os = retval;
519  KMP_CPU_FREE(oldMask);
520  return 1;
521  }
522 
523  //
524  // Sort the table by physical Id.
525  //
526  qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
527 
528  //
529  // Check to see if the machine topology is uniform
530  //
531  unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
532 
533  //
534  // Print the machine topology summary.
535  //
536  if (__kmp_affinity_verbose) {
537  char mask[KMP_AFFIN_MASK_PRINT_LEN];
538  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
539 
540  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
541  if (__kmp_affinity_respect_mask) {
542  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
543  } else {
544  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
545  }
546  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
547  if (uniform) {
548  KMP_INFORM(Uniform, "KMP_AFFINITY");
549  } else {
550  KMP_INFORM(NonUniform, "KMP_AFFINITY");
551  }
552 
553  kmp_str_buf_t buf;
554  __kmp_str_buf_init(&buf);
555 
556  __kmp_str_buf_print(&buf, "%d", nPackages);
557  //for (level = 1; level <= pkgLevel; level++) {
558  // __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
559  // }
560  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
561  __kmp_nThreadsPerCore, __kmp_ncores);
562 
563  __kmp_str_buf_free(&buf);
564  }
565 
566  if (__kmp_affinity_type == affinity_none) {
567  __kmp_free(retval);
568  KMP_CPU_FREE(oldMask);
569  return 0;
570  }
571 
572  //
573  // Find any levels with radiix 1, and remove them from the map
574  // (except for the package level).
575  //
576  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
577 
578  if (__kmp_affinity_gran_levels < 0) {
579  //
580  // Set the granularity level based on what levels are modeled
581  // in the machine topology map.
582  //
583  __kmp_affinity_gran_levels = 0;
584  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
585  __kmp_affinity_gran_levels++;
586  }
587  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
588  __kmp_affinity_gran_levels++;
589  }
590  if (__kmp_affinity_gran > affinity_gran_package) {
591  __kmp_affinity_gran_levels++;
592  }
593  }
594 
595  if (__kmp_affinity_verbose) {
596  __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
597  coreLevel, threadLevel);
598  }
599 
600  KMP_CPU_FREE(oldMask);
601  *address2os = retval;
602  return depth;
603 }
604 #endif // KMP_USE_HWLOC
605 
606 //
607 // If we don't know how to retrieve the machine's processor topology, or
608 // encounter an error in doing so, this routine is called to form a "flat"
609 // mapping of os thread id's <-> processor id's.
610 //
611 static int
612 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
613  kmp_i18n_id_t *const msg_id)
614 {
615  *address2os = NULL;
616  *msg_id = kmp_i18n_null;
617 
618  //
619  // Even if __kmp_affinity_type == affinity_none, this routine might still
620  // called to set __kmp_ncores, as well as
621  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
622  //
623  if (! KMP_AFFINITY_CAPABLE()) {
624  KMP_ASSERT(__kmp_affinity_type == affinity_none);
625  __kmp_ncores = nPackages = __kmp_xproc;
626  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
627  if (__kmp_affinity_verbose) {
628  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
629  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
630  KMP_INFORM(Uniform, "KMP_AFFINITY");
631  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
632  __kmp_nThreadsPerCore, __kmp_ncores);
633  }
634  return 0;
635  }
636 
637  //
638  // When affinity is off, this routine will still be called to set
639  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
640  // nCoresPerPkg, & nPackages. Make sure all these vars are set
641  // correctly, and return now if affinity is not enabled.
642  //
643  __kmp_ncores = nPackages = __kmp_avail_proc;
644  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
645  if (__kmp_affinity_verbose) {
646  char buf[KMP_AFFIN_MASK_PRINT_LEN];
647  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
648 
649  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
650  if (__kmp_affinity_respect_mask) {
651  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
652  } else {
653  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
654  }
655  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
656  KMP_INFORM(Uniform, "KMP_AFFINITY");
657  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
658  __kmp_nThreadsPerCore, __kmp_ncores);
659  }
660  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
661  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
662  if (__kmp_affinity_type == affinity_none) {
663  int avail_ct = 0;
664  int i;
665  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
666  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
667  continue;
668  __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
669  }
670  return 0;
671  }
672 
673  //
674  // Contruct the data structure to be returned.
675  //
676  *address2os = (AddrUnsPair*)
677  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
678  int avail_ct = 0;
679  unsigned int i;
680  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
681  //
682  // Skip this proc if it is not included in the machine model.
683  //
684  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
685  continue;
686  }
687  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
688  Address addr(1);
689  addr.labels[0] = i;
690  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
691  }
692  if (__kmp_affinity_verbose) {
693  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
694  }
695 
696  if (__kmp_affinity_gran_levels < 0) {
697  //
698  // Only the package level is modeled in the machine topology map,
699  // so the #levels of granularity is either 0 or 1.
700  //
701  if (__kmp_affinity_gran > affinity_gran_package) {
702  __kmp_affinity_gran_levels = 1;
703  }
704  else {
705  __kmp_affinity_gran_levels = 0;
706  }
707  }
708  return 1;
709 }
710 
711 
712 # if KMP_GROUP_AFFINITY
713 
714 //
715 // If multiple Windows* OS processor groups exist, we can create a 2-level
716 // topology map with the groups at level 0 and the individual procs at
717 // level 1.
718 //
719 // This facilitates letting the threads float among all procs in a group,
720 // if granularity=group (the default when there are multiple groups).
721 //
722 static int
723 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
724  kmp_i18n_id_t *const msg_id)
725 {
726  *address2os = NULL;
727  *msg_id = kmp_i18n_null;
728 
729  //
730  // If we don't have multiple processor groups, return now.
731  // The flat mapping will be used.
732  //
733  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
734  // FIXME set *msg_id
735  return -1;
736  }
737 
738  //
739  // Contruct the data structure to be returned.
740  //
741  *address2os = (AddrUnsPair*)
742  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
743  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
744  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
745  int avail_ct = 0;
746  int i;
747  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
748  //
749  // Skip this proc if it is not included in the machine model.
750  //
751  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
752  continue;
753  }
754  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
755  Address addr(2);
756  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
757  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
758  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
759 
760  if (__kmp_affinity_verbose) {
761  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
762  addr.labels[1]);
763  }
764  }
765 
766  if (__kmp_affinity_gran_levels < 0) {
767  if (__kmp_affinity_gran == affinity_gran_group) {
768  __kmp_affinity_gran_levels = 1;
769  }
770  else if ((__kmp_affinity_gran == affinity_gran_fine)
771  || (__kmp_affinity_gran == affinity_gran_thread)) {
772  __kmp_affinity_gran_levels = 0;
773  }
774  else {
775  const char *gran_str = NULL;
776  if (__kmp_affinity_gran == affinity_gran_core) {
777  gran_str = "core";
778  }
779  else if (__kmp_affinity_gran == affinity_gran_package) {
780  gran_str = "package";
781  }
782  else if (__kmp_affinity_gran == affinity_gran_node) {
783  gran_str = "node";
784  }
785  else {
786  KMP_ASSERT(0);
787  }
788 
789  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
790  __kmp_affinity_gran_levels = 0;
791  }
792  }
793  return 2;
794 }
795 
796 # endif /* KMP_GROUP_AFFINITY */
797 
798 
799 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
800 
801 static int
802 __kmp_cpuid_mask_width(int count) {
803  int r = 0;
804 
805  while((1<<r) < count)
806  ++r;
807  return r;
808 }
809 
810 
811 class apicThreadInfo {
812 public:
813  unsigned osId; // param to __kmp_affinity_bind_thread
814  unsigned apicId; // from cpuid after binding
815  unsigned maxCoresPerPkg; // ""
816  unsigned maxThreadsPerPkg; // ""
817  unsigned pkgId; // inferred from above values
818  unsigned coreId; // ""
819  unsigned threadId; // ""
820 };
821 
822 
823 static int
824 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
825 {
826  const apicThreadInfo *aa = (const apicThreadInfo *)a;
827  const apicThreadInfo *bb = (const apicThreadInfo *)b;
828  if (aa->osId < bb->osId) return -1;
829  if (aa->osId > bb->osId) return 1;
830  return 0;
831 }
832 
833 
834 static int
835 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
836 {
837  const apicThreadInfo *aa = (const apicThreadInfo *)a;
838  const apicThreadInfo *bb = (const apicThreadInfo *)b;
839  if (aa->pkgId < bb->pkgId) return -1;
840  if (aa->pkgId > bb->pkgId) return 1;
841  if (aa->coreId < bb->coreId) return -1;
842  if (aa->coreId > bb->coreId) return 1;
843  if (aa->threadId < bb->threadId) return -1;
844  if (aa->threadId > bb->threadId) return 1;
845  return 0;
846 }
847 
848 
849 //
850 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
851 // an algorithm which cycles through the available os threads, setting
852 // the current thread's affinity mask to that thread, and then retrieves
853 // the Apic Id for each thread context using the cpuid instruction.
854 //
855 static int
856 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
857  kmp_i18n_id_t *const msg_id)
858 {
859  kmp_cpuid buf;
860  int rc;
861  *address2os = NULL;
862  *msg_id = kmp_i18n_null;
863 
864  //
865  // Check if cpuid leaf 4 is supported.
866  //
867  __kmp_x86_cpuid(0, 0, &buf);
868  if (buf.eax < 4) {
869  *msg_id = kmp_i18n_str_NoLeaf4Support;
870  return -1;
871  }
872 
873  //
874  // The algorithm used starts by setting the affinity to each available
875  // thread and retrieving info from the cpuid instruction, so if we are
876  // not capable of calling __kmp_get_system_affinity() and
877  // _kmp_get_system_affinity(), then we need to do something else - use
878  // the defaults that we calculated from issuing cpuid without binding
879  // to each proc.
880  //
881  if (! KMP_AFFINITY_CAPABLE()) {
882  //
883  // Hack to try and infer the machine topology using only the data
884  // available from cpuid on the current thread, and __kmp_xproc.
885  //
886  KMP_ASSERT(__kmp_affinity_type == affinity_none);
887 
888  //
889  // Get an upper bound on the number of threads per package using
890  // cpuid(1).
891  //
892  // On some OS/chps combinations where HT is supported by the chip
893  // but is disabled, this value will be 2 on a single core chip.
894  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
895  //
896  __kmp_x86_cpuid(1, 0, &buf);
897  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
898  if (maxThreadsPerPkg == 0) {
899  maxThreadsPerPkg = 1;
900  }
901 
902  //
903  // The num cores per pkg comes from cpuid(4).
904  // 1 must be added to the encoded value.
905  //
906  // The author of cpu_count.cpp treated this only an upper bound
907  // on the number of cores, but I haven't seen any cases where it
908  // was greater than the actual number of cores, so we will treat
909  // it as exact in this block of code.
910  //
911  // First, we need to check if cpuid(4) is supported on this chip.
912  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
913  // has the value n or greater.
914  //
915  __kmp_x86_cpuid(0, 0, &buf);
916  if (buf.eax >= 4) {
917  __kmp_x86_cpuid(4, 0, &buf);
918  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
919  }
920  else {
921  nCoresPerPkg = 1;
922  }
923 
924  //
925  // There is no way to reliably tell if HT is enabled without issuing
926  // the cpuid instruction from every thread, can correlating the cpuid
927  // info, so if the machine is not affinity capable, we assume that HT
928  // is off. We have seen quite a few machines where maxThreadsPerPkg
929  // is 2, yet the machine does not support HT.
930  //
931  // - Older OSes are usually found on machines with older chips, which
932  // do not support HT.
933  //
934  // - The performance penalty for mistakenly identifying a machine as
935  // HT when it isn't (which results in blocktime being incorrecly set
936  // to 0) is greater than the penalty when for mistakenly identifying
937  // a machine as being 1 thread/core when it is really HT enabled
938  // (which results in blocktime being incorrectly set to a positive
939  // value).
940  //
941  __kmp_ncores = __kmp_xproc;
942  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
943  __kmp_nThreadsPerCore = 1;
944  if (__kmp_affinity_verbose) {
945  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
946  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
947  if (__kmp_affinity_uniform_topology()) {
948  KMP_INFORM(Uniform, "KMP_AFFINITY");
949  } else {
950  KMP_INFORM(NonUniform, "KMP_AFFINITY");
951  }
952  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
953  __kmp_nThreadsPerCore, __kmp_ncores);
954  }
955  return 0;
956  }
957 
958  //
959  //
960  // From here on, we can assume that it is safe to call
961  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
962  // even if __kmp_affinity_type = affinity_none.
963  //
964 
965  //
966  // Save the affinity mask for the current thread.
967  //
968  kmp_affin_mask_t *oldMask;
969  KMP_CPU_ALLOC(oldMask);
970  KMP_ASSERT(oldMask != NULL);
971  __kmp_get_system_affinity(oldMask, TRUE);
972 
973  //
974  // Run through each of the available contexts, binding the current thread
975  // to it, and obtaining the pertinent information using the cpuid instr.
976  //
977  // The relevant information is:
978  //
979  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
980  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
981  //
982  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
983  // value of this field determines the width of the core# + thread#
984  // fields in the Apic Id. It is also an upper bound on the number
985  // of threads per package, but it has been verified that situations
986  // happen were it is not exact. In particular, on certain OS/chip
987  // combinations where Intel(R) Hyper-Threading Technology is supported
988  // by the chip but has
989  // been disabled, the value of this field will be 2 (for a single core
990  // chip). On other OS/chip combinations supporting
991  // Intel(R) Hyper-Threading Technology, the value of
992  // this field will be 1 when Intel(R) Hyper-Threading Technology is
993  // disabled and 2 when it is enabled.
994  //
995  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
996  // value of this field (+1) determines the width of the core# field in
997  // the Apic Id. The comments in "cpucount.cpp" say that this value is
998  // an upper bound, but the IA-32 architecture manual says that it is
999  // exactly the number of cores per package, and I haven't seen any
1000  // case where it wasn't.
1001  //
1002  // From this information, deduce the package Id, core Id, and thread Id,
1003  // and set the corresponding fields in the apicThreadInfo struct.
1004  //
1005  unsigned i;
1006  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1007  __kmp_avail_proc * sizeof(apicThreadInfo));
1008  unsigned nApics = 0;
1009  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1010  //
1011  // Skip this proc if it is not included in the machine model.
1012  //
1013  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1014  continue;
1015  }
1016  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1017 
1018  __kmp_affinity_dispatch->bind_thread(i);
1019  threadInfo[nApics].osId = i;
1020 
1021  //
1022  // The apic id and max threads per pkg come from cpuid(1).
1023  //
1024  __kmp_x86_cpuid(1, 0, &buf);
1025  if (((buf.edx >> 9) & 1) == 0) {
1026  __kmp_set_system_affinity(oldMask, TRUE);
1027  __kmp_free(threadInfo);
1028  KMP_CPU_FREE(oldMask);
1029  *msg_id = kmp_i18n_str_ApicNotPresent;
1030  return -1;
1031  }
1032  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1033  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1034  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1035  threadInfo[nApics].maxThreadsPerPkg = 1;
1036  }
1037 
1038  //
1039  // Max cores per pkg comes from cpuid(4).
1040  // 1 must be added to the encoded value.
1041  //
1042  // First, we need to check if cpuid(4) is supported on this chip.
1043  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1044  // has the value n or greater.
1045  //
1046  __kmp_x86_cpuid(0, 0, &buf);
1047  if (buf.eax >= 4) {
1048  __kmp_x86_cpuid(4, 0, &buf);
1049  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1050  }
1051  else {
1052  threadInfo[nApics].maxCoresPerPkg = 1;
1053  }
1054 
1055  //
1056  // Infer the pkgId / coreId / threadId using only the info
1057  // obtained locally.
1058  //
1059  int widthCT = __kmp_cpuid_mask_width(
1060  threadInfo[nApics].maxThreadsPerPkg);
1061  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1062 
1063  int widthC = __kmp_cpuid_mask_width(
1064  threadInfo[nApics].maxCoresPerPkg);
1065  int widthT = widthCT - widthC;
1066  if (widthT < 0) {
1067  //
1068  // I've never seen this one happen, but I suppose it could, if
1069  // the cpuid instruction on a chip was really screwed up.
1070  // Make sure to restore the affinity mask before the tail call.
1071  //
1072  __kmp_set_system_affinity(oldMask, TRUE);
1073  __kmp_free(threadInfo);
1074  KMP_CPU_FREE(oldMask);
1075  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1076  return -1;
1077  }
1078 
1079  int maskC = (1 << widthC) - 1;
1080  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1081  &maskC;
1082 
1083  int maskT = (1 << widthT) - 1;
1084  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1085 
1086  nApics++;
1087  }
1088 
1089  //
1090  // We've collected all the info we need.
1091  // Restore the old affinity mask for this thread.
1092  //
1093  __kmp_set_system_affinity(oldMask, TRUE);
1094 
1095  //
1096  // If there's only one thread context to bind to, form an Address object
1097  // with depth 1 and return immediately (or, if affinity is off, set
1098  // address2os to NULL and return).
1099  //
1100  // If it is configured to omit the package level when there is only a
1101  // single package, the logic at the end of this routine won't work if
1102  // there is only a single thread - it would try to form an Address
1103  // object with depth 0.
1104  //
1105  KMP_ASSERT(nApics > 0);
1106  if (nApics == 1) {
1107  __kmp_ncores = nPackages = 1;
1108  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1109  if (__kmp_affinity_verbose) {
1110  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1111  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1112 
1113  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1114  if (__kmp_affinity_respect_mask) {
1115  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1116  } else {
1117  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1118  }
1119  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1120  KMP_INFORM(Uniform, "KMP_AFFINITY");
1121  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1122  __kmp_nThreadsPerCore, __kmp_ncores);
1123  }
1124 
1125  if (__kmp_affinity_type == affinity_none) {
1126  __kmp_free(threadInfo);
1127  KMP_CPU_FREE(oldMask);
1128  return 0;
1129  }
1130 
1131  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1132  Address addr(1);
1133  addr.labels[0] = threadInfo[0].pkgId;
1134  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1135 
1136  if (__kmp_affinity_gran_levels < 0) {
1137  __kmp_affinity_gran_levels = 0;
1138  }
1139 
1140  if (__kmp_affinity_verbose) {
1141  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1142  }
1143 
1144  __kmp_free(threadInfo);
1145  KMP_CPU_FREE(oldMask);
1146  return 1;
1147  }
1148 
1149  //
1150  // Sort the threadInfo table by physical Id.
1151  //
1152  qsort(threadInfo, nApics, sizeof(*threadInfo),
1153  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1154 
1155  //
1156  // The table is now sorted by pkgId / coreId / threadId, but we really
1157  // don't know the radix of any of the fields. pkgId's may be sparsely
1158  // assigned among the chips on a system. Although coreId's are usually
1159  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1160  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1161  //
1162  // For that matter, we don't know what coresPerPkg and threadsPerCore
1163  // (or the total # packages) are at this point - we want to determine
1164  // that now. We only have an upper bound on the first two figures.
1165  //
1166  // We also perform a consistency check at this point: the values returned
1167  // by the cpuid instruction for any thread bound to a given package had
1168  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1169  //
1170  nPackages = 1;
1171  nCoresPerPkg = 1;
1172  __kmp_nThreadsPerCore = 1;
1173  unsigned nCores = 1;
1174 
1175  unsigned pkgCt = 1; // to determine radii
1176  unsigned lastPkgId = threadInfo[0].pkgId;
1177  unsigned coreCt = 1;
1178  unsigned lastCoreId = threadInfo[0].coreId;
1179  unsigned threadCt = 1;
1180  unsigned lastThreadId = threadInfo[0].threadId;
1181 
1182  // intra-pkg consist checks
1183  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1184  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1185 
1186  for (i = 1; i < nApics; i++) {
1187  if (threadInfo[i].pkgId != lastPkgId) {
1188  nCores++;
1189  pkgCt++;
1190  lastPkgId = threadInfo[i].pkgId;
1191  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1192  coreCt = 1;
1193  lastCoreId = threadInfo[i].coreId;
1194  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1195  threadCt = 1;
1196  lastThreadId = threadInfo[i].threadId;
1197 
1198  //
1199  // This is a different package, so go on to the next iteration
1200  // without doing any consistency checks. Reset the consistency
1201  // check vars, though.
1202  //
1203  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1204  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1205  continue;
1206  }
1207 
1208  if (threadInfo[i].coreId != lastCoreId) {
1209  nCores++;
1210  coreCt++;
1211  lastCoreId = threadInfo[i].coreId;
1212  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1213  threadCt = 1;
1214  lastThreadId = threadInfo[i].threadId;
1215  }
1216  else if (threadInfo[i].threadId != lastThreadId) {
1217  threadCt++;
1218  lastThreadId = threadInfo[i].threadId;
1219  }
1220  else {
1221  __kmp_free(threadInfo);
1222  KMP_CPU_FREE(oldMask);
1223  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1224  return -1;
1225  }
1226 
1227  //
1228  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1229  // fields agree between all the threads bounds to a given package.
1230  //
1231  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1232  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1233  __kmp_free(threadInfo);
1234  KMP_CPU_FREE(oldMask);
1235  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1236  return -1;
1237  }
1238  }
1239  nPackages = pkgCt;
1240  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1241  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1242 
1243  //
1244  // When affinity is off, this routine will still be called to set
1245  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1246  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1247  // correctly, and return now if affinity is not enabled.
1248  //
1249  __kmp_ncores = nCores;
1250  if (__kmp_affinity_verbose) {
1251  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1252  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1253 
1254  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1255  if (__kmp_affinity_respect_mask) {
1256  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1257  } else {
1258  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1259  }
1260  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1261  if (__kmp_affinity_uniform_topology()) {
1262  KMP_INFORM(Uniform, "KMP_AFFINITY");
1263  } else {
1264  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1265  }
1266  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1267  __kmp_nThreadsPerCore, __kmp_ncores);
1268 
1269  }
1270  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1271  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1272  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1273  for (i = 0; i < nApics; ++i) {
1274  __kmp_pu_os_idx[i] = threadInfo[i].osId;
1275  }
1276  if (__kmp_affinity_type == affinity_none) {
1277  __kmp_free(threadInfo);
1278  KMP_CPU_FREE(oldMask);
1279  return 0;
1280  }
1281 
1282  //
1283  // Now that we've determined the number of packages, the number of cores
1284  // per package, and the number of threads per core, we can construct the
1285  // data structure that is to be returned.
1286  //
1287  int pkgLevel = 0;
1288  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1289  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1290  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1291 
1292  KMP_ASSERT(depth > 0);
1293  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1294 
1295  for (i = 0; i < nApics; ++i) {
1296  Address addr(depth);
1297  unsigned os = threadInfo[i].osId;
1298  int d = 0;
1299 
1300  if (pkgLevel >= 0) {
1301  addr.labels[d++] = threadInfo[i].pkgId;
1302  }
1303  if (coreLevel >= 0) {
1304  addr.labels[d++] = threadInfo[i].coreId;
1305  }
1306  if (threadLevel >= 0) {
1307  addr.labels[d++] = threadInfo[i].threadId;
1308  }
1309  (*address2os)[i] = AddrUnsPair(addr, os);
1310  }
1311 
1312  if (__kmp_affinity_gran_levels < 0) {
1313  //
1314  // Set the granularity level based on what levels are modeled
1315  // in the machine topology map.
1316  //
1317  __kmp_affinity_gran_levels = 0;
1318  if ((threadLevel >= 0)
1319  && (__kmp_affinity_gran > affinity_gran_thread)) {
1320  __kmp_affinity_gran_levels++;
1321  }
1322  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1323  __kmp_affinity_gran_levels++;
1324  }
1325  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1326  __kmp_affinity_gran_levels++;
1327  }
1328  }
1329 
1330  if (__kmp_affinity_verbose) {
1331  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1332  coreLevel, threadLevel);
1333  }
1334 
1335  __kmp_free(threadInfo);
1336  KMP_CPU_FREE(oldMask);
1337  return depth;
1338 }
1339 
1340 
1341 //
1342 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1343 // architectures support a newer interface for specifying the x2APIC Ids,
1344 // based on cpuid leaf 11.
1345 //
1346 static int
1347 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1348  kmp_i18n_id_t *const msg_id)
1349 {
1350  kmp_cpuid buf;
1351 
1352  *address2os = NULL;
1353  *msg_id = kmp_i18n_null;
1354 
1355  //
1356  // Check to see if cpuid leaf 11 is supported.
1357  //
1358  __kmp_x86_cpuid(0, 0, &buf);
1359  if (buf.eax < 11) {
1360  *msg_id = kmp_i18n_str_NoLeaf11Support;
1361  return -1;
1362  }
1363  __kmp_x86_cpuid(11, 0, &buf);
1364  if (buf.ebx == 0) {
1365  *msg_id = kmp_i18n_str_NoLeaf11Support;
1366  return -1;
1367  }
1368 
1369  //
1370  // Find the number of levels in the machine topology. While we're at it,
1371  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1372  // try to get more accurate values later by explicitly counting them,
1373  // but get reasonable defaults now, in case we return early.
1374  //
1375  int level;
1376  int threadLevel = -1;
1377  int coreLevel = -1;
1378  int pkgLevel = -1;
1379  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1380 
1381  for (level = 0;; level++) {
1382  if (level > 31) {
1383  //
1384  // FIXME: Hack for DPD200163180
1385  //
1386  // If level is big then something went wrong -> exiting
1387  //
1388  // There could actually be 32 valid levels in the machine topology,
1389  // but so far, the only machine we have seen which does not exit
1390  // this loop before iteration 32 has fubar x2APIC settings.
1391  //
1392  // For now, just reject this case based upon loop trip count.
1393  //
1394  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1395  return -1;
1396  }
1397  __kmp_x86_cpuid(11, level, &buf);
1398  if (buf.ebx == 0) {
1399  if (pkgLevel < 0) {
1400  //
1401  // Will infer nPackages from __kmp_xproc
1402  //
1403  pkgLevel = level;
1404  level++;
1405  }
1406  break;
1407  }
1408  int kind = (buf.ecx >> 8) & 0xff;
1409  if (kind == 1) {
1410  //
1411  // SMT level
1412  //
1413  threadLevel = level;
1414  coreLevel = -1;
1415  pkgLevel = -1;
1416  __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1417  if (__kmp_nThreadsPerCore == 0) {
1418  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1419  return -1;
1420  }
1421  }
1422  else if (kind == 2) {
1423  //
1424  // core level
1425  //
1426  coreLevel = level;
1427  pkgLevel = -1;
1428  nCoresPerPkg = buf.ebx & 0xffff;
1429  if (nCoresPerPkg == 0) {
1430  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1431  return -1;
1432  }
1433  }
1434  else {
1435  if (level <= 0) {
1436  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437  return -1;
1438  }
1439  if (pkgLevel >= 0) {
1440  continue;
1441  }
1442  pkgLevel = level;
1443  nPackages = buf.ebx & 0xffff;
1444  if (nPackages == 0) {
1445  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1446  return -1;
1447  }
1448  }
1449  }
1450  int depth = level;
1451 
1452  //
1453  // In the above loop, "level" was counted from the finest level (usually
1454  // thread) to the coarsest. The caller expects that we will place the
1455  // labels in (*address2os)[].first.labels[] in the inverse order, so
1456  // we need to invert the vars saying which level means what.
1457  //
1458  if (threadLevel >= 0) {
1459  threadLevel = depth - threadLevel - 1;
1460  }
1461  if (coreLevel >= 0) {
1462  coreLevel = depth - coreLevel - 1;
1463  }
1464  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1465  pkgLevel = depth - pkgLevel - 1;
1466 
1467  //
1468  // The algorithm used starts by setting the affinity to each available
1469  // thread and retrieving info from the cpuid instruction, so if we are
1470  // not capable of calling __kmp_get_system_affinity() and
1471  // _kmp_get_system_affinity(), then we need to do something else - use
1472  // the defaults that we calculated from issuing cpuid without binding
1473  // to each proc.
1474  //
1475  if (! KMP_AFFINITY_CAPABLE())
1476  {
1477  //
1478  // Hack to try and infer the machine topology using only the data
1479  // available from cpuid on the current thread, and __kmp_xproc.
1480  //
1481  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1482 
1483  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1484  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1485  if (__kmp_affinity_verbose) {
1486  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1487  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1488  if (__kmp_affinity_uniform_topology()) {
1489  KMP_INFORM(Uniform, "KMP_AFFINITY");
1490  } else {
1491  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1492  }
1493  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1494  __kmp_nThreadsPerCore, __kmp_ncores);
1495  }
1496  return 0;
1497  }
1498 
1499  //
1500  //
1501  // From here on, we can assume that it is safe to call
1502  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1503  // even if __kmp_affinity_type = affinity_none.
1504  //
1505 
1506  //
1507  // Save the affinity mask for the current thread.
1508  //
1509  kmp_affin_mask_t *oldMask;
1510  KMP_CPU_ALLOC(oldMask);
1511  __kmp_get_system_affinity(oldMask, TRUE);
1512 
1513  //
1514  // Allocate the data structure to be returned.
1515  //
1516  AddrUnsPair *retval = (AddrUnsPair *)
1517  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1518 
1519  //
1520  // Run through each of the available contexts, binding the current thread
1521  // to it, and obtaining the pertinent information using the cpuid instr.
1522  //
1523  unsigned int proc;
1524  int nApics = 0;
1525  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1526  //
1527  // Skip this proc if it is not included in the machine model.
1528  //
1529  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1530  continue;
1531  }
1532  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1533 
1534  __kmp_affinity_dispatch->bind_thread(proc);
1535 
1536  //
1537  // Extrach the labels for each level in the machine topology map
1538  // from the Apic ID.
1539  //
1540  Address addr(depth);
1541  int prev_shift = 0;
1542 
1543  for (level = 0; level < depth; level++) {
1544  __kmp_x86_cpuid(11, level, &buf);
1545  unsigned apicId = buf.edx;
1546  if (buf.ebx == 0) {
1547  if (level != depth - 1) {
1548  KMP_CPU_FREE(oldMask);
1549  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1550  return -1;
1551  }
1552  addr.labels[depth - level - 1] = apicId >> prev_shift;
1553  level++;
1554  break;
1555  }
1556  int shift = buf.eax & 0x1f;
1557  int mask = (1 << shift) - 1;
1558  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1559  prev_shift = shift;
1560  }
1561  if (level != depth) {
1562  KMP_CPU_FREE(oldMask);
1563  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1564  return -1;
1565  }
1566 
1567  retval[nApics] = AddrUnsPair(addr, proc);
1568  nApics++;
1569  }
1570 
1571  //
1572  // We've collected all the info we need.
1573  // Restore the old affinity mask for this thread.
1574  //
1575  __kmp_set_system_affinity(oldMask, TRUE);
1576 
1577  //
1578  // If there's only one thread context to bind to, return now.
1579  //
1580  KMP_ASSERT(nApics > 0);
1581  if (nApics == 1) {
1582  __kmp_ncores = nPackages = 1;
1583  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1584  if (__kmp_affinity_verbose) {
1585  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1586  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1587 
1588  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1589  if (__kmp_affinity_respect_mask) {
1590  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1591  } else {
1592  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1593  }
1594  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1595  KMP_INFORM(Uniform, "KMP_AFFINITY");
1596  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1597  __kmp_nThreadsPerCore, __kmp_ncores);
1598  }
1599 
1600  if (__kmp_affinity_type == affinity_none) {
1601  __kmp_free(retval);
1602  KMP_CPU_FREE(oldMask);
1603  return 0;
1604  }
1605 
1606  //
1607  // Form an Address object which only includes the package level.
1608  //
1609  Address addr(1);
1610  addr.labels[0] = retval[0].first.labels[pkgLevel];
1611  retval[0].first = addr;
1612 
1613  if (__kmp_affinity_gran_levels < 0) {
1614  __kmp_affinity_gran_levels = 0;
1615  }
1616 
1617  if (__kmp_affinity_verbose) {
1618  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1619  }
1620 
1621  *address2os = retval;
1622  KMP_CPU_FREE(oldMask);
1623  return 1;
1624  }
1625 
1626  //
1627  // Sort the table by physical Id.
1628  //
1629  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1630 
1631  //
1632  // Find the radix at each of the levels.
1633  //
1634  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1635  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1636  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1637  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1638  for (level = 0; level < depth; level++) {
1639  totals[level] = 1;
1640  maxCt[level] = 1;
1641  counts[level] = 1;
1642  last[level] = retval[0].first.labels[level];
1643  }
1644 
1645  //
1646  // From here on, the iteration variable "level" runs from the finest
1647  // level to the coarsest, i.e. we iterate forward through
1648  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1649  // backwards.
1650  //
1651  for (proc = 1; (int)proc < nApics; proc++) {
1652  int level;
1653  for (level = 0; level < depth; level++) {
1654  if (retval[proc].first.labels[level] != last[level]) {
1655  int j;
1656  for (j = level + 1; j < depth; j++) {
1657  totals[j]++;
1658  counts[j] = 1;
1659  // The line below causes printing incorrect topology information
1660  // in case the max value for some level (maxCt[level]) is encountered earlier than
1661  // some less value while going through the array.
1662  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1663  // whereas it must be 4.
1664  // TODO!!! Check if it can be commented safely
1665  //maxCt[j] = 1;
1666  last[j] = retval[proc].first.labels[j];
1667  }
1668  totals[level]++;
1669  counts[level]++;
1670  if (counts[level] > maxCt[level]) {
1671  maxCt[level] = counts[level];
1672  }
1673  last[level] = retval[proc].first.labels[level];
1674  break;
1675  }
1676  else if (level == depth - 1) {
1677  __kmp_free(last);
1678  __kmp_free(maxCt);
1679  __kmp_free(counts);
1680  __kmp_free(totals);
1681  __kmp_free(retval);
1682  KMP_CPU_FREE(oldMask);
1683  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1684  return -1;
1685  }
1686  }
1687  }
1688 
1689  //
1690  // When affinity is off, this routine will still be called to set
1691  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1692  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1693  // correctly, and return if affinity is not enabled.
1694  //
1695  if (threadLevel >= 0) {
1696  __kmp_nThreadsPerCore = maxCt[threadLevel];
1697  }
1698  else {
1699  __kmp_nThreadsPerCore = 1;
1700  }
1701  nPackages = totals[pkgLevel];
1702 
1703  if (coreLevel >= 0) {
1704  __kmp_ncores = totals[coreLevel];
1705  nCoresPerPkg = maxCt[coreLevel];
1706  }
1707  else {
1708  __kmp_ncores = nPackages;
1709  nCoresPerPkg = 1;
1710  }
1711 
1712  //
1713  // Check to see if the machine topology is uniform
1714  //
1715  unsigned prod = maxCt[0];
1716  for (level = 1; level < depth; level++) {
1717  prod *= maxCt[level];
1718  }
1719  bool uniform = (prod == totals[level - 1]);
1720 
1721  //
1722  // Print the machine topology summary.
1723  //
1724  if (__kmp_affinity_verbose) {
1725  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1726  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1727 
1728  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1729  if (__kmp_affinity_respect_mask) {
1730  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1731  } else {
1732  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1733  }
1734  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1735  if (uniform) {
1736  KMP_INFORM(Uniform, "KMP_AFFINITY");
1737  } else {
1738  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1739  }
1740 
1741  kmp_str_buf_t buf;
1742  __kmp_str_buf_init(&buf);
1743 
1744  __kmp_str_buf_print(&buf, "%d", totals[0]);
1745  for (level = 1; level <= pkgLevel; level++) {
1746  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1747  }
1748  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1749  __kmp_nThreadsPerCore, __kmp_ncores);
1750 
1751  __kmp_str_buf_free(&buf);
1752  }
1753  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1754  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1755  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1756  for (proc = 0; (int)proc < nApics; ++proc) {
1757  __kmp_pu_os_idx[proc] = retval[proc].second;
1758  }
1759  if (__kmp_affinity_type == affinity_none) {
1760  __kmp_free(last);
1761  __kmp_free(maxCt);
1762  __kmp_free(counts);
1763  __kmp_free(totals);
1764  __kmp_free(retval);
1765  KMP_CPU_FREE(oldMask);
1766  return 0;
1767  }
1768 
1769  //
1770  // Find any levels with radiix 1, and remove them from the map
1771  // (except for the package level).
1772  //
1773  int new_depth = 0;
1774  for (level = 0; level < depth; level++) {
1775  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1776  continue;
1777  }
1778  new_depth++;
1779  }
1780 
1781  //
1782  // If we are removing any levels, allocate a new vector to return,
1783  // and copy the relevant information to it.
1784  //
1785  if (new_depth != depth) {
1786  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1787  sizeof(AddrUnsPair) * nApics);
1788  for (proc = 0; (int)proc < nApics; proc++) {
1789  Address addr(new_depth);
1790  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1791  }
1792  int new_level = 0;
1793  int newPkgLevel = -1;
1794  int newCoreLevel = -1;
1795  int newThreadLevel = -1;
1796  int i;
1797  for (level = 0; level < depth; level++) {
1798  if ((maxCt[level] == 1)
1799  && (level != pkgLevel)) {
1800  //
1801  // Remove this level. Never remove the package level
1802  //
1803  continue;
1804  }
1805  if (level == pkgLevel) {
1806  newPkgLevel = level;
1807  }
1808  if (level == coreLevel) {
1809  newCoreLevel = level;
1810  }
1811  if (level == threadLevel) {
1812  newThreadLevel = level;
1813  }
1814  for (proc = 0; (int)proc < nApics; proc++) {
1815  new_retval[proc].first.labels[new_level]
1816  = retval[proc].first.labels[level];
1817  }
1818  new_level++;
1819  }
1820 
1821  __kmp_free(retval);
1822  retval = new_retval;
1823  depth = new_depth;
1824  pkgLevel = newPkgLevel;
1825  coreLevel = newCoreLevel;
1826  threadLevel = newThreadLevel;
1827  }
1828 
1829  if (__kmp_affinity_gran_levels < 0) {
1830  //
1831  // Set the granularity level based on what levels are modeled
1832  // in the machine topology map.
1833  //
1834  __kmp_affinity_gran_levels = 0;
1835  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1836  __kmp_affinity_gran_levels++;
1837  }
1838  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1839  __kmp_affinity_gran_levels++;
1840  }
1841  if (__kmp_affinity_gran > affinity_gran_package) {
1842  __kmp_affinity_gran_levels++;
1843  }
1844  }
1845 
1846  if (__kmp_affinity_verbose) {
1847  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1848  coreLevel, threadLevel);
1849  }
1850 
1851  __kmp_free(last);
1852  __kmp_free(maxCt);
1853  __kmp_free(counts);
1854  __kmp_free(totals);
1855  KMP_CPU_FREE(oldMask);
1856  *address2os = retval;
1857  return depth;
1858 }
1859 
1860 
1861 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1862 
1863 
1864 #define osIdIndex 0
1865 #define threadIdIndex 1
1866 #define coreIdIndex 2
1867 #define pkgIdIndex 3
1868 #define nodeIdIndex 4
1869 
1870 typedef unsigned *ProcCpuInfo;
1871 static unsigned maxIndex = pkgIdIndex;
1872 
1873 
1874 static int
1875 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1876 {
1877  const unsigned *aa = (const unsigned *)a;
1878  const unsigned *bb = (const unsigned *)b;
1879  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1880  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1881  return 0;
1882 };
1883 
1884 
1885 static int
1886 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1887 {
1888  unsigned i;
1889  const unsigned *aa = *((const unsigned **)a);
1890  const unsigned *bb = *((const unsigned **)b);
1891  for (i = maxIndex; ; i--) {
1892  if (aa[i] < bb[i]) return -1;
1893  if (aa[i] > bb[i]) return 1;
1894  if (i == osIdIndex) break;
1895  }
1896  return 0;
1897 }
1898 
1899 
1900 //
1901 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1902 // affinity map.
1903 //
1904 static int
1905 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1906  kmp_i18n_id_t *const msg_id, FILE *f)
1907 {
1908  *address2os = NULL;
1909  *msg_id = kmp_i18n_null;
1910 
1911  //
1912  // Scan of the file, and count the number of "processor" (osId) fields,
1913  // and find the highest value of <n> for a node_<n> field.
1914  //
1915  char buf[256];
1916  unsigned num_records = 0;
1917  while (! feof(f)) {
1918  buf[sizeof(buf) - 1] = 1;
1919  if (! fgets(buf, sizeof(buf), f)) {
1920  //
1921  // Read errors presumably because of EOF
1922  //
1923  break;
1924  }
1925 
1926  char s1[] = "processor";
1927  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1928  num_records++;
1929  continue;
1930  }
1931 
1932  //
1933  // FIXME - this will match "node_<n> <garbage>"
1934  //
1935  unsigned level;
1936  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1937  if (nodeIdIndex + level >= maxIndex) {
1938  maxIndex = nodeIdIndex + level;
1939  }
1940  continue;
1941  }
1942  }
1943 
1944  //
1945  // Check for empty file / no valid processor records, or too many.
1946  // The number of records can't exceed the number of valid bits in the
1947  // affinity mask.
1948  //
1949  if (num_records == 0) {
1950  *line = 0;
1951  *msg_id = kmp_i18n_str_NoProcRecords;
1952  return -1;
1953  }
1954  if (num_records > (unsigned)__kmp_xproc) {
1955  *line = 0;
1956  *msg_id = kmp_i18n_str_TooManyProcRecords;
1957  return -1;
1958  }
1959 
1960  //
1961  // Set the file pointer back to the begginning, so that we can scan the
1962  // file again, this time performing a full parse of the data.
1963  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1964  // Adding an extra element at the end allows us to remove a lot of extra
1965  // checks for termination conditions.
1966  //
1967  if (fseek(f, 0, SEEK_SET) != 0) {
1968  *line = 0;
1969  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1970  return -1;
1971  }
1972 
1973  //
1974  // Allocate the array of records to store the proc info in. The dummy
1975  // element at the end makes the logic in filling them out easier to code.
1976  //
1977  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1978  * sizeof(unsigned *));
1979  unsigned i;
1980  for (i = 0; i <= num_records; i++) {
1981  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1982  * sizeof(unsigned));
1983  }
1984 
1985 #define CLEANUP_THREAD_INFO \
1986  for (i = 0; i <= num_records; i++) { \
1987  __kmp_free(threadInfo[i]); \
1988  } \
1989  __kmp_free(threadInfo);
1990 
1991  //
1992  // A value of UINT_MAX means that we didn't find the field
1993  //
1994  unsigned __index;
1995 
1996 #define INIT_PROC_INFO(p) \
1997  for (__index = 0; __index <= maxIndex; __index++) { \
1998  (p)[__index] = UINT_MAX; \
1999  }
2000 
2001  for (i = 0; i <= num_records; i++) {
2002  INIT_PROC_INFO(threadInfo[i]);
2003  }
2004 
2005  unsigned num_avail = 0;
2006  *line = 0;
2007  while (! feof(f)) {
2008  //
2009  // Create an inner scoping level, so that all the goto targets at the
2010  // end of the loop appear in an outer scoping level. This avoids
2011  // warnings about jumping past an initialization to a target in the
2012  // same block.
2013  //
2014  {
2015  buf[sizeof(buf) - 1] = 1;
2016  bool long_line = false;
2017  if (! fgets(buf, sizeof(buf), f)) {
2018  //
2019  // Read errors presumably because of EOF
2020  //
2021  // If there is valid data in threadInfo[num_avail], then fake
2022  // a blank line in ensure that the last address gets parsed.
2023  //
2024  bool valid = false;
2025  for (i = 0; i <= maxIndex; i++) {
2026  if (threadInfo[num_avail][i] != UINT_MAX) {
2027  valid = true;
2028  }
2029  }
2030  if (! valid) {
2031  break;
2032  }
2033  buf[0] = 0;
2034  } else if (!buf[sizeof(buf) - 1]) {
2035  //
2036  // The line is longer than the buffer. Set a flag and don't
2037  // emit an error if we were going to ignore the line, anyway.
2038  //
2039  long_line = true;
2040 
2041 #define CHECK_LINE \
2042  if (long_line) { \
2043  CLEANUP_THREAD_INFO; \
2044  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2045  return -1; \
2046  }
2047  }
2048  (*line)++;
2049 
2050  char s1[] = "processor";
2051  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2052  CHECK_LINE;
2053  char *p = strchr(buf + sizeof(s1) - 1, ':');
2054  unsigned val;
2055  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2056  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2057  threadInfo[num_avail][osIdIndex] = val;
2058 #if KMP_OS_LINUX && USE_SYSFS_INFO
2059  char path[256];
2060  KMP_SNPRINTF(path, sizeof(path),
2061  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2062  threadInfo[num_avail][osIdIndex]);
2063  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2064 
2065  KMP_SNPRINTF(path, sizeof(path),
2066  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2067  threadInfo[num_avail][osIdIndex]);
2068  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2069  continue;
2070 #else
2071  }
2072  char s2[] = "physical id";
2073  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2074  CHECK_LINE;
2075  char *p = strchr(buf + sizeof(s2) - 1, ':');
2076  unsigned val;
2077  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2078  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2079  threadInfo[num_avail][pkgIdIndex] = val;
2080  continue;
2081  }
2082  char s3[] = "core id";
2083  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2084  CHECK_LINE;
2085  char *p = strchr(buf + sizeof(s3) - 1, ':');
2086  unsigned val;
2087  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2088  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2089  threadInfo[num_avail][coreIdIndex] = val;
2090  continue;
2091 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2092  }
2093  char s4[] = "thread id";
2094  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2095  CHECK_LINE;
2096  char *p = strchr(buf + sizeof(s4) - 1, ':');
2097  unsigned val;
2098  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2099  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2100  threadInfo[num_avail][threadIdIndex] = val;
2101  continue;
2102  }
2103  unsigned level;
2104  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2105  CHECK_LINE;
2106  char *p = strchr(buf + sizeof(s4) - 1, ':');
2107  unsigned val;
2108  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2109  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2110  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2111  threadInfo[num_avail][nodeIdIndex + level] = val;
2112  continue;
2113  }
2114 
2115  //
2116  // We didn't recognize the leading token on the line.
2117  // There are lots of leading tokens that we don't recognize -
2118  // if the line isn't empty, go on to the next line.
2119  //
2120  if ((*buf != 0) && (*buf != '\n')) {
2121  //
2122  // If the line is longer than the buffer, read characters
2123  // until we find a newline.
2124  //
2125  if (long_line) {
2126  int ch;
2127  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2128  }
2129  continue;
2130  }
2131 
2132  //
2133  // A newline has signalled the end of the processor record.
2134  // Check that there aren't too many procs specified.
2135  //
2136  if ((int)num_avail == __kmp_xproc) {
2137  CLEANUP_THREAD_INFO;
2138  *msg_id = kmp_i18n_str_TooManyEntries;
2139  return -1;
2140  }
2141 
2142  //
2143  // Check for missing fields. The osId field must be there, and we
2144  // currently require that the physical id field is specified, also.
2145  //
2146  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2147  CLEANUP_THREAD_INFO;
2148  *msg_id = kmp_i18n_str_MissingProcField;
2149  return -1;
2150  }
2151  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2152  CLEANUP_THREAD_INFO;
2153  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2154  return -1;
2155  }
2156 
2157  //
2158  // Skip this proc if it is not included in the machine model.
2159  //
2160  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
2161  INIT_PROC_INFO(threadInfo[num_avail]);
2162  continue;
2163  }
2164 
2165  //
2166  // We have a successful parse of this proc's info.
2167  // Increment the counter, and prepare for the next proc.
2168  //
2169  num_avail++;
2170  KMP_ASSERT(num_avail <= num_records);
2171  INIT_PROC_INFO(threadInfo[num_avail]);
2172  }
2173  continue;
2174 
2175  no_val:
2176  CLEANUP_THREAD_INFO;
2177  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2178  return -1;
2179 
2180  dup_field:
2181  CLEANUP_THREAD_INFO;
2182  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2183  return -1;
2184  }
2185  *line = 0;
2186 
2187 # if KMP_MIC && REDUCE_TEAM_SIZE
2188  unsigned teamSize = 0;
2189 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2190 
2191  // check for num_records == __kmp_xproc ???
2192 
2193  //
2194  // If there's only one thread context to bind to, form an Address object
2195  // with depth 1 and return immediately (or, if affinity is off, set
2196  // address2os to NULL and return).
2197  //
2198  // If it is configured to omit the package level when there is only a
2199  // single package, the logic at the end of this routine won't work if
2200  // there is only a single thread - it would try to form an Address
2201  // object with depth 0.
2202  //
2203  KMP_ASSERT(num_avail > 0);
2204  KMP_ASSERT(num_avail <= num_records);
2205  if (num_avail == 1) {
2206  __kmp_ncores = 1;
2207  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2208  if (__kmp_affinity_verbose) {
2209  if (! KMP_AFFINITY_CAPABLE()) {
2210  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2211  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2212  KMP_INFORM(Uniform, "KMP_AFFINITY");
2213  }
2214  else {
2215  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2216  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2217  __kmp_affin_fullMask);
2218  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2219  if (__kmp_affinity_respect_mask) {
2220  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2221  } else {
2222  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2223  }
2224  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2225  KMP_INFORM(Uniform, "KMP_AFFINITY");
2226  }
2227  int index;
2228  kmp_str_buf_t buf;
2229  __kmp_str_buf_init(&buf);
2230  __kmp_str_buf_print(&buf, "1");
2231  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2232  __kmp_str_buf_print(&buf, " x 1");
2233  }
2234  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2235  __kmp_str_buf_free(&buf);
2236  }
2237 
2238  if (__kmp_affinity_type == affinity_none) {
2239  CLEANUP_THREAD_INFO;
2240  return 0;
2241  }
2242 
2243  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2244  Address addr(1);
2245  addr.labels[0] = threadInfo[0][pkgIdIndex];
2246  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2247 
2248  if (__kmp_affinity_gran_levels < 0) {
2249  __kmp_affinity_gran_levels = 0;
2250  }
2251 
2252  if (__kmp_affinity_verbose) {
2253  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2254  }
2255 
2256  CLEANUP_THREAD_INFO;
2257  return 1;
2258  }
2259 
2260  //
2261  // Sort the threadInfo table by physical Id.
2262  //
2263  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2264  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2265 
2266  //
2267  // The table is now sorted by pkgId / coreId / threadId, but we really
2268  // don't know the radix of any of the fields. pkgId's may be sparsely
2269  // assigned among the chips on a system. Although coreId's are usually
2270  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2271  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2272  //
2273  // For that matter, we don't know what coresPerPkg and threadsPerCore
2274  // (or the total # packages) are at this point - we want to determine
2275  // that now. We only have an upper bound on the first two figures.
2276  //
2277  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2278  * sizeof(unsigned));
2279  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2280  * sizeof(unsigned));
2281  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2282  * sizeof(unsigned));
2283  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2284  * sizeof(unsigned));
2285 
2286  bool assign_thread_ids = false;
2287  unsigned threadIdCt;
2288  unsigned index;
2289 
2290  restart_radix_check:
2291  threadIdCt = 0;
2292 
2293  //
2294  // Initialize the counter arrays with data from threadInfo[0].
2295  //
2296  if (assign_thread_ids) {
2297  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2298  threadInfo[0][threadIdIndex] = threadIdCt++;
2299  }
2300  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2301  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2302  }
2303  }
2304  for (index = 0; index <= maxIndex; index++) {
2305  counts[index] = 1;
2306  maxCt[index] = 1;
2307  totals[index] = 1;
2308  lastId[index] = threadInfo[0][index];;
2309  }
2310 
2311  //
2312  // Run through the rest of the OS procs.
2313  //
2314  for (i = 1; i < num_avail; i++) {
2315  //
2316  // Find the most significant index whose id differs
2317  // from the id for the previous OS proc.
2318  //
2319  for (index = maxIndex; index >= threadIdIndex; index--) {
2320  if (assign_thread_ids && (index == threadIdIndex)) {
2321  //
2322  // Auto-assign the thread id field if it wasn't specified.
2323  //
2324  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2325  threadInfo[i][threadIdIndex] = threadIdCt++;
2326  }
2327 
2328  //
2329  // Aparrently the thread id field was specified for some
2330  // entries and not others. Start the thread id counter
2331  // off at the next higher thread id.
2332  //
2333  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2334  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2335  }
2336  }
2337  if (threadInfo[i][index] != lastId[index]) {
2338  //
2339  // Run through all indices which are less significant,
2340  // and reset the counts to 1.
2341  //
2342  // At all levels up to and including index, we need to
2343  // increment the totals and record the last id.
2344  //
2345  unsigned index2;
2346  for (index2 = threadIdIndex; index2 < index; index2++) {
2347  totals[index2]++;
2348  if (counts[index2] > maxCt[index2]) {
2349  maxCt[index2] = counts[index2];
2350  }
2351  counts[index2] = 1;
2352  lastId[index2] = threadInfo[i][index2];
2353  }
2354  counts[index]++;
2355  totals[index]++;
2356  lastId[index] = threadInfo[i][index];
2357 
2358  if (assign_thread_ids && (index > threadIdIndex)) {
2359 
2360 # if KMP_MIC && REDUCE_TEAM_SIZE
2361  //
2362  // The default team size is the total #threads in the machine
2363  // minus 1 thread for every core that has 3 or more threads.
2364  //
2365  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2366 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2367 
2368  //
2369  // Restart the thread counter, as we are on a new core.
2370  //
2371  threadIdCt = 0;
2372 
2373  //
2374  // Auto-assign the thread id field if it wasn't specified.
2375  //
2376  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2377  threadInfo[i][threadIdIndex] = threadIdCt++;
2378  }
2379 
2380  //
2381  // Aparrently the thread id field was specified for some
2382  // entries and not others. Start the thread id counter
2383  // off at the next higher thread id.
2384  //
2385  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2386  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2387  }
2388  }
2389  break;
2390  }
2391  }
2392  if (index < threadIdIndex) {
2393  //
2394  // If thread ids were specified, it is an error if they are not
2395  // unique. Also, check that we waven't already restarted the
2396  // loop (to be safe - shouldn't need to).
2397  //
2398  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2399  || assign_thread_ids) {
2400  __kmp_free(lastId);
2401  __kmp_free(totals);
2402  __kmp_free(maxCt);
2403  __kmp_free(counts);
2404  CLEANUP_THREAD_INFO;
2405  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2406  return -1;
2407  }
2408 
2409  //
2410  // If the thread ids were not specified and we see entries
2411  // entries that are duplicates, start the loop over and
2412  // assign the thread ids manually.
2413  //
2414  assign_thread_ids = true;
2415  goto restart_radix_check;
2416  }
2417  }
2418 
2419 # if KMP_MIC && REDUCE_TEAM_SIZE
2420  //
2421  // The default team size is the total #threads in the machine
2422  // minus 1 thread for every core that has 3 or more threads.
2423  //
2424  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2425 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2426 
2427  for (index = threadIdIndex; index <= maxIndex; index++) {
2428  if (counts[index] > maxCt[index]) {
2429  maxCt[index] = counts[index];
2430  }
2431  }
2432 
2433  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2434  nCoresPerPkg = maxCt[coreIdIndex];
2435  nPackages = totals[pkgIdIndex];
2436 
2437  //
2438  // Check to see if the machine topology is uniform
2439  //
2440  unsigned prod = totals[maxIndex];
2441  for (index = threadIdIndex; index < maxIndex; index++) {
2442  prod *= maxCt[index];
2443  }
2444  bool uniform = (prod == totals[threadIdIndex]);
2445 
2446  //
2447  // When affinity is off, this routine will still be called to set
2448  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2449  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2450  // correctly, and return now if affinity is not enabled.
2451  //
2452  __kmp_ncores = totals[coreIdIndex];
2453 
2454  if (__kmp_affinity_verbose) {
2455  if (! KMP_AFFINITY_CAPABLE()) {
2456  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2457  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2458  if (uniform) {
2459  KMP_INFORM(Uniform, "KMP_AFFINITY");
2460  } else {
2461  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2462  }
2463  }
2464  else {
2465  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2466  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
2467  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2468  if (__kmp_affinity_respect_mask) {
2469  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2470  } else {
2471  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2472  }
2473  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2474  if (uniform) {
2475  KMP_INFORM(Uniform, "KMP_AFFINITY");
2476  } else {
2477  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2478  }
2479  }
2480  kmp_str_buf_t buf;
2481  __kmp_str_buf_init(&buf);
2482 
2483  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2484  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2485  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2486  }
2487  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2488  maxCt[threadIdIndex], __kmp_ncores);
2489 
2490  __kmp_str_buf_free(&buf);
2491  }
2492 
2493 # if KMP_MIC && REDUCE_TEAM_SIZE
2494  //
2495  // Set the default team size.
2496  //
2497  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2498  __kmp_dflt_team_nth = teamSize;
2499  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2500  __kmp_dflt_team_nth));
2501  }
2502 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2503 
2504  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2505  KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2506  __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2507  for (i = 0; i < num_avail; ++i) { // fill the os indices
2508  __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2509  }
2510 
2511  if (__kmp_affinity_type == affinity_none) {
2512  __kmp_free(lastId);
2513  __kmp_free(totals);
2514  __kmp_free(maxCt);
2515  __kmp_free(counts);
2516  CLEANUP_THREAD_INFO;
2517  return 0;
2518  }
2519 
2520  //
2521  // Count the number of levels which have more nodes at that level than
2522  // at the parent's level (with there being an implicit root node of
2523  // the top level). This is equivalent to saying that there is at least
2524  // one node at this level which has a sibling. These levels are in the
2525  // map, and the package level is always in the map.
2526  //
2527  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2528  int level = 0;
2529  for (index = threadIdIndex; index < maxIndex; index++) {
2530  KMP_ASSERT(totals[index] >= totals[index + 1]);
2531  inMap[index] = (totals[index] > totals[index + 1]);
2532  }
2533  inMap[maxIndex] = (totals[maxIndex] > 1);
2534  inMap[pkgIdIndex] = true;
2535 
2536  int depth = 0;
2537  for (index = threadIdIndex; index <= maxIndex; index++) {
2538  if (inMap[index]) {
2539  depth++;
2540  }
2541  }
2542  KMP_ASSERT(depth > 0);
2543 
2544  //
2545  // Construct the data structure that is to be returned.
2546  //
2547  *address2os = (AddrUnsPair*)
2548  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2549  int pkgLevel = -1;
2550  int coreLevel = -1;
2551  int threadLevel = -1;
2552 
2553  for (i = 0; i < num_avail; ++i) {
2554  Address addr(depth);
2555  unsigned os = threadInfo[i][osIdIndex];
2556  int src_index;
2557  int dst_index = 0;
2558 
2559  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2560  if (! inMap[src_index]) {
2561  continue;
2562  }
2563  addr.labels[dst_index] = threadInfo[i][src_index];
2564  if (src_index == pkgIdIndex) {
2565  pkgLevel = dst_index;
2566  }
2567  else if (src_index == coreIdIndex) {
2568  coreLevel = dst_index;
2569  }
2570  else if (src_index == threadIdIndex) {
2571  threadLevel = dst_index;
2572  }
2573  dst_index++;
2574  }
2575  (*address2os)[i] = AddrUnsPair(addr, os);
2576  }
2577 
2578  if (__kmp_affinity_gran_levels < 0) {
2579  //
2580  // Set the granularity level based on what levels are modeled
2581  // in the machine topology map.
2582  //
2583  unsigned src_index;
2584  __kmp_affinity_gran_levels = 0;
2585  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2586  if (! inMap[src_index]) {
2587  continue;
2588  }
2589  switch (src_index) {
2590  case threadIdIndex:
2591  if (__kmp_affinity_gran > affinity_gran_thread) {
2592  __kmp_affinity_gran_levels++;
2593  }
2594 
2595  break;
2596  case coreIdIndex:
2597  if (__kmp_affinity_gran > affinity_gran_core) {
2598  __kmp_affinity_gran_levels++;
2599  }
2600  break;
2601 
2602  case pkgIdIndex:
2603  if (__kmp_affinity_gran > affinity_gran_package) {
2604  __kmp_affinity_gran_levels++;
2605  }
2606  break;
2607  }
2608  }
2609  }
2610 
2611  if (__kmp_affinity_verbose) {
2612  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2613  coreLevel, threadLevel);
2614  }
2615 
2616  __kmp_free(inMap);
2617  __kmp_free(lastId);
2618  __kmp_free(totals);
2619  __kmp_free(maxCt);
2620  __kmp_free(counts);
2621  CLEANUP_THREAD_INFO;
2622  return depth;
2623 }
2624 
2625 
2626 //
2627 // Create and return a table of affinity masks, indexed by OS thread ID.
2628 // This routine handles OR'ing together all the affinity masks of threads
2629 // that are sufficiently close, if granularity > fine.
2630 //
2631 static kmp_affin_mask_t *
2632 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2633  AddrUnsPair *address2os, unsigned numAddrs)
2634 {
2635  //
2636  // First form a table of affinity masks in order of OS thread id.
2637  //
2638  unsigned depth;
2639  unsigned maxOsId;
2640  unsigned i;
2641 
2642  KMP_ASSERT(numAddrs > 0);
2643  depth = address2os[0].first.depth;
2644 
2645  maxOsId = 0;
2646  for (i = 0; i < numAddrs; i++) {
2647  unsigned osId = address2os[i].second;
2648  if (osId > maxOsId) {
2649  maxOsId = osId;
2650  }
2651  }
2652  kmp_affin_mask_t *osId2Mask;
2653  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
2654 
2655  //
2656  // Sort the address2os table according to physical order. Doing so
2657  // will put all threads on the same core/package/node in consecutive
2658  // locations.
2659  //
2660  qsort(address2os, numAddrs, sizeof(*address2os),
2661  __kmp_affinity_cmp_Address_labels);
2662 
2663  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2664  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2665  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2666  }
2667  if (__kmp_affinity_gran_levels >= (int)depth) {
2668  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2669  && (__kmp_affinity_type != affinity_none))) {
2670  KMP_WARNING(AffThreadsMayMigrate);
2671  }
2672  }
2673 
2674  //
2675  // Run through the table, forming the masks for all threads on each
2676  // core. Threads on the same core will have identical "Address"
2677  // objects, not considering the last level, which must be the thread
2678  // id. All threads on a core will appear consecutively.
2679  //
2680  unsigned unique = 0;
2681  unsigned j = 0; // index of 1st thread on core
2682  unsigned leader = 0;
2683  Address *leaderAddr = &(address2os[0].first);
2684  kmp_affin_mask_t *sum;
2685  KMP_CPU_ALLOC_ON_STACK(sum);
2686  KMP_CPU_ZERO(sum);
2687  KMP_CPU_SET(address2os[0].second, sum);
2688  for (i = 1; i < numAddrs; i++) {
2689  //
2690  // If this thread is sufficiently close to the leader (within the
2691  // granularity setting), then set the bit for this os thread in the
2692  // affinity mask for this group, and go on to the next thread.
2693  //
2694  if (leaderAddr->isClose(address2os[i].first,
2695  __kmp_affinity_gran_levels)) {
2696  KMP_CPU_SET(address2os[i].second, sum);
2697  continue;
2698  }
2699 
2700  //
2701  // For every thread in this group, copy the mask to the thread's
2702  // entry in the osId2Mask table. Mark the first address as a
2703  // leader.
2704  //
2705  for (; j < i; j++) {
2706  unsigned osId = address2os[j].second;
2707  KMP_DEBUG_ASSERT(osId <= maxOsId);
2708  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2709  KMP_CPU_COPY(mask, sum);
2710  address2os[j].first.leader = (j == leader);
2711  }
2712  unique++;
2713 
2714  //
2715  // Start a new mask.
2716  //
2717  leader = i;
2718  leaderAddr = &(address2os[i].first);
2719  KMP_CPU_ZERO(sum);
2720  KMP_CPU_SET(address2os[i].second, sum);
2721  }
2722 
2723  //
2724  // For every thread in last group, copy the mask to the thread's
2725  // entry in the osId2Mask table.
2726  //
2727  for (; j < i; j++) {
2728  unsigned osId = address2os[j].second;
2729  KMP_DEBUG_ASSERT(osId <= maxOsId);
2730  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2731  KMP_CPU_COPY(mask, sum);
2732  address2os[j].first.leader = (j == leader);
2733  }
2734  unique++;
2735  KMP_CPU_FREE_FROM_STACK(sum);
2736 
2737  *maxIndex = maxOsId;
2738  *numUnique = unique;
2739  return osId2Mask;
2740 }
2741 
2742 
2743 //
2744 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2745 // as file-static than to try and pass them through the calling sequence of
2746 // the recursive-descent OMP_PLACES parser.
2747 //
2748 static kmp_affin_mask_t *newMasks;
2749 static int numNewMasks;
2750 static int nextNewMask;
2751 
2752 #define ADD_MASK(_mask) \
2753  { \
2754  if (nextNewMask >= numNewMasks) { \
2755  int i; \
2756  numNewMasks *= 2; \
2757  kmp_affin_mask_t* temp; \
2758  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2759  for(i=0;i<numNewMasks/2;i++) { \
2760  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \
2761  kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \
2762  KMP_CPU_COPY(dest, src); \
2763  } \
2764  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \
2765  newMasks = temp; \
2766  } \
2767  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2768  nextNewMask++; \
2769  }
2770 
2771 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2772  { \
2773  if (((_osId) > _maxOsId) || \
2774  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2775  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2776  && (__kmp_affinity_type != affinity_none))) { \
2777  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2778  } \
2779  } \
2780  else { \
2781  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2782  } \
2783  }
2784 
2785 
2786 //
2787 // Re-parse the proclist (for the explicit affinity type), and form the list
2788 // of affinity newMasks indexed by gtid.
2789 //
2790 static void
2791 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2792  unsigned int *out_numMasks, const char *proclist,
2793  kmp_affin_mask_t *osId2Mask, int maxOsId)
2794 {
2795  int i;
2796  const char *scan = proclist;
2797  const char *next = proclist;
2798 
2799  //
2800  // We use malloc() for the temporary mask vector,
2801  // so that we can use realloc() to extend it.
2802  //
2803  numNewMasks = 2;
2804  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2805  nextNewMask = 0;
2806  kmp_affin_mask_t *sumMask;
2807  KMP_CPU_ALLOC(sumMask);
2808  int setSize = 0;
2809 
2810  for (;;) {
2811  int start, end, stride;
2812 
2813  SKIP_WS(scan);
2814  next = scan;
2815  if (*next == '\0') {
2816  break;
2817  }
2818 
2819  if (*next == '{') {
2820  int num;
2821  setSize = 0;
2822  next++; // skip '{'
2823  SKIP_WS(next);
2824  scan = next;
2825 
2826  //
2827  // Read the first integer in the set.
2828  //
2829  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2830  "bad proclist");
2831  SKIP_DIGITS(next);
2832  num = __kmp_str_to_int(scan, *next);
2833  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2834 
2835  //
2836  // Copy the mask for that osId to the sum (union) mask.
2837  //
2838  if ((num > maxOsId) ||
2839  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2840  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2841  && (__kmp_affinity_type != affinity_none))) {
2842  KMP_WARNING(AffIgnoreInvalidProcID, num);
2843  }
2844  KMP_CPU_ZERO(sumMask);
2845  }
2846  else {
2847  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2848  setSize = 1;
2849  }
2850 
2851  for (;;) {
2852  //
2853  // Check for end of set.
2854  //
2855  SKIP_WS(next);
2856  if (*next == '}') {
2857  next++; // skip '}'
2858  break;
2859  }
2860 
2861  //
2862  // Skip optional comma.
2863  //
2864  if (*next == ',') {
2865  next++;
2866  }
2867  SKIP_WS(next);
2868 
2869  //
2870  // Read the next integer in the set.
2871  //
2872  scan = next;
2873  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2874  "bad explicit proc list");
2875 
2876  SKIP_DIGITS(next);
2877  num = __kmp_str_to_int(scan, *next);
2878  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2879 
2880  //
2881  // Add the mask for that osId to the sum mask.
2882  //
2883  if ((num > maxOsId) ||
2884  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2885  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2886  && (__kmp_affinity_type != affinity_none))) {
2887  KMP_WARNING(AffIgnoreInvalidProcID, num);
2888  }
2889  }
2890  else {
2891  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2892  setSize++;
2893  }
2894  }
2895  if (setSize > 0) {
2896  ADD_MASK(sumMask);
2897  }
2898 
2899  SKIP_WS(next);
2900  if (*next == ',') {
2901  next++;
2902  }
2903  scan = next;
2904  continue;
2905  }
2906 
2907  //
2908  // Read the first integer.
2909  //
2910  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2911  SKIP_DIGITS(next);
2912  start = __kmp_str_to_int(scan, *next);
2913  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2914  SKIP_WS(next);
2915 
2916  //
2917  // If this isn't a range, then add a mask to the list and go on.
2918  //
2919  if (*next != '-') {
2920  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2921 
2922  //
2923  // Skip optional comma.
2924  //
2925  if (*next == ',') {
2926  next++;
2927  }
2928  scan = next;
2929  continue;
2930  }
2931 
2932  //
2933  // This is a range. Skip over the '-' and read in the 2nd int.
2934  //
2935  next++; // skip '-'
2936  SKIP_WS(next);
2937  scan = next;
2938  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2939  SKIP_DIGITS(next);
2940  end = __kmp_str_to_int(scan, *next);
2941  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2942 
2943  //
2944  // Check for a stride parameter
2945  //
2946  stride = 1;
2947  SKIP_WS(next);
2948  if (*next == ':') {
2949  //
2950  // A stride is specified. Skip over the ':" and read the 3rd int.
2951  //
2952  int sign = +1;
2953  next++; // skip ':'
2954  SKIP_WS(next);
2955  scan = next;
2956  if (*next == '-') {
2957  sign = -1;
2958  next++;
2959  SKIP_WS(next);
2960  scan = next;
2961  }
2962  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2963  "bad explicit proc list");
2964  SKIP_DIGITS(next);
2965  stride = __kmp_str_to_int(scan, *next);
2966  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2967  stride *= sign;
2968  }
2969 
2970  //
2971  // Do some range checks.
2972  //
2973  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2974  if (stride > 0) {
2975  KMP_ASSERT2(start <= end, "bad explicit proc list");
2976  }
2977  else {
2978  KMP_ASSERT2(start >= end, "bad explicit proc list");
2979  }
2980  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2981 
2982  //
2983  // Add the mask for each OS proc # to the list.
2984  //
2985  if (stride > 0) {
2986  do {
2987  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2988  start += stride;
2989  } while (start <= end);
2990  }
2991  else {
2992  do {
2993  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2994  start += stride;
2995  } while (start >= end);
2996  }
2997 
2998  //
2999  // Skip optional comma.
3000  //
3001  SKIP_WS(next);
3002  if (*next == ',') {
3003  next++;
3004  }
3005  scan = next;
3006  }
3007 
3008  *out_numMasks = nextNewMask;
3009  if (nextNewMask == 0) {
3010  *out_masks = NULL;
3011  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3012  return;
3013  }
3014  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3015  for(i = 0; i < nextNewMask; i++) {
3016  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
3017  kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3018  KMP_CPU_COPY(dest, src);
3019  }
3020  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3021  KMP_CPU_FREE(sumMask);
3022 }
3023 
3024 
3025 # if OMP_40_ENABLED
3026 
3027 /*-----------------------------------------------------------------------------
3028 
3029 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3030 places. Again, Here is the grammar:
3031 
3032 place_list := place
3033 place_list := place , place_list
3034 place := num
3035 place := place : num
3036 place := place : num : signed
3037 place := { subplacelist }
3038 place := ! place // (lowest priority)
3039 subplace_list := subplace
3040 subplace_list := subplace , subplace_list
3041 subplace := num
3042 subplace := num : num
3043 subplace := num : num : signed
3044 signed := num
3045 signed := + signed
3046 signed := - signed
3047 
3048 -----------------------------------------------------------------------------*/
3049 
3050 static void
3051 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3052  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3053 {
3054  const char *next;
3055 
3056  for (;;) {
3057  int start, count, stride, i;
3058 
3059  //
3060  // Read in the starting proc id
3061  //
3062  SKIP_WS(*scan);
3063  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3064  "bad explicit places list");
3065  next = *scan;
3066  SKIP_DIGITS(next);
3067  start = __kmp_str_to_int(*scan, *next);
3068  KMP_ASSERT(start >= 0);
3069  *scan = next;
3070 
3071  //
3072  // valid follow sets are ',' ':' and '}'
3073  //
3074  SKIP_WS(*scan);
3075  if (**scan == '}' || **scan == ',') {
3076  if ((start > maxOsId) ||
3077  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3078  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3079  && (__kmp_affinity_type != affinity_none))) {
3080  KMP_WARNING(AffIgnoreInvalidProcID, start);
3081  }
3082  }
3083  else {
3084  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3085  (*setSize)++;
3086  }
3087  if (**scan == '}') {
3088  break;
3089  }
3090  (*scan)++; // skip ','
3091  continue;
3092  }
3093  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3094  (*scan)++; // skip ':'
3095 
3096  //
3097  // Read count parameter
3098  //
3099  SKIP_WS(*scan);
3100  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3101  "bad explicit places list");
3102  next = *scan;
3103  SKIP_DIGITS(next);
3104  count = __kmp_str_to_int(*scan, *next);
3105  KMP_ASSERT(count >= 0);
3106  *scan = next;
3107 
3108  //
3109  // valid follow sets are ',' ':' and '}'
3110  //
3111  SKIP_WS(*scan);
3112  if (**scan == '}' || **scan == ',') {
3113  for (i = 0; i < count; i++) {
3114  if ((start > maxOsId) ||
3115  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3116  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3117  && (__kmp_affinity_type != affinity_none))) {
3118  KMP_WARNING(AffIgnoreInvalidProcID, start);
3119  }
3120  break; // don't proliferate warnings for large count
3121  }
3122  else {
3123  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3124  start++;
3125  (*setSize)++;
3126  }
3127  }
3128  if (**scan == '}') {
3129  break;
3130  }
3131  (*scan)++; // skip ','
3132  continue;
3133  }
3134  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3135  (*scan)++; // skip ':'
3136 
3137  //
3138  // Read stride parameter
3139  //
3140  int sign = +1;
3141  for (;;) {
3142  SKIP_WS(*scan);
3143  if (**scan == '+') {
3144  (*scan)++; // skip '+'
3145  continue;
3146  }
3147  if (**scan == '-') {
3148  sign *= -1;
3149  (*scan)++; // skip '-'
3150  continue;
3151  }
3152  break;
3153  }
3154  SKIP_WS(*scan);
3155  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3156  "bad explicit places list");
3157  next = *scan;
3158  SKIP_DIGITS(next);
3159  stride = __kmp_str_to_int(*scan, *next);
3160  KMP_ASSERT(stride >= 0);
3161  *scan = next;
3162  stride *= sign;
3163 
3164  //
3165  // valid follow sets are ',' and '}'
3166  //
3167  SKIP_WS(*scan);
3168  if (**scan == '}' || **scan == ',') {
3169  for (i = 0; i < count; i++) {
3170  if ((start > maxOsId) ||
3171  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3172  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3173  && (__kmp_affinity_type != affinity_none))) {
3174  KMP_WARNING(AffIgnoreInvalidProcID, start);
3175  }
3176  break; // don't proliferate warnings for large count
3177  }
3178  else {
3179  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3180  start += stride;
3181  (*setSize)++;
3182  }
3183  }
3184  if (**scan == '}') {
3185  break;
3186  }
3187  (*scan)++; // skip ','
3188  continue;
3189  }
3190 
3191  KMP_ASSERT2(0, "bad explicit places list");
3192  }
3193 }
3194 
3195 
3196 static void
3197 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3198  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3199 {
3200  const char *next;
3201 
3202  //
3203  // valid follow sets are '{' '!' and num
3204  //
3205  SKIP_WS(*scan);
3206  if (**scan == '{') {
3207  (*scan)++; // skip '{'
3208  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3209  setSize);
3210  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3211  (*scan)++; // skip '}'
3212  }
3213  else if (**scan == '!') {
3214  (*scan)++; // skip '!'
3215  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3216  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3217  }
3218  else if ((**scan >= '0') && (**scan <= '9')) {
3219  next = *scan;
3220  SKIP_DIGITS(next);
3221  int num = __kmp_str_to_int(*scan, *next);
3222  KMP_ASSERT(num >= 0);
3223  if ((num > maxOsId) ||
3224  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3225  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3226  && (__kmp_affinity_type != affinity_none))) {
3227  KMP_WARNING(AffIgnoreInvalidProcID, num);
3228  }
3229  }
3230  else {
3231  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3232  (*setSize)++;
3233  }
3234  *scan = next; // skip num
3235  }
3236  else {
3237  KMP_ASSERT2(0, "bad explicit places list");
3238  }
3239 }
3240 
3241 
3242 //static void
3243 void
3244 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3245  unsigned int *out_numMasks, const char *placelist,
3246  kmp_affin_mask_t *osId2Mask, int maxOsId)
3247 {
3248  int i,j,count,stride,sign;
3249  const char *scan = placelist;
3250  const char *next = placelist;
3251 
3252  numNewMasks = 2;
3253  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3254  nextNewMask = 0;
3255 
3256  // tempMask is modified based on the previous or initial
3257  // place to form the current place
3258  // previousMask contains the previous place
3259  kmp_affin_mask_t *tempMask;
3260  kmp_affin_mask_t *previousMask;
3261  KMP_CPU_ALLOC(tempMask);
3262  KMP_CPU_ZERO(tempMask);
3263  KMP_CPU_ALLOC(previousMask);
3264  KMP_CPU_ZERO(previousMask);
3265  int setSize = 0;
3266 
3267  for (;;) {
3268  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3269 
3270  //
3271  // valid follow sets are ',' ':' and EOL
3272  //
3273  SKIP_WS(scan);
3274  if (*scan == '\0' || *scan == ',') {
3275  if (setSize > 0) {
3276  ADD_MASK(tempMask);
3277  }
3278  KMP_CPU_ZERO(tempMask);
3279  setSize = 0;
3280  if (*scan == '\0') {
3281  break;
3282  }
3283  scan++; // skip ','
3284  continue;
3285  }
3286 
3287  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3288  scan++; // skip ':'
3289 
3290  //
3291  // Read count parameter
3292  //
3293  SKIP_WS(scan);
3294  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3295  "bad explicit places list");
3296  next = scan;
3297  SKIP_DIGITS(next);
3298  count = __kmp_str_to_int(scan, *next);
3299  KMP_ASSERT(count >= 0);
3300  scan = next;
3301 
3302  //
3303  // valid follow sets are ',' ':' and EOL
3304  //
3305  SKIP_WS(scan);
3306  if (*scan == '\0' || *scan == ',') {
3307  stride = +1;
3308  }
3309  else {
3310  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3311  scan++; // skip ':'
3312 
3313  //
3314  // Read stride parameter
3315  //
3316  sign = +1;
3317  for (;;) {
3318  SKIP_WS(scan);
3319  if (*scan == '+') {
3320  scan++; // skip '+'
3321  continue;
3322  }
3323  if (*scan == '-') {
3324  sign *= -1;
3325  scan++; // skip '-'
3326  continue;
3327  }
3328  break;
3329  }
3330  SKIP_WS(scan);
3331  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3332  "bad explicit places list");
3333  next = scan;
3334  SKIP_DIGITS(next);
3335  stride = __kmp_str_to_int(scan, *next);
3336  KMP_DEBUG_ASSERT(stride >= 0);
3337  scan = next;
3338  stride *= sign;
3339  }
3340 
3341  // Add places determined by initial_place : count : stride
3342  for (i = 0; i < count; i++) {
3343  if (setSize == 0) {
3344  break;
3345  }
3346  // Add the current place, then build the next place (tempMask) from that
3347  KMP_CPU_COPY(previousMask, tempMask);
3348  ADD_MASK(previousMask);
3349  KMP_CPU_ZERO(tempMask);
3350  setSize = 0;
3351  KMP_CPU_SET_ITERATE(j, previousMask) {
3352  if (! KMP_CPU_ISSET(j, previousMask)) {
3353  continue;
3354  }
3355  if ((j+stride > maxOsId) || (j+stride < 0) ||
3356  (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3357  (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
3358  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3359  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3360  KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
3361  }
3362  continue;
3363  }
3364  KMP_CPU_SET(j+stride, tempMask);
3365  setSize++;
3366  }
3367  }
3368  KMP_CPU_ZERO(tempMask);
3369  setSize = 0;
3370 
3371  //
3372  // valid follow sets are ',' and EOL
3373  //
3374  SKIP_WS(scan);
3375  if (*scan == '\0') {
3376  break;
3377  }
3378  if (*scan == ',') {
3379  scan++; // skip ','
3380  continue;
3381  }
3382 
3383  KMP_ASSERT2(0, "bad explicit places list");
3384  }
3385 
3386  *out_numMasks = nextNewMask;
3387  if (nextNewMask == 0) {
3388  *out_masks = NULL;
3389  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3390  return;
3391  }
3392  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3393  KMP_CPU_FREE(tempMask);
3394  KMP_CPU_FREE(previousMask);
3395  for(i = 0; i < nextNewMask; i++) {
3396  kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
3397  kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
3398  KMP_CPU_COPY(dest, src);
3399  }
3400  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3401 }
3402 
3403 # endif /* OMP_40_ENABLED */
3404 
3405 #undef ADD_MASK
3406 #undef ADD_MASK_OSID
3407 
3408 static void
3409 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3410 {
3411  int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
3412  if (__kmp_place_num_sockets == 0 &&
3413  __kmp_place_num_cores == 0 &&
3414  __kmp_place_num_threads_per_core == 0 )
3415  goto _exit; // no topology limiting actions requested, exit
3416  if (__kmp_place_num_sockets == 0)
3417  __kmp_place_num_sockets = nPackages; // use all available sockets
3418  if (__kmp_place_num_cores == 0)
3419  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3420  if (__kmp_place_num_threads_per_core == 0 ||
3421  __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
3422  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3423 
3424  if ( !__kmp_affinity_uniform_topology() ) {
3425  KMP_WARNING( AffHWSubsetNonUniform );
3426  goto _exit; // don't support non-uniform topology
3427  }
3428  if ( depth > 3 ) {
3429  KMP_WARNING( AffHWSubsetNonThreeLevel );
3430  goto _exit; // don't support not-3-level topology
3431  }
3432  if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
3433  KMP_WARNING(AffHWSubsetManySockets);
3434  goto _exit;
3435  }
3436  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3437  KMP_WARNING( AffHWSubsetManyCores );
3438  goto _exit;
3439  }
3440 
3441  AddrUnsPair *newAddr;
3442  if (pAddr) // pAddr is NULL in case of affinity_none
3443  newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3444  __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3445 
3446  for (i = 0; i < nPackages; ++i) {
3447  if (i < __kmp_place_socket_offset ||
3448  i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
3449  n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
3450  if (__kmp_pu_os_idx != NULL) {
3451  for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
3452  for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
3453  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3454  ++proc_num;
3455  }
3456  }
3457  }
3458  } else {
3459  for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
3460  if (j < __kmp_place_core_offset ||
3461  j >= __kmp_place_core_offset + __kmp_place_num_cores) {
3462  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3463  if (__kmp_pu_os_idx != NULL) {
3464  for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
3465  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3466  ++proc_num;
3467  }
3468  }
3469  } else {
3470  for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
3471  if (k < __kmp_place_num_threads_per_core) {
3472  if (pAddr)
3473  newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
3474  n_new++;
3475  } else {
3476  if (__kmp_pu_os_idx != NULL)
3477  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3478  }
3479  n_old++;
3480  ++proc_num;
3481  }
3482  }
3483  }
3484  }
3485  }
3486  KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3487  KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
3488  __kmp_place_num_threads_per_core);
3489 
3490  nPackages = __kmp_place_num_sockets; // correct nPackages
3491  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3492  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3493  __kmp_avail_proc = n_new; // correct avail_proc
3494  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3495 
3496  if (pAddr) {
3497  __kmp_free( *pAddr );
3498  *pAddr = newAddr; // replace old topology with new one
3499  }
3500 _exit:
3501  if (__kmp_pu_os_idx != NULL) {
3502  __kmp_free(__kmp_pu_os_idx);
3503  __kmp_pu_os_idx = NULL;
3504  }
3505 }
3506 
3507 //
3508 // This function figures out the deepest level at which there is at least one cluster/core
3509 // with more than one processing unit bound to it.
3510 //
3511 static int
3512 __kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level)
3513 {
3514  int core_level = 0;
3515 
3516  for( int i = 0; i < nprocs; i++ ) {
3517  for( int j = bottom_level; j > 0; j-- ) {
3518  if( address2os[i].first.labels[j] > 0 ) {
3519  if( core_level < ( j - 1 ) ) {
3520  core_level = j - 1;
3521  }
3522  }
3523  }
3524  }
3525  return core_level;
3526 }
3527 
3528 //
3529 // This function counts number of clusters/cores at given level.
3530 //
3531 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
3532 {
3533  int ncores = 0;
3534  int i, j;
3535 
3536  j = bottom_level;
3537  for( i = 0; i < nprocs; i++ ) {
3538  for ( j = bottom_level; j > core_level; j-- ) {
3539  if( ( i + 1 ) < nprocs ) {
3540  if( address2os[i + 1].first.labels[j] > 0 ) {
3541  break;
3542  }
3543  }
3544  }
3545  if( j == core_level ) {
3546  ncores++;
3547  }
3548  }
3549  if( j > core_level ) {
3550  //
3551  // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core.
3552  // May occur when called from __kmp_affinity_find_core().
3553  //
3554  ncores++;
3555  }
3556  return ncores;
3557 }
3558 
3559 //
3560 // This function finds to which cluster/core given processing unit is bound.
3561 //
3562 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level)
3563 {
3564  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1;
3565 }
3566 
3567 //
3568 // This function finds maximal number of processing units bound to a cluster/core at given level.
3569 //
3570 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
3571 {
3572  int maxprocpercore = 0;
3573 
3574  if( core_level < bottom_level ) {
3575  for( int i = 0; i < nprocs; i++ ) {
3576  int percore = address2os[i].first.labels[core_level + 1] + 1;
3577 
3578  if( percore > maxprocpercore ) {
3579  maxprocpercore = percore;
3580  }
3581  }
3582  } else {
3583  maxprocpercore = 1;
3584  }
3585  return maxprocpercore;
3586 }
3587 
3588 static AddrUnsPair *address2os = NULL;
3589 static int * procarr = NULL;
3590 static int __kmp_aff_depth = 0;
3591 
3592 #define KMP_EXIT_AFF_NONE \
3593  KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3594  KMP_ASSERT(address2os == NULL); \
3595  __kmp_apply_thread_places(NULL, 0); \
3596  return;
3597 
3598 static int
3599 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
3600 {
3601  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
3602  ->first);
3603  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
3604  ->first);
3605  unsigned depth = aa->depth;
3606  unsigned i;
3607  KMP_DEBUG_ASSERT(depth == bb->depth);
3608  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
3609  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
3610  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
3611  int j = depth - i - 1;
3612  if (aa->childNums[j] < bb->childNums[j]) return -1;
3613  if (aa->childNums[j] > bb->childNums[j]) return 1;
3614  }
3615  for (; i < depth; i++) {
3616  int j = i - __kmp_affinity_compact;
3617  if (aa->childNums[j] < bb->childNums[j]) return -1;
3618  if (aa->childNums[j] > bb->childNums[j]) return 1;
3619  }
3620  return 0;
3621 }
3622 
3623 static void
3624 __kmp_aux_affinity_initialize(void)
3625 {
3626  if (__kmp_affinity_masks != NULL) {
3627  KMP_ASSERT(__kmp_affin_fullMask != NULL);
3628  return;
3629  }
3630 
3631  //
3632  // Create the "full" mask - this defines all of the processors that we
3633  // consider to be in the machine model. If respect is set, then it is
3634  // the initialization thread's affinity mask. Otherwise, it is all
3635  // processors that we know about on the machine.
3636  //
3637  if (__kmp_affin_fullMask == NULL) {
3638  KMP_CPU_ALLOC(__kmp_affin_fullMask);
3639  }
3640  if (KMP_AFFINITY_CAPABLE()) {
3641  if (__kmp_affinity_respect_mask) {
3642  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3643 
3644  //
3645  // Count the number of available processors.
3646  //
3647  unsigned i;
3648  __kmp_avail_proc = 0;
3649  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3650  if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3651  continue;
3652  }
3653  __kmp_avail_proc++;
3654  }
3655  if (__kmp_avail_proc > __kmp_xproc) {
3656  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3657  && (__kmp_affinity_type != affinity_none))) {
3658  KMP_WARNING(ErrorInitializeAffinity);
3659  }
3660  __kmp_affinity_type = affinity_none;
3661  KMP_AFFINITY_DISABLE();
3662  return;
3663  }
3664  }
3665  else {
3666  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3667  __kmp_avail_proc = __kmp_xproc;
3668  }
3669  }
3670 
3671  int depth = -1;
3672  kmp_i18n_id_t msg_id = kmp_i18n_null;
3673 
3674  //
3675  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3676  // KMP_TOPOLOGY_METHOD=cpuinfo
3677  //
3678  if ((__kmp_cpuinfo_file != NULL) &&
3679  (__kmp_affinity_top_method == affinity_top_method_all)) {
3680  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3681  }
3682 
3683  if (__kmp_affinity_top_method == affinity_top_method_all) {
3684  //
3685  // In the default code path, errors are not fatal - we just try using
3686  // another method. We only emit a warning message if affinity is on,
3687  // or the verbose flag is set, an the nowarnings flag was not set.
3688  //
3689  const char *file_name = NULL;
3690  int line = 0;
3691 # if KMP_USE_HWLOC
3692  if (depth < 0 && __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3693  if (__kmp_affinity_verbose) {
3694  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3695  }
3696  if(!__kmp_hwloc_error) {
3697  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3698  if (depth == 0) {
3699  KMP_EXIT_AFF_NONE;
3700  } else if(depth < 0 && __kmp_affinity_verbose) {
3701  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3702  }
3703  } else if(__kmp_affinity_verbose) {
3704  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3705  }
3706  }
3707 # endif
3708 
3709 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3710 
3711  if (depth < 0) {
3712  if (__kmp_affinity_verbose) {
3713  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3714  }
3715 
3716  file_name = NULL;
3717  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3718  if (depth == 0) {
3719  KMP_EXIT_AFF_NONE;
3720  }
3721 
3722  if (depth < 0) {
3723  if (__kmp_affinity_verbose) {
3724  if (msg_id != kmp_i18n_null) {
3725  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3726  KMP_I18N_STR(DecodingLegacyAPIC));
3727  }
3728  else {
3729  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3730  }
3731  }
3732 
3733  file_name = NULL;
3734  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3735  if (depth == 0) {
3736  KMP_EXIT_AFF_NONE;
3737  }
3738  }
3739  }
3740 
3741 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3742 
3743 # if KMP_OS_LINUX
3744 
3745  if (depth < 0) {
3746  if (__kmp_affinity_verbose) {
3747  if (msg_id != kmp_i18n_null) {
3748  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3749  }
3750  else {
3751  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3752  }
3753  }
3754 
3755  FILE *f = fopen("/proc/cpuinfo", "r");
3756  if (f == NULL) {
3757  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3758  }
3759  else {
3760  file_name = "/proc/cpuinfo";
3761  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3762  fclose(f);
3763  if (depth == 0) {
3764  KMP_EXIT_AFF_NONE;
3765  }
3766  }
3767  }
3768 
3769 # endif /* KMP_OS_LINUX */
3770 
3771 # if KMP_GROUP_AFFINITY
3772 
3773  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3774  if (__kmp_affinity_verbose) {
3775  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3776  }
3777 
3778  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3779  KMP_ASSERT(depth != 0);
3780  }
3781 
3782 # endif /* KMP_GROUP_AFFINITY */
3783 
3784  if (depth < 0) {
3785  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3786  if (file_name == NULL) {
3787  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3788  }
3789  else if (line == 0) {
3790  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3791  }
3792  else {
3793  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3794  }
3795  }
3796  // FIXME - print msg if msg_id = kmp_i18n_null ???
3797 
3798  file_name = "";
3799  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3800  if (depth == 0) {
3801  KMP_EXIT_AFF_NONE;
3802  }
3803  KMP_ASSERT(depth > 0);
3804  KMP_ASSERT(address2os != NULL);
3805  }
3806  }
3807 
3808  //
3809  // If the user has specified that a paricular topology discovery method
3810  // is to be used, then we abort if that method fails. The exception is
3811  // group affinity, which might have been implicitly set.
3812  //
3813 
3814 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3815 
3816  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3817  if (__kmp_affinity_verbose) {
3818  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3819  KMP_I18N_STR(Decodingx2APIC));
3820  }
3821 
3822  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3823  if (depth == 0) {
3824  KMP_EXIT_AFF_NONE;
3825  }
3826  if (depth < 0) {
3827  KMP_ASSERT(msg_id != kmp_i18n_null);
3828  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3829  }
3830  }
3831  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3832  if (__kmp_affinity_verbose) {
3833  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3834  KMP_I18N_STR(DecodingLegacyAPIC));
3835  }
3836 
3837  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3838  if (depth == 0) {
3839  KMP_EXIT_AFF_NONE;
3840  }
3841  if (depth < 0) {
3842  KMP_ASSERT(msg_id != kmp_i18n_null);
3843  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3844  }
3845  }
3846 
3847 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3848 
3849  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3850  const char *filename;
3851  if (__kmp_cpuinfo_file != NULL) {
3852  filename = __kmp_cpuinfo_file;
3853  }
3854  else {
3855  filename = "/proc/cpuinfo";
3856  }
3857 
3858  if (__kmp_affinity_verbose) {
3859  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3860  }
3861 
3862  FILE *f = fopen(filename, "r");
3863  if (f == NULL) {
3864  int code = errno;
3865  if (__kmp_cpuinfo_file != NULL) {
3866  __kmp_msg(
3867  kmp_ms_fatal,
3868  KMP_MSG(CantOpenFileForReading, filename),
3869  KMP_ERR(code),
3870  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3871  __kmp_msg_null
3872  );
3873  }
3874  else {
3875  __kmp_msg(
3876  kmp_ms_fatal,
3877  KMP_MSG(CantOpenFileForReading, filename),
3878  KMP_ERR(code),
3879  __kmp_msg_null
3880  );
3881  }
3882  }
3883  int line = 0;
3884  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3885  fclose(f);
3886  if (depth < 0) {
3887  KMP_ASSERT(msg_id != kmp_i18n_null);
3888  if (line > 0) {
3889  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3890  }
3891  else {
3892  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3893  }
3894  }
3895  if (__kmp_affinity_type == affinity_none) {
3896  KMP_ASSERT(depth == 0);
3897  KMP_EXIT_AFF_NONE;
3898  }
3899  }
3900 
3901 # if KMP_GROUP_AFFINITY
3902 
3903  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3904  if (__kmp_affinity_verbose) {
3905  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3906  }
3907 
3908  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3909  KMP_ASSERT(depth != 0);
3910  if (depth < 0) {
3911  KMP_ASSERT(msg_id != kmp_i18n_null);
3912  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3913  }
3914  }
3915 
3916 # endif /* KMP_GROUP_AFFINITY */
3917 
3918  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3919  if (__kmp_affinity_verbose) {
3920  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3921  }
3922 
3923  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3924  if (depth == 0) {
3925  KMP_EXIT_AFF_NONE;
3926  }
3927  // should not fail
3928  KMP_ASSERT(depth > 0);
3929  KMP_ASSERT(address2os != NULL);
3930  }
3931 
3932 # if KMP_USE_HWLOC
3933  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3934  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
3935  if (__kmp_affinity_verbose) {
3936  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3937  }
3938  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3939  if (depth == 0) {
3940  KMP_EXIT_AFF_NONE;
3941  }
3942  }
3943 # endif // KMP_USE_HWLOC
3944 
3945  if (address2os == NULL) {
3946  if (KMP_AFFINITY_CAPABLE()
3947  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3948  && (__kmp_affinity_type != affinity_none)))) {
3949  KMP_WARNING(ErrorInitializeAffinity);
3950  }
3951  __kmp_affinity_type = affinity_none;
3952  KMP_AFFINITY_DISABLE();
3953  return;
3954  }
3955 
3956  __kmp_apply_thread_places(&address2os, depth);
3957 
3958  //
3959  // Create the table of masks, indexed by thread Id.
3960  //
3961  unsigned maxIndex;
3962  unsigned numUnique;
3963  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3964  address2os, __kmp_avail_proc);
3965  if (__kmp_affinity_gran_levels == 0) {
3966  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3967  }
3968 
3969  //
3970  // Set the childNums vector in all Address objects. This must be done
3971  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3972  // which takes into account the setting of __kmp_affinity_compact.
3973  //
3974  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3975 
3976  switch (__kmp_affinity_type) {
3977 
3978  case affinity_explicit:
3979  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3980 # if OMP_40_ENABLED
3981  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3982 # endif
3983  {
3984  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3985  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3986  maxIndex);
3987  }
3988 # if OMP_40_ENABLED
3989  else {
3990  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3991  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3992  maxIndex);
3993  }
3994 # endif
3995  if (__kmp_affinity_num_masks == 0) {
3996  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3997  && (__kmp_affinity_type != affinity_none))) {
3998  KMP_WARNING(AffNoValidProcID);
3999  }
4000  __kmp_affinity_type = affinity_none;
4001  return;
4002  }
4003  break;
4004 
4005  //
4006  // The other affinity types rely on sorting the Addresses according
4007  // to some permutation of the machine topology tree. Set
4008  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
4009  // then jump to a common code fragment to do the sort and create
4010  // the array of affinity masks.
4011  //
4012 
4013  case affinity_logical:
4014  __kmp_affinity_compact = 0;
4015  if (__kmp_affinity_offset) {
4016  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
4017  % __kmp_avail_proc;
4018  }
4019  goto sortAddresses;
4020 
4021  case affinity_physical:
4022  if (__kmp_nThreadsPerCore > 1) {
4023  __kmp_affinity_compact = 1;
4024  if (__kmp_affinity_compact >= depth) {
4025  __kmp_affinity_compact = 0;
4026  }
4027  } else {
4028  __kmp_affinity_compact = 0;
4029  }
4030  if (__kmp_affinity_offset) {
4031  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
4032  % __kmp_avail_proc;
4033  }
4034  goto sortAddresses;
4035 
4036  case affinity_scatter:
4037  if (__kmp_affinity_compact >= depth) {
4038  __kmp_affinity_compact = 0;
4039  }
4040  else {
4041  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4042  }
4043  goto sortAddresses;
4044 
4045  case affinity_compact:
4046  if (__kmp_affinity_compact >= depth) {
4047  __kmp_affinity_compact = depth - 1;
4048  }
4049  goto sortAddresses;
4050 
4051  case affinity_balanced:
4052  if( depth <= 1 ) {
4053  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
4054  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
4055  }
4056  __kmp_affinity_type = affinity_none;
4057  return;
4058  } else if( __kmp_affinity_uniform_topology() ) {
4059  break;
4060  } else { // Non-uniform topology
4061 
4062  // Save the depth for further usage
4063  __kmp_aff_depth = depth;
4064 
4065  int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1);
4066  int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level);
4067  int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level);
4068 
4069  int nproc = ncores * maxprocpercore;
4070  if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) {
4071  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
4072  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
4073  }
4074  __kmp_affinity_type = affinity_none;
4075  return;
4076  }
4077 
4078  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4079  for( int i = 0; i < nproc; i++ ) {
4080  procarr[ i ] = -1;
4081  }
4082 
4083  int lastcore = -1;
4084  int inlastcore = 0;
4085  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4086  int proc = address2os[ i ].second;
4087  int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4088 
4089  if ( core == lastcore ) {
4090  inlastcore++;
4091  } else {
4092  inlastcore = 0;
4093  }
4094  lastcore = core;
4095 
4096  procarr[ core * maxprocpercore + inlastcore ] = proc;
4097  }
4098 
4099  break;
4100  }
4101 
4102  sortAddresses:
4103  //
4104  // Allocate the gtid->affinity mask table.
4105  //
4106  if (__kmp_affinity_dups) {
4107  __kmp_affinity_num_masks = __kmp_avail_proc;
4108  }
4109  else {
4110  __kmp_affinity_num_masks = numUnique;
4111  }
4112 
4113 # if OMP_40_ENABLED
4114  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
4115  && ( __kmp_affinity_num_places > 0 )
4116  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
4117  __kmp_affinity_num_masks = __kmp_affinity_num_places;
4118  }
4119 # endif
4120 
4121  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4122 
4123  //
4124  // Sort the address2os table according to the current setting of
4125  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4126  //
4127  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4128  __kmp_affinity_cmp_Address_child_num);
4129  {
4130  int i;
4131  unsigned j;
4132  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4133  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
4134  continue;
4135  }
4136  unsigned osId = address2os[i].second;
4137  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4138  kmp_affin_mask_t *dest
4139  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4140  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4141  KMP_CPU_COPY(dest, src);
4142  if (++j >= __kmp_affinity_num_masks) {
4143  break;
4144  }
4145  }
4146  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4147  }
4148  break;
4149 
4150  default:
4151  KMP_ASSERT2(0, "Unexpected affinity setting");
4152  }
4153 
4154  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex+1);
4155  machine_hierarchy.init(address2os, __kmp_avail_proc);
4156 }
4157 #undef KMP_EXIT_AFF_NONE
4158 
4159 
4160 void
4161 __kmp_affinity_initialize(void)
4162 {
4163  //
4164  // Much of the code above was written assumming that if a machine was not
4165  // affinity capable, then __kmp_affinity_type == affinity_none. We now
4166  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4167  //
4168  // There are too many checks for __kmp_affinity_type == affinity_none
4169  // in this code. Instead of trying to change them all, check if
4170  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4171  // affinity_none, call the real initialization routine, then restore
4172  // __kmp_affinity_type to affinity_disabled.
4173  //
4174  int disabled = (__kmp_affinity_type == affinity_disabled);
4175  if (! KMP_AFFINITY_CAPABLE()) {
4176  KMP_ASSERT(disabled);
4177  }
4178  if (disabled) {
4179  __kmp_affinity_type = affinity_none;
4180  }
4181  __kmp_aux_affinity_initialize();
4182  if (disabled) {
4183  __kmp_affinity_type = affinity_disabled;
4184  }
4185 }
4186 
4187 
4188 void
4189 __kmp_affinity_uninitialize(void)
4190 {
4191  if (__kmp_affinity_masks != NULL) {
4192  KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4193  __kmp_affinity_masks = NULL;
4194  }
4195  if (__kmp_affin_fullMask != NULL) {
4196  KMP_CPU_FREE(__kmp_affin_fullMask);
4197  __kmp_affin_fullMask = NULL;
4198  }
4199  __kmp_affinity_num_masks = 0;
4200 # if OMP_40_ENABLED
4201  __kmp_affinity_num_places = 0;
4202 # endif
4203  if (__kmp_affinity_proclist != NULL) {
4204  __kmp_free(__kmp_affinity_proclist);
4205  __kmp_affinity_proclist = NULL;
4206  }
4207  if( address2os != NULL ) {
4208  __kmp_free( address2os );
4209  address2os = NULL;
4210  }
4211  if( procarr != NULL ) {
4212  __kmp_free( procarr );
4213  procarr = NULL;
4214  }
4215 # if KMP_USE_HWLOC
4216  if (__kmp_hwloc_topology != NULL) {
4217  hwloc_topology_destroy(__kmp_hwloc_topology);
4218  __kmp_hwloc_topology = NULL;
4219  }
4220 # endif
4221  KMPAffinity::destroy_api();
4222 }
4223 
4224 
4225 void
4226 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4227 {
4228  if (! KMP_AFFINITY_CAPABLE()) {
4229  return;
4230  }
4231 
4232  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4233  if (th->th.th_affin_mask == NULL) {
4234  KMP_CPU_ALLOC(th->th.th_affin_mask);
4235  }
4236  else {
4237  KMP_CPU_ZERO(th->th.th_affin_mask);
4238  }
4239 
4240  //
4241  // Copy the thread mask to the kmp_info_t strucuture.
4242  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4243  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4244  // is set, then the full mask is the same as the mask of the initialization
4245  // thread.
4246  //
4247  kmp_affin_mask_t *mask;
4248  int i;
4249 
4250 # if OMP_40_ENABLED
4251  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4252 # endif
4253  {
4254  if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4255  ) {
4256 # if KMP_GROUP_AFFINITY
4257  if (__kmp_num_proc_groups > 1) {
4258  return;
4259  }
4260 # endif
4261  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4262  i = KMP_PLACE_ALL;
4263  mask = __kmp_affin_fullMask;
4264  }
4265  else {
4266  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4267  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4268  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4269  }
4270  }
4271 # if OMP_40_ENABLED
4272  else {
4273  if ((! isa_root)
4274  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4275 # if KMP_GROUP_AFFINITY
4276  if (__kmp_num_proc_groups > 1) {
4277  return;
4278  }
4279 # endif
4280  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4281  i = KMP_PLACE_ALL;
4282  mask = __kmp_affin_fullMask;
4283  }
4284  else {
4285  //
4286  // int i = some hash function or just a counter that doesn't
4287  // always start at 0. Use gtid for now.
4288  //
4289  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4290  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4291  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4292  }
4293  }
4294 # endif
4295 
4296 # if OMP_40_ENABLED
4297  th->th.th_current_place = i;
4298  if (isa_root) {
4299  th->th.th_new_place = i;
4300  th->th.th_first_place = 0;
4301  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4302  }
4303 
4304  if (i == KMP_PLACE_ALL) {
4305  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4306  gtid));
4307  }
4308  else {
4309  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4310  gtid, i));
4311  }
4312 # else
4313  if (i == -1) {
4314  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4315  gtid));
4316  }
4317  else {
4318  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4319  gtid, i));
4320  }
4321 # endif /* OMP_40_ENABLED */
4322 
4323  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4324 
4325  if (__kmp_affinity_verbose) {
4326  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4327  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4328  th->th.th_affin_mask);
4329  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4330  buf);
4331  }
4332 
4333 # if KMP_OS_WINDOWS
4334  //
4335  // On Windows* OS, the process affinity mask might have changed.
4336  // If the user didn't request affinity and this call fails,
4337  // just continue silently. See CQ171393.
4338  //
4339  if ( __kmp_affinity_type == affinity_none ) {
4340  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4341  }
4342  else
4343 # endif
4344  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4345 }
4346 
4347 
4348 # if OMP_40_ENABLED
4349 
4350 void
4351 __kmp_affinity_set_place(int gtid)
4352 {
4353  int retval;
4354 
4355  if (! KMP_AFFINITY_CAPABLE()) {
4356  return;
4357  }
4358 
4359  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4360 
4361  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4362  gtid, th->th.th_new_place, th->th.th_current_place));
4363 
4364  //
4365  // Check that the new place is within this thread's partition.
4366  //
4367  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4368  KMP_ASSERT(th->th.th_new_place >= 0);
4369  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4370  if (th->th.th_first_place <= th->th.th_last_place) {
4371  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4372  && (th->th.th_new_place <= th->th.th_last_place));
4373  }
4374  else {
4375  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4376  || (th->th.th_new_place >= th->th.th_last_place));
4377  }
4378 
4379  //
4380  // Copy the thread mask to the kmp_info_t strucuture,
4381  // and set this thread's affinity.
4382  //
4383  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4384  th->th.th_new_place);
4385  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4386  th->th.th_current_place = th->th.th_new_place;
4387 
4388  if (__kmp_affinity_verbose) {
4389  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4390  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4391  th->th.th_affin_mask);
4392  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4393  gtid, buf);
4394  }
4395  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4396 }
4397 
4398 # endif /* OMP_40_ENABLED */
4399 
4400 
4401 int
4402 __kmp_aux_set_affinity(void **mask)
4403 {
4404  int gtid;
4405  kmp_info_t *th;
4406  int retval;
4407 
4408  if (! KMP_AFFINITY_CAPABLE()) {
4409  return -1;
4410  }
4411 
4412  gtid = __kmp_entry_gtid();
4413  KA_TRACE(1000, ;{
4414  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4415  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4416  (kmp_affin_mask_t *)(*mask));
4417  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4418  gtid, buf);
4419  });
4420 
4421  if (__kmp_env_consistency_check) {
4422  if ((mask == NULL) || (*mask == NULL)) {
4423  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4424  }
4425  else {
4426  unsigned proc;
4427  int num_procs = 0;
4428 
4429  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
4430  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4431  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4432  }
4433  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4434  continue;
4435  }
4436  num_procs++;
4437  }
4438  if (num_procs == 0) {
4439  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4440  }
4441 
4442 # if KMP_GROUP_AFFINITY
4443  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4444  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4445  }
4446 # endif /* KMP_GROUP_AFFINITY */
4447 
4448  }
4449  }
4450 
4451  th = __kmp_threads[gtid];
4452  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4453  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4454  if (retval == 0) {
4455  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4456  }
4457 
4458 # if OMP_40_ENABLED
4459  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4460  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4461  th->th.th_first_place = 0;
4462  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4463 
4464  //
4465  // Turn off 4.0 affinity for the current tread at this parallel level.
4466  //
4467  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4468 # endif
4469 
4470  return retval;
4471 }
4472 
4473 
4474 int
4475 __kmp_aux_get_affinity(void **mask)
4476 {
4477  int gtid;
4478  int retval;
4479  kmp_info_t *th;
4480 
4481  if (! KMP_AFFINITY_CAPABLE()) {
4482  return -1;
4483  }
4484 
4485  gtid = __kmp_entry_gtid();
4486  th = __kmp_threads[gtid];
4487  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4488 
4489  KA_TRACE(1000, ;{
4490  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4491  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4492  th->th.th_affin_mask);
4493  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4494  });
4495 
4496  if (__kmp_env_consistency_check) {
4497  if ((mask == NULL) || (*mask == NULL)) {
4498  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4499  }
4500  }
4501 
4502 # if !KMP_OS_WINDOWS
4503 
4504  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4505  KA_TRACE(1000, ;{
4506  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4507  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4508  (kmp_affin_mask_t *)(*mask));
4509  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4510  });
4511  return retval;
4512 
4513 # else
4514 
4515  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4516  return 0;
4517 
4518 # endif /* KMP_OS_WINDOWS */
4519 
4520 }
4521 
4522 int
4523 __kmp_aux_get_affinity_max_proc() {
4524  if (! KMP_AFFINITY_CAPABLE()) {
4525  return 0;
4526  }
4527 #if KMP_GROUP_AFFINITY
4528  if ( __kmp_num_proc_groups > 1 ) {
4529  return (int)(__kmp_num_proc_groups*sizeof(DWORD_PTR)*CHAR_BIT);
4530  }
4531 #endif
4532  return __kmp_xproc;
4533 }
4534 
4535 int
4536 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4537 {
4538  int retval;
4539 
4540  if (! KMP_AFFINITY_CAPABLE()) {
4541  return -1;
4542  }
4543 
4544  KA_TRACE(1000, ;{
4545  int gtid = __kmp_entry_gtid();
4546  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4547  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4548  (kmp_affin_mask_t *)(*mask));
4549  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4550  proc, gtid, buf);
4551  });
4552 
4553  if (__kmp_env_consistency_check) {
4554  if ((mask == NULL) || (*mask == NULL)) {
4555  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4556  }
4557  }
4558 
4559  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4560  return -1;
4561  }
4562  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4563  return -2;
4564  }
4565 
4566  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4567  return 0;
4568 }
4569 
4570 
4571 int
4572 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4573 {
4574  int retval;
4575 
4576  if (! KMP_AFFINITY_CAPABLE()) {
4577  return -1;
4578  }
4579 
4580  KA_TRACE(1000, ;{
4581  int gtid = __kmp_entry_gtid();
4582  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4583  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4584  (kmp_affin_mask_t *)(*mask));
4585  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4586  proc, gtid, buf);
4587  });
4588 
4589  if (__kmp_env_consistency_check) {
4590  if ((mask == NULL) || (*mask == NULL)) {
4591  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4592  }
4593  }
4594 
4595  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4596  return -1;
4597  }
4598  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4599  return -2;
4600  }
4601 
4602  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4603  return 0;
4604 }
4605 
4606 
4607 int
4608 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4609 {
4610  int retval;
4611 
4612  if (! KMP_AFFINITY_CAPABLE()) {
4613  return -1;
4614  }
4615 
4616  KA_TRACE(1000, ;{
4617  int gtid = __kmp_entry_gtid();
4618  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4619  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4620  (kmp_affin_mask_t *)(*mask));
4621  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4622  proc, gtid, buf);
4623  });
4624 
4625  if (__kmp_env_consistency_check) {
4626  if ((mask == NULL) || (*mask == NULL)) {
4627  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4628  }
4629  }
4630 
4631  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4632  return -1;
4633  }
4634  if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4635  return 0;
4636  }
4637 
4638  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4639 }
4640 
4641 
4642 // Dynamic affinity settings - Affinity balanced
4643 void __kmp_balanced_affinity( int tid, int nthreads )
4644 {
4645  bool fine_gran = true;
4646 
4647  switch (__kmp_affinity_gran) {
4648  case affinity_gran_fine:
4649  case affinity_gran_thread:
4650  break;
4651  case affinity_gran_core:
4652  if( __kmp_nThreadsPerCore > 1) {
4653  fine_gran = false;
4654  }
4655  break;
4656  case affinity_gran_package:
4657  if( nCoresPerPkg > 1) {
4658  fine_gran = false;
4659  }
4660  break;
4661  default:
4662  fine_gran = false;
4663  }
4664 
4665  if( __kmp_affinity_uniform_topology() ) {
4666  int coreID;
4667  int threadID;
4668  // Number of hyper threads per core in HT machine
4669  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4670  // Number of cores
4671  int ncores = __kmp_ncores;
4672  if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) {
4673  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4674  ncores = nPackages;
4675  }
4676  // How many threads will be bound to each core
4677  int chunk = nthreads / ncores;
4678  // How many cores will have an additional thread bound to it - "big cores"
4679  int big_cores = nthreads % ncores;
4680  // Number of threads on the big cores
4681  int big_nth = ( chunk + 1 ) * big_cores;
4682  if( tid < big_nth ) {
4683  coreID = tid / (chunk + 1 );
4684  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4685  } else { //tid >= big_nth
4686  coreID = ( tid - big_cores ) / chunk;
4687  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4688  }
4689 
4690  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4691  "Illegal set affinity operation when not capable");
4692 
4693  kmp_affin_mask_t *mask;
4694  KMP_CPU_ALLOC_ON_STACK(mask);
4695  KMP_CPU_ZERO(mask);
4696 
4697  if( fine_gran ) {
4698  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4699  KMP_CPU_SET( osID, mask);
4700  } else {
4701  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4702  int osID;
4703  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4704  KMP_CPU_SET( osID, mask);
4705  }
4706  }
4707  if (__kmp_affinity_verbose) {
4708  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4709  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4710  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4711  tid, buf);
4712  }
4713  __kmp_set_system_affinity( mask, TRUE );
4714  KMP_CPU_FREE_FROM_STACK(mask);
4715  } else { // Non-uniform topology
4716 
4717  kmp_affin_mask_t *mask;
4718  KMP_CPU_ALLOC_ON_STACK(mask);
4719  KMP_CPU_ZERO(mask);
4720 
4721  int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
4722  int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4723  int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4724 
4725  // For performance gain consider the special case nthreads == __kmp_avail_proc
4726  if( nthreads == __kmp_avail_proc ) {
4727  if( fine_gran ) {
4728  int osID = address2os[ tid ].second;
4729  KMP_CPU_SET( osID, mask);
4730  } else {
4731  int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level);
4732  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4733  int osID = address2os[ i ].second;
4734  if( __kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, core_level) == core ) {
4735  KMP_CPU_SET( osID, mask);
4736  }
4737  }
4738  }
4739  } else if( nthreads <= ncores ) {
4740 
4741  int core = 0;
4742  for( int i = 0; i < ncores; i++ ) {
4743  // Check if this core from procarr[] is in the mask
4744  int in_mask = 0;
4745  for( int j = 0; j < nth_per_core; j++ ) {
4746  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4747  in_mask = 1;
4748  break;
4749  }
4750  }
4751  if( in_mask ) {
4752  if( tid == core ) {
4753  for( int j = 0; j < nth_per_core; j++ ) {
4754  int osID = procarr[ i * nth_per_core + j ];
4755  if( osID != -1 ) {
4756  KMP_CPU_SET( osID, mask );
4757  // For fine granularity it is enough to set the first available osID for this core
4758  if( fine_gran) {
4759  break;
4760  }
4761  }
4762  }
4763  break;
4764  } else {
4765  core++;
4766  }
4767  }
4768  }
4769 
4770  } else { // nthreads > ncores
4771 
4772  // Array to save the number of processors at each core
4773  int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4774  // Array to save the number of cores with "x" available processors;
4775  int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4776  // Array to save the number of cores with # procs from x to nth_per_core
4777  int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4778 
4779  for( int i = 0; i <= nth_per_core; i++ ) {
4780  ncores_with_x_procs[ i ] = 0;
4781  ncores_with_x_to_max_procs[ i ] = 0;
4782  }
4783 
4784  for( int i = 0; i < ncores; i++ ) {
4785  int cnt = 0;
4786  for( int j = 0; j < nth_per_core; j++ ) {
4787  if( procarr[ i * nth_per_core + j ] != -1 ) {
4788  cnt++;
4789  }
4790  }
4791  nproc_at_core[ i ] = cnt;
4792  ncores_with_x_procs[ cnt ]++;
4793  }
4794 
4795  for( int i = 0; i <= nth_per_core; i++ ) {
4796  for( int j = i; j <= nth_per_core; j++ ) {
4797  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4798  }
4799  }
4800 
4801  // Max number of processors
4802  int nproc = nth_per_core * ncores;
4803  // An array to keep number of threads per each context
4804  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4805  for( int i = 0; i < nproc; i++ ) {
4806  newarr[ i ] = 0;
4807  }
4808 
4809  int nth = nthreads;
4810  int flag = 0;
4811  while( nth > 0 ) {
4812  for( int j = 1; j <= nth_per_core; j++ ) {
4813  int cnt = ncores_with_x_to_max_procs[ j ];
4814  for( int i = 0; i < ncores; i++ ) {
4815  // Skip the core with 0 processors
4816  if( nproc_at_core[ i ] == 0 ) {
4817  continue;
4818  }
4819  for( int k = 0; k < nth_per_core; k++ ) {
4820  if( procarr[ i * nth_per_core + k ] != -1 ) {
4821  if( newarr[ i * nth_per_core + k ] == 0 ) {
4822  newarr[ i * nth_per_core + k ] = 1;
4823  cnt--;
4824  nth--;
4825  break;
4826  } else {
4827  if( flag != 0 ) {
4828  newarr[ i * nth_per_core + k ] ++;
4829  cnt--;
4830  nth--;
4831  break;
4832  }
4833  }
4834  }
4835  }
4836  if( cnt == 0 || nth == 0 ) {
4837  break;
4838  }
4839  }
4840  if( nth == 0 ) {
4841  break;
4842  }
4843  }
4844  flag = 1;
4845  }
4846  int sum = 0;
4847  for( int i = 0; i < nproc; i++ ) {
4848  sum += newarr[ i ];
4849  if( sum > tid ) {
4850  if( fine_gran) {
4851  int osID = procarr[ i ];
4852  KMP_CPU_SET( osID, mask);
4853  } else {
4854  int coreID = i / nth_per_core;
4855  for( int ii = 0; ii < nth_per_core; ii++ ) {
4856  int osID = procarr[ coreID * nth_per_core + ii ];
4857  if( osID != -1 ) {
4858  KMP_CPU_SET( osID, mask);
4859  }
4860  }
4861  }
4862  break;
4863  }
4864  }
4865  __kmp_free( newarr );
4866  }
4867 
4868  if (__kmp_affinity_verbose) {
4869  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4870  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4871  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4872  tid, buf);
4873  }
4874  __kmp_set_system_affinity( mask, TRUE );
4875  KMP_CPU_FREE_FROM_STACK(mask);
4876  }
4877 }
4878 
4879 #if KMP_OS_LINUX
4880 // We don't need this entry for Windows because
4881 // there is GetProcessAffinityMask() api
4882 //
4883 // The intended usage is indicated by these steps:
4884 // 1) The user gets the current affinity mask
4885 // 2) Then sets the affinity by calling this function
4886 // 3) Error check the return value
4887 // 4) Use non-OpenMP parallelization
4888 // 5) Reset the affinity to what was stored in step 1)
4889 #ifdef __cplusplus
4890 extern "C"
4891 #endif
4892 int
4893 kmp_set_thread_affinity_mask_initial()
4894 // the function returns 0 on success,
4895 // -1 if we cannot bind thread
4896 // >0 (errno) if an error happened during binding
4897 {
4898  int gtid = __kmp_get_gtid();
4899  if (gtid < 0) {
4900  // Do not touch non-omp threads
4901  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4902  "non-omp thread, returning\n"));
4903  return -1;
4904  }
4905  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4906  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4907  "affinity not initialized, returning\n"));
4908  return -1;
4909  }
4910  KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
4911  "set full mask for thread %d\n", gtid));
4912  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4913  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4914 }
4915 #endif
4916 
4917 #endif // KMP_AFFINITY_SUPPORTED
kmp_uint32 depth
Definition: kmp_affinity.h:619
kmp_uint32 * numPerLevel
Definition: kmp_affinity.h:628