LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_affinity.h"
18 #include "kmp_atomic.h"
19 #include "kmp_environment.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_io.h"
23 #include "kmp_itt.h"
24 #include "kmp_settings.h"
25 #include "kmp_stats.h"
26 #include "kmp_str.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_wrapper_getpid.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] =
45  KMP_VERSION_PREFIX "alternative compiler support: yes";
46 #endif /* defined(KMP_GOMP_COMPAT) */
47 
48 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
49 #if OMP_50_ENABLED
50  "5.0 (201611)";
51 #elif OMP_45_ENABLED
52  "4.5 (201511)";
53 #elif OMP_40_ENABLED
54  "4.0 (201307)";
55 #else
56  "3.1 (201107)";
57 #endif
58 
59 #ifdef KMP_DEBUG
60 char const __kmp_version_lock[] =
61  KMP_VERSION_PREFIX "lock type: run time selectable";
62 #endif /* KMP_DEBUG */
63 
64 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
65 
66 /* ------------------------------------------------------------------------ */
67 
68 kmp_info_t __kmp_monitor;
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup(void);
73 
74 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
75  int gtid);
76 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
77  kmp_internal_control_t *new_icvs,
78  ident_t *loc);
79 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
80 static void __kmp_partition_places(kmp_team_t *team,
81  int update_master_only = 0);
82 #endif
83 static void __kmp_do_serial_initialize(void);
84 void __kmp_fork_barrier(int gtid, int tid);
85 void __kmp_join_barrier(int gtid);
86 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs, ident_t *loc);
88 
89 #ifdef USE_LOAD_BALANCE
90 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
91 #endif
92 
93 static int __kmp_expand_threads(int nWish, int nNeed);
94 #if KMP_OS_WINDOWS
95 static int __kmp_unregister_root_other_thread(int gtid);
96 #endif
97 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
98 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
99 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
100 
101 /* Calculate the identifier of the current thread */
102 /* fast (and somewhat portable) way to get unique identifier of executing
103  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
104 int __kmp_get_global_thread_id() {
105  int i;
106  kmp_info_t **other_threads;
107  size_t stack_data;
108  char *stack_addr;
109  size_t stack_size;
110  char *stack_base;
111 
112  KA_TRACE(
113  1000,
114  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
115  __kmp_nth, __kmp_all_nth));
116 
117  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
118  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
119  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
120  __kmp_init_gtid for this to work. */
121 
122  if (!TCR_4(__kmp_init_gtid))
123  return KMP_GTID_DNE;
124 
125 #ifdef KMP_TDATA_GTID
126  if (TCR_4(__kmp_gtid_mode) >= 3) {
127  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
128  return __kmp_gtid;
129  }
130 #endif
131  if (TCR_4(__kmp_gtid_mode) >= 2) {
132  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
133  return __kmp_gtid_get_specific();
134  }
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
136 
137  stack_addr = (char *)&stack_data;
138  other_threads = __kmp_threads;
139 
140  /* ATT: The code below is a source of potential bugs due to unsynchronized
141  access to __kmp_threads array. For example:
142  1. Current thread loads other_threads[i] to thr and checks it, it is
143  non-NULL.
144  2. Current thread is suspended by OS.
145  3. Another thread unregisters and finishes (debug versions of free()
146  may fill memory with something like 0xEF).
147  4. Current thread is resumed.
148  5. Current thread reads junk from *thr.
149  TODO: Fix it. --ln */
150 
151  for (i = 0; i < __kmp_threads_capacity; i++) {
152 
153  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
154  if (!thr)
155  continue;
156 
157  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
158  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
159 
160  /* stack grows down -- search through all of the active threads */
161 
162  if (stack_addr <= stack_base) {
163  size_t stack_diff = stack_base - stack_addr;
164 
165  if (stack_diff <= stack_size) {
166  /* The only way we can be closer than the allocated */
167  /* stack size is if we are running on this thread. */
168  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
169  return i;
170  }
171  }
172  }
173 
174  /* get specific to try and determine our gtid */
175  KA_TRACE(1000,
176  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
177  "thread, using TLS\n"));
178  i = __kmp_gtid_get_specific();
179 
180  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
181 
182  /* if we havn't been assigned a gtid, then return code */
183  if (i < 0)
184  return i;
185 
186  /* dynamically updated stack window for uber threads to avoid get_specific
187  call */
188  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
189  KMP_FATAL(StackOverflow, i);
190  }
191 
192  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
193  if (stack_addr > stack_base) {
194  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
195  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
196  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
197  stack_base);
198  } else {
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  stack_base - stack_addr);
201  }
202 
203  /* Reprint stack bounds for ubermaster since they have been refined */
204  if (__kmp_storage_map) {
205  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
207  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
208  other_threads[i]->th.th_info.ds.ds_stacksize,
209  "th_%d stack (refinement)", i);
210  }
211  return i;
212 }
213 
214 int __kmp_get_global_thread_id_reg() {
215  int gtid;
216 
217  if (!__kmp_init_serial) {
218  gtid = KMP_GTID_DNE;
219  } else
220 #ifdef KMP_TDATA_GTID
221  if (TCR_4(__kmp_gtid_mode) >= 3) {
222  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
223  gtid = __kmp_gtid;
224  } else
225 #endif
226  if (TCR_4(__kmp_gtid_mode) >= 2) {
227  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
228  gtid = __kmp_gtid_get_specific();
229  } else {
230  KA_TRACE(1000,
231  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
232  gtid = __kmp_get_global_thread_id();
233  }
234 
235  /* we must be a new uber master sibling thread */
236  if (gtid == KMP_GTID_DNE) {
237  KA_TRACE(10,
238  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
239  "Registering a new gtid.\n"));
240  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
241  if (!__kmp_init_serial) {
242  __kmp_do_serial_initialize();
243  gtid = __kmp_gtid_get_specific();
244  } else {
245  gtid = __kmp_register_root(FALSE);
246  }
247  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
248  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
249  }
250 
251  KMP_DEBUG_ASSERT(gtid >= 0);
252 
253  return gtid;
254 }
255 
256 /* caller must hold forkjoin_lock */
257 void __kmp_check_stack_overlap(kmp_info_t *th) {
258  int f;
259  char *stack_beg = NULL;
260  char *stack_end = NULL;
261  int gtid;
262 
263  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
264  if (__kmp_storage_map) {
265  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
266  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
267 
268  gtid = __kmp_gtid_from_thread(th);
269 
270  if (gtid == KMP_GTID_MONITOR) {
271  __kmp_print_storage_map_gtid(
272  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273  "th_%s stack (%s)", "mon",
274  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275  } else {
276  __kmp_print_storage_map_gtid(
277  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
278  "th_%d stack (%s)", gtid,
279  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
280  }
281  }
282 
283  /* No point in checking ubermaster threads since they use refinement and
284  * cannot overlap */
285  gtid = __kmp_gtid_from_thread(th);
286  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
287  KA_TRACE(10,
288  ("__kmp_check_stack_overlap: performing extensive checking\n"));
289  if (stack_beg == NULL) {
290  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
291  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
292  }
293 
294  for (f = 0; f < __kmp_threads_capacity; f++) {
295  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
296 
297  if (f_th && f_th != th) {
298  char *other_stack_end =
299  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
300  char *other_stack_beg =
301  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
302  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
303  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
304 
305  /* Print the other stack values before the abort */
306  if (__kmp_storage_map)
307  __kmp_print_storage_map_gtid(
308  -1, other_stack_beg, other_stack_end,
309  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
310  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
311 
312  __kmp_msg(kmp_ms_fatal, KMP_MSG(StackOverlap),
313  KMP_HNT(ChangeStackLimit), __kmp_msg_null);
314  }
315  }
316  }
317  }
318  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
319 }
320 
321 /* ------------------------------------------------------------------------ */
322 
323 void __kmp_infinite_loop(void) {
324  static int done = FALSE;
325 
326  while (!done) {
327  KMP_YIELD(1);
328  }
329 }
330 
331 #define MAX_MESSAGE 512
332 
333 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
334  char const *format, ...) {
335  char buffer[MAX_MESSAGE];
336  va_list ap;
337 
338  va_start(ap, format);
339  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
340  p2, (unsigned long)size, format);
341  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
342  __kmp_vprintf(kmp_err, buffer, ap);
343 #if KMP_PRINT_DATA_PLACEMENT
344  int node;
345  if (gtid >= 0) {
346  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
347  if (__kmp_storage_map_verbose) {
348  node = __kmp_get_host_node(p1);
349  if (node < 0) /* doesn't work, so don't try this next time */
350  __kmp_storage_map_verbose = FALSE;
351  else {
352  char *last;
353  int lastNode;
354  int localProc = __kmp_get_cpu_from_gtid(gtid);
355 
356  const int page_size = KMP_GET_PAGE_SIZE();
357 
358  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
359  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
360  if (localProc >= 0)
361  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
362  localProc >> 1);
363  else
364  __kmp_printf_no_lock(" GTID %d\n", gtid);
365 #if KMP_USE_PRCTL
366  /* The more elaborate format is disabled for now because of the prctl
367  * hanging bug. */
368  do {
369  last = p1;
370  lastNode = node;
371  /* This loop collates adjacent pages with the same host node. */
372  do {
373  (char *)p1 += page_size;
374  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
375  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
376  lastNode);
377  } while (p1 <= p2);
378 #else
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
380  (char *)p1 + (page_size - 1),
381  __kmp_get_host_node(p1));
382  if (p1 < p2) {
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
384  (char *)p2 + (page_size - 1),
385  __kmp_get_host_node(p2));
386  }
387 #endif
388  }
389  }
390  } else
391  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
392  }
393 #endif /* KMP_PRINT_DATA_PLACEMENT */
394  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
395 }
396 
397 void __kmp_warn(char const *format, ...) {
398  char buffer[MAX_MESSAGE];
399  va_list ap;
400 
401  if (__kmp_generate_warnings == kmp_warnings_off) {
402  return;
403  }
404 
405  va_start(ap, format);
406 
407  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
408  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
409  __kmp_vprintf(kmp_err, buffer, ap);
410  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
411 
412  va_end(ap);
413 }
414 
415 void __kmp_abort_process() {
416  // Later threads may stall here, but that's ok because abort() will kill them.
417  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
418 
419  if (__kmp_debug_buf) {
420  __kmp_dump_debug_buffer();
421  }; // if
422 
423  if (KMP_OS_WINDOWS) {
424  // Let other threads know of abnormal termination and prevent deadlock
425  // if abort happened during library initialization or shutdown
426  __kmp_global.g.g_abort = SIGABRT;
427 
428  /* On Windows* OS by default abort() causes pop-up error box, which stalls
429  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
430  boxes. _set_abort_behavior() works well, but this function is not
431  available in VS7 (this is not problem for DLL, but it is a problem for
432  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
433  help, at least in some versions of MS C RTL.
434 
435  It seems following sequence is the only way to simulate abort() and
436  avoid pop-up error box. */
437  raise(SIGABRT);
438  _exit(3); // Just in case, if signal ignored, exit anyway.
439  } else {
440  abort();
441  }; // if
442 
443  __kmp_infinite_loop();
444  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
445 
446 } // __kmp_abort_process
447 
448 void __kmp_abort_thread(void) {
449  // TODO: Eliminate g_abort global variable and this function.
450  // In case of abort just call abort(), it will kill all the threads.
451  __kmp_infinite_loop();
452 } // __kmp_abort_thread
453 
454 /* Print out the storage map for the major kmp_info_t thread data structures
455  that are allocated together. */
456 
457 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
458  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
459  gtid);
460 
461  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
462  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
463 
464  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
465  sizeof(kmp_local_t), "th_%d.th_local", gtid);
466 
467  __kmp_print_storage_map_gtid(
468  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
469  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
470 
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
472  &thr->th.th_bar[bs_plain_barrier + 1],
473  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
474  gtid);
475 
476  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
477  &thr->th.th_bar[bs_forkjoin_barrier + 1],
478  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
479  gtid);
480 
481 #if KMP_FAST_REDUCTION_BARRIER
482  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
483  &thr->th.th_bar[bs_reduction_barrier + 1],
484  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
485  gtid);
486 #endif // KMP_FAST_REDUCTION_BARRIER
487 }
488 
489 /* Print out the storage map for the major kmp_team_t team data structures
490  that are allocated together. */
491 
492 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
493  int team_id, int num_thr) {
494  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
495  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
496  header, team_id);
497 
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
499  &team->t.t_bar[bs_last_barrier],
500  sizeof(kmp_balign_team_t) * bs_last_barrier,
501  "%s_%d.t_bar", header, team_id);
502 
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
504  &team->t.t_bar[bs_plain_barrier + 1],
505  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
506  header, team_id);
507 
508  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
509  &team->t.t_bar[bs_forkjoin_barrier + 1],
510  sizeof(kmp_balign_team_t),
511  "%s_%d.t_bar[forkjoin]", header, team_id);
512 
513 #if KMP_FAST_REDUCTION_BARRIER
514  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
515  &team->t.t_bar[bs_reduction_barrier + 1],
516  sizeof(kmp_balign_team_t),
517  "%s_%d.t_bar[reduction]", header, team_id);
518 #endif // KMP_FAST_REDUCTION_BARRIER
519 
520  __kmp_print_storage_map_gtid(
521  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
522  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
526  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
527 
528  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
529  &team->t.t_disp_buffer[num_disp_buff],
530  sizeof(dispatch_shared_info_t) * num_disp_buff,
531  "%s_%d.t_disp_buffer", header, team_id);
532 
533  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
534  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
535  team_id);
536 }
537 
538 static void __kmp_init_allocator() {}
539 static void __kmp_fini_allocator() {}
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #ifdef KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547  // TODO: Change to __kmp_break_bootstrap_lock().
548  __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552  int i;
553  int thread_count;
554 
555  // PROCESS_DETACH is expected to be called by a thread that executes
556  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559  // threads can be still alive here, although being about to be terminated. The
560  // threads in the array with ds_thread==0 are most suspicious. Actually, it
561  // can be not safe to access the __kmp_threads[].
562 
563  // TODO: does it make sense to check __kmp_roots[] ?
564 
565  // Let's check that there are no other alive threads registered with the OMP
566  // lib.
567  while (1) {
568  thread_count = 0;
569  for (i = 0; i < __kmp_threads_capacity; ++i) {
570  if (!__kmp_threads)
571  continue;
572  kmp_info_t *th = __kmp_threads[i];
573  if (th == NULL)
574  continue;
575  int gtid = th->th.th_info.ds.ds_gtid;
576  if (gtid == gtid_req)
577  continue;
578  if (gtid < 0)
579  continue;
580  DWORD exit_val;
581  int alive = __kmp_is_thread_alive(th, &exit_val);
582  if (alive) {
583  ++thread_count;
584  }
585  }
586  if (thread_count == 0)
587  break; // success
588  }
589 
590  // Assume that I'm alone. Now it might be safe to check and reset locks.
591  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592  __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594  __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601  switch (fdwReason) {
602 
603  case DLL_PROCESS_ATTACH:
604  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606  return TRUE;
607 
608  case DLL_PROCESS_DETACH:
609  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611  if (lpReserved != NULL) {
612  // lpReserved is used for telling the difference:
613  // lpReserved == NULL when FreeLibrary() was called,
614  // lpReserved != NULL when the process terminates.
615  // When FreeLibrary() is called, worker threads remain alive. So they will
616  // release the forkjoin lock by themselves. When the process terminates,
617  // worker threads disappear triggering the problem of unreleased forkjoin
618  // lock as described below.
619 
620  // A worker thread can take the forkjoin lock. The problem comes up if
621  // that worker thread becomes dead before it releases the forkjoin lock.
622  // The forkjoin lock remains taken, while the thread executing
623  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624  // to take the forkjoin lock and will always fail, so that the application
625  // will never finish [normally]. This scenario is possible if
626  // __kmpc_end() has not been executed. It looks like it's not a corner
627  // case, but common cases:
628  // - the main function was compiled by an alternative compiler;
629  // - the main function was compiled by icl but without /Qopenmp
630  // (application with plugins);
631  // - application terminates by calling C exit(), Fortran CALL EXIT() or
632  // Fortran STOP.
633  // - alive foreign thread prevented __kmpc_end from doing cleanup.
634  //
635  // This is a hack to work around the problem.
636  // TODO: !!! figure out something better.
637  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638  }
639 
640  __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642  return TRUE;
643 
644  case DLL_THREAD_ATTACH:
645  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647  /* if we want to register new siblings all the time here call
648  * __kmp_get_gtid(); */
649  return TRUE;
650 
651  case DLL_THREAD_DETACH:
652  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654  __kmp_internal_end_thread(__kmp_gtid_get_specific());
655  return TRUE;
656  }
657 
658  return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* Change the library type to "status" and return the old type */
665 /* called from within initialization routines where __kmp_initz_lock is held */
666 int __kmp_change_library(int status) {
667  int old_status;
668 
669  old_status = __kmp_yield_init &
670  1; // check whether KMP_LIBRARY=throughput (even init count)
671 
672  if (status) {
673  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
674  } else {
675  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
676  }
677 
678  return old_status; // return previous setting of whether
679  // KMP_LIBRARY=throughput
680 }
681 
682 /* __kmp_parallel_deo -- Wait until it's our turn. */
683 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
684  int gtid = *gtid_ref;
685 #ifdef BUILD_PARALLEL_ORDERED
686  kmp_team_t *team = __kmp_team_from_gtid(gtid);
687 #endif /* BUILD_PARALLEL_ORDERED */
688 
689  if (__kmp_env_consistency_check) {
690  if (__kmp_threads[gtid]->th.th_root->r.r_active)
691 #if KMP_USE_DYNAMIC_LOCK
692  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
693 #else
694  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
695 #endif
696  }
697 #ifdef BUILD_PARALLEL_ORDERED
698  if (!team->t.t_serialized) {
699  KMP_MB();
700  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
701  KMP_EQ, NULL);
702  KMP_MB();
703  }
704 #endif /* BUILD_PARALLEL_ORDERED */
705 }
706 
707 /* __kmp_parallel_dxo -- Signal the next task. */
708 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
709  int gtid = *gtid_ref;
710 #ifdef BUILD_PARALLEL_ORDERED
711  int tid = __kmp_tid_from_gtid(gtid);
712  kmp_team_t *team = __kmp_team_from_gtid(gtid);
713 #endif /* BUILD_PARALLEL_ORDERED */
714 
715  if (__kmp_env_consistency_check) {
716  if (__kmp_threads[gtid]->th.th_root->r.r_active)
717  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
718  }
719 #ifdef BUILD_PARALLEL_ORDERED
720  if (!team->t.t_serialized) {
721  KMP_MB(); /* Flush all pending memory write invalidates. */
722 
723  /* use the tid of the next thread in this team */
724  /* TODO replace with general release procedure */
725  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
726 
727 #if OMPT_SUPPORT && OMPT_BLAME
728  if (ompt_enabled &&
729  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
730  /* accept blame for "ordered" waiting */
731  kmp_info_t *this_thread = __kmp_threads[gtid];
732  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
733  this_thread->th.ompt_thread_info.wait_id);
734  }
735 #endif
736 
737  KMP_MB(); /* Flush all pending memory write invalidates. */
738  }
739 #endif /* BUILD_PARALLEL_ORDERED */
740 }
741 
742 /* ------------------------------------------------------------------------ */
743 /* The BARRIER for a SINGLE process section is always explicit */
744 
745 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
746  int status;
747  kmp_info_t *th;
748  kmp_team_t *team;
749 
750  if (!TCR_4(__kmp_init_parallel))
751  __kmp_parallel_initialize();
752 
753  th = __kmp_threads[gtid];
754  team = th->th.th_team;
755  status = 0;
756 
757  th->th.th_ident = id_ref;
758 
759  if (team->t.t_serialized) {
760  status = 1;
761  } else {
762  kmp_int32 old_this = th->th.th_local.this_construct;
763 
764  ++th->th.th_local.this_construct;
765  /* try to set team count to thread count--success means thread got the
766  single block */
767  /* TODO: Should this be acquire or release? */
768  if (team->t.t_construct == old_this) {
769  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
770  th->th.th_local.this_construct);
771  }
772 #if USE_ITT_BUILD
773  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
774  KMP_MASTER_GTID(gtid) &&
775 #if OMP_40_ENABLED
776  th->th.th_teams_microtask == NULL &&
777 #endif
778  team->t.t_active_level ==
779  1) { // Only report metadata by master of active team at level 1
780  __kmp_itt_metadata_single(id_ref);
781  }
782 #endif /* USE_ITT_BUILD */
783  }
784 
785  if (__kmp_env_consistency_check) {
786  if (status && push_ws) {
787  __kmp_push_workshare(gtid, ct_psingle, id_ref);
788  } else {
789  __kmp_check_workshare(gtid, ct_psingle, id_ref);
790  }
791  }
792 #if USE_ITT_BUILD
793  if (status) {
794  __kmp_itt_single_start(gtid);
795  }
796 #endif /* USE_ITT_BUILD */
797  return status;
798 }
799 
800 void __kmp_exit_single(int gtid) {
801 #if USE_ITT_BUILD
802  __kmp_itt_single_end(gtid);
803 #endif /* USE_ITT_BUILD */
804  if (__kmp_env_consistency_check)
805  __kmp_pop_workshare(gtid, ct_psingle, NULL);
806 }
807 
808 /* determine if we can go parallel or must use a serialized parallel region and
809  * how many threads we can use
810  * set_nproc is the number of threads requested for the team
811  * returns 0 if we should serialize or only use one thread,
812  * otherwise the number of threads to use
813  * The forkjoin lock is held by the caller. */
814 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
815  int master_tid, int set_nthreads
816 #if OMP_40_ENABLED
817  ,
818  int enter_teams
819 #endif /* OMP_40_ENABLED */
820  ) {
821  int capacity;
822  int new_nthreads;
823  KMP_DEBUG_ASSERT(__kmp_init_serial);
824  KMP_DEBUG_ASSERT(root && parent_team);
825 
826  // If dyn-var is set, dynamically adjust the number of desired threads,
827  // according to the method specified by dynamic_mode.
828  new_nthreads = set_nthreads;
829  if (!get__dynamic_2(parent_team, master_tid)) {
830  ;
831  }
832 #ifdef USE_LOAD_BALANCE
833  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
834  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
835  if (new_nthreads == 1) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
837  "reservation to 1 thread\n",
838  master_tid));
839  return 1;
840  }
841  if (new_nthreads < set_nthreads) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
843  "reservation to %d threads\n",
844  master_tid, new_nthreads));
845  }
846  }
847 #endif /* USE_LOAD_BALANCE */
848  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
849  new_nthreads = __kmp_avail_proc - __kmp_nth +
850  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
851  if (new_nthreads <= 1) {
852  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
853  "reservation to 1 thread\n",
854  master_tid));
855  return 1;
856  }
857  if (new_nthreads < set_nthreads) {
858  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
859  "reservation to %d threads\n",
860  master_tid, new_nthreads));
861  } else {
862  new_nthreads = set_nthreads;
863  }
864  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
865  if (set_nthreads > 2) {
866  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
867  new_nthreads = (new_nthreads % set_nthreads) + 1;
868  if (new_nthreads == 1) {
869  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
870  "reservation to 1 thread\n",
871  master_tid));
872  return 1;
873  }
874  if (new_nthreads < set_nthreads) {
875  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
876  "reservation to %d threads\n",
877  master_tid, new_nthreads));
878  }
879  }
880  } else {
881  KMP_ASSERT(0);
882  }
883 
884  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
885  if (__kmp_nth + new_nthreads -
886  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
887  __kmp_max_nth) {
888  int tl_nthreads = __kmp_max_nth - __kmp_nth +
889  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
890  if (tl_nthreads <= 0) {
891  tl_nthreads = 1;
892  }
893 
894  // If dyn-var is false, emit a 1-time warning.
895  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
896  __kmp_reserve_warn = 1;
897  __kmp_msg(kmp_ms_warning,
898  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
899  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
900  }
901  if (tl_nthreads == 1) {
902  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
903  "reservation to 1 thread\n",
904  master_tid));
905  return 1;
906  }
907  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
908  "reservation to %d threads\n",
909  master_tid, tl_nthreads));
910  new_nthreads = tl_nthreads;
911  }
912 
913  // Check if the threads array is large enough, or needs expanding.
914  // See comment in __kmp_register_root() about the adjustment if
915  // __kmp_threads[0] == NULL.
916  capacity = __kmp_threads_capacity;
917  if (TCR_PTR(__kmp_threads[0]) == NULL) {
918  --capacity;
919  }
920  if (__kmp_nth + new_nthreads -
921  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
922  capacity) {
923  // Expand the threads array.
924  int slotsRequired = __kmp_nth + new_nthreads -
925  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
926  capacity;
927  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
928  if (slotsAdded < slotsRequired) {
929  // The threads array was not expanded enough.
930  new_nthreads -= (slotsRequired - slotsAdded);
931  KMP_ASSERT(new_nthreads >= 1);
932 
933  // If dyn-var is false, emit a 1-time warning.
934  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
935  __kmp_reserve_warn = 1;
936  if (__kmp_tp_cached) {
937  __kmp_msg(kmp_ms_warning,
938  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
939  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
940  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
941  } else {
942  __kmp_msg(kmp_ms_warning,
943  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
944  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
945  }
946  }
947  }
948  }
949 
950 #ifdef KMP_DEBUG
951  if (new_nthreads == 1) {
952  KC_TRACE(10,
953  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
954  "dead roots and rechecking; requested %d threads\n",
955  __kmp_get_gtid(), set_nthreads));
956  } else {
957  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
958  " %d threads\n",
959  __kmp_get_gtid(), new_nthreads, set_nthreads));
960  }
961 #endif // KMP_DEBUG
962  return new_nthreads;
963 }
964 
965 /* Allocate threads from the thread pool and assign them to the new team. We are
966  assured that there are enough threads available, because we checked on that
967  earlier within critical section forkjoin */
968 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
969  kmp_info_t *master_th, int master_gtid) {
970  int i;
971  int use_hot_team;
972 
973  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
974  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
975  KMP_MB();
976 
977  /* first, let's setup the master thread */
978  master_th->th.th_info.ds.ds_tid = 0;
979  master_th->th.th_team = team;
980  master_th->th.th_team_nproc = team->t.t_nproc;
981  master_th->th.th_team_master = master_th;
982  master_th->th.th_team_serialized = FALSE;
983  master_th->th.th_dispatch = &team->t.t_dispatch[0];
984 
985 /* make sure we are not the optimized hot team */
986 #if KMP_NESTED_HOT_TEAMS
987  use_hot_team = 0;
988  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
989  if (hot_teams) { // hot teams array is not allocated if
990  // KMP_HOT_TEAMS_MAX_LEVEL=0
991  int level = team->t.t_active_level - 1; // index in array of hot teams
992  if (master_th->th.th_teams_microtask) { // are we inside the teams?
993  if (master_th->th.th_teams_size.nteams > 1) {
994  ++level; // level was not increased in teams construct for
995  // team_of_masters
996  }
997  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
998  master_th->th.th_teams_level == team->t.t_level) {
999  ++level; // level was not increased in teams construct for
1000  // team_of_workers before the parallel
1001  } // team->t.t_level will be increased inside parallel
1002  }
1003  if (level < __kmp_hot_teams_max_level) {
1004  if (hot_teams[level].hot_team) {
1005  // hot team has already been allocated for given level
1006  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1007  use_hot_team = 1; // the team is ready to use
1008  } else {
1009  use_hot_team = 0; // AC: threads are not allocated yet
1010  hot_teams[level].hot_team = team; // remember new hot team
1011  hot_teams[level].hot_team_nth = team->t.t_nproc;
1012  }
1013  } else {
1014  use_hot_team = 0;
1015  }
1016  }
1017 #else
1018  use_hot_team = team == root->r.r_hot_team;
1019 #endif
1020  if (!use_hot_team) {
1021 
1022  /* install the master thread */
1023  team->t.t_threads[0] = master_th;
1024  __kmp_initialize_info(master_th, team, 0, master_gtid);
1025 
1026  /* now, install the worker threads */
1027  for (i = 1; i < team->t.t_nproc; i++) {
1028 
1029  /* fork or reallocate a new thread and install it in team */
1030  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1031  team->t.t_threads[i] = thr;
1032  KMP_DEBUG_ASSERT(thr);
1033  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1034  /* align team and thread arrived states */
1035  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1036  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1037  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1038  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1039  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1040  team->t.t_bar[bs_plain_barrier].b_arrived));
1041 #if OMP_40_ENABLED
1042  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1043  thr->th.th_teams_level = master_th->th.th_teams_level;
1044  thr->th.th_teams_size = master_th->th.th_teams_size;
1045 #endif
1046  { // Initialize threads' barrier data.
1047  int b;
1048  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1049  for (b = 0; b < bs_last_barrier; ++b) {
1050  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1051  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1052 #if USE_DEBUGGER
1053  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1054 #endif
1055  }; // for b
1056  }
1057  }
1058 
1059 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1060  __kmp_partition_places(team);
1061 #endif
1062  }
1063 
1064  KMP_MB();
1065 }
1066 
1067 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1068 // Propagate any changes to the floating point control registers out to the team
1069 // We try to avoid unnecessary writes to the relevant cache line in the team
1070 // structure, so we don't make changes unless they are needed.
1071 inline static void propagateFPControl(kmp_team_t *team) {
1072  if (__kmp_inherit_fp_control) {
1073  kmp_int16 x87_fpu_control_word;
1074  kmp_uint32 mxcsr;
1075 
1076  // Get master values of FPU control flags (both X87 and vector)
1077  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1078  __kmp_store_mxcsr(&mxcsr);
1079  mxcsr &= KMP_X86_MXCSR_MASK;
1080 
1081 // There is no point looking at t_fp_control_saved here.
1082 // If it is TRUE, we still have to update the values if they are different from
1083 // those we now have.
1084 // If it is FALSE we didn't save anything yet, but our objective is the same. We
1085 // have to ensure that the values in the team are the same as those we have.
1086 // So, this code achieves what we need whether or not t_fp_control_saved is
1087 // true. By checking whether the value needs updating we avoid unnecessary
1088 // writes that would put the cache-line into a written state, causing all
1089 // threads in the team to have to read it again.
1090  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1091  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1092  // Although we don't use this value, other code in the runtime wants to know
1093  // whether it should restore them. So we must ensure it is correct.
1094  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1095  } else {
1096  // Similarly here. Don't write to this cache-line in the team structure
1097  // unless we have to.
1098  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1099  }
1100 }
1101 
1102 // Do the opposite, setting the hardware registers to the updated values from
1103 // the team.
1104 inline static void updateHWFPControl(kmp_team_t *team) {
1105  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1106  // Only reset the fp control regs if they have been changed in the team.
1107  // the parallel region that we are exiting.
1108  kmp_int16 x87_fpu_control_word;
1109  kmp_uint32 mxcsr;
1110  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1111  __kmp_store_mxcsr(&mxcsr);
1112  mxcsr &= KMP_X86_MXCSR_MASK;
1113 
1114  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1115  __kmp_clear_x87_fpu_status_word();
1116  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1117  }
1118 
1119  if (team->t.t_mxcsr != mxcsr) {
1120  __kmp_load_mxcsr(&team->t.t_mxcsr);
1121  }
1122  }
1123 }
1124 #else
1125 #define propagateFPControl(x) ((void)0)
1126 #define updateHWFPControl(x) ((void)0)
1127 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1128 
1129 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1130  int realloc); // forward declaration
1131 
1132 /* Run a parallel region that has been serialized, so runs only in a team of the
1133  single master thread. */
1134 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1135  kmp_info_t *this_thr;
1136  kmp_team_t *serial_team;
1137 
1138  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1139 
1140  /* Skip all this code for autopar serialized loops since it results in
1141  unacceptable overhead */
1142  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1143  return;
1144 
1145  if (!TCR_4(__kmp_init_parallel))
1146  __kmp_parallel_initialize();
1147 
1148  this_thr = __kmp_threads[global_tid];
1149  serial_team = this_thr->th.th_serial_team;
1150 
1151  /* utilize the serialized team held by this thread */
1152  KMP_DEBUG_ASSERT(serial_team);
1153  KMP_MB();
1154 
1155  if (__kmp_tasking_mode != tskm_immediate_exec) {
1156  KMP_DEBUG_ASSERT(
1157  this_thr->th.th_task_team ==
1158  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1159  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1160  NULL);
1161  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1162  "team %p, new task_team = NULL\n",
1163  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1164  this_thr->th.th_task_team = NULL;
1165  }
1166 
1167 #if OMP_40_ENABLED
1168  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1169  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1170  proc_bind = proc_bind_false;
1171  } else if (proc_bind == proc_bind_default) {
1172  // No proc_bind clause was specified, so use the current value
1173  // of proc-bind-var for this parallel region.
1174  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1175  }
1176  // Reset for next parallel region
1177  this_thr->th.th_set_proc_bind = proc_bind_default;
1178 #endif /* OMP_40_ENABLED */
1179 
1180  if (this_thr->th.th_team != serial_team) {
1181  // Nested level will be an index in the nested nthreads array
1182  int level = this_thr->th.th_team->t.t_level;
1183 
1184  if (serial_team->t.t_serialized) {
1185  /* this serial team was already used
1186  TODO increase performance by making this locks more specific */
1187  kmp_team_t *new_team;
1188 
1189  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1190 
1191 #if OMPT_SUPPORT
1192  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1193 #endif
1194 
1195  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1196 #if OMPT_SUPPORT
1197  ompt_parallel_id,
1198 #endif
1199 #if OMP_40_ENABLED
1200  proc_bind,
1201 #endif
1202  &this_thr->th.th_current_task->td_icvs,
1203  0 USE_NESTED_HOT_ARG(NULL));
1204  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1205  KMP_ASSERT(new_team);
1206 
1207  /* setup new serialized team and install it */
1208  new_team->t.t_threads[0] = this_thr;
1209  new_team->t.t_parent = this_thr->th.th_team;
1210  serial_team = new_team;
1211  this_thr->th.th_serial_team = serial_team;
1212 
1213  KF_TRACE(
1214  10,
1215  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1216  global_tid, serial_team));
1217 
1218  /* TODO the above breaks the requirement that if we run out of resources,
1219  then we can still guarantee that serialized teams are ok, since we may
1220  need to allocate a new one */
1221  } else {
1222  KF_TRACE(
1223  10,
1224  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1225  global_tid, serial_team));
1226  }
1227 
1228  /* we have to initialize this serial team */
1229  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1230  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1231  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1232  serial_team->t.t_ident = loc;
1233  serial_team->t.t_serialized = 1;
1234  serial_team->t.t_nproc = 1;
1235  serial_team->t.t_parent = this_thr->th.th_team;
1236  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1237  this_thr->th.th_team = serial_team;
1238  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1239 
1240  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1241  this_thr->th.th_current_task));
1242  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1243  this_thr->th.th_current_task->td_flags.executing = 0;
1244 
1245  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1246 
1247  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1248  implicit task for each serialized task represented by
1249  team->t.t_serialized? */
1250  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1251  &this_thr->th.th_current_task->td_parent->td_icvs);
1252 
1253  // Thread value exists in the nested nthreads array for the next nested
1254  // level
1255  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1256  this_thr->th.th_current_task->td_icvs.nproc =
1257  __kmp_nested_nth.nth[level + 1];
1258  }
1259 
1260 #if OMP_40_ENABLED
1261  if (__kmp_nested_proc_bind.used &&
1262  (level + 1 < __kmp_nested_proc_bind.used)) {
1263  this_thr->th.th_current_task->td_icvs.proc_bind =
1264  __kmp_nested_proc_bind.bind_types[level + 1];
1265  }
1266 #endif /* OMP_40_ENABLED */
1267 
1268 #if USE_DEBUGGER
1269  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1270 #endif
1271  this_thr->th.th_info.ds.ds_tid = 0;
1272 
1273  /* set thread cache values */
1274  this_thr->th.th_team_nproc = 1;
1275  this_thr->th.th_team_master = this_thr;
1276  this_thr->th.th_team_serialized = 1;
1277 
1278  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1279  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1280 
1281  propagateFPControl(serial_team);
1282 
1283  /* check if we need to allocate dispatch buffers stack */
1284  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1285  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1286  serial_team->t.t_dispatch->th_disp_buffer =
1287  (dispatch_private_info_t *)__kmp_allocate(
1288  sizeof(dispatch_private_info_t));
1289  }
1290  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1291 
1292 #if OMPT_SUPPORT
1293  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1294  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1295 #endif
1296 
1297  KMP_MB();
1298 
1299  } else {
1300  /* this serialized team is already being used,
1301  * that's fine, just add another nested level */
1302  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1303  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1304  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1305  ++serial_team->t.t_serialized;
1306  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1307 
1308  // Nested level will be an index in the nested nthreads array
1309  int level = this_thr->th.th_team->t.t_level;
1310  // Thread value exists in the nested nthreads array for the next nested
1311  // level
1312  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1313  this_thr->th.th_current_task->td_icvs.nproc =
1314  __kmp_nested_nth.nth[level + 1];
1315  }
1316  serial_team->t.t_level++;
1317  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1318  "of serial team %p to %d\n",
1319  global_tid, serial_team, serial_team->t.t_level));
1320 
1321  /* allocate/push dispatch buffers stack */
1322  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1323  {
1324  dispatch_private_info_t *disp_buffer =
1325  (dispatch_private_info_t *)__kmp_allocate(
1326  sizeof(dispatch_private_info_t));
1327  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1328  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1329  }
1330  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1331 
1332  KMP_MB();
1333  }
1334 #if OMP_40_ENABLED
1335  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1336 #endif
1337 
1338  if (__kmp_env_consistency_check)
1339  __kmp_push_parallel(global_tid, NULL);
1340 }
1341 
1342 /* most of the work for a fork */
1343 /* return true if we really went parallel, false if serialized */
1344 int __kmp_fork_call(ident_t *loc, int gtid,
1345  enum fork_context_e call_context, // Intel, GNU, ...
1346  kmp_int32 argc,
1347 #if OMPT_SUPPORT
1348  void *unwrapped_task,
1349 #endif
1350  microtask_t microtask, launch_t invoker,
1351 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1352 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1353  va_list *ap
1354 #else
1355  va_list ap
1356 #endif
1357  ) {
1358  void **argv;
1359  int i;
1360  int master_tid;
1361  int master_this_cons;
1362  kmp_team_t *team;
1363  kmp_team_t *parent_team;
1364  kmp_info_t *master_th;
1365  kmp_root_t *root;
1366  int nthreads;
1367  int master_active;
1368  int master_set_numthreads;
1369  int level;
1370 #if OMP_40_ENABLED
1371  int active_level;
1372  int teams_level;
1373 #endif
1374 #if KMP_NESTED_HOT_TEAMS
1375  kmp_hot_team_ptr_t **p_hot_teams;
1376 #endif
1377  { // KMP_TIME_BLOCK
1378  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1379  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1380 
1381  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1382  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1383  /* Some systems prefer the stack for the root thread(s) to start with */
1384  /* some gap from the parent stack to prevent false sharing. */
1385  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1386  /* These 2 lines below are so this does not get optimized out */
1387  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1388  __kmp_stkpadding += (short)((kmp_int64)dummy);
1389  }
1390 
1391  /* initialize if needed */
1392  KMP_DEBUG_ASSERT(
1393  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1394  if (!TCR_4(__kmp_init_parallel))
1395  __kmp_parallel_initialize();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_parallel_id_t ompt_parallel_id;
1409  ompt_task_id_t ompt_task_id;
1410  ompt_frame_t *ompt_frame;
1411  ompt_task_id_t my_task_id;
1412  ompt_parallel_id_t my_parallel_id;
1413 
1414  if (ompt_enabled) {
1415  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1416  ompt_task_id = __ompt_get_task_id_internal(0);
1417  ompt_frame = __ompt_get_task_frame_internal(0);
1418  }
1419 #endif
1420 
1421  // Nested level will be an index in the nested nthreads array
1422  level = parent_team->t.t_level;
1423  // used to launch non-serial teams even if nested is not allowed
1424  active_level = parent_team->t.t_active_level;
1425 #if OMP_40_ENABLED
1426  // needed to check nesting inside the teams
1427  teams_level = master_th->th.th_teams_level;
1428 #endif
1429 #if KMP_NESTED_HOT_TEAMS
1430  p_hot_teams = &master_th->th.th_hot_teams;
1431  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1432  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1433  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1434  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1435  // it is either actual or not needed (when active_level > 0)
1436  (*p_hot_teams)[0].hot_team_nth = 1;
1437  }
1438 #endif
1439 
1440 #if OMPT_SUPPORT
1441  if (ompt_enabled &&
1442  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1443  int team_size = master_set_numthreads;
1444 
1445  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1446  ompt_task_id, ompt_frame, ompt_parallel_id, team_size, unwrapped_task,
1447  OMPT_INVOKER(call_context));
1448  }
1449 #endif
1450 
1451  master_th->th.th_ident = loc;
1452 
1453 #if OMP_40_ENABLED
1454  if (master_th->th.th_teams_microtask && ap &&
1455  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1456  // AC: This is start of parallel that is nested inside teams construct.
1457  // The team is actual (hot), all workers are ready at the fork barrier.
1458  // No lock needed to initialize the team a bit, then free workers.
1459  parent_team->t.t_ident = loc;
1460  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1461  parent_team->t.t_argc = argc;
1462  argv = (void **)parent_team->t.t_argv;
1463  for (i = argc - 1; i >= 0; --i)
1464 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1465 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1466  *argv++ = va_arg(*ap, void *);
1467 #else
1468  *argv++ = va_arg(ap, void *);
1469 #endif
1470  // Increment our nested depth levels, but not increase the serialization
1471  if (parent_team == master_th->th.th_serial_team) {
1472  // AC: we are in serialized parallel
1473  __kmpc_serialized_parallel(loc, gtid);
1474  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1475  // AC: need this in order enquiry functions work
1476  // correctly, will restore at join time
1477  parent_team->t.t_serialized--;
1478 #if OMPT_SUPPORT
1479  void *dummy;
1480  void **exit_runtime_p;
1481 
1482  ompt_lw_taskteam_t lw_taskteam;
1483 
1484  if (ompt_enabled) {
1485  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, unwrapped_task,
1486  ompt_parallel_id);
1487  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1488  exit_runtime_p =
1489  &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1490 
1491  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1492 
1493 #if OMPT_TRACE
1494  /* OMPT implicit task begin */
1495  my_task_id = lw_taskteam.ompt_task_info.task_id;
1496  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1497  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1498  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1499  my_parallel_id, my_task_id);
1500  }
1501 #endif
1502 
1503  /* OMPT state */
1504  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1505  } else {
1506  exit_runtime_p = &dummy;
1507  }
1508 #endif
1509 
1510  {
1511  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1512  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1513  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1514 #if OMPT_SUPPORT
1515  ,
1516  exit_runtime_p
1517 #endif
1518  );
1519  }
1520 
1521 #if OMPT_SUPPORT
1522  *exit_runtime_p = NULL;
1523  if (ompt_enabled) {
1524 #if OMPT_TRACE
1525  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1526 
1527  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1528  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1529  ompt_parallel_id, ompt_task_id);
1530  }
1531 
1532  __ompt_lw_taskteam_unlink(master_th);
1533  // reset clear the task id only after unlinking the task
1534  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1535 #endif
1536 
1537  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1538  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1539  ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1540  }
1541  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1542  }
1543 #endif
1544  return TRUE;
1545  }
1546 
1547  parent_team->t.t_pkfn = microtask;
1548 #if OMPT_SUPPORT
1549  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1550 #endif
1551  parent_team->t.t_invoke = invoker;
1552  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1553  parent_team->t.t_active_level++;
1554  parent_team->t.t_level++;
1555 
1556  /* Change number of threads in the team if requested */
1557  if (master_set_numthreads) { // The parallel has num_threads clause
1558  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1559  // AC: only can reduce number of threads dynamically, can't increase
1560  kmp_info_t **other_threads = parent_team->t.t_threads;
1561  parent_team->t.t_nproc = master_set_numthreads;
1562  for (i = 0; i < master_set_numthreads; ++i) {
1563  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1564  }
1565  // Keep extra threads hot in the team for possible next parallels
1566  }
1567  master_th->th.th_set_nproc = 0;
1568  }
1569 
1570 #if USE_DEBUGGER
1571  if (__kmp_debugging) { // Let debugger override number of threads.
1572  int nth = __kmp_omp_num_threads(loc);
1573  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1574  master_set_numthreads = nth;
1575  }; // if
1576  }; // if
1577 #endif
1578 
1579  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1580  "master_th=%p, gtid=%d\n",
1581  root, parent_team, master_th, gtid));
1582  __kmp_internal_fork(loc, gtid, parent_team);
1583  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1584  "master_th=%p, gtid=%d\n",
1585  root, parent_team, master_th, gtid));
1586 
1587  /* Invoke microtask for MASTER thread */
1588  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1589  parent_team->t.t_id, parent_team->t.t_pkfn));
1590 
1591  {
1592  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1593  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1594  if (!parent_team->t.t_invoke(gtid)) {
1595  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1596  }
1597  }
1598  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1599  parent_team->t.t_id, parent_team->t.t_pkfn));
1600  KMP_MB(); /* Flush all pending memory write invalidates. */
1601 
1602  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1603 
1604  return TRUE;
1605  } // Parallel closely nested in teams construct
1606 #endif /* OMP_40_ENABLED */
1607 
1608 #if KMP_DEBUG
1609  if (__kmp_tasking_mode != tskm_immediate_exec) {
1610  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1611  parent_team->t.t_task_team[master_th->th.th_task_state]);
1612  }
1613 #endif
1614 
1615  if (parent_team->t.t_active_level >=
1616  master_th->th.th_current_task->td_icvs.max_active_levels) {
1617  nthreads = 1;
1618  } else {
1619 #if OMP_40_ENABLED
1620  int enter_teams = ((ap == NULL && active_level == 0) ||
1621  (ap && teams_level > 0 && teams_level == level));
1622 #endif
1623  nthreads =
1624  master_set_numthreads
1625  ? master_set_numthreads
1626  : get__nproc_2(
1627  parent_team,
1628  master_tid); // TODO: get nproc directly from current task
1629 
1630  // Check if we need to take forkjoin lock? (no need for serialized
1631  // parallel out of teams construct). This code moved here from
1632  // __kmp_reserve_threads() to speedup nested serialized parallels.
1633  if (nthreads > 1) {
1634  if ((!get__nested(master_th) && (root->r.r_in_parallel
1635 #if OMP_40_ENABLED
1636  && !enter_teams
1637 #endif /* OMP_40_ENABLED */
1638  )) ||
1639  (__kmp_library == library_serial)) {
1640  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1641  " threads\n",
1642  gtid, nthreads));
1643  nthreads = 1;
1644  }
1645  }
1646  if (nthreads > 1) {
1647  /* determine how many new threads we can use */
1648  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1649  nthreads = __kmp_reserve_threads(
1650  root, parent_team, master_tid, nthreads
1651 #if OMP_40_ENABLED
1652  /* AC: If we execute teams from parallel region (on host), then
1653  teams should be created but each can only have 1 thread if
1654  nesting is disabled. If teams called from serial region, then
1655  teams and their threads should be created regardless of the
1656  nesting setting. */
1657  ,
1658  enter_teams
1659 #endif /* OMP_40_ENABLED */
1660  );
1661  if (nthreads == 1) {
1662  // Free lock for single thread execution here; for multi-thread
1663  // execution it will be freed later after team of threads created
1664  // and initialized
1665  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1666  }
1667  }
1668  }
1669  KMP_DEBUG_ASSERT(nthreads > 0);
1670 
1671  // If we temporarily changed the set number of threads then restore it now
1672  master_th->th.th_set_nproc = 0;
1673 
1674  /* create a serialized parallel region? */
1675  if (nthreads == 1) {
1676 /* josh todo: hypothetical question: what do we do for OS X*? */
1677 #if KMP_OS_LINUX && \
1678  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1679  void *args[argc];
1680 #else
1681  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1682 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1683  KMP_ARCH_AARCH64) */
1684 
1685  KA_TRACE(20,
1686  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1687 
1688  __kmpc_serialized_parallel(loc, gtid);
1689 
1690  if (call_context == fork_context_intel) {
1691  /* TODO this sucks, use the compiler itself to pass args! :) */
1692  master_th->th.th_serial_team->t.t_ident = loc;
1693 #if OMP_40_ENABLED
1694  if (!ap) {
1695  // revert change made in __kmpc_serialized_parallel()
1696  master_th->th.th_serial_team->t.t_level--;
1697 // Get args from parent team for teams construct
1698 
1699 #if OMPT_SUPPORT
1700  void *dummy;
1701  void **exit_runtime_p;
1702 
1703  ompt_lw_taskteam_t lw_taskteam;
1704 
1705  if (ompt_enabled) {
1706  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1707  unwrapped_task, ompt_parallel_id);
1708  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1709  exit_runtime_p =
1710  &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1711 
1712  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1713 
1714 #if OMPT_TRACE
1715  my_task_id = lw_taskteam.ompt_task_info.task_id;
1716  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1717  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1718  ompt_parallel_id, my_task_id);
1719  }
1720 #endif
1721 
1722  /* OMPT state */
1723  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1724  } else {
1725  exit_runtime_p = &dummy;
1726  }
1727 #endif
1728 
1729  {
1730  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1731  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1732  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1733  parent_team->t.t_argv
1734 #if OMPT_SUPPORT
1735  ,
1736  exit_runtime_p
1737 #endif
1738  );
1739  }
1740 
1741 #if OMPT_SUPPORT
1742  *exit_runtime_p = NULL;
1743  if (ompt_enabled) {
1744  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1745 
1746 #if OMPT_TRACE
1747  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1748  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1749  ompt_parallel_id, ompt_task_id);
1750  }
1751 #endif
1752 
1753  __ompt_lw_taskteam_unlink(master_th);
1754  // reset clear the task id only after unlinking the task
1755  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1756 
1757  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1758  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1759  ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1760  }
1761  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1762  }
1763 #endif
1764  } else if (microtask == (microtask_t)__kmp_teams_master) {
1765  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1766  master_th->th.th_serial_team);
1767  team = master_th->th.th_team;
1768  // team->t.t_pkfn = microtask;
1769  team->t.t_invoke = invoker;
1770  __kmp_alloc_argv_entries(argc, team, TRUE);
1771  team->t.t_argc = argc;
1772  argv = (void **)team->t.t_argv;
1773  if (ap) {
1774  for (i = argc - 1; i >= 0; --i)
1775 // TODO: revert workaround for Intel(R) 64 tracker #96
1776 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1777  *argv++ = va_arg(*ap, void *);
1778 #else
1779  *argv++ = va_arg(ap, void *);
1780 #endif
1781  } else {
1782  for (i = 0; i < argc; ++i)
1783  // Get args from parent team for teams construct
1784  argv[i] = parent_team->t.t_argv[i];
1785  }
1786  // AC: revert change made in __kmpc_serialized_parallel()
1787  // because initial code in teams should have level=0
1788  team->t.t_level--;
1789  // AC: call special invoker for outer "parallel" of teams construct
1790  {
1791  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1792  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1793  invoker(gtid);
1794  }
1795  } else {
1796 #endif /* OMP_40_ENABLED */
1797  argv = args;
1798  for (i = argc - 1; i >= 0; --i)
1799 // TODO: revert workaround for Intel(R) 64 tracker #96
1800 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1801  *argv++ = va_arg(*ap, void *);
1802 #else
1803  *argv++ = va_arg(ap, void *);
1804 #endif
1805  KMP_MB();
1806 
1807 #if OMPT_SUPPORT
1808  void *dummy;
1809  void **exit_runtime_p;
1810 
1811  ompt_lw_taskteam_t lw_taskteam;
1812 
1813  if (ompt_enabled) {
1814  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1815  unwrapped_task, ompt_parallel_id);
1816  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1817  exit_runtime_p =
1818  &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1819 
1820  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1821 
1822 #if OMPT_TRACE
1823  /* OMPT implicit task begin */
1824  my_task_id = lw_taskteam.ompt_task_info.task_id;
1825  my_parallel_id = ompt_parallel_id;
1826  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1827  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1828  my_parallel_id, my_task_id);
1829  }
1830 #endif
1831 
1832  /* OMPT state */
1833  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1834  } else {
1835  exit_runtime_p = &dummy;
1836  }
1837 #endif
1838 
1839  {
1840  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1841  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1842  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1843 #if OMPT_SUPPORT
1844  ,
1845  exit_runtime_p
1846 #endif
1847  );
1848  }
1849 
1850 #if OMPT_SUPPORT
1851  *exit_runtime_p = NULL;
1852  if (ompt_enabled) {
1853 #if OMPT_TRACE
1854  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1855 
1856  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1857  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1858  my_parallel_id, my_task_id);
1859  }
1860 #endif
1861 
1862  __ompt_lw_taskteam_unlink(master_th);
1863  // reset clear the task id only after unlinking the task
1864  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1865 
1866  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1867  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1868  ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1869  }
1870  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1871  }
1872 #endif
1873 #if OMP_40_ENABLED
1874  }
1875 #endif /* OMP_40_ENABLED */
1876  } else if (call_context == fork_context_gnu) {
1877 #if OMPT_SUPPORT
1878  ompt_lw_taskteam_t *lwt =
1879  (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
1880  __ompt_lw_taskteam_init(lwt, master_th, gtid, unwrapped_task,
1881  ompt_parallel_id);
1882 
1883  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1884  lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1885  __ompt_lw_taskteam_link(lwt, master_th);
1886 #endif
1887 
1888  // we were called from GNU native code
1889  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890  return FALSE;
1891  } else {
1892  KMP_ASSERT2(call_context < fork_context_last,
1893  "__kmp_fork_call: unknown fork_context parameter");
1894  }
1895 
1896  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1897  KMP_MB();
1898  return FALSE;
1899  }
1900 
1901  // GEH: only modify the executing flag in the case when not serialized
1902  // serialized case is handled in kmpc_serialized_parallel
1903  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904  "curtask=%p, curtask_max_aclevel=%d\n",
1905  parent_team->t.t_active_level, master_th,
1906  master_th->th.th_current_task,
1907  master_th->th.th_current_task->td_icvs.max_active_levels));
1908  // TODO: GEH - cannot do this assertion because root thread not set up as
1909  // executing
1910  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911  master_th->th.th_current_task->td_flags.executing = 0;
1912 
1913 #if OMP_40_ENABLED
1914  if (!master_th->th.th_teams_microtask || level > teams_level)
1915 #endif /* OMP_40_ENABLED */
1916  {
1917  /* Increment our nested depth level */
1918  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1919  }
1920 
1921  // See if we need to make a copy of the ICVs.
1922  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1923  if ((level + 1 < __kmp_nested_nth.used) &&
1924  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1925  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1926  } else {
1927  nthreads_icv = 0; // don't update
1928  }
1929 
1930 #if OMP_40_ENABLED
1931  // Figure out the proc_bind_policy for the new team.
1932  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1933  kmp_proc_bind_t proc_bind_icv =
1934  proc_bind_default; // proc_bind_default means don't update
1935  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1936  proc_bind = proc_bind_false;
1937  } else {
1938  if (proc_bind == proc_bind_default) {
1939  // No proc_bind clause specified; use current proc-bind-var for this
1940  // parallel region
1941  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1942  }
1943  /* else: The proc_bind policy was specified explicitly on parallel clause.
1944  This overrides proc-bind-var for this parallel region, but does not
1945  change proc-bind-var. */
1946  // Figure the value of proc-bind-var for the child threads.
1947  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1948  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1949  master_th->th.th_current_task->td_icvs.proc_bind)) {
1950  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1951  }
1952  }
1953 
1954  // Reset for next parallel region
1955  master_th->th.th_set_proc_bind = proc_bind_default;
1956 #endif /* OMP_40_ENABLED */
1957 
1958  if ((nthreads_icv > 0)
1959 #if OMP_40_ENABLED
1960  || (proc_bind_icv != proc_bind_default)
1961 #endif /* OMP_40_ENABLED */
1962  ) {
1963  kmp_internal_control_t new_icvs;
1964  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1965  new_icvs.next = NULL;
1966  if (nthreads_icv > 0) {
1967  new_icvs.nproc = nthreads_icv;
1968  }
1969 
1970 #if OMP_40_ENABLED
1971  if (proc_bind_icv != proc_bind_default) {
1972  new_icvs.proc_bind = proc_bind_icv;
1973  }
1974 #endif /* OMP_40_ENABLED */
1975 
1976  /* allocate a new parallel team */
1977  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1978  team = __kmp_allocate_team(root, nthreads, nthreads,
1979 #if OMPT_SUPPORT
1980  ompt_parallel_id,
1981 #endif
1982 #if OMP_40_ENABLED
1983  proc_bind,
1984 #endif
1985  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
1986  } else {
1987  /* allocate a new parallel team */
1988  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1989  team = __kmp_allocate_team(root, nthreads, nthreads,
1990 #if OMPT_SUPPORT
1991  ompt_parallel_id,
1992 #endif
1993 #if OMP_40_ENABLED
1994  proc_bind,
1995 #endif
1996  &master_th->th.th_current_task->td_icvs,
1997  argc USE_NESTED_HOT_ARG(master_th));
1998  }
1999  KF_TRACE(
2000  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2001 
2002  /* setup the new team */
2003  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2004  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2005  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2006  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2007  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2008 #if OMPT_SUPPORT
2009  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2010 #endif
2011  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2012 // TODO: parent_team->t.t_level == INT_MAX ???
2013 #if OMP_40_ENABLED
2014  if (!master_th->th.th_teams_microtask || level > teams_level) {
2015 #endif /* OMP_40_ENABLED */
2016  int new_level = parent_team->t.t_level + 1;
2017  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2018  new_level = parent_team->t.t_active_level + 1;
2019  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2020 #if OMP_40_ENABLED
2021  } else {
2022  // AC: Do not increase parallel level at start of the teams construct
2023  int new_level = parent_team->t.t_level;
2024  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2025  new_level = parent_team->t.t_active_level;
2026  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2027  }
2028 #endif /* OMP_40_ENABLED */
2029  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2030  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2031  team->t.t_sched.chunk != new_sched.chunk)
2032  team->t.t_sched =
2033  new_sched; // set master's schedule as new run-time schedule
2034 
2035 #if OMP_40_ENABLED
2036  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2037 #endif
2038 
2039  // Update the floating point rounding in the team if required.
2040  propagateFPControl(team);
2041 
2042  if (__kmp_tasking_mode != tskm_immediate_exec) {
2043  // Set master's task team to team's task team. Unless this is hot team, it
2044  // should be NULL.
2045  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2046  parent_team->t.t_task_team[master_th->th.th_task_state]);
2047  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2048  "%p, new task_team %p / team %p\n",
2049  __kmp_gtid_from_thread(master_th),
2050  master_th->th.th_task_team, parent_team,
2051  team->t.t_task_team[master_th->th.th_task_state], team));
2052 
2053  if (active_level || master_th->th.th_task_team) {
2054  // Take a memo of master's task_state
2055  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2056  if (master_th->th.th_task_state_top >=
2057  master_th->th.th_task_state_stack_sz) { // increase size
2058  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2059  kmp_uint8 *old_stack, *new_stack;
2060  kmp_uint32 i;
2061  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2062  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2063  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2064  }
2065  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2066  ++i) { // zero-init rest of stack
2067  new_stack[i] = 0;
2068  }
2069  old_stack = master_th->th.th_task_state_memo_stack;
2070  master_th->th.th_task_state_memo_stack = new_stack;
2071  master_th->th.th_task_state_stack_sz = new_size;
2072  __kmp_free(old_stack);
2073  }
2074  // Store master's task_state on stack
2075  master_th->th
2076  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2077  master_th->th.th_task_state;
2078  master_th->th.th_task_state_top++;
2079 #if KMP_NESTED_HOT_TEAMS
2080  if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2081  // Restore master's nested state if nested hot team
2082  master_th->th.th_task_state =
2083  master_th->th
2084  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2085  } else {
2086 #endif
2087  master_th->th.th_task_state = 0;
2088 #if KMP_NESTED_HOT_TEAMS
2089  }
2090 #endif
2091  }
2092 #if !KMP_NESTED_HOT_TEAMS
2093  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2094  (team == root->r.r_hot_team));
2095 #endif
2096  }
2097 
2098  KA_TRACE(
2099  20,
2100  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2101  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2102  team->t.t_nproc));
2103  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2104  (team->t.t_master_tid == 0 &&
2105  (team->t.t_parent == root->r.r_root_team ||
2106  team->t.t_parent->t.t_serialized)));
2107  KMP_MB();
2108 
2109  /* now, setup the arguments */
2110  argv = (void **)team->t.t_argv;
2111 #if OMP_40_ENABLED
2112  if (ap) {
2113 #endif /* OMP_40_ENABLED */
2114  for (i = argc - 1; i >= 0; --i) {
2115 // TODO: revert workaround for Intel(R) 64 tracker #96
2116 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2117  void *new_argv = va_arg(*ap, void *);
2118 #else
2119  void *new_argv = va_arg(ap, void *);
2120 #endif
2121  KMP_CHECK_UPDATE(*argv, new_argv);
2122  argv++;
2123  }
2124 #if OMP_40_ENABLED
2125  } else {
2126  for (i = 0; i < argc; ++i) {
2127  // Get args from parent team for teams construct
2128  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2129  }
2130  }
2131 #endif /* OMP_40_ENABLED */
2132 
2133  /* now actually fork the threads */
2134  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2135  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2136  root->r.r_active = TRUE;
2137 
2138  __kmp_fork_team_threads(root, team, master_th, gtid);
2139  __kmp_setup_icv_copy(team, nthreads,
2140  &master_th->th.th_current_task->td_icvs, loc);
2141 
2142 #if OMPT_SUPPORT
2143  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2144 #endif
2145 
2146  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2147 
2148 #if USE_ITT_BUILD
2149  if (team->t.t_active_level == 1 // only report frames at level 1
2150 #if OMP_40_ENABLED
2151  && !master_th->th.th_teams_microtask // not in teams construct
2152 #endif /* OMP_40_ENABLED */
2153  ) {
2154 #if USE_ITT_NOTIFY
2155  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2156  (__kmp_forkjoin_frames_mode == 3 ||
2157  __kmp_forkjoin_frames_mode == 1)) {
2158  kmp_uint64 tmp_time = 0;
2159  if (__itt_get_timestamp_ptr)
2160  tmp_time = __itt_get_timestamp();
2161  // Internal fork - report frame begin
2162  master_th->th.th_frame_time = tmp_time;
2163  if (__kmp_forkjoin_frames_mode == 3)
2164  team->t.t_region_time = tmp_time;
2165  } else
2166 // only one notification scheme (either "submit" or "forking/joined", not both)
2167 #endif /* USE_ITT_NOTIFY */
2168  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2169  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2170  // Mark start of "parallel" region for VTune.
2171  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2172  }
2173  }
2174 #endif /* USE_ITT_BUILD */
2175 
2176  /* now go on and do the work */
2177  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2178  KMP_MB();
2179  KF_TRACE(10,
2180  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2181  root, team, master_th, gtid));
2182 
2183 #if USE_ITT_BUILD
2184  if (__itt_stack_caller_create_ptr) {
2185  team->t.t_stack_id =
2186  __kmp_itt_stack_caller_create(); // create new stack stitching id
2187  // before entering fork barrier
2188  }
2189 #endif /* USE_ITT_BUILD */
2190 
2191 #if OMP_40_ENABLED
2192  // AC: skip __kmp_internal_fork at teams construct, let only master
2193  // threads execute
2194  if (ap)
2195 #endif /* OMP_40_ENABLED */
2196  {
2197  __kmp_internal_fork(loc, gtid, team);
2198  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2199  "master_th=%p, gtid=%d\n",
2200  root, team, master_th, gtid));
2201  }
2202 
2203  if (call_context == fork_context_gnu) {
2204  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2205  return TRUE;
2206  }
2207 
2208  /* Invoke microtask for MASTER thread */
2209  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2210  team->t.t_id, team->t.t_pkfn));
2211  } // END of timer KMP_fork_call block
2212 
2213  {
2214  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2215  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2216  if (!team->t.t_invoke(gtid)) {
2217  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2218  }
2219  }
2220  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2221  team->t.t_id, team->t.t_pkfn));
2222  KMP_MB(); /* Flush all pending memory write invalidates. */
2223 
2224  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2225 
2226 #if OMPT_SUPPORT
2227  if (ompt_enabled) {
2228  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2229  }
2230 #endif
2231 
2232  return TRUE;
2233 }
2234 
2235 #if OMPT_SUPPORT
2236 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2237  kmp_team_t *team) {
2238  // restore state outside the region
2239  thread->th.ompt_thread_info.state =
2240  ((team->t.t_serialized) ? ompt_state_work_serial
2241  : ompt_state_work_parallel);
2242 }
2243 
2244 static inline void __kmp_join_ompt(kmp_info_t *thread, kmp_team_t *team,
2245  ompt_parallel_id_t parallel_id,
2246  fork_context_e fork_context) {
2247  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2248  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2249  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2250  parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2251  }
2252 
2253  task_info->frame.reenter_runtime_frame = NULL;
2254  __kmp_join_restore_state(thread, team);
2255 }
2256 #endif
2257 
2258 void __kmp_join_call(ident_t *loc, int gtid
2259 #if OMPT_SUPPORT
2260  ,
2261  enum fork_context_e fork_context
2262 #endif
2263 #if OMP_40_ENABLED
2264  ,
2265  int exit_teams
2266 #endif /* OMP_40_ENABLED */
2267  ) {
2268  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2269  kmp_team_t *team;
2270  kmp_team_t *parent_team;
2271  kmp_info_t *master_th;
2272  kmp_root_t *root;
2273  int master_active;
2274  int i;
2275 
2276  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2277 
2278  /* setup current data */
2279  master_th = __kmp_threads[gtid];
2280  root = master_th->th.th_root;
2281  team = master_th->th.th_team;
2282  parent_team = team->t.t_parent;
2283 
2284  master_th->th.th_ident = loc;
2285 
2286 #if OMPT_SUPPORT
2287  if (ompt_enabled) {
2288  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2289  }
2290 #endif
2291 
2292 #if KMP_DEBUG
2293  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2294  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2295  "th_task_team = %p\n",
2296  __kmp_gtid_from_thread(master_th), team,
2297  team->t.t_task_team[master_th->th.th_task_state],
2298  master_th->th.th_task_team));
2299  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2300  team->t.t_task_team[master_th->th.th_task_state]);
2301  }
2302 #endif
2303 
2304  if (team->t.t_serialized) {
2305 #if OMP_40_ENABLED
2306  if (master_th->th.th_teams_microtask) {
2307  // We are in teams construct
2308  int level = team->t.t_level;
2309  int tlevel = master_th->th.th_teams_level;
2310  if (level == tlevel) {
2311  // AC: we haven't incremented it earlier at start of teams construct,
2312  // so do it here - at the end of teams construct
2313  team->t.t_level++;
2314  } else if (level == tlevel + 1) {
2315  // AC: we are exiting parallel inside teams, need to increment
2316  // serialization in order to restore it in the next call to
2317  // __kmpc_end_serialized_parallel
2318  team->t.t_serialized++;
2319  }
2320  }
2321 #endif /* OMP_40_ENABLED */
2322  __kmpc_end_serialized_parallel(loc, gtid);
2323 
2324 #if OMPT_SUPPORT
2325  if (ompt_enabled) {
2326  __kmp_join_restore_state(master_th, parent_team);
2327  }
2328 #endif
2329 
2330  return;
2331  }
2332 
2333  master_active = team->t.t_master_active;
2334 
2335 #if OMP_40_ENABLED
2336  if (!exit_teams)
2337 #endif /* OMP_40_ENABLED */
2338  {
2339  // AC: No barrier for internal teams at exit from teams construct.
2340  // But there is barrier for external team (league).
2341  __kmp_internal_join(loc, gtid, team);
2342  }
2343 #if OMP_40_ENABLED
2344  else {
2345  master_th->th.th_task_state =
2346  0; // AC: no tasking in teams (out of any parallel)
2347  }
2348 #endif /* OMP_40_ENABLED */
2349 
2350  KMP_MB();
2351 
2352 #if OMPT_SUPPORT
2353  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2354 #endif
2355 
2356 #if USE_ITT_BUILD
2357  if (__itt_stack_caller_create_ptr) {
2358  __kmp_itt_stack_caller_destroy(
2359  (__itt_caller)team->t
2360  .t_stack_id); // destroy the stack stitching id after join barrier
2361  }
2362 
2363  // Mark end of "parallel" region for VTune.
2364  if (team->t.t_active_level == 1
2365 #if OMP_40_ENABLED
2366  && !master_th->th.th_teams_microtask /* not in teams construct */
2367 #endif /* OMP_40_ENABLED */
2368  ) {
2369  master_th->th.th_ident = loc;
2370  // only one notification scheme (either "submit" or "forking/joined", not
2371  // both)
2372  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2373  __kmp_forkjoin_frames_mode == 3)
2374  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2375  master_th->th.th_frame_time, 0, loc,
2376  master_th->th.th_team_nproc, 1);
2377  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2378  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2379  __kmp_itt_region_joined(gtid);
2380  } // active_level == 1
2381 #endif /* USE_ITT_BUILD */
2382 
2383 #if OMP_40_ENABLED
2384  if (master_th->th.th_teams_microtask && !exit_teams &&
2385  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2386  team->t.t_level == master_th->th.th_teams_level + 1) {
2387  // AC: We need to leave the team structure intact at the end of parallel
2388  // inside the teams construct, so that at the next parallel same (hot) team
2389  // works, only adjust nesting levels
2390 
2391  /* Decrement our nested depth level */
2392  team->t.t_level--;
2393  team->t.t_active_level--;
2394  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2395 
2396  /* Restore number of threads in the team if needed */
2397  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2398  int old_num = master_th->th.th_team_nproc;
2399  int new_num = master_th->th.th_teams_size.nth;
2400  kmp_info_t **other_threads = team->t.t_threads;
2401  team->t.t_nproc = new_num;
2402  for (i = 0; i < old_num; ++i) {
2403  other_threads[i]->th.th_team_nproc = new_num;
2404  }
2405  // Adjust states of non-used threads of the team
2406  for (i = old_num; i < new_num; ++i) {
2407  // Re-initialize thread's barrier data.
2408  int b;
2409  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2410  for (b = 0; b < bs_last_barrier; ++b) {
2411  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2412  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2413 #if USE_DEBUGGER
2414  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2415 #endif
2416  }
2417  if (__kmp_tasking_mode != tskm_immediate_exec) {
2418  // Synchronize thread's task state
2419  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2420  }
2421  }
2422  }
2423 
2424 #if OMPT_SUPPORT
2425  if (ompt_enabled) {
2426  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2427  }
2428 #endif
2429 
2430  return;
2431  }
2432 #endif /* OMP_40_ENABLED */
2433 
2434  /* do cleanup and restore the parent team */
2435  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2436  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2437 
2438  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2439 
2440  /* jc: The following lock has instructions with REL and ACQ semantics,
2441  separating the parallel user code called in this parallel region
2442  from the serial user code called after this function returns. */
2443  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2444 
2445 #if OMP_40_ENABLED
2446  if (!master_th->th.th_teams_microtask ||
2447  team->t.t_level > master_th->th.th_teams_level)
2448 #endif /* OMP_40_ENABLED */
2449  {
2450  /* Decrement our nested depth level */
2451  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2452  }
2453  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2454 
2455 #if OMPT_SUPPORT && OMPT_TRACE
2456  if (ompt_enabled) {
2457  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2458  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2459  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2460  parallel_id, task_info->task_id);
2461  }
2462  task_info->frame.exit_runtime_frame = NULL;
2463  task_info->task_id = 0;
2464  }
2465 #endif
2466 
2467  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2468  master_th, team));
2469  __kmp_pop_current_task_from_thread(master_th);
2470 
2471 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2472  // Restore master thread's partition.
2473  master_th->th.th_first_place = team->t.t_first_place;
2474  master_th->th.th_last_place = team->t.t_last_place;
2475 #endif /* OMP_40_ENABLED */
2476 
2477  updateHWFPControl(team);
2478 
2479  if (root->r.r_active != master_active)
2480  root->r.r_active = master_active;
2481 
2482  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2483  master_th)); // this will free worker threads
2484 
2485  /* this race was fun to find. make sure the following is in the critical
2486  region otherwise assertions may fail occasionally since the old team may be
2487  reallocated and the hierarchy appears inconsistent. it is actually safe to
2488  run and won't cause any bugs, but will cause those assertion failures. it's
2489  only one deref&assign so might as well put this in the critical region */
2490  master_th->th.th_team = parent_team;
2491  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2492  master_th->th.th_team_master = parent_team->t.t_threads[0];
2493  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2494 
2495  /* restore serialized team, if need be */
2496  if (parent_team->t.t_serialized &&
2497  parent_team != master_th->th.th_serial_team &&
2498  parent_team != root->r.r_root_team) {
2499  __kmp_free_team(root,
2500  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2501  master_th->th.th_serial_team = parent_team;
2502  }
2503 
2504  if (__kmp_tasking_mode != tskm_immediate_exec) {
2505  if (master_th->th.th_task_state_top >
2506  0) { // Restore task state from memo stack
2507  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2508  // Remember master's state if we re-use this nested hot team
2509  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2510  master_th->th.th_task_state;
2511  --master_th->th.th_task_state_top; // pop
2512  // Now restore state at this level
2513  master_th->th.th_task_state =
2514  master_th->th
2515  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2516  }
2517  // Copy the task team from the parent team to the master thread
2518  master_th->th.th_task_team =
2519  parent_team->t.t_task_team[master_th->th.th_task_state];
2520  KA_TRACE(20,
2521  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2522  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2523  parent_team));
2524  }
2525 
2526  // TODO: GEH - cannot do this assertion because root thread not set up as
2527  // executing
2528  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2529  master_th->th.th_current_task->td_flags.executing = 1;
2530 
2531  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled) {
2535  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2536  }
2537 #endif
2538 
2539  KMP_MB();
2540  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2541 }
2542 
2543 /* Check whether we should push an internal control record onto the
2544  serial team stack. If so, do it. */
2545 void __kmp_save_internal_controls(kmp_info_t *thread) {
2546 
2547  if (thread->th.th_team != thread->th.th_serial_team) {
2548  return;
2549  }
2550  if (thread->th.th_team->t.t_serialized > 1) {
2551  int push = 0;
2552 
2553  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2554  push = 1;
2555  } else {
2556  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2557  thread->th.th_team->t.t_serialized) {
2558  push = 1;
2559  }
2560  }
2561  if (push) { /* push a record on the serial team's stack */
2562  kmp_internal_control_t *control =
2563  (kmp_internal_control_t *)__kmp_allocate(
2564  sizeof(kmp_internal_control_t));
2565 
2566  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2567 
2568  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2569 
2570  control->next = thread->th.th_team->t.t_control_stack_top;
2571  thread->th.th_team->t.t_control_stack_top = control;
2572  }
2573  }
2574 }
2575 
2576 /* Changes set_nproc */
2577 void __kmp_set_num_threads(int new_nth, int gtid) {
2578  kmp_info_t *thread;
2579  kmp_root_t *root;
2580 
2581  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2582  KMP_DEBUG_ASSERT(__kmp_init_serial);
2583 
2584  if (new_nth < 1)
2585  new_nth = 1;
2586  else if (new_nth > __kmp_max_nth)
2587  new_nth = __kmp_max_nth;
2588 
2589  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2590  thread = __kmp_threads[gtid];
2591 
2592  __kmp_save_internal_controls(thread);
2593 
2594  set__nproc(thread, new_nth);
2595 
2596  // If this omp_set_num_threads() call will cause the hot team size to be
2597  // reduced (in the absence of a num_threads clause), then reduce it now,
2598  // rather than waiting for the next parallel region.
2599  root = thread->th.th_root;
2600  if (__kmp_init_parallel && (!root->r.r_active) &&
2601  (root->r.r_hot_team->t.t_nproc > new_nth)
2602 #if KMP_NESTED_HOT_TEAMS
2603  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2604 #endif
2605  ) {
2606  kmp_team_t *hot_team = root->r.r_hot_team;
2607  int f;
2608 
2609  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2610 
2611  // Release the extra threads we don't need any more.
2612  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2613  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2614  if (__kmp_tasking_mode != tskm_immediate_exec) {
2615  // When decreasing team size, threads no longer in the team should unref
2616  // task team.
2617  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2618  }
2619  __kmp_free_thread(hot_team->t.t_threads[f]);
2620  hot_team->t.t_threads[f] = NULL;
2621  }
2622  hot_team->t.t_nproc = new_nth;
2623 #if KMP_NESTED_HOT_TEAMS
2624  if (thread->th.th_hot_teams) {
2625  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2626  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2627  }
2628 #endif
2629 
2630  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2631 
2632  // Update the t_nproc field in the threads that are still active.
2633  for (f = 0; f < new_nth; f++) {
2634  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2635  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2636  }
2637  // Special flag in case omp_set_num_threads() call
2638  hot_team->t.t_size_changed = -1;
2639  }
2640 }
2641 
2642 /* Changes max_active_levels */
2643 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2644  kmp_info_t *thread;
2645 
2646  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2647  "%d = (%d)\n",
2648  gtid, max_active_levels));
2649  KMP_DEBUG_ASSERT(__kmp_init_serial);
2650 
2651  // validate max_active_levels
2652  if (max_active_levels < 0) {
2653  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2654  // We ignore this call if the user has specified a negative value.
2655  // The current setting won't be changed. The last valid setting will be
2656  // used. A warning will be issued (if warnings are allowed as controlled by
2657  // the KMP_WARNINGS env var).
2658  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2659  "max_active_levels for thread %d = (%d)\n",
2660  gtid, max_active_levels));
2661  return;
2662  }
2663  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2664  // it's OK, the max_active_levels is within the valid range: [ 0;
2665  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2666  // We allow a zero value. (implementation defined behavior)
2667  } else {
2668  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2669  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2670  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2671  // Current upper limit is MAX_INT. (implementation defined behavior)
2672  // If the input exceeds the upper limit, we correct the input to be the
2673  // upper limit. (implementation defined behavior)
2674  // Actually, the flow should never get here until we use MAX_INT limit.
2675  }
2676  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2677  "max_active_levels for thread %d = (%d)\n",
2678  gtid, max_active_levels));
2679 
2680  thread = __kmp_threads[gtid];
2681 
2682  __kmp_save_internal_controls(thread);
2683 
2684  set__max_active_levels(thread, max_active_levels);
2685 }
2686 
2687 /* Gets max_active_levels */
2688 int __kmp_get_max_active_levels(int gtid) {
2689  kmp_info_t *thread;
2690 
2691  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2692  KMP_DEBUG_ASSERT(__kmp_init_serial);
2693 
2694  thread = __kmp_threads[gtid];
2695  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2696  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2697  "curtask_maxaclevel=%d\n",
2698  gtid, thread->th.th_current_task,
2699  thread->th.th_current_task->td_icvs.max_active_levels));
2700  return thread->th.th_current_task->td_icvs.max_active_levels;
2701 }
2702 
2703 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2704 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2705  kmp_info_t *thread;
2706  // kmp_team_t *team;
2707 
2708  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2709  gtid, (int)kind, chunk));
2710  KMP_DEBUG_ASSERT(__kmp_init_serial);
2711 
2712  // Check if the kind parameter is valid, correct if needed.
2713  // Valid parameters should fit in one of two intervals - standard or extended:
2714  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2715  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2716  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2717  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2718  // TODO: Hint needs attention in case we change the default schedule.
2719  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2720  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2721  __kmp_msg_null);
2722  kind = kmp_sched_default;
2723  chunk = 0; // ignore chunk value in case of bad kind
2724  }
2725 
2726  thread = __kmp_threads[gtid];
2727 
2728  __kmp_save_internal_controls(thread);
2729 
2730  if (kind < kmp_sched_upper_std) {
2731  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2732  // differ static chunked vs. unchunked: chunk should be invalid to
2733  // indicate unchunked schedule (which is the default)
2734  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2735  } else {
2736  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2737  __kmp_sch_map[kind - kmp_sched_lower - 1];
2738  }
2739  } else {
2740  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2741  // kmp_sched_lower - 2 ];
2742  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2743  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2744  kmp_sched_lower - 2];
2745  }
2746  if (kind == kmp_sched_auto || chunk < 1) {
2747  // ignore parameter chunk for schedule auto
2748  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2749  } else {
2750  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2751  }
2752 }
2753 
2754 /* Gets def_sched_var ICV values */
2755 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2756  kmp_info_t *thread;
2757  enum sched_type th_type;
2758 
2759  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2760  KMP_DEBUG_ASSERT(__kmp_init_serial);
2761 
2762  thread = __kmp_threads[gtid];
2763 
2764  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2765 
2766  switch (th_type) {
2767  case kmp_sch_static:
2768  case kmp_sch_static_greedy:
2769  case kmp_sch_static_balanced:
2770  *kind = kmp_sched_static;
2771  *chunk = 0; // chunk was not set, try to show this fact via zero value
2772  return;
2773  case kmp_sch_static_chunked:
2774  *kind = kmp_sched_static;
2775  break;
2776  case kmp_sch_dynamic_chunked:
2777  *kind = kmp_sched_dynamic;
2778  break;
2780  case kmp_sch_guided_iterative_chunked:
2781  case kmp_sch_guided_analytical_chunked:
2782  *kind = kmp_sched_guided;
2783  break;
2784  case kmp_sch_auto:
2785  *kind = kmp_sched_auto;
2786  break;
2787  case kmp_sch_trapezoidal:
2788  *kind = kmp_sched_trapezoidal;
2789  break;
2790 #if KMP_STATIC_STEAL_ENABLED
2791  case kmp_sch_static_steal:
2792  *kind = kmp_sched_static_steal;
2793  break;
2794 #endif
2795  default:
2796  KMP_FATAL(UnknownSchedulingType, th_type);
2797  }
2798 
2799  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2800 }
2801 
2802 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2803 
2804  int ii, dd;
2805  kmp_team_t *team;
2806  kmp_info_t *thr;
2807 
2808  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2809  KMP_DEBUG_ASSERT(__kmp_init_serial);
2810 
2811  // validate level
2812  if (level == 0)
2813  return 0;
2814  if (level < 0)
2815  return -1;
2816  thr = __kmp_threads[gtid];
2817  team = thr->th.th_team;
2818  ii = team->t.t_level;
2819  if (level > ii)
2820  return -1;
2821 
2822 #if OMP_40_ENABLED
2823  if (thr->th.th_teams_microtask) {
2824  // AC: we are in teams region where multiple nested teams have same level
2825  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2826  if (level <=
2827  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2828  KMP_DEBUG_ASSERT(ii >= tlevel);
2829  // AC: As we need to pass by the teams league, we need to artificially
2830  // increase ii
2831  if (ii == tlevel) {
2832  ii += 2; // three teams have same level
2833  } else {
2834  ii++; // two teams have same level
2835  }
2836  }
2837  }
2838 #endif
2839 
2840  if (ii == level)
2841  return __kmp_tid_from_gtid(gtid);
2842 
2843  dd = team->t.t_serialized;
2844  level++;
2845  while (ii > level) {
2846  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2847  }
2848  if ((team->t.t_serialized) && (!dd)) {
2849  team = team->t.t_parent;
2850  continue;
2851  }
2852  if (ii > level) {
2853  team = team->t.t_parent;
2854  dd = team->t.t_serialized;
2855  ii--;
2856  }
2857  }
2858 
2859  return (dd > 1) ? (0) : (team->t.t_master_tid);
2860 }
2861 
2862 int __kmp_get_team_size(int gtid, int level) {
2863 
2864  int ii, dd;
2865  kmp_team_t *team;
2866  kmp_info_t *thr;
2867 
2868  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2869  KMP_DEBUG_ASSERT(__kmp_init_serial);
2870 
2871  // validate level
2872  if (level == 0)
2873  return 1;
2874  if (level < 0)
2875  return -1;
2876  thr = __kmp_threads[gtid];
2877  team = thr->th.th_team;
2878  ii = team->t.t_level;
2879  if (level > ii)
2880  return -1;
2881 
2882 #if OMP_40_ENABLED
2883  if (thr->th.th_teams_microtask) {
2884  // AC: we are in teams region where multiple nested teams have same level
2885  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2886  if (level <=
2887  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2888  KMP_DEBUG_ASSERT(ii >= tlevel);
2889  // AC: As we need to pass by the teams league, we need to artificially
2890  // increase ii
2891  if (ii == tlevel) {
2892  ii += 2; // three teams have same level
2893  } else {
2894  ii++; // two teams have same level
2895  }
2896  }
2897  }
2898 #endif
2899 
2900  while (ii > level) {
2901  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2902  }
2903  if (team->t.t_serialized && (!dd)) {
2904  team = team->t.t_parent;
2905  continue;
2906  }
2907  if (ii > level) {
2908  team = team->t.t_parent;
2909  ii--;
2910  }
2911  }
2912 
2913  return team->t.t_nproc;
2914 }
2915 
2916 kmp_r_sched_t __kmp_get_schedule_global() {
2917  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2918  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2919  // independently. So one can get the updated schedule here.
2920 
2921  kmp_r_sched_t r_sched;
2922 
2923  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2924  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2925  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2926  // different roots (even in OMP 2.5)
2927  if (__kmp_sched == kmp_sch_static) {
2928  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2929  // schedule (balanced or greedy)
2930  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2931  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2932  // schedule (iterative or analytical)
2933  } else {
2934  r_sched.r_sched_type =
2935  __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2936  }
2937 
2938  if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2939  // was not ever set)
2940  r_sched.chunk = KMP_DEFAULT_CHUNK;
2941  } else {
2942  r_sched.chunk = __kmp_chunk;
2943  }
2944 
2945  return r_sched;
2946 }
2947 
2948 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2949  at least argc number of *t_argv entries for the requested team. */
2950 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2951 
2952  KMP_DEBUG_ASSERT(team);
2953  if (!realloc || argc > team->t.t_max_argc) {
2954 
2955  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2956  "current entries=%d\n",
2957  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2958  /* if previously allocated heap space for args, free them */
2959  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2960  __kmp_free((void *)team->t.t_argv);
2961 
2962  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2963  /* use unused space in the cache line for arguments */
2964  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2965  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2966  "argv entries\n",
2967  team->t.t_id, team->t.t_max_argc));
2968  team->t.t_argv = &team->t.t_inline_argv[0];
2969  if (__kmp_storage_map) {
2970  __kmp_print_storage_map_gtid(
2971  -1, &team->t.t_inline_argv[0],
2972  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2973  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2974  team->t.t_id);
2975  }
2976  } else {
2977  /* allocate space for arguments in the heap */
2978  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2979  ? KMP_MIN_MALLOC_ARGV_ENTRIES
2980  : 2 * argc;
2981  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2982  "argv entries\n",
2983  team->t.t_id, team->t.t_max_argc));
2984  team->t.t_argv =
2985  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2986  if (__kmp_storage_map) {
2987  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2988  &team->t.t_argv[team->t.t_max_argc],
2989  sizeof(void *) * team->t.t_max_argc,
2990  "team_%d.t_argv", team->t.t_id);
2991  }
2992  }
2993  }
2994 }
2995 
2996 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2997  int i;
2998  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2999  team->t.t_threads =
3000  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3001  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3002  sizeof(dispatch_shared_info_t) * num_disp_buff);
3003  team->t.t_dispatch =
3004  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3005  team->t.t_implicit_task_taskdata =
3006  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3007  team->t.t_max_nproc = max_nth;
3008 
3009  /* setup dispatch buffers */
3010  for (i = 0; i < num_disp_buff; ++i) {
3011  team->t.t_disp_buffer[i].buffer_index = i;
3012 #if OMP_45_ENABLED
3013  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3014 #endif
3015  }
3016 }
3017 
3018 static void __kmp_free_team_arrays(kmp_team_t *team) {
3019  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3020  int i;
3021  for (i = 0; i < team->t.t_max_nproc; ++i) {
3022  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3023  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3024  team->t.t_dispatch[i].th_disp_buffer = NULL;
3025  }; // if
3026  }; // for
3027  __kmp_free(team->t.t_threads);
3028  __kmp_free(team->t.t_disp_buffer);
3029  __kmp_free(team->t.t_dispatch);
3030  __kmp_free(team->t.t_implicit_task_taskdata);
3031  team->t.t_threads = NULL;
3032  team->t.t_disp_buffer = NULL;
3033  team->t.t_dispatch = NULL;
3034  team->t.t_implicit_task_taskdata = 0;
3035 }
3036 
3037 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3038  kmp_info_t **oldThreads = team->t.t_threads;
3039 
3040  __kmp_free(team->t.t_disp_buffer);
3041  __kmp_free(team->t.t_dispatch);
3042  __kmp_free(team->t.t_implicit_task_taskdata);
3043  __kmp_allocate_team_arrays(team, max_nth);
3044 
3045  KMP_MEMCPY(team->t.t_threads, oldThreads,
3046  team->t.t_nproc * sizeof(kmp_info_t *));
3047 
3048  __kmp_free(oldThreads);
3049 }
3050 
3051 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3052 
3053  kmp_r_sched_t r_sched =
3054  __kmp_get_schedule_global(); // get current state of scheduling globals
3055 
3056 #if OMP_40_ENABLED
3057  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3058 #endif /* OMP_40_ENABLED */
3059 
3060  kmp_internal_control_t g_icvs = {
3061  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3062  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3063  // for nested parallelism (per thread)
3064  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3065  // adjustment of threads (per thread)
3066  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3067  // whether blocktime is explicitly set
3068  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3069 #if KMP_USE_MONITOR
3070  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3071 // intervals
3072 #endif
3073  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3074  // next parallel region (per thread)
3075  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3076  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3077  // for max_active_levels
3078  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3079 // {sched,chunk} pair
3080 #if OMP_40_ENABLED
3081  __kmp_nested_proc_bind.bind_types[0],
3082  __kmp_default_device,
3083 #endif /* OMP_40_ENABLED */
3084  NULL // struct kmp_internal_control *next;
3085  };
3086 
3087  return g_icvs;
3088 }
3089 
3090 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3091 
3092  kmp_internal_control_t gx_icvs;
3093  gx_icvs.serial_nesting_level =
3094  0; // probably =team->t.t_serial like in save_inter_controls
3095  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3096  gx_icvs.next = NULL;
3097 
3098  return gx_icvs;
3099 }
3100 
3101 static void __kmp_initialize_root(kmp_root_t *root) {
3102  int f;
3103  kmp_team_t *root_team;
3104  kmp_team_t *hot_team;
3105  int hot_team_max_nth;
3106  kmp_r_sched_t r_sched =
3107  __kmp_get_schedule_global(); // get current state of scheduling globals
3108  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3109  KMP_DEBUG_ASSERT(root);
3110  KMP_ASSERT(!root->r.r_begin);
3111 
3112  /* setup the root state structure */
3113  __kmp_init_lock(&root->r.r_begin_lock);
3114  root->r.r_begin = FALSE;
3115  root->r.r_active = FALSE;
3116  root->r.r_in_parallel = 0;
3117  root->r.r_blocktime = __kmp_dflt_blocktime;
3118  root->r.r_nested = __kmp_dflt_nested;
3119 
3120  /* setup the root team for this task */
3121  /* allocate the root team structure */
3122  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3123 
3124  root_team =
3125  __kmp_allocate_team(root,
3126  1, // new_nproc
3127  1, // max_nproc
3128 #if OMPT_SUPPORT
3129  0, // root parallel id
3130 #endif
3131 #if OMP_40_ENABLED
3132  __kmp_nested_proc_bind.bind_types[0],
3133 #endif
3134  &r_icvs,
3135  0 // argc
3136  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3137  );
3138 #if USE_DEBUGGER
3139  // Non-NULL value should be assigned to make the debugger display the root
3140  // team.
3141  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3142 #endif
3143 
3144  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3145 
3146  root->r.r_root_team = root_team;
3147  root_team->t.t_control_stack_top = NULL;
3148 
3149  /* initialize root team */
3150  root_team->t.t_threads[0] = NULL;
3151  root_team->t.t_nproc = 1;
3152  root_team->t.t_serialized = 1;
3153  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3154  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3155  root_team->t.t_sched.chunk = r_sched.chunk;
3156  KA_TRACE(
3157  20,
3158  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3159  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3160 
3161  /* setup the hot team for this task */
3162  /* allocate the hot team structure */
3163  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3164 
3165  hot_team =
3166  __kmp_allocate_team(root,
3167  1, // new_nproc
3168  __kmp_dflt_team_nth_ub * 2, // max_nproc
3169 #if OMPT_SUPPORT
3170  0, // root parallel id
3171 #endif
3172 #if OMP_40_ENABLED
3173  __kmp_nested_proc_bind.bind_types[0],
3174 #endif
3175  &r_icvs,
3176  0 // argc
3177  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3178  );
3179  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3180 
3181  root->r.r_hot_team = hot_team;
3182  root_team->t.t_control_stack_top = NULL;
3183 
3184  /* first-time initialization */
3185  hot_team->t.t_parent = root_team;
3186 
3187  /* initialize hot team */
3188  hot_team_max_nth = hot_team->t.t_max_nproc;
3189  for (f = 0; f < hot_team_max_nth; ++f) {
3190  hot_team->t.t_threads[f] = NULL;
3191  }; // for
3192  hot_team->t.t_nproc = 1;
3193  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3194  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3195  hot_team->t.t_sched.chunk = r_sched.chunk;
3196  hot_team->t.t_size_changed = 0;
3197 }
3198 
3199 #ifdef KMP_DEBUG
3200 
3201 typedef struct kmp_team_list_item {
3202  kmp_team_p const *entry;
3203  struct kmp_team_list_item *next;
3204 } kmp_team_list_item_t;
3205 typedef kmp_team_list_item_t *kmp_team_list_t;
3206 
3207 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3208  kmp_team_list_t list, // List of teams.
3209  kmp_team_p const *team // Team to add.
3210  ) {
3211 
3212  // List must terminate with item where both entry and next are NULL.
3213  // Team is added to the list only once.
3214  // List is sorted in ascending order by team id.
3215  // Team id is *not* a key.
3216 
3217  kmp_team_list_t l;
3218 
3219  KMP_DEBUG_ASSERT(list != NULL);
3220  if (team == NULL) {
3221  return;
3222  }; // if
3223 
3224  __kmp_print_structure_team_accum(list, team->t.t_parent);
3225  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3226 
3227  // Search list for the team.
3228  l = list;
3229  while (l->next != NULL && l->entry != team) {
3230  l = l->next;
3231  }; // while
3232  if (l->next != NULL) {
3233  return; // Team has been added before, exit.
3234  }; // if
3235 
3236  // Team is not found. Search list again for insertion point.
3237  l = list;
3238  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3239  l = l->next;
3240  }; // while
3241 
3242  // Insert team.
3243  {
3244  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3245  sizeof(kmp_team_list_item_t));
3246  *item = *l;
3247  l->entry = team;
3248  l->next = item;
3249  }
3250 }
3251 
3252 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3253 
3254  ) {
3255  __kmp_printf("%s", title);
3256  if (team != NULL) {
3257  __kmp_printf("%2x %p\n", team->t.t_id, team);
3258  } else {
3259  __kmp_printf(" - (nil)\n");
3260  }; // if
3261 }
3262 
3263 static void __kmp_print_structure_thread(char const *title,
3264  kmp_info_p const *thread) {
3265  __kmp_printf("%s", title);
3266  if (thread != NULL) {
3267  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3268  } else {
3269  __kmp_printf(" - (nil)\n");
3270  }; // if
3271 }
3272 
3273 void __kmp_print_structure(void) {
3274 
3275  kmp_team_list_t list;
3276 
3277  // Initialize list of teams.
3278  list =
3279  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3280  list->entry = NULL;
3281  list->next = NULL;
3282 
3283  __kmp_printf("\n------------------------------\nGlobal Thread "
3284  "Table\n------------------------------\n");
3285  {
3286  int gtid;
3287  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3288  __kmp_printf("%2d", gtid);
3289  if (__kmp_threads != NULL) {
3290  __kmp_printf(" %p", __kmp_threads[gtid]);
3291  }; // if
3292  if (__kmp_root != NULL) {
3293  __kmp_printf(" %p", __kmp_root[gtid]);
3294  }; // if
3295  __kmp_printf("\n");
3296  }; // for gtid
3297  }
3298 
3299  // Print out __kmp_threads array.
3300  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3301  "----------\n");
3302  if (__kmp_threads != NULL) {
3303  int gtid;
3304  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305  kmp_info_t const *thread = __kmp_threads[gtid];
3306  if (thread != NULL) {
3307  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3308  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3309  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3310  __kmp_print_structure_team(" Serial Team: ",
3311  thread->th.th_serial_team);
3312  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3313  __kmp_print_structure_thread(" Master: ",
3314  thread->th.th_team_master);
3315  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3316  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3317 #if OMP_40_ENABLED
3318  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3319 #endif
3320  __kmp_print_structure_thread(" Next in pool: ",
3321  thread->th.th_next_pool);
3322  __kmp_printf("\n");
3323  __kmp_print_structure_team_accum(list, thread->th.th_team);
3324  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3325  }; // if
3326  }; // for gtid
3327  } else {
3328  __kmp_printf("Threads array is not allocated.\n");
3329  }; // if
3330 
3331  // Print out __kmp_root array.
3332  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3333  "--------\n");
3334  if (__kmp_root != NULL) {
3335  int gtid;
3336  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3337  kmp_root_t const *root = __kmp_root[gtid];
3338  if (root != NULL) {
3339  __kmp_printf("GTID %2d %p:\n", gtid, root);
3340  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3341  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3342  __kmp_print_structure_thread(" Uber Thread: ",
3343  root->r.r_uber_thread);
3344  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3345  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3346  __kmp_printf(" In Parallel: %2d\n", root->r.r_in_parallel);
3347  __kmp_printf("\n");
3348  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3349  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3350  }; // if
3351  }; // for gtid
3352  } else {
3353  __kmp_printf("Ubers array is not allocated.\n");
3354  }; // if
3355 
3356  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3357  "--------\n");
3358  while (list->next != NULL) {
3359  kmp_team_p const *team = list->entry;
3360  int i;
3361  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3362  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3363  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3364  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3365  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3366  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3367  for (i = 0; i < team->t.t_nproc; ++i) {
3368  __kmp_printf(" Thread %2d: ", i);
3369  __kmp_print_structure_thread("", team->t.t_threads[i]);
3370  }; // for i
3371  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3372  __kmp_printf("\n");
3373  list = list->next;
3374  }; // while
3375 
3376  // Print out __kmp_thread_pool and __kmp_team_pool.
3377  __kmp_printf("\n------------------------------\nPools\n----------------------"
3378  "--------\n");
3379  __kmp_print_structure_thread("Thread pool: ",
3380  CCAST(kmp_info_t *, __kmp_thread_pool));
3381  __kmp_print_structure_team("Team pool: ",
3382  CCAST(kmp_team_t *, __kmp_team_pool));
3383  __kmp_printf("\n");
3384 
3385  // Free team list.
3386  while (list != NULL) {
3387  kmp_team_list_item_t *item = list;
3388  list = list->next;
3389  KMP_INTERNAL_FREE(item);
3390  }; // while
3391 }
3392 
3393 #endif
3394 
3395 //---------------------------------------------------------------------------
3396 // Stuff for per-thread fast random number generator
3397 // Table of primes
3398 static const unsigned __kmp_primes[] = {
3399  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3400  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3401  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3402  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3403  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3404  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3405  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3406  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3407  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3408  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3409  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3410 
3411 //---------------------------------------------------------------------------
3412 // __kmp_get_random: Get a random number using a linear congruential method.
3413 unsigned short __kmp_get_random(kmp_info_t *thread) {
3414  unsigned x = thread->th.th_x;
3415  unsigned short r = x >> 16;
3416 
3417  thread->th.th_x = x * thread->th.th_a + 1;
3418 
3419  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3420  thread->th.th_info.ds.ds_tid, r));
3421 
3422  return r;
3423 }
3424 //--------------------------------------------------------
3425 // __kmp_init_random: Initialize a random number generator
3426 void __kmp_init_random(kmp_info_t *thread) {
3427  unsigned seed = thread->th.th_info.ds.ds_tid;
3428 
3429  thread->th.th_a =
3430  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3431  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3432  KA_TRACE(30,
3433  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3434 }
3435 
3436 #if KMP_OS_WINDOWS
3437 /* reclaim array entries for root threads that are already dead, returns number
3438  * reclaimed */
3439 static int __kmp_reclaim_dead_roots(void) {
3440  int i, r = 0;
3441 
3442  for (i = 0; i < __kmp_threads_capacity; ++i) {
3443  if (KMP_UBER_GTID(i) &&
3444  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3445  !__kmp_root[i]
3446  ->r.r_active) { // AC: reclaim only roots died in non-active state
3447  r += __kmp_unregister_root_other_thread(i);
3448  }
3449  }
3450  return r;
3451 }
3452 #endif
3453 
3454 /* This function attempts to create free entries in __kmp_threads and
3455  __kmp_root, and returns the number of free entries generated.
3456 
3457  For Windows* OS static library, the first mechanism used is to reclaim array
3458  entries for root threads that are already dead.
3459 
3460  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3461  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3462  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3463  threadprivate cache array has been created. Synchronization with
3464  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3465 
3466  After any dead root reclamation, if the clipping value allows array expansion
3467  to result in the generation of a total of nWish free slots, the function does
3468  that expansion. If not, but the clipping value allows array expansion to
3469  result in the generation of a total of nNeed free slots, the function does
3470  that expansion. Otherwise, nothing is done beyond the possible initial root
3471  thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3472  to fulfil nWish as far as possible, i.e. the function will attempt to create
3473  as many free slots as possible up to nWish.
3474 
3475  If any argument is negative, the behavior is undefined. */
3476 static int __kmp_expand_threads(int nWish, int nNeed) {
3477  int added = 0;
3478  int old_tp_cached;
3479  int __kmp_actual_max_nth;
3480 
3481  if (nNeed > nWish) /* normalize the arguments */
3482  nWish = nNeed;
3483 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3484  /* only for Windows static library */
3485  /* reclaim array entries for root threads that are already dead */
3486  added = __kmp_reclaim_dead_roots();
3487 
3488  if (nNeed) {
3489  nNeed -= added;
3490  if (nNeed < 0)
3491  nNeed = 0;
3492  }
3493  if (nWish) {
3494  nWish -= added;
3495  if (nWish < 0)
3496  nWish = 0;
3497  }
3498 #endif
3499  if (nWish <= 0)
3500  return added;
3501 
3502  while (1) {
3503  int nTarget;
3504  int minimumRequiredCapacity;
3505  int newCapacity;
3506  kmp_info_t **newThreads;
3507  kmp_root_t **newRoot;
3508 
3509  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3510  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3511  // user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may become
3512  // > __kmp_max_nth in one of two ways:
3513  //
3514  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3515  // may not be resused by another thread, so we may need to increase
3516  // __kmp_threads_capacity to __kmp_max_threads + 1.
3517  //
3518  // 2) New foreign root(s) are encountered. We always register new foreign
3519  // roots. This may cause a smaller # of threads to be allocated at
3520  // subsequent parallel regions, but the worker threads hang around (and
3521  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3522  //
3523  // Anyway, that is the reason for moving the check to see if
3524  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3525  // instead of having it performed here. -BB
3526  old_tp_cached = __kmp_tp_cached;
3527  __kmp_actual_max_nth =
3528  old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3529  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3530 
3531  /* compute expansion headroom to check if we can expand and whether to aim
3532  for nWish or nNeed */
3533  nTarget = nWish;
3534  if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3535  /* can't fulfil nWish, so try nNeed */
3536  if (nNeed) {
3537  nTarget = nNeed;
3538  if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3539  /* possible expansion too small -- give up */
3540  break;
3541  }
3542  } else {
3543  /* best-effort */
3544  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3545  if (!nTarget) {
3546  /* can expand at all -- give up */
3547  break;
3548  }
3549  }
3550  }
3551  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3552 
3553  newCapacity = __kmp_threads_capacity;
3554  do {
3555  newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3556  ? (newCapacity << 1)
3557  : __kmp_actual_max_nth;
3558  } while (newCapacity < minimumRequiredCapacity);
3559  newThreads = (kmp_info_t **)__kmp_allocate(
3560  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3561  CACHE_LINE);
3562  newRoot = (kmp_root_t **)((char *)newThreads +
3563  sizeof(kmp_info_t *) * newCapacity);
3564  KMP_MEMCPY(newThreads, __kmp_threads,
3565  __kmp_threads_capacity * sizeof(kmp_info_t *));
3566  KMP_MEMCPY(newRoot, __kmp_root,
3567  __kmp_threads_capacity * sizeof(kmp_root_t *));
3568  memset(newThreads + __kmp_threads_capacity, 0,
3569  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3570  memset(newRoot + __kmp_threads_capacity, 0,
3571  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3572 
3573  if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3574  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3575  allocated a threadprivate cache while we were allocating the expanded
3576  array, and our new capacity is larger than the threadprivate cache
3577  capacity, so we should deallocate the expanded arrays and try again.
3578  This is the first check of a double-check pair. */
3579  __kmp_free(newThreads);
3580  continue; /* start over and try again */
3581  }
3582  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3583  if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3584  /* Same check as above, but this time with the lock so we can be sure if
3585  we can succeed. */
3586  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3587  __kmp_free(newThreads);
3588  continue; /* start over and try again */
3589  } else {
3590  /* success */
3591  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3592  // investigated.
3593  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3594  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3595  added += newCapacity - __kmp_threads_capacity;
3596  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3597  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3598  break; /* succeeded, so we can exit the loop */
3599  }
3600  }
3601  return added;
3602 }
3603 
3604 /* Register the current thread as a root thread and obtain our gtid. We must
3605  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3606  thread that calls from __kmp_do_serial_initialize() */
3607 int __kmp_register_root(int initial_thread) {
3608  kmp_info_t *root_thread;
3609  kmp_root_t *root;
3610  int gtid;
3611  int capacity;
3612  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3613  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3614  KMP_MB();
3615 
3616  /* 2007-03-02:
3617  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3618  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3619  work as expected -- it may return false (that means there is at least one
3620  empty slot in __kmp_threads array), but it is possible the only free slot
3621  is #0, which is reserved for initial thread and so cannot be used for this
3622  one. Following code workarounds this bug.
3623 
3624  However, right solution seems to be not reserving slot #0 for initial
3625  thread because:
3626  (1) there is no magic in slot #0,
3627  (2) we cannot detect initial thread reliably (the first thread which does
3628  serial initialization may be not a real initial thread).
3629  */
3630  capacity = __kmp_threads_capacity;
3631  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3632  --capacity;
3633  }; // if
3634 
3635  /* see if there are too many threads */
3636  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3637  if (__kmp_tp_cached) {
3638  __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3639  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3640  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3641  } else {
3642  __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3643  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
3644  }
3645  }; // if
3646 
3647  /* find an available thread slot */
3648  /* Don't reassign the zero slot since we need that to only be used by initial
3649  thread */
3650  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3651  gtid++)
3652  ;
3653  KA_TRACE(1,
3654  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3655  KMP_ASSERT(gtid < __kmp_threads_capacity);
3656 
3657  /* update global accounting */
3658  __kmp_all_nth++;
3659  TCW_4(__kmp_nth, __kmp_nth + 1);
3660 
3661  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3662  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3663  if (__kmp_adjust_gtid_mode) {
3664  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3665  if (TCR_4(__kmp_gtid_mode) != 2) {
3666  TCW_4(__kmp_gtid_mode, 2);
3667  }
3668  } else {
3669  if (TCR_4(__kmp_gtid_mode) != 1) {
3670  TCW_4(__kmp_gtid_mode, 1);
3671  }
3672  }
3673  }
3674 
3675 #ifdef KMP_ADJUST_BLOCKTIME
3676  /* Adjust blocktime to zero if necessary */
3677  /* Middle initialization might not have occurred yet */
3678  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3679  if (__kmp_nth > __kmp_avail_proc) {
3680  __kmp_zero_bt = TRUE;
3681  }
3682  }
3683 #endif /* KMP_ADJUST_BLOCKTIME */
3684 
3685  /* setup this new hierarchy */
3686  if (!(root = __kmp_root[gtid])) {
3687  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3688  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3689  }
3690 
3691 #if KMP_STATS_ENABLED
3692  // Initialize stats as soon as possible (right after gtid assignment).
3693  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3694  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3695  KMP_SET_THREAD_STATE(SERIAL_REGION);
3696  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3697 #endif
3698  __kmp_initialize_root(root);
3699 
3700  /* setup new root thread structure */
3701  if (root->r.r_uber_thread) {
3702  root_thread = root->r.r_uber_thread;
3703  } else {
3704  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3705  if (__kmp_storage_map) {
3706  __kmp_print_thread_storage_map(root_thread, gtid);
3707  }
3708  root_thread->th.th_info.ds.ds_gtid = gtid;
3709  root_thread->th.th_root = root;
3710  if (__kmp_env_consistency_check) {
3711  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3712  }
3713 #if USE_FAST_MEMORY
3714  __kmp_initialize_fast_memory(root_thread);
3715 #endif /* USE_FAST_MEMORY */
3716 
3717 #if KMP_USE_BGET
3718  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3719  __kmp_initialize_bget(root_thread);
3720 #endif
3721  __kmp_init_random(root_thread); // Initialize random number generator
3722  }
3723 
3724  /* setup the serial team held in reserve by the root thread */
3725  if (!root_thread->th.th_serial_team) {
3726  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3727  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3728  root_thread->th.th_serial_team =
3729  __kmp_allocate_team(root, 1, 1,
3730 #if OMPT_SUPPORT
3731  0, // root parallel id
3732 #endif
3733 #if OMP_40_ENABLED
3734  proc_bind_default,
3735 #endif
3736  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3737  }
3738  KMP_ASSERT(root_thread->th.th_serial_team);
3739  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3740  root_thread->th.th_serial_team));
3741 
3742  /* drop root_thread into place */
3743  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3744 
3745  root->r.r_root_team->t.t_threads[0] = root_thread;
3746  root->r.r_hot_team->t.t_threads[0] = root_thread;
3747  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3748  // AC: the team created in reserve, not for execution (it is unused for now).
3749  root_thread->th.th_serial_team->t.t_serialized = 0;
3750  root->r.r_uber_thread = root_thread;
3751 
3752  /* initialize the thread, get it ready to go */
3753  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3754  TCW_4(__kmp_init_gtid, TRUE);
3755 
3756  /* prepare the master thread for get_gtid() */
3757  __kmp_gtid_set_specific(gtid);
3758 
3759 #if USE_ITT_BUILD
3760  __kmp_itt_thread_name(gtid);
3761 #endif /* USE_ITT_BUILD */
3762 
3763 #ifdef KMP_TDATA_GTID
3764  __kmp_gtid = gtid;
3765 #endif
3766  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3767  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3768 
3769  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3770  "plain=%u\n",
3771  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3772  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3773  KMP_INIT_BARRIER_STATE));
3774  { // Initialize barrier data.
3775  int b;
3776  for (b = 0; b < bs_last_barrier; ++b) {
3777  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3778 #if USE_DEBUGGER
3779  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3780 #endif
3781  }; // for
3782  }
3783  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3784  KMP_INIT_BARRIER_STATE);
3785 
3786 #if KMP_AFFINITY_SUPPORTED
3787 #if OMP_40_ENABLED
3788  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3789  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3790  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3791  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3792 #endif
3793 
3794  if (TCR_4(__kmp_init_middle)) {
3795  __kmp_affinity_set_init_mask(gtid, TRUE);
3796  }
3797 #endif /* KMP_AFFINITY_SUPPORTED */
3798 
3799  __kmp_root_counter++;
3800 
3801  KMP_MB();
3802  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3803 
3804  return gtid;
3805 }
3806 
3807 #if KMP_NESTED_HOT_TEAMS
3808 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3809  const int max_level) {
3810  int i, n, nth;
3811  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3812  if (!hot_teams || !hot_teams[level].hot_team) {
3813  return 0;
3814  }
3815  KMP_DEBUG_ASSERT(level < max_level);
3816  kmp_team_t *team = hot_teams[level].hot_team;
3817  nth = hot_teams[level].hot_team_nth;
3818  n = nth - 1; // master is not freed
3819  if (level < max_level - 1) {
3820  for (i = 0; i < nth; ++i) {
3821  kmp_info_t *th = team->t.t_threads[i];
3822  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3823  if (i > 0 && th->th.th_hot_teams) {
3824  __kmp_free(th->th.th_hot_teams);
3825  th->th.th_hot_teams = NULL;
3826  }
3827  }
3828  }
3829  __kmp_free_team(root, team, NULL);
3830  return n;
3831 }
3832 #endif
3833 
3834 // Resets a root thread and clear its root and hot teams.
3835 // Returns the number of __kmp_threads entries directly and indirectly freed.
3836 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3837  kmp_team_t *root_team = root->r.r_root_team;
3838  kmp_team_t *hot_team = root->r.r_hot_team;
3839  int n = hot_team->t.t_nproc;
3840  int i;
3841 
3842  KMP_DEBUG_ASSERT(!root->r.r_active);
3843 
3844  root->r.r_root_team = NULL;
3845  root->r.r_hot_team = NULL;
3846  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3847  // before call to __kmp_free_team().
3848  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3849 #if KMP_NESTED_HOT_TEAMS
3850  if (__kmp_hot_teams_max_level >
3851  0) { // need to free nested hot teams and their threads if any
3852  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3853  kmp_info_t *th = hot_team->t.t_threads[i];
3854  if (__kmp_hot_teams_max_level > 1) {
3855  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3856  }
3857  if (th->th.th_hot_teams) {
3858  __kmp_free(th->th.th_hot_teams);
3859  th->th.th_hot_teams = NULL;
3860  }
3861  }
3862  }
3863 #endif
3864  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3865 
3866  // Before we can reap the thread, we need to make certain that all other
3867  // threads in the teams that had this root as ancestor have stopped trying to
3868  // steal tasks.
3869  if (__kmp_tasking_mode != tskm_immediate_exec) {
3870  __kmp_wait_to_unref_task_teams();
3871  }
3872 
3873 #if KMP_OS_WINDOWS
3874  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3875  KA_TRACE(
3876  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3877  "\n",
3878  (LPVOID) & (root->r.r_uber_thread->th),
3879  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3880  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3881 #endif /* KMP_OS_WINDOWS */
3882 
3883 #if OMPT_SUPPORT
3884  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3885  int gtid = __kmp_get_gtid();
3886  __ompt_thread_end(ompt_thread_initial, gtid);
3887  }
3888 #endif
3889 
3890  TCW_4(__kmp_nth,
3891  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3892  __kmp_reap_thread(root->r.r_uber_thread, 1);
3893 
3894  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3895  // of freeing.
3896  root->r.r_uber_thread = NULL;
3897  /* mark root as no longer in use */
3898  root->r.r_begin = FALSE;
3899 
3900  return n;
3901 }
3902 
3903 void __kmp_unregister_root_current_thread(int gtid) {
3904  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3905  /* this lock should be ok, since unregister_root_current_thread is never
3906  called during an abort, only during a normal close. furthermore, if you
3907  have the forkjoin lock, you should never try to get the initz lock */
3908  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3909  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3910  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3911  "exiting T#%d\n",
3912  gtid));
3913  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3914  return;
3915  }
3916  kmp_root_t *root = __kmp_root[gtid];
3917 
3918  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3919  KMP_ASSERT(KMP_UBER_GTID(gtid));
3920  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3921  KMP_ASSERT(root->r.r_active == FALSE);
3922 
3923  KMP_MB();
3924 
3925 #if OMP_45_ENABLED
3926  kmp_info_t *thread = __kmp_threads[gtid];
3927  kmp_team_t *team = thread->th.th_team;
3928  kmp_task_team_t *task_team = thread->th.th_task_team;
3929 
3930  // we need to wait for the proxy tasks before finishing the thread
3931  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3932 #if OMPT_SUPPORT
3933  // the runtime is shutting down so we won't report any events
3934  thread->th.ompt_thread_info.state = ompt_state_undefined;
3935 #endif
3936  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3937  }
3938 #endif
3939 
3940  __kmp_reset_root(gtid, root);
3941 
3942  /* free up this thread slot */
3943  __kmp_gtid_set_specific(KMP_GTID_DNE);
3944 #ifdef KMP_TDATA_GTID
3945  __kmp_gtid = KMP_GTID_DNE;
3946 #endif
3947 
3948  KMP_MB();
3949  KC_TRACE(10,
3950  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3951 
3952  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3953 }
3954 
3955 #if KMP_OS_WINDOWS
3956 /* __kmp_forkjoin_lock must be already held
3957  Unregisters a root thread that is not the current thread. Returns the number
3958  of __kmp_threads entries freed as a result. */
3959 static int __kmp_unregister_root_other_thread(int gtid) {
3960  kmp_root_t *root = __kmp_root[gtid];
3961  int r;
3962 
3963  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3964  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3965  KMP_ASSERT(KMP_UBER_GTID(gtid));
3966  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3967  KMP_ASSERT(root->r.r_active == FALSE);
3968 
3969  r = __kmp_reset_root(gtid, root);
3970  KC_TRACE(10,
3971  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3972  return r;
3973 }
3974 #endif
3975 
3976 #if KMP_DEBUG
3977 void __kmp_task_info() {
3978 
3979  kmp_int32 gtid = __kmp_entry_gtid();
3980  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3981  kmp_info_t *this_thr = __kmp_threads[gtid];
3982  kmp_team_t *steam = this_thr->th.th_serial_team;
3983  kmp_team_t *team = this_thr->th.th_team;
3984 
3985  __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
3986  "ptask=%p\n",
3987  gtid, tid, this_thr, team, this_thr->th.th_current_task,
3988  team->t.t_implicit_task_taskdata[tid].td_parent);
3989 }
3990 #endif // KMP_DEBUG
3991 
3992 /* TODO optimize with one big memclr, take out what isn't needed, split
3993  responsibility to workers as much as possible, and delay initialization of
3994  features as much as possible */
3995 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3996  int tid, int gtid) {
3997  /* this_thr->th.th_info.ds.ds_gtid is setup in
3998  kmp_allocate_thread/create_worker.
3999  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4000  kmp_info_t *master = team->t.t_threads[0];
4001  KMP_DEBUG_ASSERT(this_thr != NULL);
4002  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4003  KMP_DEBUG_ASSERT(team);
4004  KMP_DEBUG_ASSERT(team->t.t_threads);
4005  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4006  KMP_DEBUG_ASSERT(master);
4007  KMP_DEBUG_ASSERT(master->th.th_root);
4008 
4009  KMP_MB();
4010 
4011  TCW_SYNC_PTR(this_thr->th.th_team, team);
4012 
4013  this_thr->th.th_info.ds.ds_tid = tid;
4014  this_thr->th.th_set_nproc = 0;
4015  if (__kmp_tasking_mode != tskm_immediate_exec)
4016  // When tasking is possible, threads are not safe to reap until they are
4017  // done tasking; this will be set when tasking code is exited in wait
4018  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4019  else // no tasking --> always safe to reap
4020  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4021 #if OMP_40_ENABLED
4022  this_thr->th.th_set_proc_bind = proc_bind_default;
4023 #if KMP_AFFINITY_SUPPORTED
4024  this_thr->th.th_new_place = this_thr->th.th_current_place;
4025 #endif
4026 #endif
4027  this_thr->th.th_root = master->th.th_root;
4028 
4029  /* setup the thread's cache of the team structure */
4030  this_thr->th.th_team_nproc = team->t.t_nproc;
4031  this_thr->th.th_team_master = master;
4032  this_thr->th.th_team_serialized = team->t.t_serialized;
4033  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4034 
4035  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4036 
4037  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4038  tid, gtid, this_thr, this_thr->th.th_current_task));
4039 
4040  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4041  team, tid, TRUE);
4042 
4043  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4044  tid, gtid, this_thr, this_thr->th.th_current_task));
4045  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4046  // __kmp_initialize_team()?
4047 
4048  /* TODO no worksharing in speculative threads */
4049  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4050 
4051  this_thr->th.th_local.this_construct = 0;
4052 
4053 #ifdef BUILD_TV
4054  this_thr->th.th_local.tv_data = 0;
4055 #endif
4056 
4057  if (!this_thr->th.th_pri_common) {
4058  this_thr->th.th_pri_common =
4059  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4060  if (__kmp_storage_map) {
4061  __kmp_print_storage_map_gtid(
4062  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4063  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4064  }; // if
4065  this_thr->th.th_pri_head = NULL;
4066  }; // if
4067 
4068  /* Initialize dynamic dispatch */
4069  {
4070  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4071  // Use team max_nproc since this will never change for the team.
4072  size_t disp_size =
4073  sizeof(dispatch_private_info_t) *
4074  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4075  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4076  team->t.t_max_nproc));
4077  KMP_ASSERT(dispatch);
4078  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4079  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4080 
4081  dispatch->th_disp_index = 0;
4082 #if OMP_45_ENABLED
4083  dispatch->th_doacross_buf_idx = 0;
4084 #endif
4085  if (!dispatch->th_disp_buffer) {
4086  dispatch->th_disp_buffer =
4087  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4088 
4089  if (__kmp_storage_map) {
4090  __kmp_print_storage_map_gtid(
4091  gtid, &dispatch->th_disp_buffer[0],
4092  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4093  ? 1
4094  : __kmp_dispatch_num_buffers],
4095  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4096  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4097  gtid, team->t.t_id, gtid);
4098  }
4099  } else {
4100  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4101  }
4102 
4103  dispatch->th_dispatch_pr_current = 0;
4104  dispatch->th_dispatch_sh_current = 0;
4105 
4106  dispatch->th_deo_fcn = 0; /* ORDERED */
4107  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4108  }
4109 
4110  this_thr->th.th_next_pool = NULL;
4111 
4112  if (!this_thr->th.th_task_state_memo_stack) {
4113  size_t i;
4114  this_thr->th.th_task_state_memo_stack =
4115  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4116  this_thr->th.th_task_state_top = 0;
4117  this_thr->th.th_task_state_stack_sz = 4;
4118  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4119  ++i) // zero init the stack
4120  this_thr->th.th_task_state_memo_stack[i] = 0;
4121  }
4122 
4123  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4124  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4125 
4126  KMP_MB();
4127 }
4128 
4129 /* allocate a new thread for the requesting team. this is only called from
4130  within a forkjoin critical section. we will first try to get an available
4131  thread from the thread pool. if none is available, we will fork a new one
4132  assuming we are able to create a new one. this should be assured, as the
4133  caller should check on this first. */
4134 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4135  int new_tid) {
4136  kmp_team_t *serial_team;
4137  kmp_info_t *new_thr;
4138  int new_gtid;
4139 
4140  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4141  KMP_DEBUG_ASSERT(root && team);
4142 #if !KMP_NESTED_HOT_TEAMS
4143  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4144 #endif
4145  KMP_MB();
4146 
4147  /* first, try to get one from the thread pool */
4148  if (__kmp_thread_pool) {
4149 
4150  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4151  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4152  if (new_thr == __kmp_thread_pool_insert_pt) {
4153  __kmp_thread_pool_insert_pt = NULL;
4154  }
4155  TCW_4(new_thr->th.th_in_pool, FALSE);
4156  // Don't touch th_active_in_pool or th_active.
4157  // The worker thread adjusts those flags as it sleeps/awakens.
4158  __kmp_thread_pool_nth--;
4159 
4160  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4161  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4162  KMP_ASSERT(!new_thr->th.th_team);
4163  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4164  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4165 
4166  /* setup the thread structure */
4167  __kmp_initialize_info(new_thr, team, new_tid,
4168  new_thr->th.th_info.ds.ds_gtid);
4169  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4170 
4171  TCW_4(__kmp_nth, __kmp_nth + 1);
4172 
4173  new_thr->th.th_task_state = 0;
4174  new_thr->th.th_task_state_top = 0;
4175  new_thr->th.th_task_state_stack_sz = 4;
4176 
4177 #ifdef KMP_ADJUST_BLOCKTIME
4178  /* Adjust blocktime back to zero if necessary */
4179  /* Middle initialization might not have occurred yet */
4180  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4181  if (__kmp_nth > __kmp_avail_proc) {
4182  __kmp_zero_bt = TRUE;
4183  }
4184  }
4185 #endif /* KMP_ADJUST_BLOCKTIME */
4186 
4187 #if KMP_DEBUG
4188  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4189  // KMP_BARRIER_PARENT_FLAG.
4190  int b;
4191  kmp_balign_t *balign = new_thr->th.th_bar;
4192  for (b = 0; b < bs_last_barrier; ++b)
4193  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4194 #endif
4195 
4196  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4197  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4198 
4199  KMP_MB();
4200  return new_thr;
4201  }
4202 
4203  /* no, well fork a new one */
4204  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4205  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4206 
4207 #if KMP_USE_MONITOR
4208  // If this is the first worker thread the RTL is creating, then also
4209  // launch the monitor thread. We try to do this as early as possible.
4210  if (!TCR_4(__kmp_init_monitor)) {
4211  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4212  if (!TCR_4(__kmp_init_monitor)) {
4213  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4214  TCW_4(__kmp_init_monitor, 1);
4215  __kmp_create_monitor(&__kmp_monitor);
4216  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4217 #if KMP_OS_WINDOWS
4218  // AC: wait until monitor has started. This is a fix for CQ232808.
4219  // The reason is that if the library is loaded/unloaded in a loop with
4220  // small (parallel) work in between, then there is high probability that
4221  // monitor thread started after the library shutdown. At shutdown it is
4222  // too late to cope with the problem, because when the master is in
4223  // DllMain (process detach) the monitor has no chances to start (it is
4224  // blocked), and master has no means to inform the monitor that the
4225  // library has gone, because all the memory which the monitor can access
4226  // is going to be released/reset.
4227  while (TCR_4(__kmp_init_monitor) < 2) {
4228  KMP_YIELD(TRUE);
4229  }
4230  KF_TRACE(10, ("after monitor thread has started\n"));
4231 #endif
4232  }
4233  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4234  }
4235 #endif
4236 
4237  KMP_MB();
4238  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4239  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4240  }
4241 
4242  /* allocate space for it. */
4243  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4244 
4245  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4246 
4247  if (__kmp_storage_map) {
4248  __kmp_print_thread_storage_map(new_thr, new_gtid);
4249  }
4250 
4251  // add the reserve serialized team, initialized from the team's master thread
4252  {
4253  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4254  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4255  new_thr->th.th_serial_team = serial_team =
4256  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4257 #if OMPT_SUPPORT
4258  0, // root parallel id
4259 #endif
4260 #if OMP_40_ENABLED
4261  proc_bind_default,
4262 #endif
4263  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4264  }
4265  KMP_ASSERT(serial_team);
4266  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4267  // execution (it is unused for now).
4268  serial_team->t.t_threads[0] = new_thr;
4269  KF_TRACE(10,
4270  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4271  new_thr));
4272 
4273  /* setup the thread structures */
4274  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4275 
4276 #if USE_FAST_MEMORY
4277  __kmp_initialize_fast_memory(new_thr);
4278 #endif /* USE_FAST_MEMORY */
4279 
4280 #if KMP_USE_BGET
4281  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4282  __kmp_initialize_bget(new_thr);
4283 #endif
4284 
4285  __kmp_init_random(new_thr); // Initialize random number generator
4286 
4287  /* Initialize these only once when thread is grabbed for a team allocation */
4288  KA_TRACE(20,
4289  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4290  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4291 
4292  int b;
4293  kmp_balign_t *balign = new_thr->th.th_bar;
4294  for (b = 0; b < bs_last_barrier; ++b) {
4295  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4296  balign[b].bb.team = NULL;
4297  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4298  balign[b].bb.use_oncore_barrier = 0;
4299  }
4300 
4301  new_thr->th.th_spin_here = FALSE;
4302  new_thr->th.th_next_waiting = 0;
4303 
4304 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4305  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4306  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4307  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4308  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4309 #endif
4310 
4311  TCW_4(new_thr->th.th_in_pool, FALSE);
4312  new_thr->th.th_active_in_pool = FALSE;
4313  TCW_4(new_thr->th.th_active, TRUE);
4314 
4315  /* adjust the global counters */
4316  __kmp_all_nth++;
4317  __kmp_nth++;
4318 
4319  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321  if (__kmp_adjust_gtid_mode) {
4322  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323  if (TCR_4(__kmp_gtid_mode) != 2) {
4324  TCW_4(__kmp_gtid_mode, 2);
4325  }
4326  } else {
4327  if (TCR_4(__kmp_gtid_mode) != 1) {
4328  TCW_4(__kmp_gtid_mode, 1);
4329  }
4330  }
4331  }
4332 
4333 #ifdef KMP_ADJUST_BLOCKTIME
4334  /* Adjust blocktime back to zero if necessary */
4335  /* Middle initialization might not have occurred yet */
4336  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337  if (__kmp_nth > __kmp_avail_proc) {
4338  __kmp_zero_bt = TRUE;
4339  }
4340  }
4341 #endif /* KMP_ADJUST_BLOCKTIME */
4342 
4343  /* actually fork it and create the new worker thread */
4344  KF_TRACE(
4345  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4347  KF_TRACE(10,
4348  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4349 
4350  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4351  new_gtid));
4352  KMP_MB();
4353  return new_thr;
4354 }
4355 
4356 /* Reinitialize team for reuse.
4357  The hot team code calls this case at every fork barrier, so EPCC barrier
4358  test are extremely sensitive to changes in it, esp. writes to the team
4359  struct, which cause a cache invalidation in all threads.
4360  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4361 static void __kmp_reinitialize_team(kmp_team_t *team,
4362  kmp_internal_control_t *new_icvs,
4363  ident_t *loc) {
4364  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365  team->t.t_threads[0], team));
4366  KMP_DEBUG_ASSERT(team && new_icvs);
4367  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4369 
4370  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371  // Copy ICVs to the master thread's implicit taskdata
4372  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4374 
4375  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376  team->t.t_threads[0], team));
4377 }
4378 
4379 /* Initialize the team data structure.
4380  This assumes the t_threads and t_max_nproc are already set.
4381  Also, we don't touch the arguments */
4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383  kmp_internal_control_t *new_icvs,
4384  ident_t *loc) {
4385  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4386 
4387  /* verify */
4388  KMP_DEBUG_ASSERT(team);
4389  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390  KMP_DEBUG_ASSERT(team->t.t_threads);
4391  KMP_MB();
4392 
4393  team->t.t_master_tid = 0; /* not needed */
4394  /* team->t.t_master_bar; not needed */
4395  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396  team->t.t_nproc = new_nproc;
4397 
4398  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4399  team->t.t_next_pool = NULL;
4400  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4401  * up hot team */
4402 
4403  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404  team->t.t_invoke = NULL; /* not needed */
4405 
4406  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4407  team->t.t_sched = new_icvs->sched;
4408 
4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410  team->t.t_fp_control_saved = FALSE; /* not needed */
4411  team->t.t_x87_fpu_control_word = 0; /* not needed */
4412  team->t.t_mxcsr = 0; /* not needed */
4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4414 
4415  team->t.t_construct = 0;
4416  __kmp_init_lock(&team->t.t_single_lock);
4417 
4418  team->t.t_ordered.dt.t_value = 0;
4419  team->t.t_master_active = FALSE;
4420 
4421  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4422 
4423 #ifdef KMP_DEBUG
4424  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4425 #endif
4426  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4427 
4428  team->t.t_control_stack_top = NULL;
4429 
4430  __kmp_reinitialize_team(team, new_icvs, loc);
4431 
4432  KMP_MB();
4433  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4434 }
4435 
4436 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4437 /* Sets full mask for thread and returns old mask, no changes to structures. */
4438 static void
4439 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4440  if (KMP_AFFINITY_CAPABLE()) {
4441  int status;
4442  if (old_mask != NULL) {
4443  status = __kmp_get_system_affinity(old_mask, TRUE);
4444  int error = errno;
4445  if (status != 0) {
4446  __kmp_msg(kmp_ms_fatal, KMP_MSG(ChangeThreadAffMaskError),
4447  KMP_ERR(error), __kmp_msg_null);
4448  }
4449  }
4450  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4451  }
4452 }
4453 #endif
4454 
4455 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4456 
4457 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4458 // It calculats the worker + master thread's partition based upon the parent
4459 // thread's partition, and binds each worker to a thread in their partition.
4460 // The master thread's partition should already include its current binding.
4461 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4462  // Copy the master thread's place partion to the team struct
4463  kmp_info_t *master_th = team->t.t_threads[0];
4464  KMP_DEBUG_ASSERT(master_th != NULL);
4465  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4466  int first_place = master_th->th.th_first_place;
4467  int last_place = master_th->th.th_last_place;
4468  int masters_place = master_th->th.th_current_place;
4469  team->t.t_first_place = first_place;
4470  team->t.t_last_place = last_place;
4471 
4472  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4473  "bound to place %d partition = [%d,%d]\n",
4474  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4475  team->t.t_id, masters_place, first_place, last_place));
4476 
4477  switch (proc_bind) {
4478 
4479  case proc_bind_default:
4480  // serial teams might have the proc_bind policy set to proc_bind_default. It
4481  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4482  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4483  break;
4484 
4485  case proc_bind_master: {
4486  int f;
4487  int n_th = team->t.t_nproc;
4488  for (f = 1; f < n_th; f++) {
4489  kmp_info_t *th = team->t.t_threads[f];
4490  KMP_DEBUG_ASSERT(th != NULL);
4491  th->th.th_first_place = first_place;
4492  th->th.th_last_place = last_place;
4493  th->th.th_new_place = masters_place;
4494 
4495  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4496  "partition = [%d,%d]\n",
4497  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4498  f, masters_place, first_place, last_place));
4499  }
4500  } break;
4501 
4502  case proc_bind_close: {
4503  int f;
4504  int n_th = team->t.t_nproc;
4505  int n_places;
4506  if (first_place <= last_place) {
4507  n_places = last_place - first_place + 1;
4508  } else {
4509  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4510  }
4511  if (n_th <= n_places) {
4512  int place = masters_place;
4513  for (f = 1; f < n_th; f++) {
4514  kmp_info_t *th = team->t.t_threads[f];
4515  KMP_DEBUG_ASSERT(th != NULL);
4516 
4517  if (place == last_place) {
4518  place = first_place;
4519  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4520  place = 0;
4521  } else {
4522  place++;
4523  }
4524  th->th.th_first_place = first_place;
4525  th->th.th_last_place = last_place;
4526  th->th.th_new_place = place;
4527 
4528  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4529  "partition = [%d,%d]\n",
4530  __kmp_gtid_from_thread(team->t.t_threads[f]),
4531  team->t.t_id, f, place, first_place, last_place));
4532  }
4533  } else {
4534  int S, rem, gap, s_count;
4535  S = n_th / n_places;
4536  s_count = 0;
4537  rem = n_th - (S * n_places);
4538  gap = rem > 0 ? n_places / rem : n_places;
4539  int place = masters_place;
4540  int gap_ct = gap;
4541  for (f = 0; f < n_th; f++) {
4542  kmp_info_t *th = team->t.t_threads[f];
4543  KMP_DEBUG_ASSERT(th != NULL);
4544 
4545  th->th.th_first_place = first_place;
4546  th->th.th_last_place = last_place;
4547  th->th.th_new_place = place;
4548  s_count++;
4549 
4550  if ((s_count == S) && rem && (gap_ct == gap)) {
4551  // do nothing, add an extra thread to place on next iteration
4552  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4553  // we added an extra thread to this place; move to next place
4554  if (place == last_place) {
4555  place = first_place;
4556  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4557  place = 0;
4558  } else {
4559  place++;
4560  }
4561  s_count = 0;
4562  gap_ct = 1;
4563  rem--;
4564  } else if (s_count == S) { // place full; don't add extra
4565  if (place == last_place) {
4566  place = first_place;
4567  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4568  place = 0;
4569  } else {
4570  place++;
4571  }
4572  gap_ct++;
4573  s_count = 0;
4574  }
4575 
4576  KA_TRACE(100,
4577  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4578  "partition = [%d,%d]\n",
4579  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4580  th->th.th_new_place, first_place, last_place));
4581  }
4582  KMP_DEBUG_ASSERT(place == masters_place);
4583  }
4584  } break;
4585 
4586  case proc_bind_spread: {
4587  int f;
4588  int n_th = team->t.t_nproc;
4589  int n_places;
4590  int thidx;
4591  if (first_place <= last_place) {
4592  n_places = last_place - first_place + 1;
4593  } else {
4594  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4595  }
4596  if (n_th <= n_places) {
4597  int place = masters_place;
4598  int S = n_places / n_th;
4599  int s_count, rem, gap, gap_ct;
4600  rem = n_places - n_th * S;
4601  gap = rem ? n_th / rem : 1;
4602  gap_ct = gap;
4603  thidx = n_th;
4604  if (update_master_only == 1)
4605  thidx = 1;
4606  for (f = 0; f < thidx; f++) {
4607  kmp_info_t *th = team->t.t_threads[f];
4608  KMP_DEBUG_ASSERT(th != NULL);
4609 
4610  th->th.th_first_place = place;
4611  th->th.th_new_place = place;
4612  s_count = 1;
4613  while (s_count < S) {
4614  if (place == last_place) {
4615  place = first_place;
4616  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4617  place = 0;
4618  } else {
4619  place++;
4620  }
4621  s_count++;
4622  }
4623  if (rem && (gap_ct == gap)) {
4624  if (place == last_place) {
4625  place = first_place;
4626  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4627  place = 0;
4628  } else {
4629  place++;
4630  }
4631  rem--;
4632  gap_ct = 0;
4633  }
4634  th->th.th_last_place = place;
4635  gap_ct++;
4636 
4637  if (place == last_place) {
4638  place = first_place;
4639  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4640  place = 0;
4641  } else {
4642  place++;
4643  }
4644 
4645  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4646  "partition = [%d,%d]\n",
4647  __kmp_gtid_from_thread(team->t.t_threads[f]),
4648  team->t.t_id, f, th->th.th_new_place,
4649  th->th.th_first_place, th->th.th_last_place));
4650  }
4651  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4652  } else {
4653  int S, rem, gap, s_count;
4654  S = n_th / n_places;
4655  s_count = 0;
4656  rem = n_th - (S * n_places);
4657  gap = rem > 0 ? n_places / rem : n_places;
4658  int place = masters_place;
4659  int gap_ct = gap;
4660  thidx = n_th;
4661  if (update_master_only == 1)
4662  thidx = 1;
4663  for (f = 0; f < thidx; f++) {
4664  kmp_info_t *th = team->t.t_threads[f];
4665  KMP_DEBUG_ASSERT(th != NULL);
4666 
4667  th->th.th_first_place = place;
4668  th->th.th_last_place = place;
4669  th->th.th_new_place = place;
4670  s_count++;
4671 
4672  if ((s_count == S) && rem && (gap_ct == gap)) {
4673  // do nothing, add an extra thread to place on next iteration
4674  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4675  // we added an extra thread to this place; move on to next place
4676  if (place == last_place) {
4677  place = first_place;
4678  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679  place = 0;
4680  } else {
4681  place++;
4682  }
4683  s_count = 0;
4684  gap_ct = 1;
4685  rem--;
4686  } else if (s_count == S) { // place is full; don't add extra thread
4687  if (place == last_place) {
4688  place = first_place;
4689  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4690  place = 0;
4691  } else {
4692  place++;
4693  }
4694  gap_ct++;
4695  s_count = 0;
4696  }
4697 
4698  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4699  "partition = [%d,%d]\n",
4700  __kmp_gtid_from_thread(team->t.t_threads[f]),
4701  team->t.t_id, f, th->th.th_new_place,
4702  th->th.th_first_place, th->th.th_last_place));
4703  }
4704  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4705  }
4706  } break;
4707 
4708  default:
4709  break;
4710  }
4711 
4712  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4713 }
4714 
4715 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4716 
4717 /* allocate a new team data structure to use. take one off of the free pool if
4718  available */
4719 kmp_team_t *
4720 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4721 #if OMPT_SUPPORT
4722  ompt_parallel_id_t ompt_parallel_id,
4723 #endif
4724 #if OMP_40_ENABLED
4725  kmp_proc_bind_t new_proc_bind,
4726 #endif
4727  kmp_internal_control_t *new_icvs,
4728  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4729  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4730  int f;
4731  kmp_team_t *team;
4732  int use_hot_team = !root->r.r_active;
4733  int level = 0;
4734 
4735  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4736  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4737  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4738  KMP_MB();
4739 
4740 #if KMP_NESTED_HOT_TEAMS
4741  kmp_hot_team_ptr_t *hot_teams;
4742  if (master) {
4743  team = master->th.th_team;
4744  level = team->t.t_active_level;
4745  if (master->th.th_teams_microtask) { // in teams construct?
4746  if (master->th.th_teams_size.nteams > 1 &&
4747  ( // #teams > 1
4748  team->t.t_pkfn ==
4749  (microtask_t)__kmp_teams_master || // inner fork of the teams
4750  master->th.th_teams_level <
4751  team->t.t_level)) { // or nested parallel inside the teams
4752  ++level; // not increment if #teams==1, or for outer fork of the teams;
4753  // increment otherwise
4754  }
4755  }
4756  hot_teams = master->th.th_hot_teams;
4757  if (level < __kmp_hot_teams_max_level && hot_teams &&
4758  hot_teams[level]
4759  .hot_team) { // hot team has already been allocated for given level
4760  use_hot_team = 1;
4761  } else {
4762  use_hot_team = 0;
4763  }
4764  }
4765 #endif
4766  // Optimization to use a "hot" team
4767  if (use_hot_team && new_nproc > 1) {
4768  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4769 #if KMP_NESTED_HOT_TEAMS
4770  team = hot_teams[level].hot_team;
4771 #else
4772  team = root->r.r_hot_team;
4773 #endif
4774 #if KMP_DEBUG
4775  if (__kmp_tasking_mode != tskm_immediate_exec) {
4776  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4777  "task_team[1] = %p before reinit\n",
4778  team->t.t_task_team[0], team->t.t_task_team[1]));
4779  }
4780 #endif
4781 
4782  // Has the number of threads changed?
4783  /* Let's assume the most common case is that the number of threads is
4784  unchanged, and put that case first. */
4785  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4786  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4787  // This case can mean that omp_set_num_threads() was called and the hot
4788  // team size was already reduced, so we check the special flag
4789  if (team->t.t_size_changed == -1) {
4790  team->t.t_size_changed = 1;
4791  } else {
4792  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4793  }
4794 
4795  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4796  kmp_r_sched_t new_sched = new_icvs->sched;
4797  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4798  team->t.t_sched.chunk != new_sched.chunk)
4799  team->t.t_sched =
4800  new_sched; // set master's schedule as new run-time schedule
4801 
4802  __kmp_reinitialize_team(team, new_icvs,
4803  root->r.r_uber_thread->th.th_ident);
4804 
4805  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4806  team->t.t_threads[0], team));
4807  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4808 
4809 #if OMP_40_ENABLED
4810 #if KMP_AFFINITY_SUPPORTED
4811  if ((team->t.t_size_changed == 0) &&
4812  (team->t.t_proc_bind == new_proc_bind)) {
4813  if (new_proc_bind == proc_bind_spread) {
4814  __kmp_partition_places(
4815  team, 1); // add flag to update only master for spread
4816  }
4817  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4818  "proc_bind = %d, partition = [%d,%d]\n",
4819  team->t.t_id, new_proc_bind, team->t.t_first_place,
4820  team->t.t_last_place));
4821  } else {
4822  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4823  __kmp_partition_places(team);
4824  }
4825 #else
4826  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4827 #endif /* KMP_AFFINITY_SUPPORTED */
4828 #endif /* OMP_40_ENABLED */
4829  } else if (team->t.t_nproc > new_nproc) {
4830  KA_TRACE(20,
4831  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4832  new_nproc));
4833 
4834  team->t.t_size_changed = 1;
4835 #if KMP_NESTED_HOT_TEAMS
4836  if (__kmp_hot_teams_mode == 0) {
4837  // AC: saved number of threads should correspond to team's value in this
4838  // mode, can be bigger in mode 1, when hot team has threads in reserve
4839  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4840  hot_teams[level].hot_team_nth = new_nproc;
4841 #endif // KMP_NESTED_HOT_TEAMS
4842  /* release the extra threads we don't need any more */
4843  for (f = new_nproc; f < team->t.t_nproc; f++) {
4844  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4845  if (__kmp_tasking_mode != tskm_immediate_exec) {
4846  // When decreasing team size, threads no longer in the team should
4847  // unref task team.
4848  team->t.t_threads[f]->th.th_task_team = NULL;
4849  }
4850  __kmp_free_thread(team->t.t_threads[f]);
4851  team->t.t_threads[f] = NULL;
4852  }
4853 #if KMP_NESTED_HOT_TEAMS
4854  } // (__kmp_hot_teams_mode == 0)
4855  else {
4856  // When keeping extra threads in team, switch threads to wait on own
4857  // b_go flag
4858  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4859  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4860  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4861  for (int b = 0; b < bs_last_barrier; ++b) {
4862  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4863  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4864  }
4865  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4866  }
4867  }
4868  }
4869 #endif // KMP_NESTED_HOT_TEAMS
4870  team->t.t_nproc = new_nproc;
4871  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4872  if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4873  team->t.t_sched.chunk != new_icvs->sched.chunk)
4874  team->t.t_sched = new_icvs->sched;
4875  __kmp_reinitialize_team(team, new_icvs,
4876  root->r.r_uber_thread->th.th_ident);
4877 
4878  /* update the remaining threads */
4879  for (f = 0; f < new_nproc; ++f) {
4880  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4881  }
4882  // restore the current task state of the master thread: should be the
4883  // implicit task
4884  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4885  team->t.t_threads[0], team));
4886 
4887  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4888 
4889 #ifdef KMP_DEBUG
4890  for (f = 0; f < team->t.t_nproc; f++) {
4891  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4892  team->t.t_threads[f]->th.th_team_nproc ==
4893  team->t.t_nproc);
4894  }
4895 #endif
4896 
4897 #if OMP_40_ENABLED
4898  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4899 #if KMP_AFFINITY_SUPPORTED
4900  __kmp_partition_places(team);
4901 #endif
4902 #endif
4903  } else { // team->t.t_nproc < new_nproc
4904 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4905  kmp_affin_mask_t *old_mask;
4906  if (KMP_AFFINITY_CAPABLE()) {
4907  KMP_CPU_ALLOC(old_mask);
4908  }
4909 #endif
4910 
4911  KA_TRACE(20,
4912  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4913  new_nproc));
4914 
4915  team->t.t_size_changed = 1;
4916 
4917 #if KMP_NESTED_HOT_TEAMS
4918  int avail_threads = hot_teams[level].hot_team_nth;
4919  if (new_nproc < avail_threads)
4920  avail_threads = new_nproc;
4921  kmp_info_t **other_threads = team->t.t_threads;
4922  for (f = team->t.t_nproc; f < avail_threads; ++f) {
4923  // Adjust barrier data of reserved threads (if any) of the team
4924  // Other data will be set in __kmp_initialize_info() below.
4925  int b;
4926  kmp_balign_t *balign = other_threads[f]->th.th_bar;
4927  for (b = 0; b < bs_last_barrier; ++b) {
4928  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4929  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4930 #if USE_DEBUGGER
4931  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4932 #endif
4933  }
4934  }
4935  if (hot_teams[level].hot_team_nth >= new_nproc) {
4936  // we have all needed threads in reserve, no need to allocate any
4937  // this only possible in mode 1, cannot have reserved threads in mode 0
4938  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4939  team->t.t_nproc = new_nproc; // just get reserved threads involved
4940  } else {
4941  // we may have some threads in reserve, but not enough
4942  team->t.t_nproc =
4943  hot_teams[level]
4944  .hot_team_nth; // get reserved threads involved if any
4945  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4946 #endif // KMP_NESTED_HOT_TEAMS
4947  if (team->t.t_max_nproc < new_nproc) {
4948  /* reallocate larger arrays */
4949  __kmp_reallocate_team_arrays(team, new_nproc);
4950  __kmp_reinitialize_team(team, new_icvs, NULL);
4951  }
4952 
4953 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4954  /* Temporarily set full mask for master thread before creation of
4955  workers. The reason is that workers inherit the affinity from master,
4956  so if a lot of workers are created on the single core quickly, they
4957  don't get a chance to set their own affinity for a long time. */
4958  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
4959 #endif
4960 
4961  /* allocate new threads for the hot team */
4962  for (f = team->t.t_nproc; f < new_nproc; f++) {
4963  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
4964  KMP_DEBUG_ASSERT(new_worker);
4965  team->t.t_threads[f] = new_worker;
4966 
4967  KA_TRACE(20,
4968  ("__kmp_allocate_team: team %d init T#%d arrived: "
4969  "join=%llu, plain=%llu\n",
4970  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
4971  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4972  team->t.t_bar[bs_plain_barrier].b_arrived));
4973 
4974  { // Initialize barrier data for new threads.
4975  int b;
4976  kmp_balign_t *balign = new_worker->th.th_bar;
4977  for (b = 0; b < bs_last_barrier; ++b) {
4978  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4979  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
4980  KMP_BARRIER_PARENT_FLAG);
4981 #if USE_DEBUGGER
4982  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4983 #endif
4984  }
4985  }
4986  }
4987 
4988 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4989  if (KMP_AFFINITY_CAPABLE()) {
4990  /* Restore initial master thread's affinity mask */
4991  __kmp_set_system_affinity(old_mask, TRUE);
4992  KMP_CPU_FREE(old_mask);
4993  }
4994 #endif
4995 #if KMP_NESTED_HOT_TEAMS
4996  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4997 #endif // KMP_NESTED_HOT_TEAMS
4998  /* make sure everyone is syncronized */
4999  int old_nproc = team->t.t_nproc; // save old value and use to update only
5000  // new threads below
5001  __kmp_initialize_team(team, new_nproc, new_icvs,
5002  root->r.r_uber_thread->th.th_ident);
5003 
5004  /* reinitialize the threads */
5005  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5006  for (f = 0; f < team->t.t_nproc; ++f)
5007  __kmp_initialize_info(team->t.t_threads[f], team, f,
5008  __kmp_gtid_from_tid(f, team));
5009  if (level) { // set th_task_state for new threads in nested hot team
5010  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5011  // only need to set the th_task_state for the new threads. th_task_state
5012  // for master thread will not be accurate until after this in
5013  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5014  // correct value.
5015  for (f = old_nproc; f < team->t.t_nproc; ++f)
5016  team->t.t_threads[f]->th.th_task_state =
5017  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5018  } else { // set th_task_state for new threads in non-nested hot team
5019  int old_state =
5020  team->t.t_threads[0]->th.th_task_state; // copy master's state
5021  for (f = old_nproc; f < team->t.t_nproc; ++f)
5022  team->t.t_threads[f]->th.th_task_state = old_state;
5023  }
5024 
5025 #ifdef KMP_DEBUG
5026  for (f = 0; f < team->t.t_nproc; ++f) {
5027  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5028  team->t.t_threads[f]->th.th_team_nproc ==
5029  team->t.t_nproc);
5030  }
5031 #endif
5032 
5033 #if OMP_40_ENABLED
5034  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5035 #if KMP_AFFINITY_SUPPORTED
5036  __kmp_partition_places(team);
5037 #endif
5038 #endif
5039  } // Check changes in number of threads
5040 
5041 #if OMP_40_ENABLED
5042  kmp_info_t *master = team->t.t_threads[0];
5043  if (master->th.th_teams_microtask) {
5044  for (f = 1; f < new_nproc; ++f) {
5045  // propagate teams construct specific info to workers
5046  kmp_info_t *thr = team->t.t_threads[f];
5047  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5048  thr->th.th_teams_level = master->th.th_teams_level;
5049  thr->th.th_teams_size = master->th.th_teams_size;
5050  }
5051  }
5052 #endif /* OMP_40_ENABLED */
5053 #if KMP_NESTED_HOT_TEAMS
5054  if (level) {
5055  // Sync barrier state for nested hot teams, not needed for outermost hot
5056  // team.
5057  for (f = 1; f < new_nproc; ++f) {
5058  kmp_info_t *thr = team->t.t_threads[f];
5059  int b;
5060  kmp_balign_t *balign = thr->th.th_bar;
5061  for (b = 0; b < bs_last_barrier; ++b) {
5062  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5063  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5064 #if USE_DEBUGGER
5065  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5066 #endif
5067  }
5068  }
5069  }
5070 #endif // KMP_NESTED_HOT_TEAMS
5071 
5072  /* reallocate space for arguments if necessary */
5073  __kmp_alloc_argv_entries(argc, team, TRUE);
5074  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5075  // The hot team re-uses the previous task team,
5076  // if untouched during the previous release->gather phase.
5077 
5078  KF_TRACE(10, (" hot_team = %p\n", team));
5079 
5080 #if KMP_DEBUG
5081  if (__kmp_tasking_mode != tskm_immediate_exec) {
5082  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5083  "task_team[1] = %p after reinit\n",
5084  team->t.t_task_team[0], team->t.t_task_team[1]));
5085  }
5086 #endif
5087 
5088 #if OMPT_SUPPORT
5089  __ompt_team_assign_id(team, ompt_parallel_id);
5090 #endif
5091 
5092  KMP_MB();
5093 
5094  return team;
5095  }
5096 
5097  /* next, let's try to take one from the team pool */
5098  KMP_MB();
5099  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5100  /* TODO: consider resizing undersized teams instead of reaping them, now
5101  that we have a resizing mechanism */
5102  if (team->t.t_max_nproc >= max_nproc) {
5103  /* take this team from the team pool */
5104  __kmp_team_pool = team->t.t_next_pool;
5105 
5106  /* setup the team for fresh use */
5107  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5108 
5109  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5110  "task_team[1] %p to NULL\n",
5111  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5112  team->t.t_task_team[0] = NULL;
5113  team->t.t_task_team[1] = NULL;
5114 
5115  /* reallocate space for arguments if necessary */
5116  __kmp_alloc_argv_entries(argc, team, TRUE);
5117  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5118 
5119  KA_TRACE(
5120  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5121  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5122  { // Initialize barrier data.
5123  int b;
5124  for (b = 0; b < bs_last_barrier; ++b) {
5125  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5126 #if USE_DEBUGGER
5127  team->t.t_bar[b].b_master_arrived = 0;
5128  team->t.t_bar[b].b_team_arrived = 0;
5129 #endif
5130  }
5131  }
5132 
5133 #if OMP_40_ENABLED
5134  team->t.t_proc_bind = new_proc_bind;
5135 #endif
5136 
5137  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5138  team->t.t_id));
5139 
5140 #if OMPT_SUPPORT
5141  __ompt_team_assign_id(team, ompt_parallel_id);
5142 #endif
5143 
5144  KMP_MB();
5145 
5146  return team;
5147  }
5148 
5149 /* reap team if it is too small, then loop back and check the next one */
5150 // not sure if this is wise, but, will be redone during the hot-teams rewrite.
5151 /* TODO: Use technique to find the right size hot-team, don't reap them */
5152  team = __kmp_reap_team(team);
5153  __kmp_team_pool = team;
5154  }
5155 
5156  /* nothing available in the pool, no matter, make a new team! */
5157  KMP_MB();
5158  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5159 
5160  /* and set it up */
5161  team->t.t_max_nproc = max_nproc;
5162  /* NOTE well, for some reason allocating one big buffer and dividing it up
5163  seems to really hurt performance a lot on the P4, so, let's not use this */
5164  __kmp_allocate_team_arrays(team, max_nproc);
5165 
5166  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5167  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5168 
5169  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5170  "%p to NULL\n",
5171  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5172  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5173  // memory, no need to duplicate
5174  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5175  // memory, no need to duplicate
5176 
5177  if (__kmp_storage_map) {
5178  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5179  }
5180 
5181  /* allocate space for arguments */
5182  __kmp_alloc_argv_entries(argc, team, FALSE);
5183  team->t.t_argc = argc;
5184 
5185  KA_TRACE(20,
5186  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5187  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5188  { // Initialize barrier data.
5189  int b;
5190  for (b = 0; b < bs_last_barrier; ++b) {
5191  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5192 #if USE_DEBUGGER
5193  team->t.t_bar[b].b_master_arrived = 0;
5194  team->t.t_bar[b].b_team_arrived = 0;
5195 #endif
5196  }
5197  }
5198 
5199 #if OMP_40_ENABLED
5200  team->t.t_proc_bind = new_proc_bind;
5201 #endif
5202 
5203 #if OMPT_SUPPORT
5204  __ompt_team_assign_id(team, ompt_parallel_id);
5205  team->t.ompt_serialized_team_info = NULL;
5206 #endif
5207 
5208  KMP_MB();
5209 
5210  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5211  team->t.t_id));
5212 
5213  return team;
5214 }
5215 
5216 /* TODO implement hot-teams at all levels */
5217 /* TODO implement lazy thread release on demand (disband request) */
5218 
5219 /* free the team. return it to the team pool. release all the threads
5220  * associated with it */
5221 void __kmp_free_team(kmp_root_t *root,
5222  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5223  int f;
5224  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5225  team->t.t_id));
5226 
5227  /* verify state */
5228  KMP_DEBUG_ASSERT(root);
5229  KMP_DEBUG_ASSERT(team);
5230  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5231  KMP_DEBUG_ASSERT(team->t.t_threads);
5232 
5233  int use_hot_team = team == root->r.r_hot_team;
5234 #if KMP_NESTED_HOT_TEAMS
5235  int level;
5236  kmp_hot_team_ptr_t *hot_teams;
5237  if (master) {
5238  level = team->t.t_active_level - 1;
5239  if (master->th.th_teams_microtask) { // in teams construct?
5240  if (master->th.th_teams_size.nteams > 1) {
5241  ++level; // level was not increased in teams construct for
5242  // team_of_masters
5243  }
5244  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5245  master->th.th_teams_level == team->t.t_level) {
5246  ++level; // level was not increased in teams construct for
5247  // team_of_workers before the parallel
5248  } // team->t.t_level will be increased inside parallel
5249  }
5250  hot_teams = master->th.th_hot_teams;
5251  if (level < __kmp_hot_teams_max_level) {
5252  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5253  use_hot_team = 1;
5254  }
5255  }
5256 #endif // KMP_NESTED_HOT_TEAMS
5257 
5258  /* team is done working */
5259  TCW_SYNC_PTR(team->t.t_pkfn,
5260  NULL); // Important for Debugging Support Library.
5261  team->t.t_copyin_counter = 0; // init counter for possible reuse
5262  // Do not reset pointer to parent team to NULL for hot teams.
5263 
5264  /* if we are non-hot team, release our threads */
5265  if (!use_hot_team) {
5266  if (__kmp_tasking_mode != tskm_immediate_exec) {
5267  // Wait for threads to reach reapable state
5268  for (f = 1; f < team->t.t_nproc; ++f) {
5269  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5270  kmp_info_t *th = team->t.t_threads[f];
5271  volatile kmp_uint32 *state = &th->th.th_reap_state;
5272  while (*state != KMP_SAFE_TO_REAP) {
5273 #if KMP_OS_WINDOWS
5274  // On Windows a thread can be killed at any time, check this
5275  DWORD ecode;
5276  if (!__kmp_is_thread_alive(th, &ecode)) {
5277  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5278  break;
5279  }
5280 #endif
5281  // first check if thread is sleeping
5282  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5283  if (fl.is_sleeping())
5284  fl.resume(__kmp_gtid_from_thread(th));
5285  KMP_CPU_PAUSE();
5286  }
5287  }
5288 
5289  // Delete task teams
5290  int tt_idx;
5291  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5292  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5293  if (task_team != NULL) {
5294  for (f = 0; f < team->t.t_nproc;
5295  ++f) { // Have all threads unref task teams
5296  team->t.t_threads[f]->th.th_task_team = NULL;
5297  }
5298  KA_TRACE(
5299  20,
5300  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5301  __kmp_get_gtid(), task_team, team->t.t_id));
5302 #if KMP_NESTED_HOT_TEAMS
5303  __kmp_free_task_team(master, task_team);
5304 #endif
5305  team->t.t_task_team[tt_idx] = NULL;
5306  }
5307  }
5308  }
5309 
5310  // Reset pointer to parent team only for non-hot teams.
5311  team->t.t_parent = NULL;
5312  team->t.t_level = 0;
5313  team->t.t_active_level = 0;
5314 
5315  /* free the worker threads */
5316  for (f = 1; f < team->t.t_nproc; ++f) {
5317  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5318  __kmp_free_thread(team->t.t_threads[f]);
5319  team->t.t_threads[f] = NULL;
5320  }
5321 
5322  /* put the team back in the team pool */
5323  /* TODO limit size of team pool, call reap_team if pool too large */
5324  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5325  __kmp_team_pool = (volatile kmp_team_t *)team;
5326  }
5327 
5328  KMP_MB();
5329 }
5330 
5331 /* reap the team. destroy it, reclaim all its resources and free its memory */
5332 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5333  kmp_team_t *next_pool = team->t.t_next_pool;
5334 
5335  KMP_DEBUG_ASSERT(team);
5336  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5337  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5338  KMP_DEBUG_ASSERT(team->t.t_threads);
5339  KMP_DEBUG_ASSERT(team->t.t_argv);
5340 
5341  /* TODO clean the threads that are a part of this? */
5342 
5343  /* free stuff */
5344  __kmp_free_team_arrays(team);
5345  if (team->t.t_argv != &team->t.t_inline_argv[0])
5346  __kmp_free((void *)team->t.t_argv);
5347  __kmp_free(team);
5348 
5349  KMP_MB();
5350  return next_pool;
5351 }
5352 
5353 // Free the thread. Don't reap it, just place it on the pool of available
5354 // threads.
5355 //
5356 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5357 // binding for the affinity mechanism to be useful.
5358 //
5359 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5360 // However, we want to avoid a potential performance problem by always
5361 // scanning through the list to find the correct point at which to insert
5362 // the thread (potential N**2 behavior). To do this we keep track of the
5363 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5364 // With single-level parallelism, threads will always be added to the tail
5365 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5366 // parallelism, all bets are off and we may need to scan through the entire
5367 // free list.
5368 //
5369 // This change also has a potentially large performance benefit, for some
5370 // applications. Previously, as threads were freed from the hot team, they
5371 // would be placed back on the free list in inverse order. If the hot team
5372 // grew back to it's original size, then the freed thread would be placed
5373 // back on the hot team in reverse order. This could cause bad cache
5374 // locality problems on programs where the size of the hot team regularly
5375 // grew and shrunk.
5376 //
5377 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5378 void __kmp_free_thread(kmp_info_t *this_th) {
5379  int gtid;
5380  kmp_info_t **scan;
5381 
5382  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5383  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5384 
5385  KMP_DEBUG_ASSERT(this_th);
5386 
5387  // When moving thread to pool, switch thread to wait on own b_go flag, and
5388  // uninitialized (NULL team).
5389  int b;
5390  kmp_balign_t *balign = this_th->th.th_bar;
5391  for (b = 0; b < bs_last_barrier; ++b) {
5392  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5393  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5394  balign[b].bb.team = NULL;
5395  balign[b].bb.leaf_kids = 0;
5396  }
5397  this_th->th.th_task_state = 0;
5398 
5399  /* put thread back on the free pool */
5400  TCW_PTR(this_th->th.th_team, NULL);
5401  TCW_PTR(this_th->th.th_root, NULL);
5402  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5403 
5404  // If the __kmp_thread_pool_insert_pt is already past the new insert
5405  // point, then we need to re-scan the entire list.
5406  gtid = this_th->th.th_info.ds.ds_gtid;
5407  if (__kmp_thread_pool_insert_pt != NULL) {
5408  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5409  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5410  __kmp_thread_pool_insert_pt = NULL;
5411  }
5412  }
5413 
5414  // Scan down the list to find the place to insert the thread.
5415  // scan is the address of a link in the list, possibly the address of
5416  // __kmp_thread_pool itself.
5417  //
5418  // In the absence of nested parallism, the for loop will have 0 iterations.
5419  if (__kmp_thread_pool_insert_pt != NULL) {
5420  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5421  } else {
5422  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5423  }
5424  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5425  scan = &((*scan)->th.th_next_pool))
5426  ;
5427 
5428  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5429  // to its address.
5430  TCW_PTR(this_th->th.th_next_pool, *scan);
5431  __kmp_thread_pool_insert_pt = *scan = this_th;
5432  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5433  (this_th->th.th_info.ds.ds_gtid <
5434  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5435  TCW_4(this_th->th.th_in_pool, TRUE);
5436  __kmp_thread_pool_nth++;
5437 
5438  TCW_4(__kmp_nth, __kmp_nth - 1);
5439 
5440 #ifdef KMP_ADJUST_BLOCKTIME
5441  /* Adjust blocktime back to user setting or default if necessary */
5442  /* Middle initialization might never have occurred */
5443  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5444  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5445  if (__kmp_nth <= __kmp_avail_proc) {
5446  __kmp_zero_bt = FALSE;
5447  }
5448  }
5449 #endif /* KMP_ADJUST_BLOCKTIME */
5450 
5451  KMP_MB();
5452 }
5453 
5454 /* ------------------------------------------------------------------------ */
5455 
5456 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5457  int gtid = this_thr->th.th_info.ds.ds_gtid;
5458  /* void *stack_data;*/
5459  kmp_team_t *(*volatile pteam);
5460 
5461  KMP_MB();
5462  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5463 
5464  if (__kmp_env_consistency_check) {
5465  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5466  }
5467 
5468 #if OMPT_SUPPORT
5469  if (ompt_enabled) {
5470  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5471  this_thr->th.ompt_thread_info.wait_id = 0;
5472  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5473  if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5474  __ompt_thread_begin(ompt_thread_worker, gtid);
5475  }
5476  }
5477 #endif
5478 
5479  /* This is the place where threads wait for work */
5480  while (!TCR_4(__kmp_global.g.g_done)) {
5481  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5482  KMP_MB();
5483 
5484  /* wait for work to do */
5485  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5486 
5487 #if OMPT_SUPPORT
5488  if (ompt_enabled) {
5489  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5490  }
5491 #endif
5492 
5493  /* No tid yet since not part of a team */
5494  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5495 
5496 #if OMPT_SUPPORT
5497  if (ompt_enabled) {
5498  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5499  }
5500 #endif
5501 
5502  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5503 
5504  /* have we been allocated? */
5505  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5506 #if OMPT_SUPPORT
5507  ompt_task_info_t *task_info;
5508  ompt_parallel_id_t my_parallel_id;
5509  if (ompt_enabled) {
5510  task_info = __ompt_get_taskinfo(0);
5511  my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5512  }
5513 #endif
5514  /* we were just woken up, so run our new task */
5515  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5516  int rc;
5517  KA_TRACE(20,
5518  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5519  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5520  (*pteam)->t.t_pkfn));
5521 
5522  updateHWFPControl(*pteam);
5523 
5524 #if OMPT_SUPPORT
5525  if (ompt_enabled) {
5526  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5527  // Initialize OMPT task id for implicit task.
5528  int tid = __kmp_tid_from_gtid(gtid);
5529  task_info->task_id = __ompt_task_id_new(tid);
5530  }
5531 #endif
5532 
5533  {
5534  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5535  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5536  rc = (*pteam)->t.t_invoke(gtid);
5537  }
5538  KMP_ASSERT(rc);
5539 
5540 #if OMPT_SUPPORT
5541  if (ompt_enabled) {
5542  /* no frame set while outside task */
5543  task_info->frame.exit_runtime_frame = NULL;
5544 
5545  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5546  }
5547 #endif
5548  KMP_MB();
5549  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5550  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5551  (*pteam)->t.t_pkfn));
5552  }
5553  /* join barrier after parallel region */
5554  __kmp_join_barrier(gtid);
5555 #if OMPT_SUPPORT && OMPT_TRACE
5556  if (ompt_enabled) {
5557  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5558  // don't access *pteam here: it may have already been freed
5559  // by the master thread behind the barrier (possible race)
5560  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5561  my_parallel_id, task_info->task_id);
5562  }
5563  task_info->frame.exit_runtime_frame = NULL;
5564  task_info->task_id = 0;
5565  }
5566 #endif
5567  }
5568  }
5569  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5570 
5571 #if OMPT_SUPPORT
5572  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5573  __ompt_thread_end(ompt_thread_worker, gtid);
5574  }
5575 #endif
5576 
5577  this_thr->th.th_task_team = NULL;
5578  /* run the destructors for the threadprivate data for this thread */
5579  __kmp_common_destroy_gtid(gtid);
5580 
5581  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5582  KMP_MB();
5583  return this_thr;
5584 }
5585 
5586 /* ------------------------------------------------------------------------ */
5587 
5588 void __kmp_internal_end_dest(void *specific_gtid) {
5589 #if KMP_COMPILER_ICC
5590 #pragma warning(push)
5591 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5592 // significant bits
5593 #endif
5594  // Make sure no significant bits are lost
5595  int gtid = (kmp_intptr_t)specific_gtid - 1;
5596 #if KMP_COMPILER_ICC
5597 #pragma warning(pop)
5598 #endif
5599 
5600  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5601  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5602  * this is because 0 is reserved for the nothing-stored case */
5603 
5604  /* josh: One reason for setting the gtid specific data even when it is being
5605  destroyed by pthread is to allow gtid lookup through thread specific data
5606  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5607  that gets executed in the call to __kmp_internal_end_thread, actually
5608  gets the gtid through the thread specific data. Setting it here seems
5609  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5610  to run smoothly.
5611  todo: get rid of this after we remove the dependence on
5612  __kmp_gtid_get_specific */
5613  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5614  __kmp_gtid_set_specific(gtid);
5615 #ifdef KMP_TDATA_GTID
5616  __kmp_gtid = gtid;
5617 #endif
5618  __kmp_internal_end_thread(gtid);
5619 }
5620 
5621 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5622 
5623 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5624 // destructors work perfectly, but in real libomp.so I have no evidence it is
5625 // ever called. However, -fini linker option in makefile.mk works fine.
5626 
5627 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5628  __kmp_internal_end_atexit();
5629 }
5630 
5631 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5632 
5633 #endif
5634 
5635 /* [Windows] josh: when the atexit handler is called, there may still be more
5636  than one thread alive */
5637 void __kmp_internal_end_atexit(void) {
5638  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5639  /* [Windows]
5640  josh: ideally, we want to completely shutdown the library in this atexit
5641  handler, but stat code that depends on thread specific data for gtid fails
5642  because that data becomes unavailable at some point during the shutdown, so
5643  we call __kmp_internal_end_thread instead. We should eventually remove the
5644  dependency on __kmp_get_specific_gtid in the stat code and use
5645  __kmp_internal_end_library to cleanly shutdown the library.
5646 
5647  // TODO: Can some of this comment about GVS be removed?
5648  I suspect that the offending stat code is executed when the calling thread
5649  tries to clean up a dead root thread's data structures, resulting in GVS
5650  code trying to close the GVS structures for that thread, but since the stat
5651  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5652  the calling thread is cleaning up itself instead of another thread, it get
5653  confused. This happens because allowing a thread to unregister and cleanup
5654  another thread is a recent modification for addressing an issue.
5655  Based on the current design (20050722), a thread may end up
5656  trying to unregister another thread only if thread death does not trigger
5657  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5658  thread specific data destructor function to detect thread death. For
5659  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5660  is nothing. Thus, the workaround is applicable only for Windows static
5661  stat library. */
5662  __kmp_internal_end_library(-1);
5663 #if KMP_OS_WINDOWS
5664  __kmp_close_console();
5665 #endif
5666 }
5667 
5668 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5669  // It is assumed __kmp_forkjoin_lock is acquired.
5670 
5671  int gtid;
5672 
5673  KMP_DEBUG_ASSERT(thread != NULL);
5674 
5675  gtid = thread->th.th_info.ds.ds_gtid;
5676 
5677  if (!is_root) {
5678 
5679  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5680  /* Assume the threads are at the fork barrier here */
5681  KA_TRACE(
5682  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5683  gtid));
5684  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5685  * (GEH) */
5686  ANNOTATE_HAPPENS_BEFORE(thread);
5687  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5688  __kmp_release_64(&flag);
5689  }; // if
5690 
5691  // Terminate OS thread.
5692  __kmp_reap_worker(thread);
5693 
5694  // The thread was killed asynchronously. If it was actively
5695  // spinning in the thread pool, decrement the global count.
5696  //
5697  // There is a small timing hole here - if the worker thread was just waking
5698  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5699  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5700  // the global counter might not get updated.
5701  //
5702  // Currently, this can only happen as the library is unloaded,
5703  // so there are no harmful side effects.
5704  if (thread->th.th_active_in_pool) {
5705  thread->th.th_active_in_pool = FALSE;
5706  KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5707  KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5708  }
5709 
5710  // Decrement # of [worker] threads in the pool.
5711  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5712  --__kmp_thread_pool_nth;
5713  }; // if
5714 
5715  __kmp_free_implicit_task(thread);
5716 
5717 // Free the fast memory for tasking
5718 #if USE_FAST_MEMORY
5719  __kmp_free_fast_memory(thread);
5720 #endif /* USE_FAST_MEMORY */
5721 
5722  __kmp_suspend_uninitialize_thread(thread);
5723 
5724  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5725  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5726 
5727  --__kmp_all_nth;
5728 // __kmp_nth was decremented when thread is added to the pool.
5729 
5730 #ifdef KMP_ADJUST_BLOCKTIME
5731  /* Adjust blocktime back to user setting or default if necessary */
5732  /* Middle initialization might never have occurred */
5733  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5734  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5735  if (__kmp_nth <= __kmp_avail_proc) {
5736  __kmp_zero_bt = FALSE;
5737  }
5738  }
5739 #endif /* KMP_ADJUST_BLOCKTIME */
5740 
5741  /* free the memory being used */
5742  if (__kmp_env_consistency_check) {
5743  if (thread->th.th_cons) {
5744  __kmp_free_cons_stack(thread->th.th_cons);
5745  thread->th.th_cons = NULL;
5746  }; // if
5747  }
5748 
5749  if (thread->th.th_pri_common != NULL) {
5750  __kmp_free(thread->th.th_pri_common);
5751  thread->th.th_pri_common = NULL;
5752  }; // if
5753 
5754  if (thread->th.th_task_state_memo_stack != NULL) {
5755  __kmp_free(thread->th.th_task_state_memo_stack);
5756  thread->th.th_task_state_memo_stack = NULL;
5757  }
5758 
5759 #if KMP_USE_BGET
5760  if (thread->th.th_local.bget_data != NULL) {
5761  __kmp_finalize_bget(thread);
5762  }; // if
5763 #endif
5764 
5765 #if KMP_AFFINITY_SUPPORTED
5766  if (thread->th.th_affin_mask != NULL) {
5767  KMP_CPU_FREE(thread->th.th_affin_mask);
5768  thread->th.th_affin_mask = NULL;
5769  }; // if
5770 #endif /* KMP_AFFINITY_SUPPORTED */
5771 
5772  __kmp_reap_team(thread->th.th_serial_team);
5773  thread->th.th_serial_team = NULL;
5774  __kmp_free(thread);
5775 
5776  KMP_MB();
5777 
5778 } // __kmp_reap_thread
5779 
5780 static void __kmp_internal_end(void) {
5781  int i;
5782 
5783  /* First, unregister the library */
5784  __kmp_unregister_library();
5785 
5786 #if KMP_OS_WINDOWS
5787  /* In Win static library, we can't tell when a root actually dies, so we
5788  reclaim the data structures for any root threads that have died but not
5789  unregistered themselves, in order to shut down cleanly.
5790  In Win dynamic library we also can't tell when a thread dies. */
5791  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5792 // dead roots
5793 #endif
5794 
5795  for (i = 0; i < __kmp_threads_capacity; i++)
5796  if (__kmp_root[i])
5797  if (__kmp_root[i]->r.r_active)
5798  break;
5799  KMP_MB(); /* Flush all pending memory write invalidates. */
5800  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5801 
5802  if (i < __kmp_threads_capacity) {
5803 #if KMP_USE_MONITOR
5804  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5805  KMP_MB(); /* Flush all pending memory write invalidates. */
5806 
5807 // Need to check that monitor was initialized before reaping it. If we are
5808 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5809 // __kmp_monitor will appear to contain valid data, but it is only valid in the
5810 // parent process, not the child.
5811  // New behavior (201008): instead of keying off of the flag
5812  // __kmp_init_parallel, the monitor thread creation is keyed off
5813  // of the new flag __kmp_init_monitor.
5814  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5815  if (TCR_4(__kmp_init_monitor)) {
5816  __kmp_reap_monitor(&__kmp_monitor);
5817  TCW_4(__kmp_init_monitor, 0);
5818  }
5819  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5820  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5821 #endif // KMP_USE_MONITOR
5822  } else {
5823 /* TODO move this to cleanup code */
5824 #ifdef KMP_DEBUG
5825  /* make sure that everything has properly ended */
5826  for (i = 0; i < __kmp_threads_capacity; i++) {
5827  if (__kmp_root[i]) {
5828  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5829  // there can be uber threads alive here
5830  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5831  }
5832  }
5833 #endif
5834 
5835  KMP_MB();
5836 
5837  // Reap the worker threads.
5838  // This is valid for now, but be careful if threads are reaped sooner.
5839  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5840  // Get the next thread from the pool.
5841  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5842  __kmp_thread_pool = thread->th.th_next_pool;
5843  // Reap it.
5844  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5845  thread->th.th_next_pool = NULL;
5846  thread->th.th_in_pool = FALSE;
5847  __kmp_reap_thread(thread, 0);
5848  }; // while
5849  __kmp_thread_pool_insert_pt = NULL;
5850 
5851  // Reap teams.
5852  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5853  // Get the next team from the pool.
5854  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5855  __kmp_team_pool = team->t.t_next_pool;
5856  // Reap it.
5857  team->t.t_next_pool = NULL;
5858  __kmp_reap_team(team);
5859  }; // while
5860 
5861  __kmp_reap_task_teams();
5862 
5863  for (i = 0; i < __kmp_threads_capacity; ++i) {
5864  // TBD: Add some checking...
5865  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5866  }
5867 
5868  /* Make sure all threadprivate destructors get run by joining with all
5869  worker threads before resetting this flag */
5870  TCW_SYNC_4(__kmp_init_common, FALSE);
5871 
5872  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5873  KMP_MB();
5874 
5875 #if KMP_USE_MONITOR
5876  // See note above: One of the possible fixes for CQ138434 / CQ140126
5877  //
5878  // FIXME: push both code fragments down and CSE them?
5879  // push them into __kmp_cleanup() ?
5880  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5881  if (TCR_4(__kmp_init_monitor)) {
5882  __kmp_reap_monitor(&__kmp_monitor);
5883  TCW_4(__kmp_init_monitor, 0);
5884  }
5885  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5886  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5887 #endif
5888  } /* else !__kmp_global.t_active */
5889  TCW_4(__kmp_init_gtid, FALSE);
5890  KMP_MB(); /* Flush all pending memory write invalidates. */
5891 
5892  __kmp_cleanup();
5893 #if OMPT_SUPPORT
5894  ompt_fini();
5895 #endif
5896 }
5897 
5898 void __kmp_internal_end_library(int gtid_req) {
5899  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5900  /* this shouldn't be a race condition because __kmp_internal_end() is the
5901  only place to clear __kmp_serial_init */
5902  /* we'll check this later too, after we get the lock */
5903  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5904  // redundaant, because the next check will work in any case.
5905  if (__kmp_global.g.g_abort) {
5906  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5907  /* TODO abort? */
5908  return;
5909  }
5910  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5911  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5912  return;
5913  }
5914 
5915  KMP_MB(); /* Flush all pending memory write invalidates. */
5916 
5917  /* find out who we are and what we should do */
5918  {
5919  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
5920  KA_TRACE(
5921  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
5922  if (gtid == KMP_GTID_SHUTDOWN) {
5923  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
5924  "already shutdown\n"));
5925  return;
5926  } else if (gtid == KMP_GTID_MONITOR) {
5927  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
5928  "registered, or system shutdown\n"));
5929  return;
5930  } else if (gtid == KMP_GTID_DNE) {
5931  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
5932  "shutdown\n"));
5933  /* we don't know who we are, but we may still shutdown the library */
5934  } else if (KMP_UBER_GTID(gtid)) {
5935  /* unregister ourselves as an uber thread. gtid is no longer valid */
5936  if (__kmp_root[gtid]->r.r_active) {
5937  __kmp_global.g.g_abort = -1;
5938  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5939  KA_TRACE(10,
5940  ("__kmp_internal_end_library: root still active, abort T#%d\n",
5941  gtid));
5942  return;
5943  } else {
5944  KA_TRACE(
5945  10,
5946  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
5947  __kmp_unregister_root_current_thread(gtid);
5948  }
5949  } else {
5950 /* worker threads may call this function through the atexit handler, if they
5951  * call exit() */
5952 /* For now, skip the usual subsequent processing and just dump the debug buffer.
5953  TODO: do a thorough shutdown instead */
5954 #ifdef DUMP_DEBUG_ON_EXIT
5955  if (__kmp_debug_buf)
5956  __kmp_dump_debug_buffer();
5957 #endif
5958  return;
5959  }
5960  }
5961  /* synchronize the termination process */
5962  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
5963 
5964  /* have we already finished */
5965  if (__kmp_global.g.g_abort) {
5966  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
5967  /* TODO abort? */
5968  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5969  return;
5970  }
5971  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5972  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5973  return;
5974  }
5975 
5976  /* We need this lock to enforce mutex between this reading of
5977  __kmp_threads_capacity and the writing by __kmp_register_root.
5978  Alternatively, we can use a counter of roots that is atomically updated by
5979  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
5980  __kmp_internal_end_*. */
5981  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
5982 
5983  /* now we can safely conduct the actual termination */
5984  __kmp_internal_end();
5985 
5986  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
5987  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5988 
5989  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
5990 
5991 #ifdef DUMP_DEBUG_ON_EXIT
5992  if (__kmp_debug_buf)
5993  __kmp_dump_debug_buffer();
5994 #endif
5995 
5996 #if KMP_OS_WINDOWS
5997  __kmp_close_console();
5998 #endif
5999 
6000  __kmp_fini_allocator();
6001 
6002 } // __kmp_internal_end_library
6003 
6004 void __kmp_internal_end_thread(int gtid_req) {
6005  int i;
6006 
6007  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6008  /* this shouldn't be a race condition because __kmp_internal_end() is the
6009  * only place to clear __kmp_serial_init */
6010  /* we'll check this later too, after we get the lock */
6011  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6012  // redundant, because the next check will work in any case.
6013  if (__kmp_global.g.g_abort) {
6014  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6015  /* TODO abort? */
6016  return;
6017  }
6018  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6019  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6020  return;
6021  }
6022 
6023  KMP_MB(); /* Flush all pending memory write invalidates. */
6024 
6025  /* find out who we are and what we should do */
6026  {
6027  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6028  KA_TRACE(10,
6029  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6030  if (gtid == KMP_GTID_SHUTDOWN) {
6031  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6032  "already shutdown\n"));
6033  return;
6034  } else if (gtid == KMP_GTID_MONITOR) {
6035  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6036  "registered, or system shutdown\n"));
6037  return;
6038  } else if (gtid == KMP_GTID_DNE) {
6039  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6040  "shutdown\n"));
6041  return;
6042  /* we don't know who we are */
6043  } else if (KMP_UBER_GTID(gtid)) {
6044  /* unregister ourselves as an uber thread. gtid is no longer valid */
6045  if (__kmp_root[gtid]->r.r_active) {
6046  __kmp_global.g.g_abort = -1;
6047  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6048  KA_TRACE(10,
6049  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6050  gtid));
6051  return;
6052  } else {
6053  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6054  gtid));
6055  __kmp_unregister_root_current_thread(gtid);
6056  }
6057  } else {
6058  /* just a worker thread, let's leave */
6059  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6060 
6061  if (gtid >= 0) {
6062  __kmp_threads[gtid]->th.th_task_team = NULL;
6063  }
6064 
6065  KA_TRACE(10,
6066  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6067  gtid));
6068  return;
6069  }
6070  }
6071 #if defined KMP_DYNAMIC_LIB
6072  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6073  // thread, because we will better shutdown later in the library destructor.
6074  // The reason of this change is performance problem when non-openmp thread in
6075  // a loop forks and joins many openmp threads. We can save a lot of time
6076  // keeping worker threads alive until the program shutdown.
6077  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6078  // and Windows(DPD200287443) that occurs when using critical sections from
6079  // foreign threads.
6080  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6081  return;
6082 #endif
6083  /* synchronize the termination process */
6084  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6085 
6086  /* have we already finished */
6087  if (__kmp_global.g.g_abort) {
6088  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6089  /* TODO abort? */
6090  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6091  return;
6092  }
6093  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6094  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6095  return;
6096  }
6097 
6098  /* We need this lock to enforce mutex between this reading of
6099  __kmp_threads_capacity and the writing by __kmp_register_root.
6100  Alternatively, we can use a counter of roots that is atomically updated by
6101  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6102  __kmp_internal_end_*. */
6103 
6104  /* should we finish the run-time? are all siblings done? */
6105  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6106 
6107  for (i = 0; i < __kmp_threads_capacity; ++i) {
6108  if (KMP_UBER_GTID(i)) {
6109  KA_TRACE(
6110  10,
6111  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6112  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6113  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6114  return;
6115  };
6116  }
6117 
6118  /* now we can safely conduct the actual termination */
6119 
6120  __kmp_internal_end();
6121 
6122  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6123  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6124 
6125  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6126 
6127 #ifdef DUMP_DEBUG_ON_EXIT
6128  if (__kmp_debug_buf)
6129  __kmp_dump_debug_buffer();
6130 #endif
6131 } // __kmp_internal_end_thread
6132 
6133 // -----------------------------------------------------------------------------
6134 // Library registration stuff.
6135 
6136 static long __kmp_registration_flag = 0;
6137 // Random value used to indicate library initialization.
6138 static char *__kmp_registration_str = NULL;
6139 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6140 
6141 static inline char *__kmp_reg_status_name() {
6142  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6143  each thread. If registration and unregistration go in different threads
6144  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6145  env var can not be found, because the name will contain different pid. */
6146  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6147 } // __kmp_reg_status_get
6148 
6149 void __kmp_register_library_startup(void) {
6150 
6151  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6152  int done = 0;
6153  union {
6154  double dtime;
6155  long ltime;
6156  } time;
6157 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6158  __kmp_initialize_system_tick();
6159 #endif
6160  __kmp_read_system_time(&time.dtime);
6161  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6162  __kmp_registration_str =
6163  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6164  __kmp_registration_flag, KMP_LIBRARY_FILE);
6165 
6166  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6167  __kmp_registration_str));
6168 
6169  while (!done) {
6170 
6171  char *value = NULL; // Actual value of the environment variable.
6172 
6173  // Set environment variable, but do not overwrite if it is exist.
6174  __kmp_env_set(name, __kmp_registration_str, 0);
6175  // Check the variable is written.
6176  value = __kmp_env_get(name);
6177  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6178 
6179  done = 1; // Ok, environment variable set successfully, exit the loop.
6180 
6181  } else {
6182 
6183  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6184  // Check whether it alive or dead.
6185  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6186  char *tail = value;
6187  char *flag_addr_str = NULL;
6188  char *flag_val_str = NULL;
6189  char const *file_name = NULL;
6190  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6191  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6192  file_name = tail;
6193  if (tail != NULL) {
6194  long *flag_addr = 0;
6195  long flag_val = 0;
6196  KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6197  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6198  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6199  // First, check whether environment-encoded address is mapped into
6200  // addr space.
6201  // If so, dereference it to see if it still has the right value.
6202  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6203  neighbor = 1;
6204  } else {
6205  // If not, then we know the other copy of the library is no longer
6206  // running.
6207  neighbor = 2;
6208  }; // if
6209  }; // if
6210  }; // if
6211  switch (neighbor) {
6212  case 0: // Cannot parse environment variable -- neighbor status unknown.
6213  // Assume it is the incompatible format of future version of the
6214  // library. Assume the other library is alive.
6215  // WARN( ... ); // TODO: Issue a warning.
6216  file_name = "unknown library";
6217  // Attention! Falling to the next case. That's intentional.
6218  case 1: { // Neighbor is alive.
6219  // Check it is allowed.
6220  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6221  if (!__kmp_str_match_true(duplicate_ok)) {
6222  // That's not allowed. Issue fatal error.
6223  __kmp_msg(kmp_ms_fatal,
6224  KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6225  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6226  }; // if
6227  KMP_INTERNAL_FREE(duplicate_ok);
6228  __kmp_duplicate_library_ok = 1;
6229  done = 1; // Exit the loop.
6230  } break;
6231  case 2: { // Neighbor is dead.
6232  // Clear the variable and try to register library again.
6233  __kmp_env_unset(name);
6234  } break;
6235  default: { KMP_DEBUG_ASSERT(0); } break;
6236  }; // switch
6237 
6238  }; // if
6239  KMP_INTERNAL_FREE((void *)value);
6240 
6241  }; // while
6242  KMP_INTERNAL_FREE((void *)name);
6243 
6244 } // func __kmp_register_library_startup
6245 
6246 void __kmp_unregister_library(void) {
6247 
6248  char *name = __kmp_reg_status_name();
6249  char *value = __kmp_env_get(name);
6250 
6251  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6252  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6253  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6254  // Ok, this is our variable. Delete it.
6255  __kmp_env_unset(name);
6256  }; // if
6257 
6258  KMP_INTERNAL_FREE(__kmp_registration_str);
6259  KMP_INTERNAL_FREE(value);
6260  KMP_INTERNAL_FREE(name);
6261 
6262  __kmp_registration_flag = 0;
6263  __kmp_registration_str = NULL;
6264 
6265 } // __kmp_unregister_library
6266 
6267 // End of Library registration stuff.
6268 // -----------------------------------------------------------------------------
6269 
6270 #if KMP_MIC_SUPPORTED
6271 
6272 static void __kmp_check_mic_type() {
6273  kmp_cpuid_t cpuid_state = {0};
6274  kmp_cpuid_t *cs_p = &cpuid_state;
6275  __kmp_x86_cpuid(1, 0, cs_p);
6276  // We don't support mic1 at the moment
6277  if ((cs_p->eax & 0xff0) == 0xB10) {
6278  __kmp_mic_type = mic2;
6279  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6280  __kmp_mic_type = mic3;
6281  } else {
6282  __kmp_mic_type = non_mic;
6283  }
6284 }
6285 
6286 #endif /* KMP_MIC_SUPPORTED */
6287 
6288 static void __kmp_do_serial_initialize(void) {
6289  int i, gtid;
6290  int size;
6291 
6292  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6293 
6294  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6295  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6296  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6297  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6298  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6299 
6300 #if OMPT_SUPPORT
6301  ompt_pre_init();
6302 #endif
6303 
6304  __kmp_validate_locks();
6305 
6306  /* Initialize internal memory allocator */
6307  __kmp_init_allocator();
6308 
6309  /* Register the library startup via an environment variable and check to see
6310  whether another copy of the library is already registered. */
6311 
6312  __kmp_register_library_startup();
6313 
6314  /* TODO reinitialization of library */
6315  if (TCR_4(__kmp_global.g.g_done)) {
6316  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6317  }
6318 
6319  __kmp_global.g.g_abort = 0;
6320  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6321 
6322 /* initialize the locks */
6323 #if KMP_USE_ADAPTIVE_LOCKS
6324 #if KMP_DEBUG_ADAPTIVE_LOCKS
6325  __kmp_init_speculative_stats();
6326 #endif
6327 #endif
6328 #if KMP_STATS_ENABLED
6329  __kmp_stats_init();
6330 #endif
6331  __kmp_init_lock(&__kmp_global_lock);
6332  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6333  __kmp_init_lock(&__kmp_debug_lock);
6334  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6335  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6336  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6337  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6338  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6339  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6340  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6341  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6342  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6343  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6344  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6345  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6346  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6347  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6348  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6349 #if KMP_USE_MONITOR
6350  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6351 #endif
6352  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6353 
6354  /* conduct initialization and initial setup of configuration */
6355 
6356  __kmp_runtime_initialize();
6357 
6358 #if KMP_MIC_SUPPORTED
6359  __kmp_check_mic_type();
6360 #endif
6361 
6362 // Some global variable initialization moved here from kmp_env_initialize()
6363 #ifdef KMP_DEBUG
6364  kmp_diag = 0;
6365 #endif
6366  __kmp_abort_delay = 0;
6367 
6368  // From __kmp_init_dflt_team_nth()
6369  /* assume the entire machine will be used */
6370  __kmp_dflt_team_nth_ub = __kmp_xproc;
6371  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6372  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6373  }
6374  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6375  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6376  }
6377  __kmp_max_nth = __kmp_sys_max_nth;
6378 
6379  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6380  // part
6381  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6382 #if KMP_USE_MONITOR
6383  __kmp_monitor_wakeups =
6384  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6385  __kmp_bt_intervals =
6386  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6387 #endif
6388  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6389  __kmp_library = library_throughput;
6390  // From KMP_SCHEDULE initialization
6391  __kmp_static = kmp_sch_static_balanced;
6392 // AC: do not use analytical here, because it is non-monotonous
6393 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6394 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6395 // need to repeat assignment
6396 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6397 // bit control and barrier method control parts
6398 #if KMP_FAST_REDUCTION_BARRIER
6399 #define kmp_reduction_barrier_gather_bb ((int)1)
6400 #define kmp_reduction_barrier_release_bb ((int)1)
6401 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6402 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6403 #endif // KMP_FAST_REDUCTION_BARRIER
6404  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6405  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6406  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6407  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6408  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6409 #if KMP_FAST_REDUCTION_BARRIER
6410  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6411  // lin_64 ): hyper,1
6412  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6413  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6414  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6415  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6416  }
6417 #endif // KMP_FAST_REDUCTION_BARRIER
6418  }
6419 #if KMP_FAST_REDUCTION_BARRIER
6420 #undef kmp_reduction_barrier_release_pat
6421 #undef kmp_reduction_barrier_gather_pat
6422 #undef kmp_reduction_barrier_release_bb
6423 #undef kmp_reduction_barrier_gather_bb
6424 #endif // KMP_FAST_REDUCTION_BARRIER
6425 #if KMP_MIC_SUPPORTED
6426  if (__kmp_mic_type == mic2) { // KNC
6427  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6428  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6429  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6430  1; // forkjoin release
6431  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6432  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6433  }
6434 #if KMP_FAST_REDUCTION_BARRIER
6435  if (__kmp_mic_type == mic2) { // KNC
6436  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6437  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6438  }
6439 #endif // KMP_FAST_REDUCTION_BARRIER
6440 #endif // KMP_MIC_SUPPORTED
6441 
6442 // From KMP_CHECKS initialization
6443 #ifdef KMP_DEBUG
6444  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6445 #else
6446  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6447 #endif
6448 
6449  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6450  __kmp_foreign_tp = TRUE;
6451 
6452  __kmp_global.g.g_dynamic = FALSE;
6453  __kmp_global.g.g_dynamic_mode = dynamic_default;
6454 
6455  __kmp_env_initialize(NULL);
6456 
6457 // Print all messages in message catalog for testing purposes.
6458 #ifdef KMP_DEBUG
6459  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6460  if (__kmp_str_match_true(val)) {
6461  kmp_str_buf_t buffer;
6462  __kmp_str_buf_init(&buffer);
6463  __kmp_i18n_dump_catalog(&buffer);
6464  __kmp_printf("%s", buffer.str);
6465  __kmp_str_buf_free(&buffer);
6466  }; // if
6467  __kmp_env_free(&val);
6468 #endif
6469 
6470  __kmp_threads_capacity =
6471  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6472  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6473  __kmp_tp_capacity = __kmp_default_tp_capacity(
6474  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6475 
6476  // If the library is shut down properly, both pools must be NULL. Just in
6477  // case, set them to NULL -- some memory may leak, but subsequent code will
6478  // work even if pools are not freed.
6479  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6480  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6481  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6482  __kmp_thread_pool = NULL;
6483  __kmp_thread_pool_insert_pt = NULL;
6484  __kmp_team_pool = NULL;
6485 
6486  /* Allocate all of the variable sized records */
6487  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6488  * expandable */
6489  /* Since allocation is cache-aligned, just add extra padding at the end */
6490  size =
6491  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6492  CACHE_LINE;
6493  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6494  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6495  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6496 
6497  /* init thread counts */
6498  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6499  0); // Asserts fail if the library is reinitializing and
6500  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6501  __kmp_all_nth = 0;
6502  __kmp_nth = 0;
6503 
6504  /* setup the uber master thread and hierarchy */
6505  gtid = __kmp_register_root(TRUE);
6506  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6507  KMP_ASSERT(KMP_UBER_GTID(gtid));
6508  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6509 
6510  KMP_MB(); /* Flush all pending memory write invalidates. */
6511 
6512  __kmp_common_initialize();
6513 
6514 #if KMP_OS_UNIX
6515  /* invoke the child fork handler */
6516  __kmp_register_atfork();
6517 #endif
6518 
6519 #if !defined KMP_DYNAMIC_LIB
6520  {
6521  /* Invoke the exit handler when the program finishes, only for static
6522  library. For dynamic library, we already have _fini and DllMain. */
6523  int rc = atexit(__kmp_internal_end_atexit);
6524  if (rc != 0) {
6525  __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6526  __kmp_msg_null);
6527  }; // if
6528  }
6529 #endif
6530 
6531 #if KMP_HANDLE_SIGNALS
6532 #if KMP_OS_UNIX
6533  /* NOTE: make sure that this is called before the user installs their own
6534  signal handlers so that the user handlers are called first. this way they
6535  can return false, not call our handler, avoid terminating the library, and
6536  continue execution where they left off. */
6537  __kmp_install_signals(FALSE);
6538 #endif /* KMP_OS_UNIX */
6539 #if KMP_OS_WINDOWS
6540  __kmp_install_signals(TRUE);
6541 #endif /* KMP_OS_WINDOWS */
6542 #endif
6543 
6544  /* we have finished the serial initialization */
6545  __kmp_init_counter++;
6546 
6547  __kmp_init_serial = TRUE;
6548 
6549  if (__kmp_settings) {
6550  __kmp_env_print();
6551  }
6552 
6553 #if OMP_40_ENABLED
6554  if (__kmp_display_env || __kmp_display_env_verbose) {
6555  __kmp_env_print_2();
6556  }
6557 #endif // OMP_40_ENABLED
6558 
6559 #if OMPT_SUPPORT
6560  ompt_post_init();
6561 #endif
6562 
6563  KMP_MB();
6564 
6565  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6566 }
6567 
6568 void __kmp_serial_initialize(void) {
6569  if (__kmp_init_serial) {
6570  return;
6571  }
6572  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6573  if (__kmp_init_serial) {
6574  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6575  return;
6576  }
6577  __kmp_do_serial_initialize();
6578  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6579 }
6580 
6581 static void __kmp_do_middle_initialize(void) {
6582  int i, j;
6583  int prev_dflt_team_nth;
6584 
6585  if (!__kmp_init_serial) {
6586  __kmp_do_serial_initialize();
6587  }
6588 
6589  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6590 
6591  // Save the previous value for the __kmp_dflt_team_nth so that
6592  // we can avoid some reinitialization if it hasn't changed.
6593  prev_dflt_team_nth = __kmp_dflt_team_nth;
6594 
6595 #if KMP_AFFINITY_SUPPORTED
6596  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6597  // number of cores on the machine.
6598  __kmp_affinity_initialize();
6599 
6600  // Run through the __kmp_threads array and set the affinity mask
6601  // for each root thread that is currently registered with the RTL.
6602  for (i = 0; i < __kmp_threads_capacity; i++) {
6603  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6604  __kmp_affinity_set_init_mask(i, TRUE);
6605  }
6606  }
6607 #endif /* KMP_AFFINITY_SUPPORTED */
6608 
6609  KMP_ASSERT(__kmp_xproc > 0);
6610  if (__kmp_avail_proc == 0) {
6611  __kmp_avail_proc = __kmp_xproc;
6612  }
6613 
6614  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6615  // correct them now
6616  j = 0;
6617  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6618  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6619  __kmp_avail_proc;
6620  j++;
6621  }
6622 
6623  if (__kmp_dflt_team_nth == 0) {
6624 #ifdef KMP_DFLT_NTH_CORES
6625  // Default #threads = #cores
6626  __kmp_dflt_team_nth = __kmp_ncores;
6627  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6628  "__kmp_ncores (%d)\n",
6629  __kmp_dflt_team_nth));
6630 #else
6631  // Default #threads = #available OS procs
6632  __kmp_dflt_team_nth = __kmp_avail_proc;
6633  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6634  "__kmp_avail_proc(%d)\n",
6635  __kmp_dflt_team_nth));
6636 #endif /* KMP_DFLT_NTH_CORES */
6637  }
6638 
6639  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6640  __kmp_dflt_team_nth = KMP_MIN_NTH;
6641  }
6642  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6643  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6644  }
6645 
6646  // There's no harm in continuing if the following check fails,
6647  // but it indicates an error in the previous logic.
6648  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6649 
6650  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6651  // Run through the __kmp_threads array and set the num threads icv for each
6652  // root thread that is currently registered with the RTL (which has not
6653  // already explicitly set its nthreads-var with a call to
6654  // omp_set_num_threads()).
6655  for (i = 0; i < __kmp_threads_capacity; i++) {
6656  kmp_info_t *thread = __kmp_threads[i];
6657  if (thread == NULL)
6658  continue;
6659  if (thread->th.th_current_task->td_icvs.nproc != 0)
6660  continue;
6661 
6662  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6663  }
6664  }
6665  KA_TRACE(
6666  20,
6667  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6668  __kmp_dflt_team_nth));
6669 
6670 #ifdef KMP_ADJUST_BLOCKTIME
6671  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6672  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6673  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6674  if (__kmp_nth > __kmp_avail_proc) {
6675  __kmp_zero_bt = TRUE;
6676  }
6677  }
6678 #endif /* KMP_ADJUST_BLOCKTIME */
6679 
6680  /* we have finished middle initialization */
6681  TCW_SYNC_4(__kmp_init_middle, TRUE);
6682 
6683  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6684 }
6685 
6686 void __kmp_middle_initialize(void) {
6687  if (__kmp_init_middle) {
6688  return;
6689  }
6690  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6691  if (__kmp_init_middle) {
6692  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6693  return;
6694  }
6695  __kmp_do_middle_initialize();
6696  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6697 }
6698 
6699 void __kmp_parallel_initialize(void) {
6700  int gtid = __kmp_entry_gtid(); // this might be a new root
6701 
6702  /* synchronize parallel initialization (for sibling) */
6703  if (TCR_4(__kmp_init_parallel))
6704  return;
6705  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706  if (TCR_4(__kmp_init_parallel)) {
6707  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708  return;
6709  }
6710 
6711  /* TODO reinitialization after we have already shut down */
6712  if (TCR_4(__kmp_global.g.g_done)) {
6713  KA_TRACE(
6714  10,
6715  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6716  __kmp_infinite_loop();
6717  }
6718 
6719  /* jc: The lock __kmp_initz_lock is already held, so calling
6720  __kmp_serial_initialize would cause a deadlock. So we call
6721  __kmp_do_serial_initialize directly. */
6722  if (!__kmp_init_middle) {
6723  __kmp_do_middle_initialize();
6724  }
6725 
6726  /* begin initialization */
6727  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6728  KMP_ASSERT(KMP_UBER_GTID(gtid));
6729 
6730 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6731  // Save the FP control regs.
6732  // Worker threads will set theirs to these values at thread startup.
6733  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6734  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6735  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6736 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6737 
6738 #if KMP_OS_UNIX
6739 #if KMP_HANDLE_SIGNALS
6740  /* must be after __kmp_serial_initialize */
6741  __kmp_install_signals(TRUE);
6742 #endif
6743 #endif
6744 
6745  __kmp_suspend_initialize();
6746 
6747 #if defined(USE_LOAD_BALANCE)
6748  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6749  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6750  }
6751 #else
6752  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6753  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6754  }
6755 #endif
6756 
6757  if (__kmp_version) {
6758  __kmp_print_version_2();
6759  }
6760 
6761  /* we have finished parallel initialization */
6762  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6763 
6764  KMP_MB();
6765  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6766 
6767  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6768 }
6769 
6770 /* ------------------------------------------------------------------------ */
6771 
6772 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6773  kmp_team_t *team) {
6774  kmp_disp_t *dispatch;
6775 
6776  KMP_MB();
6777 
6778  /* none of the threads have encountered any constructs, yet. */
6779  this_thr->th.th_local.this_construct = 0;
6780 #if KMP_CACHE_MANAGE
6781  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6782 #endif /* KMP_CACHE_MANAGE */
6783  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6784  KMP_DEBUG_ASSERT(dispatch);
6785  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6786  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6787  // this_thr->th.th_info.ds.ds_tid ] );
6788 
6789  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6790 #if OMP_45_ENABLED
6791  dispatch->th_doacross_buf_idx =
6792  0; /* reset the doacross dispatch buffer counter */
6793 #endif
6794  if (__kmp_env_consistency_check)
6795  __kmp_push_parallel(gtid, team->t.t_ident);
6796 
6797  KMP_MB(); /* Flush all pending memory write invalidates. */
6798 }
6799 
6800 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6801  kmp_team_t *team) {
6802  if (__kmp_env_consistency_check)
6803  __kmp_pop_parallel(gtid, team->t.t_ident);
6804 
6805  __kmp_finish_implicit_task(this_thr);
6806 }
6807 
6808 int __kmp_invoke_task_func(int gtid) {
6809  int rc;
6810  int tid = __kmp_tid_from_gtid(gtid);
6811  kmp_info_t *this_thr = __kmp_threads[gtid];
6812  kmp_team_t *team = this_thr->th.th_team;
6813 
6814  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6815 #if USE_ITT_BUILD
6816  if (__itt_stack_caller_create_ptr) {
6817  __kmp_itt_stack_callee_enter(
6818  (__itt_caller)
6819  team->t.t_stack_id); // inform ittnotify about entering user's code
6820  }
6821 #endif /* USE_ITT_BUILD */
6822 #if INCLUDE_SSC_MARKS
6823  SSC_MARK_INVOKING();
6824 #endif
6825 
6826 #if OMPT_SUPPORT
6827  void *dummy;
6828  void **exit_runtime_p;
6829  ompt_task_id_t my_task_id;
6830  ompt_parallel_id_t my_parallel_id;
6831 
6832  if (ompt_enabled) {
6833  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
6834  .ompt_task_info.frame.exit_runtime_frame);
6835  } else {
6836  exit_runtime_p = &dummy;
6837  }
6838 
6839 #if OMPT_TRACE
6840  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6841  my_parallel_id = team->t.ompt_team_info.parallel_id;
6842  if (ompt_enabled &&
6843  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6844  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(my_parallel_id,
6845  my_task_id);
6846  }
6847 #endif
6848 #endif
6849 
6850  {
6851  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6852  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6853  rc =
6854  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6855  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6856 #if OMPT_SUPPORT
6857  ,
6858  exit_runtime_p
6859 #endif
6860  );
6861 #if OMPT_SUPPORT
6862  *exit_runtime_p = NULL;
6863 #endif
6864  }
6865 
6866 #if USE_ITT_BUILD
6867  if (__itt_stack_caller_create_ptr) {
6868  __kmp_itt_stack_callee_leave(
6869  (__itt_caller)
6870  team->t.t_stack_id); // inform ittnotify about leaving user's code
6871  }
6872 #endif /* USE_ITT_BUILD */
6873  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6874 
6875  return rc;
6876 }
6877 
6878 #if OMP_40_ENABLED
6879 void __kmp_teams_master(int gtid) {
6880  // This routine is called by all master threads in teams construct
6881  kmp_info_t *thr = __kmp_threads[gtid];
6882  kmp_team_t *team = thr->th.th_team;
6883  ident_t *loc = team->t.t_ident;
6884  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6885  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6886  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6887  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6888  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6889 // Launch league of teams now, but not let workers execute
6890 // (they hang on fork barrier until next parallel)
6891 #if INCLUDE_SSC_MARKS
6892  SSC_MARK_FORKING();
6893 #endif
6894  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6895 #if OMPT_SUPPORT
6896  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6897 #endif
6898  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6899  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6900 #if INCLUDE_SSC_MARKS
6901  SSC_MARK_JOINING();
6902 #endif
6903 
6904  // AC: last parameter "1" eliminates join barrier which won't work because
6905  // worker threads are in a fork barrier waiting for more parallel regions
6906  __kmp_join_call(loc, gtid
6907 #if OMPT_SUPPORT
6908  ,
6909  fork_context_intel
6910 #endif
6911  ,
6912  1);
6913 }
6914 
6915 int __kmp_invoke_teams_master(int gtid) {
6916  kmp_info_t *this_thr = __kmp_threads[gtid];
6917  kmp_team_t *team = this_thr->th.th_team;
6918 #if KMP_DEBUG
6919  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
6920  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
6921  (void *)__kmp_teams_master);
6922 #endif
6923  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
6924  __kmp_teams_master(gtid);
6925  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
6926  return 1;
6927 }
6928 #endif /* OMP_40_ENABLED */
6929 
6930 /* this sets the requested number of threads for the next parallel region
6931  encountered by this team. since this should be enclosed in the forkjoin
6932  critical section it should avoid race conditions with assymmetrical nested
6933  parallelism */
6934 
6935 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
6936  kmp_info_t *thr = __kmp_threads[gtid];
6937 
6938  if (num_threads > 0)
6939  thr->th.th_set_nproc = num_threads;
6940 }
6941 
6942 #if OMP_40_ENABLED
6943 
6944 /* this sets the requested number of teams for the teams region and/or
6945  the number of threads for the next parallel region encountered */
6946 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
6947  int num_threads) {
6948  kmp_info_t *thr = __kmp_threads[gtid];
6949  KMP_DEBUG_ASSERT(num_teams >= 0);
6950  KMP_DEBUG_ASSERT(num_threads >= 0);
6951 
6952  if (num_teams == 0)
6953  num_teams = 1; // default number of teams is 1.
6954  if (num_teams > __kmp_max_nth) { // if too many teams requested?
6955  if (!__kmp_reserve_warn) {
6956  __kmp_reserve_warn = 1;
6957  __kmp_msg(kmp_ms_warning,
6958  KMP_MSG(CantFormThrTeam, num_teams, __kmp_max_nth),
6959  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6960  }
6961  num_teams = __kmp_max_nth;
6962  }
6963  // Set number of teams (number of threads in the outer "parallel" of the
6964  // teams)
6965  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6966 
6967  // Remember the number of threads for inner parallel regions
6968  if (num_threads == 0) {
6969  if (!TCR_4(__kmp_init_middle))
6970  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6971  num_threads = __kmp_avail_proc / num_teams;
6972  if (num_teams * num_threads > __kmp_max_nth) {
6973  // adjust num_threads w/o warning as it is not user setting
6974  num_threads = __kmp_max_nth / num_teams;
6975  }
6976  } else {
6977  if (num_teams * num_threads > __kmp_max_nth) {
6978  int new_threads = __kmp_max_nth / num_teams;
6979  if (!__kmp_reserve_warn) { // user asked for too many threads
6980  __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6981  __kmp_msg(kmp_ms_warning,
6982  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
6983  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6984  }
6985  num_threads = new_threads;
6986  }
6987  }
6988  thr->th.th_teams_size.nth = num_threads;
6989 }
6990 
6991 // Set the proc_bind var to use in the following parallel region.
6992 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
6993  kmp_info_t *thr = __kmp_threads[gtid];
6994  thr->th.th_set_proc_bind = proc_bind;
6995 }
6996 
6997 #endif /* OMP_40_ENABLED */
6998 
6999 /* Launch the worker threads into the microtask. */
7000 
7001 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7002  kmp_info_t *this_thr = __kmp_threads[gtid];
7003 
7004 #ifdef KMP_DEBUG
7005  int f;
7006 #endif /* KMP_DEBUG */
7007 
7008  KMP_DEBUG_ASSERT(team);
7009  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7010  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7011  KMP_MB(); /* Flush all pending memory write invalidates. */
7012 
7013  team->t.t_construct = 0; /* no single directives seen yet */
7014  team->t.t_ordered.dt.t_value =
7015  0; /* thread 0 enters the ordered section first */
7016 
7017  /* Reset the identifiers on the dispatch buffer */
7018  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7019  if (team->t.t_max_nproc > 1) {
7020  int i;
7021  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7022  team->t.t_disp_buffer[i].buffer_index = i;
7023 #if OMP_45_ENABLED
7024  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7025 #endif
7026  }
7027  } else {
7028  team->t.t_disp_buffer[0].buffer_index = 0;
7029 #if OMP_45_ENABLED
7030  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7031 #endif
7032  }
7033 
7034  KMP_MB(); /* Flush all pending memory write invalidates. */
7035  KMP_ASSERT(this_thr->th.th_team == team);
7036 
7037 #ifdef KMP_DEBUG
7038  for (f = 0; f < team->t.t_nproc; f++) {
7039  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7040  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7041  }
7042 #endif /* KMP_DEBUG */
7043 
7044  /* release the worker threads so they may begin working */
7045  __kmp_fork_barrier(gtid, 0);
7046 }
7047 
7048 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7049  kmp_info_t *this_thr = __kmp_threads[gtid];
7050 
7051  KMP_DEBUG_ASSERT(team);
7052  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7053  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7054  KMP_MB(); /* Flush all pending memory write invalidates. */
7055 
7056 /* Join barrier after fork */
7057 
7058 #ifdef KMP_DEBUG
7059  if (__kmp_threads[gtid] &&
7060  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7061  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7062  __kmp_threads[gtid]);
7063  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7064  "team->t.t_nproc=%d\n",
7065  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7066  team->t.t_nproc);
7067  __kmp_print_structure();
7068  }
7069  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7070  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7071 #endif /* KMP_DEBUG */
7072 
7073  __kmp_join_barrier(gtid); /* wait for everyone */
7074 
7075  KMP_MB(); /* Flush all pending memory write invalidates. */
7076  KMP_ASSERT(this_thr->th.th_team == team);
7077 }
7078 
7079 /* ------------------------------------------------------------------------ */
7080 
7081 #ifdef USE_LOAD_BALANCE
7082 
7083 // Return the worker threads actively spinning in the hot team, if we
7084 // are at the outermost level of parallelism. Otherwise, return 0.
7085 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7086  int i;
7087  int retval;
7088  kmp_team_t *hot_team;
7089 
7090  if (root->r.r_active) {
7091  return 0;
7092  }
7093  hot_team = root->r.r_hot_team;
7094  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7095  return hot_team->t.t_nproc - 1; // Don't count master thread
7096  }
7097 
7098  // Skip the master thread - it is accounted for elsewhere.
7099  retval = 0;
7100  for (i = 1; i < hot_team->t.t_nproc; i++) {
7101  if (hot_team->t.t_threads[i]->th.th_active) {
7102  retval++;
7103  }
7104  }
7105  return retval;
7106 }
7107 
7108 // Perform an automatic adjustment to the number of
7109 // threads used by the next parallel region.
7110 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7111  int retval;
7112  int pool_active;
7113  int hot_team_active;
7114  int team_curr_active;
7115  int system_active;
7116 
7117  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7118  set_nproc));
7119  KMP_DEBUG_ASSERT(root);
7120  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7121  ->th.th_current_task->td_icvs.dynamic == TRUE);
7122  KMP_DEBUG_ASSERT(set_nproc > 1);
7123 
7124  if (set_nproc == 1) {
7125  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7126  return 1;
7127  }
7128 
7129  // Threads that are active in the thread pool, active in the hot team for this
7130  // particular root (if we are at the outer par level), and the currently
7131  // executing thread (to become the master) are available to add to the new
7132  // team, but are currently contributing to the system load, and must be
7133  // accounted for.
7134  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7135  hot_team_active = __kmp_active_hot_team_nproc(root);
7136  team_curr_active = pool_active + hot_team_active + 1;
7137 
7138  // Check the system load.
7139  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7140  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7141  "hot team active = %d\n",
7142  system_active, pool_active, hot_team_active));
7143 
7144  if (system_active < 0) {
7145  // There was an error reading the necessary info from /proc, so use the
7146  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7147  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7148  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7149  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7150 
7151  // Make this call behave like the thread limit algorithm.
7152  retval = __kmp_avail_proc - __kmp_nth +
7153  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7154  if (retval > set_nproc) {
7155  retval = set_nproc;
7156  }
7157  if (retval < KMP_MIN_NTH) {
7158  retval = KMP_MIN_NTH;
7159  }
7160 
7161  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7162  retval));
7163  return retval;
7164  }
7165 
7166  // There is a slight delay in the load balance algorithm in detecting new
7167  // running procs. The real system load at this instant should be at least as
7168  // large as the #active omp thread that are available to add to the team.
7169  if (system_active < team_curr_active) {
7170  system_active = team_curr_active;
7171  }
7172  retval = __kmp_avail_proc - system_active + team_curr_active;
7173  if (retval > set_nproc) {
7174  retval = set_nproc;
7175  }
7176  if (retval < KMP_MIN_NTH) {
7177  retval = KMP_MIN_NTH;
7178  }
7179 
7180  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7181  return retval;
7182 } // __kmp_load_balance_nproc()
7183 
7184 #endif /* USE_LOAD_BALANCE */
7185 
7186 /* ------------------------------------------------------------------------ */
7187 
7188 /* NOTE: this is called with the __kmp_init_lock held */
7189 void __kmp_cleanup(void) {
7190  int f;
7191 
7192  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7193 
7194  if (TCR_4(__kmp_init_parallel)) {
7195 #if KMP_HANDLE_SIGNALS
7196  __kmp_remove_signals();
7197 #endif
7198  TCW_4(__kmp_init_parallel, FALSE);
7199  }
7200 
7201  if (TCR_4(__kmp_init_middle)) {
7202 #if KMP_AFFINITY_SUPPORTED
7203  __kmp_affinity_uninitialize();
7204 #endif /* KMP_AFFINITY_SUPPORTED */
7205  __kmp_cleanup_hierarchy();
7206  TCW_4(__kmp_init_middle, FALSE);
7207  }
7208 
7209  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7210 
7211  if (__kmp_init_serial) {
7212  __kmp_runtime_destroy();
7213  __kmp_init_serial = FALSE;
7214  }
7215 
7216  for (f = 0; f < __kmp_threads_capacity; f++) {
7217  if (__kmp_root[f] != NULL) {
7218  __kmp_free(__kmp_root[f]);
7219  __kmp_root[f] = NULL;
7220  }
7221  }
7222  __kmp_free(__kmp_threads);
7223  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7224  // there is no need in freeing __kmp_root.
7225  __kmp_threads = NULL;
7226  __kmp_root = NULL;
7227  __kmp_threads_capacity = 0;
7228 
7229 #if KMP_USE_DYNAMIC_LOCK
7230  __kmp_cleanup_indirect_user_locks();
7231 #else
7232  __kmp_cleanup_user_locks();
7233 #endif
7234 
7235 #if KMP_AFFINITY_SUPPORTED
7236  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7237  __kmp_cpuinfo_file = NULL;
7238 #endif /* KMP_AFFINITY_SUPPORTED */
7239 
7240 #if KMP_USE_ADAPTIVE_LOCKS
7241 #if KMP_DEBUG_ADAPTIVE_LOCKS
7242  __kmp_print_speculative_stats();
7243 #endif
7244 #endif
7245  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7246  __kmp_nested_nth.nth = NULL;
7247  __kmp_nested_nth.size = 0;
7248  __kmp_nested_nth.used = 0;
7249  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7250  __kmp_nested_proc_bind.bind_types = NULL;
7251  __kmp_nested_proc_bind.size = 0;
7252  __kmp_nested_proc_bind.used = 0;
7253 
7254  __kmp_i18n_catclose();
7255 
7256 #if KMP_STATS_ENABLED
7257  __kmp_stats_fini();
7258 #endif
7259 
7260  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7261 }
7262 
7263 /* ------------------------------------------------------------------------ */
7264 
7265 int __kmp_ignore_mppbeg(void) {
7266  char *env;
7267 
7268  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7269  if (__kmp_str_match_false(env))
7270  return FALSE;
7271  }
7272  // By default __kmpc_begin() is no-op.
7273  return TRUE;
7274 }
7275 
7276 int __kmp_ignore_mppend(void) {
7277  char *env;
7278 
7279  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7280  if (__kmp_str_match_false(env))
7281  return FALSE;
7282  }
7283  // By default __kmpc_end() is no-op.
7284  return TRUE;
7285 }
7286 
7287 void __kmp_internal_begin(void) {
7288  int gtid;
7289  kmp_root_t *root;
7290 
7291  /* this is a very important step as it will register new sibling threads
7292  and assign these new uber threads a new gtid */
7293  gtid = __kmp_entry_gtid();
7294  root = __kmp_threads[gtid]->th.th_root;
7295  KMP_ASSERT(KMP_UBER_GTID(gtid));
7296 
7297  if (root->r.r_begin)
7298  return;
7299  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7300  if (root->r.r_begin) {
7301  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7302  return;
7303  }
7304 
7305  root->r.r_begin = TRUE;
7306 
7307  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7308 }
7309 
7310 /* ------------------------------------------------------------------------ */
7311 
7312 void __kmp_user_set_library(enum library_type arg) {
7313  int gtid;
7314  kmp_root_t *root;
7315  kmp_info_t *thread;
7316 
7317  /* first, make sure we are initialized so we can get our gtid */
7318 
7319  gtid = __kmp_entry_gtid();
7320  thread = __kmp_threads[gtid];
7321 
7322  root = thread->th.th_root;
7323 
7324  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7325  library_serial));
7326  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7327  thread */
7328  KMP_WARNING(SetLibraryIncorrectCall);
7329  return;
7330  }
7331 
7332  switch (arg) {
7333  case library_serial:
7334  thread->th.th_set_nproc = 0;
7335  set__nproc(thread, 1);
7336  break;
7337  case library_turnaround:
7338  thread->th.th_set_nproc = 0;
7339  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7340  : __kmp_dflt_team_nth_ub);
7341  break;
7342  case library_throughput:
7343  thread->th.th_set_nproc = 0;
7344  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7345  : __kmp_dflt_team_nth_ub);
7346  break;
7347  default:
7348  KMP_FATAL(UnknownLibraryType, arg);
7349  }
7350 
7351  __kmp_aux_set_library(arg);
7352 }
7353 
7354 void __kmp_aux_set_stacksize(size_t arg) {
7355  if (!__kmp_init_serial)
7356  __kmp_serial_initialize();
7357 
7358 #if KMP_OS_DARWIN
7359  if (arg & (0x1000 - 1)) {
7360  arg &= ~(0x1000 - 1);
7361  if (arg + 0x1000) /* check for overflow if we round up */
7362  arg += 0x1000;
7363  }
7364 #endif
7365  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7366 
7367  /* only change the default stacksize before the first parallel region */
7368  if (!TCR_4(__kmp_init_parallel)) {
7369  size_t value = arg; /* argument is in bytes */
7370 
7371  if (value < __kmp_sys_min_stksize)
7372  value = __kmp_sys_min_stksize;
7373  else if (value > KMP_MAX_STKSIZE)
7374  value = KMP_MAX_STKSIZE;
7375 
7376  __kmp_stksize = value;
7377 
7378  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7379  }
7380 
7381  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7382 }
7383 
7384 /* set the behaviour of the runtime library */
7385 /* TODO this can cause some odd behaviour with sibling parallelism... */
7386 void __kmp_aux_set_library(enum library_type arg) {
7387  __kmp_library = arg;
7388 
7389  switch (__kmp_library) {
7390  case library_serial: {
7391  KMP_INFORM(LibraryIsSerial);
7392  (void)__kmp_change_library(TRUE);
7393  } break;
7394  case library_turnaround:
7395  (void)__kmp_change_library(TRUE);
7396  break;
7397  case library_throughput:
7398  (void)__kmp_change_library(FALSE);
7399  break;
7400  default:
7401  KMP_FATAL(UnknownLibraryType, arg);
7402  }
7403 }
7404 
7405 /* ------------------------------------------------------------------------ */
7406 
7407 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7408  int blocktime = arg; /* argument is in milliseconds */
7409 #if KMP_USE_MONITOR
7410  int bt_intervals;
7411 #endif
7412  int bt_set;
7413 
7414  __kmp_save_internal_controls(thread);
7415 
7416  /* Normalize and set blocktime for the teams */
7417  if (blocktime < KMP_MIN_BLOCKTIME)
7418  blocktime = KMP_MIN_BLOCKTIME;
7419  else if (blocktime > KMP_MAX_BLOCKTIME)
7420  blocktime = KMP_MAX_BLOCKTIME;
7421 
7422  set__blocktime_team(thread->th.th_team, tid, blocktime);
7423  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7424 
7425 #if KMP_USE_MONITOR
7426  /* Calculate and set blocktime intervals for the teams */
7427  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7428 
7429  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7430  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7431 #endif
7432 
7433  /* Set whether blocktime has been set to "TRUE" */
7434  bt_set = TRUE;
7435 
7436  set__bt_set_team(thread->th.th_team, tid, bt_set);
7437  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7438 #if KMP_USE_MONITOR
7439  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7440  "bt_intervals=%d, monitor_updates=%d\n",
7441  __kmp_gtid_from_tid(tid, thread->th.th_team),
7442  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7443  __kmp_monitor_wakeups));
7444 #else
7445  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7446  __kmp_gtid_from_tid(tid, thread->th.th_team),
7447  thread->th.th_team->t.t_id, tid, blocktime));
7448 #endif
7449 }
7450 
7451 void __kmp_aux_set_defaults(char const *str, int len) {
7452  if (!__kmp_init_serial) {
7453  __kmp_serial_initialize();
7454  };
7455  __kmp_env_initialize(str);
7456 
7457  if (__kmp_settings
7458 #if OMP_40_ENABLED
7459  || __kmp_display_env || __kmp_display_env_verbose
7460 #endif // OMP_40_ENABLED
7461  ) {
7462  __kmp_env_print();
7463  }
7464 } // __kmp_aux_set_defaults
7465 
7466 /* ------------------------------------------------------------------------ */
7467 /* internal fast reduction routines */
7468 
7469 PACKED_REDUCTION_METHOD_T
7470 __kmp_determine_reduction_method(
7471  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7472  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7473  kmp_critical_name *lck) {
7474 
7475  // Default reduction method: critical construct ( lck != NULL, like in current
7476  // PAROPT )
7477  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7478  // can be selected by RTL
7479  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7480  // can be selected by RTL
7481  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7482  // among generated by PAROPT.
7483 
7484  PACKED_REDUCTION_METHOD_T retval;
7485 
7486  int team_size;
7487 
7488  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7489  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7490 
7491 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7492  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7493 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7494 
7495  retval = critical_reduce_block;
7496 
7497  // another choice of getting a team size (with 1 dynamic deference) is slower
7498  team_size = __kmp_get_team_num_threads(global_tid);
7499  if (team_size == 1) {
7500 
7501  retval = empty_reduce_block;
7502 
7503  } else {
7504 
7505  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7506  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7507 
7508 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7509 
7510 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7511  KMP_OS_DARWIN
7512 
7513  int teamsize_cutoff = 4;
7514 
7515 #if KMP_MIC_SUPPORTED
7516  if (__kmp_mic_type != non_mic) {
7517  teamsize_cutoff = 8;
7518  }
7519 #endif
7520  if (tree_available) {
7521  if (team_size <= teamsize_cutoff) {
7522  if (atomic_available) {
7523  retval = atomic_reduce_block;
7524  }
7525  } else {
7526  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7527  }
7528  } else if (atomic_available) {
7529  retval = atomic_reduce_block;
7530  }
7531 #else
7532 #error "Unknown or unsupported OS"
7533 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7534 // KMP_OS_DARWIN
7535 
7536 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7537 
7538 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7539 
7540  // basic tuning
7541 
7542  if (atomic_available) {
7543  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7544  retval = atomic_reduce_block;
7545  }
7546  } // otherwise: use critical section
7547 
7548 #elif KMP_OS_DARWIN
7549 
7550  if (atomic_available && (num_vars <= 3)) {
7551  retval = atomic_reduce_block;
7552  } else if (tree_available) {
7553  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7554  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7555  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7556  }
7557  } // otherwise: use critical section
7558 
7559 #else
7560 #error "Unknown or unsupported OS"
7561 #endif
7562 
7563 #else
7564 #error "Unknown or unsupported architecture"
7565 #endif
7566  }
7567 
7568  // KMP_FORCE_REDUCTION
7569 
7570  // If the team is serialized (team_size == 1), ignore the forced reduction
7571  // method and stay with the unsynchronized method (empty_reduce_block)
7572  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7573  team_size != 1) {
7574 
7575  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7576 
7577  int atomic_available, tree_available;
7578 
7579  switch ((forced_retval = __kmp_force_reduction_method)) {
7580  case critical_reduce_block:
7581  KMP_ASSERT(lck); // lck should be != 0
7582  break;
7583 
7584  case atomic_reduce_block:
7585  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7586  if (!atomic_available) {
7587  KMP_WARNING(RedMethodNotSupported, "atomic");
7588  forced_retval = critical_reduce_block;
7589  }
7590  break;
7591 
7592  case tree_reduce_block:
7593  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7594  if (!tree_available) {
7595  KMP_WARNING(RedMethodNotSupported, "tree");
7596  forced_retval = critical_reduce_block;
7597  } else {
7598 #if KMP_FAST_REDUCTION_BARRIER
7599  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7600 #endif
7601  }
7602  break;
7603 
7604  default:
7605  KMP_ASSERT(0); // "unsupported method specified"
7606  }
7607 
7608  retval = forced_retval;
7609  }
7610 
7611  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7612 
7613 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7614 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7615 
7616  return (retval);
7617 }
7618 
7619 // this function is for testing set/get/determine reduce method
7620 kmp_int32 __kmp_get_reduce_method(void) {
7621  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7622 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:823
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:792
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:191
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:872
Definition: kmp.h:208
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
sched_type
Definition: kmp.h:315
kmp_int32 flags
Definition: kmp.h:210