LLVM OpenMP* Runtime Library
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 
29 #if OMPT_SUPPORT
30 #include "ompt-specific.h"
31 #endif
32 
33 /* these are temporary issues to be dealt with */
34 #define KMP_USE_PRCTL 0
35 #define KMP_USE_POOLED_ALLOC 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 
42 #if defined(KMP_GOMP_COMPAT)
43 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_40_ENABLED
48  "4.0 (201307)";
49 #else
50  "3.1 (201107)";
51 #endif
52 
53 #ifdef KMP_DEBUG
54 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 
58 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
59 
60 /* ------------------------------------------------------------------------ */
61 /* ------------------------------------------------------------------------ */
62 
63 kmp_info_t __kmp_monitor;
64 
65 /* ------------------------------------------------------------------------ */
66 /* ------------------------------------------------------------------------ */
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup( void );
71 
72 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
73 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
74 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places( kmp_team_t *team );
76 #endif
77 static void __kmp_do_serial_initialize( void );
78 void __kmp_fork_barrier( int gtid, int tid );
79 void __kmp_join_barrier( int gtid );
80 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
81 
82 
83 #ifdef USE_LOAD_BALANCE
84 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
85 #endif
86 
87 static int __kmp_expand_threads(int nWish, int nNeed);
88 #if KMP_OS_WINDOWS
89 static int __kmp_unregister_root_other_thread( int gtid );
90 #endif
91 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
92 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
93 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
94 
95 /* ------------------------------------------------------------------------ */
96 /* ------------------------------------------------------------------------ */
97 
98 /* Calculate the identifier of the current thread */
99 /* fast (and somewhat portable) way to get unique */
100 /* identifier of executing thread. */
101 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
102 
103 int
104 __kmp_get_global_thread_id( )
105 {
106  int i;
107  kmp_info_t **other_threads;
108  size_t stack_data;
109  char *stack_addr;
110  size_t stack_size;
111  char *stack_base;
112 
113  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
114  __kmp_nth, __kmp_all_nth ));
115 
116  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
117  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
118  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
119  __kmp_init_gtid for this to work. */
120 
121  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124  if ( TCR_4(__kmp_gtid_mode) >= 3) {
125  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
126  return __kmp_gtid;
127  }
128 #endif
129  if ( TCR_4(__kmp_gtid_mode) >= 2) {
130  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
131  return __kmp_gtid_get_specific();
132  }
133  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
134 
135  stack_addr = (char*) & stack_data;
136  other_threads = __kmp_threads;
137 
138  /*
139  ATT: The code below is a source of potential bugs due to unsynchronized access to
140  __kmp_threads array. For example:
141  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
142  2. Current thread is suspended by OS.
143  3. Another thread unregisters and finishes (debug versions of free() may fill memory
144  with something like 0xEF).
145  4. Current thread is resumed.
146  5. Current thread reads junk from *thr.
147  TODO: Fix it.
148  --ln
149  */
150 
151  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
152 
153  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
154  if( !thr ) continue;
155 
156  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
157  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
158 
159  /* stack grows down -- search through all of the active threads */
160 
161  if( stack_addr <= stack_base ) {
162  size_t stack_diff = stack_base - stack_addr;
163 
164  if( stack_diff <= stack_size ) {
165  /* The only way we can be closer than the allocated */
166  /* stack size is if we are running on this thread. */
167  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
168  return i;
169  }
170  }
171  }
172 
173  /* get specific to try and determine our gtid */
174  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
175  "thread, using TLS\n" ));
176  i = __kmp_gtid_get_specific();
177 
178  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
179 
180  /* if we havn't been assigned a gtid, then return code */
181  if( i<0 ) return i;
182 
183  /* dynamically updated stack window for uber threads to avoid get_specific call */
184  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
185  KMP_FATAL( StackOverflow, i );
186  }
187 
188  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
189  if( stack_addr > stack_base ) {
190  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
191  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
192  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
193  } else {
194  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
195  }
196 
197  /* Reprint stack bounds for ubermaster since they have been refined */
198  if ( __kmp_storage_map ) {
199  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
200  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
201  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
202  other_threads[i]->th.th_info.ds.ds_stacksize,
203  "th_%d stack (refinement)", i );
204  }
205  return i;
206 }
207 
208 int
209 __kmp_get_global_thread_id_reg( )
210 {
211  int gtid;
212 
213  if ( !__kmp_init_serial ) {
214  gtid = KMP_GTID_DNE;
215  } else
216 #ifdef KMP_TDATA_GTID
217  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
218  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
219  gtid = __kmp_gtid;
220  } else
221 #endif
222  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
223  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
224  gtid = __kmp_gtid_get_specific();
225  } else {
226  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
227  gtid = __kmp_get_global_thread_id();
228  }
229 
230  /* we must be a new uber master sibling thread */
231  if( gtid == KMP_GTID_DNE ) {
232  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
233  "Registering a new gtid.\n" ));
234  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
235  if( !__kmp_init_serial ) {
236  __kmp_do_serial_initialize();
237  gtid = __kmp_gtid_get_specific();
238  } else {
239  gtid = __kmp_register_root(FALSE);
240  }
241  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
242  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
243  }
244 
245  KMP_DEBUG_ASSERT( gtid >=0 );
246 
247  return gtid;
248 }
249 
250 /* caller must hold forkjoin_lock */
251 void
252 __kmp_check_stack_overlap( kmp_info_t *th )
253 {
254  int f;
255  char *stack_beg = NULL;
256  char *stack_end = NULL;
257  int gtid;
258 
259  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
260  if ( __kmp_storage_map ) {
261  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
262  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
263 
264  gtid = __kmp_gtid_from_thread( th );
265 
266  if (gtid == KMP_GTID_MONITOR) {
267  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268  "th_%s stack (%s)", "mon",
269  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
270  } else {
271  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
272  "th_%d stack (%s)", gtid,
273  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
274  }
275  }
276 
277  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
278  gtid = __kmp_gtid_from_thread( th );
279  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
280  {
281  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
282  if ( stack_beg == NULL ) {
283  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
284  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
285  }
286 
287  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
288  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
289 
290  if( f_th && f_th != th ) {
291  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
292  char *other_stack_beg = other_stack_end -
293  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
294  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
295  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
296 
297  /* Print the other stack values before the abort */
298  if ( __kmp_storage_map )
299  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
300  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
301  "th_%d stack (overlapped)",
302  __kmp_gtid_from_thread( f_th ) );
303 
304  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
305  }
306  }
307  }
308  }
309  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
310 }
311 
312 
313 /* ------------------------------------------------------------------------ */
314 
315 /* ------------------------------------------------------------------------ */
316 
317 void
318 __kmp_infinite_loop( void )
319 {
320  static int done = FALSE;
321 
322  while (! done) {
323  KMP_YIELD( 1 );
324  }
325 }
326 
327 #define MAX_MESSAGE 512
328 
329 void
330 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
331  char buffer[MAX_MESSAGE];
332  va_list ap;
333 
334  va_start( ap, format);
335  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
336  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
337  __kmp_vprintf( kmp_err, buffer, ap );
338 #if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if(gtid >= 0) {
341  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
342  if( __kmp_storage_map_verbose ) {
343  node = __kmp_get_host_node(p1);
344  if(node < 0) /* doesn't work, so don't try this next time */
345  __kmp_storage_map_verbose = FALSE;
346  else {
347  char *last;
348  int lastNode;
349  int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
352  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
353  if(localProc >= 0)
354  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
355  else
356  __kmp_printf_no_lock(" GTID %d\n", gtid);
357 # if KMP_USE_PRCTL
358 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
359  do {
360  last = p1;
361  lastNode = node;
362  /* This loop collates adjacent pages with the same host node. */
363  do {
364  (char*)p1 += PAGE_SIZE;
365  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
366  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
367  (char*)p1 - 1, lastNode);
368  } while(p1 <= p2);
369 # else
370  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
371  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
372  if(p1 < p2) {
373  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
374  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
375  }
376 # endif
377  }
378  }
379  } else
380  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
381  }
382 #endif /* KMP_PRINT_DATA_PLACEMENT */
383  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
384 }
385 
386 void
387 __kmp_warn( char const * format, ... )
388 {
389  char buffer[MAX_MESSAGE];
390  va_list ap;
391 
392  if ( __kmp_generate_warnings == kmp_warnings_off ) {
393  return;
394  }
395 
396  va_start( ap, format );
397 
398  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
399  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
400  __kmp_vprintf( kmp_err, buffer, ap );
401  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
402 
403  va_end( ap );
404 }
405 
406 void
407 __kmp_abort_process()
408 {
409 
410  // Later threads may stall here, but that's ok because abort() will kill them.
411  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
412 
413  if ( __kmp_debug_buf ) {
414  __kmp_dump_debug_buffer();
415  }; // if
416 
417  if ( KMP_OS_WINDOWS ) {
418  // Let other threads know of abnormal termination and prevent deadlock
419  // if abort happened during library initialization or shutdown
420  __kmp_global.g.g_abort = SIGABRT;
421 
422  /*
423  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
424  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
425  works well, but this function is not available in VS7 (this is not problem for DLL, but
426  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
427  not help, at least in some versions of MS C RTL.
428 
429  It seems following sequence is the only way to simulate abort() and avoid pop-up error
430  box.
431  */
432  raise( SIGABRT );
433  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
434  } else {
435  abort();
436  }; // if
437 
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
440 
441 } // __kmp_abort_process
442 
443 void
444 __kmp_abort_thread( void )
445 {
446  // TODO: Eliminate g_abort global variable and this function.
447  // In case of abort just call abort(), it will kill all the threads.
448  __kmp_infinite_loop();
449 } // __kmp_abort_thread
450 
451 /* ------------------------------------------------------------------------ */
452 
453 /*
454  * Print out the storage map for the major kmp_info_t thread data structures
455  * that are allocated together.
456  */
457 
458 static void
459 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
460 {
461  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
462 
463  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
464  "th_%d.th_info", gtid );
465 
466  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
467  "th_%d.th_local", gtid );
468 
469  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
470  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
471 
472  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
473  &thr->th.th_bar[bs_plain_barrier+1],
474  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
475 
476  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
477  &thr->th.th_bar[bs_forkjoin_barrier+1],
478  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
479 
480  #if KMP_FAST_REDUCTION_BARRIER
481  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
482  &thr->th.th_bar[bs_reduction_barrier+1],
483  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
484  #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /*
488  * Print out the storage map for the major kmp_team_t team data structures
489  * that are allocated together.
490  */
491 
492 static void
493 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
494 {
495  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
496  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
497  header, team_id );
498 
499  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
500  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
501 
502 
503  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
504  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
505 
506  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
507  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
508 
509  #if KMP_FAST_REDUCTION_BARRIER
510  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
511  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
512  #endif // KMP_FAST_REDUCTION_BARRIER
513 
514  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
515  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
516 
517  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
518  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
519 
520  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
521  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
522  header, team_id );
523 
524  /*
525  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
526  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
527 
528  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
529  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
530 
531  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
532  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
533 
534  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
535  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
536 
537  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
538  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
539 
540  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
541  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
542 
543  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
544  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
545 
546  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
547  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
548 #if OMP_40_ENABLED
549  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
550  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
551 #endif
552  */
553 
554  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
555  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
556 }
557 
558 static void __kmp_init_allocator() {}
559 static void __kmp_fini_allocator() {}
560 
561 /* ------------------------------------------------------------------------ */
562 
563 #ifdef KMP_DYNAMIC_LIB
564 # if KMP_OS_WINDOWS
565 
566 
567 static void
568 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
569  // TODO: Change to __kmp_break_bootstrap_lock().
570  __kmp_init_bootstrap_lock( lck ); // make the lock released
571 }
572 
573 static void
574 __kmp_reset_locks_on_process_detach( int gtid_req ) {
575  int i;
576  int thread_count;
577 
578  // PROCESS_DETACH is expected to be called by a thread
579  // that executes ProcessExit() or FreeLibrary().
580  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
581  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
582  // However, in fact, some threads can be still alive here, although being about to be terminated.
583  // The threads in the array with ds_thread==0 are most suspicious.
584  // Actually, it can be not safe to access the __kmp_threads[].
585 
586  // TODO: does it make sense to check __kmp_roots[] ?
587 
588  // Let's check that there are no other alive threads registered with the OMP lib.
589  while( 1 ) {
590  thread_count = 0;
591  for( i = 0; i < __kmp_threads_capacity; ++i ) {
592  if( !__kmp_threads ) continue;
593  kmp_info_t* th = __kmp_threads[ i ];
594  if( th == NULL ) continue;
595  int gtid = th->th.th_info.ds.ds_gtid;
596  if( gtid == gtid_req ) continue;
597  if( gtid < 0 ) continue;
598  DWORD exit_val;
599  int alive = __kmp_is_thread_alive( th, &exit_val );
600  if( alive ) {
601  ++thread_count;
602  }
603  }
604  if( thread_count == 0 ) break; // success
605  }
606 
607  // Assume that I'm alone.
608 
609  // Now it might be probably safe to check and reset locks.
610  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
611  __kmp_reset_lock( &__kmp_forkjoin_lock );
612  #ifdef KMP_DEBUG
613  __kmp_reset_lock( &__kmp_stdio_lock );
614  #endif // KMP_DEBUG
615 
616 
617 }
618 
619 BOOL WINAPI
620 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
621  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
622 
623  switch( fdwReason ) {
624 
625  case DLL_PROCESS_ATTACH:
626  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
627 
628  return TRUE;
629 
630  case DLL_PROCESS_DETACH:
631  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
632  __kmp_gtid_get_specific() ));
633 
634  if( lpReserved != NULL )
635  {
636  // lpReserved is used for telling the difference:
637  // lpReserved == NULL when FreeLibrary() was called,
638  // lpReserved != NULL when the process terminates.
639  // When FreeLibrary() is called, worker threads remain alive.
640  // So they will release the forkjoin lock by themselves.
641  // When the process terminates, worker threads disappear triggering
642  // the problem of unreleased forkjoin lock as described below.
643 
644  // A worker thread can take the forkjoin lock
645  // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
646  // The problem comes up if that worker thread becomes dead
647  // before it releases the forkjoin lock.
648  // The forkjoin lock remains taken, while the thread
649  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
650  // will try to take the forkjoin lock and will always fail,
651  // so that the application will never finish [normally].
652  // This scenario is possible if __kmpc_end() has not been executed.
653  // It looks like it's not a corner case, but common cases:
654  // - the main function was compiled by an alternative compiler;
655  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
656  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
657  // - alive foreign thread prevented __kmpc_end from doing cleanup.
658 
659  // This is a hack to work around the problem.
660  // TODO: !!! to figure out something better.
661  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
662  }
663 
664  __kmp_internal_end_library( __kmp_gtid_get_specific() );
665 
666  return TRUE;
667 
668  case DLL_THREAD_ATTACH:
669  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
670 
671  /* if we wanted to register new siblings all the time here call
672  * __kmp_get_gtid(); */
673  return TRUE;
674 
675  case DLL_THREAD_DETACH:
676  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
677  __kmp_gtid_get_specific() ));
678 
679  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
680  return TRUE;
681  }
682 
683  return TRUE;
684 }
685 
686 # endif /* KMP_OS_WINDOWS */
687 #endif /* KMP_DYNAMIC_LIB */
688 
689 
690 /* ------------------------------------------------------------------------ */
691 
692 /* Change the library type to "status" and return the old type */
693 /* called from within initialization routines where __kmp_initz_lock is held */
694 int
695 __kmp_change_library( int status )
696 {
697  int old_status;
698 
699  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
700 
701  if (status) {
702  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
703  }
704  else {
705  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
706  }
707 
708  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
709 }
710 
711 /* ------------------------------------------------------------------------ */
712 /* ------------------------------------------------------------------------ */
713 
714 /* __kmp_parallel_deo --
715  * Wait until it's our turn.
716  */
717 void
718 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
719 {
720  int gtid = *gtid_ref;
721 #ifdef BUILD_PARALLEL_ORDERED
722  kmp_team_t *team = __kmp_team_from_gtid( gtid );
723 #endif /* BUILD_PARALLEL_ORDERED */
724 
725  if( __kmp_env_consistency_check ) {
726  if( __kmp_threads[gtid]->th.th_root->r.r_active )
727 #if KMP_USE_DYNAMIC_LOCK
728  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
729 #else
730  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
731 #endif
732  }
733 #ifdef BUILD_PARALLEL_ORDERED
734  if( !team->t.t_serialized ) {
735  KMP_MB();
736  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
737  KMP_MB();
738  }
739 #endif /* BUILD_PARALLEL_ORDERED */
740 }
741 
742 /* __kmp_parallel_dxo --
743  * Signal the next task.
744  */
745 
746 void
747 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
748 {
749  int gtid = *gtid_ref;
750 #ifdef BUILD_PARALLEL_ORDERED
751  int tid = __kmp_tid_from_gtid( gtid );
752  kmp_team_t *team = __kmp_team_from_gtid( gtid );
753 #endif /* BUILD_PARALLEL_ORDERED */
754 
755  if( __kmp_env_consistency_check ) {
756  if( __kmp_threads[gtid]->th.th_root->r.r_active )
757  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
758  }
759 #ifdef BUILD_PARALLEL_ORDERED
760  if ( ! team->t.t_serialized ) {
761  KMP_MB(); /* Flush all pending memory write invalidates. */
762 
763  /* use the tid of the next thread in this team */
764  /* TODO repleace with general release procedure */
765  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
766 
767 #if OMPT_SUPPORT && OMPT_BLAME
768  if ((ompt_status == ompt_status_track_callback) &&
769  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
770  /* accept blame for "ordered" waiting */
771  kmp_info_t *this_thread = __kmp_threads[gtid];
772  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
773  this_thread->th.ompt_thread_info.wait_id);
774  }
775 #endif
776 
777  KMP_MB(); /* Flush all pending memory write invalidates. */
778  }
779 #endif /* BUILD_PARALLEL_ORDERED */
780 }
781 
782 /* ------------------------------------------------------------------------ */
783 /* ------------------------------------------------------------------------ */
784 
785 /* ------------------------------------------------------------------------ */
786 /* ------------------------------------------------------------------------ */
787 
788 /* The BARRIER for a SINGLE process section is always explicit */
789 
790 int
791 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
792 {
793  int status;
794  kmp_info_t *th;
795  kmp_team_t *team;
796 
797  if( ! TCR_4(__kmp_init_parallel) )
798  __kmp_parallel_initialize();
799 
800  th = __kmp_threads[ gtid ];
801  team = th->th.th_team;
802  status = 0;
803 
804  th->th.th_ident = id_ref;
805 
806  if ( team->t.t_serialized ) {
807  status = 1;
808  } else {
809  kmp_int32 old_this = th->th.th_local.this_construct;
810 
811  ++th->th.th_local.this_construct;
812  /* try to set team count to thread count--success means thread got the
813  single block
814  */
815  /* TODO: Should this be acquire or release? */
816  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
817  th->th.th_local.this_construct);
818 #if USE_ITT_BUILD
819  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
820 #if OMP_40_ENABLED
821  th->th.th_teams_microtask == NULL &&
822 #endif
823  team->t.t_active_level == 1 )
824  { // Only report metadata by master of active team at level 1
825  __kmp_itt_metadata_single( id_ref );
826  }
827 #endif /* USE_ITT_BUILD */
828  }
829 
830  if( __kmp_env_consistency_check ) {
831  if (status && push_ws) {
832  __kmp_push_workshare( gtid, ct_psingle, id_ref );
833  } else {
834  __kmp_check_workshare( gtid, ct_psingle, id_ref );
835  }
836  }
837 #if USE_ITT_BUILD
838  if ( status ) {
839  __kmp_itt_single_start( gtid );
840  }
841 #endif /* USE_ITT_BUILD */
842  return status;
843 }
844 
845 void
846 __kmp_exit_single( int gtid )
847 {
848 #if USE_ITT_BUILD
849  __kmp_itt_single_end( gtid );
850 #endif /* USE_ITT_BUILD */
851  if( __kmp_env_consistency_check )
852  __kmp_pop_workshare( gtid, ct_psingle, NULL );
853 }
854 
855 
856 /*
857  * determine if we can go parallel or must use a serialized parallel region and
858  * how many threads we can use
859  * set_nproc is the number of threads requested for the team
860  * returns 0 if we should serialize or only use one thread,
861  * otherwise the number of threads to use
862  * The forkjoin lock is held by the caller.
863  */
864 static int
865 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
866  int master_tid, int set_nthreads
867 #if OMP_40_ENABLED
868  , int enter_teams
869 #endif /* OMP_40_ENABLED */
870 )
871 {
872  int capacity;
873  int new_nthreads;
874  KMP_DEBUG_ASSERT( __kmp_init_serial );
875  KMP_DEBUG_ASSERT( root && parent_team );
876 
877  //
878  // Initial check to see if we should use a serialized team.
879  //
880  if ( set_nthreads == 1 ) {
881  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
882  __kmp_get_gtid(), set_nthreads ));
883  return 1;
884  }
885  if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
886 #if OMP_40_ENABLED
887  && !enter_teams
888 #endif /* OMP_40_ENABLED */
889  ) ) || ( __kmp_library == library_serial ) ) {
890  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
891  __kmp_get_gtid(), set_nthreads ));
892  return 1;
893  }
894 
895  //
896  // If dyn-var is set, dynamically adjust the number of desired threads,
897  // according to the method specified by dynamic_mode.
898  //
899  new_nthreads = set_nthreads;
900  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
901  ;
902  }
903 #ifdef USE_LOAD_BALANCE
904  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
905  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
906  if ( new_nthreads == 1 ) {
907  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
908  master_tid ));
909  return 1;
910  }
911  if ( new_nthreads < set_nthreads ) {
912  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
913  master_tid, new_nthreads ));
914  }
915  }
916 #endif /* USE_LOAD_BALANCE */
917  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
918  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
919  : root->r.r_hot_team->t.t_nproc);
920  if ( new_nthreads <= 1 ) {
921  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
922  master_tid ));
923  return 1;
924  }
925  if ( new_nthreads < set_nthreads ) {
926  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
927  master_tid, new_nthreads ));
928  }
929  else {
930  new_nthreads = set_nthreads;
931  }
932  }
933  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
934  if ( set_nthreads > 2 ) {
935  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
936  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
937  if ( new_nthreads == 1 ) {
938  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
939  master_tid ));
940  return 1;
941  }
942  if ( new_nthreads < set_nthreads ) {
943  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
944  master_tid, new_nthreads ));
945  }
946  }
947  }
948  else {
949  KMP_ASSERT( 0 );
950  }
951 
952  //
953  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
954  //
955  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
956  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
957  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
958  root->r.r_hot_team->t.t_nproc );
959  if ( tl_nthreads <= 0 ) {
960  tl_nthreads = 1;
961  }
962 
963  //
964  // If dyn-var is false, emit a 1-time warning.
965  //
966  if ( ! get__dynamic_2( parent_team, master_tid )
967  && ( ! __kmp_reserve_warn ) ) {
968  __kmp_reserve_warn = 1;
969  __kmp_msg(
970  kmp_ms_warning,
971  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
972  KMP_HNT( Unset_ALL_THREADS ),
973  __kmp_msg_null
974  );
975  }
976  if ( tl_nthreads == 1 ) {
977  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
978  master_tid ));
979  return 1;
980  }
981  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
982  master_tid, tl_nthreads ));
983  new_nthreads = tl_nthreads;
984  }
985 
986 
987  //
988  // Check if the threads array is large enough, or needs expanding.
989  //
990  // See comment in __kmp_register_root() about the adjustment if
991  // __kmp_threads[0] == NULL.
992  //
993  capacity = __kmp_threads_capacity;
994  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
995  --capacity;
996  }
997  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
998  root->r.r_hot_team->t.t_nproc ) > capacity ) {
999  //
1000  // Expand the threads array.
1001  //
1002  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
1003  root->r.r_hot_team->t.t_nproc ) - capacity;
1004  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
1005  if ( slotsAdded < slotsRequired ) {
1006  //
1007  // The threads array was not expanded enough.
1008  //
1009  new_nthreads -= ( slotsRequired - slotsAdded );
1010  KMP_ASSERT( new_nthreads >= 1 );
1011 
1012  //
1013  // If dyn-var is false, emit a 1-time warning.
1014  //
1015  if ( ! get__dynamic_2( parent_team, master_tid )
1016  && ( ! __kmp_reserve_warn ) ) {
1017  __kmp_reserve_warn = 1;
1018  if ( __kmp_tp_cached ) {
1019  __kmp_msg(
1020  kmp_ms_warning,
1021  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1022  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
1023  KMP_HNT( PossibleSystemLimitOnThreads ),
1024  __kmp_msg_null
1025  );
1026  }
1027  else {
1028  __kmp_msg(
1029  kmp_ms_warning,
1030  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1031  KMP_HNT( SystemLimitOnThreads ),
1032  __kmp_msg_null
1033  );
1034  }
1035  }
1036  }
1037  }
1038 
1039  if ( new_nthreads == 1 ) {
1040  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
1041  __kmp_get_gtid(), set_nthreads ) );
1042  return 1;
1043  }
1044 
1045  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1046  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1047  return new_nthreads;
1048 }
1049 
1050 /* ------------------------------------------------------------------------ */
1051 /* ------------------------------------------------------------------------ */
1052 
1053 /* allocate threads from the thread pool and assign them to the new team */
1054 /* we are assured that there are enough threads available, because we
1055  * checked on that earlier within critical section forkjoin */
1056 
1057 static void
1058 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1059  kmp_info_t *master_th, int master_gtid )
1060 {
1061  int i;
1062  int use_hot_team;
1063 
1064  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1065  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1066  KMP_MB();
1067 
1068  /* first, let's setup the master thread */
1069  master_th->th.th_info.ds.ds_tid = 0;
1070  master_th->th.th_team = team;
1071  master_th->th.th_team_nproc = team->t.t_nproc;
1072  master_th->th.th_team_master = master_th;
1073  master_th->th.th_team_serialized = FALSE;
1074  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1075 
1076  /* make sure we are not the optimized hot team */
1077 #if KMP_NESTED_HOT_TEAMS
1078  use_hot_team = 0;
1079  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1080  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1081  int level = team->t.t_active_level - 1; // index in array of hot teams
1082  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1083  if( master_th->th.th_teams_size.nteams > 1 ) {
1084  ++level; // level was not increased in teams construct for team_of_masters
1085  }
1086  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1087  master_th->th.th_teams_level == team->t.t_level ) {
1088  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1089  } // team->t.t_level will be increased inside parallel
1090  }
1091  if( level < __kmp_hot_teams_max_level ) {
1092  if( hot_teams[level].hot_team ) {
1093  // hot team has already been allocated for given level
1094  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1095  use_hot_team = 1; // the team is ready to use
1096  } else {
1097  use_hot_team = 0; // AC: threads are not allocated yet
1098  hot_teams[level].hot_team = team; // remember new hot team
1099  hot_teams[level].hot_team_nth = team->t.t_nproc;
1100  }
1101  } else {
1102  use_hot_team = 0;
1103  }
1104  }
1105 #else
1106  use_hot_team = team == root->r.r_hot_team;
1107 #endif
1108  if ( !use_hot_team ) {
1109 
1110  /* install the master thread */
1111  team->t.t_threads[ 0 ] = master_th;
1112  __kmp_initialize_info( master_th, team, 0, master_gtid );
1113 
1114  /* now, install the worker threads */
1115  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1116 
1117  /* fork or reallocate a new thread and install it in team */
1118  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1119  team->t.t_threads[ i ] = thr;
1120  KMP_DEBUG_ASSERT( thr );
1121  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1122  /* align team and thread arrived states */
1123  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
1124  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1125  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1126  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1127  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1128 #if OMP_40_ENABLED
1129  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1130  thr->th.th_teams_level = master_th->th.th_teams_level;
1131  thr->th.th_teams_size = master_th->th.th_teams_size;
1132 #endif
1133  { // Initialize threads' barrier data.
1134  int b;
1135  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1136  for ( b = 0; b < bs_last_barrier; ++ b ) {
1137  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1138  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1139 #if USE_DEBUGGER
1140  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1141 #endif
1142  }; // for b
1143  }
1144  }
1145 
1146 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1147  __kmp_partition_places( team );
1148 #endif
1149 
1150  }
1151 
1152  KMP_MB();
1153 }
1154 
1155 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1156 //
1157 // Propagate any changes to the floating point control registers out to the team
1158 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1159 // so we don't make changes unless they are needed.
1160 //
1161 inline static void
1162 propagateFPControl(kmp_team_t * team)
1163 {
1164  if ( __kmp_inherit_fp_control ) {
1165  kmp_int16 x87_fpu_control_word;
1166  kmp_uint32 mxcsr;
1167 
1168  // Get master values of FPU control flags (both X87 and vector)
1169  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1170  __kmp_store_mxcsr( &mxcsr );
1171  mxcsr &= KMP_X86_MXCSR_MASK;
1172 
1173  // There is no point looking at t_fp_control_saved here.
1174  // If it is TRUE, we still have to update the values if they are different from those we now have.
1175  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1176  // that the values in the team are the same as those we have.
1177  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1178  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1179  // cache-line into a written state, causing all threads in the team to have to read it again.
1180  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1181  team->t.t_x87_fpu_control_word = x87_fpu_control_word;
1182  }
1183  if ( team->t.t_mxcsr != mxcsr ) {
1184  team->t.t_mxcsr = mxcsr;
1185  }
1186  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1187  // So we must ensure it is correct.
1188  if (!team->t.t_fp_control_saved) {
1189  team->t.t_fp_control_saved = TRUE;
1190  }
1191  }
1192  else {
1193  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1194  if (team->t.t_fp_control_saved)
1195  team->t.t_fp_control_saved = FALSE;
1196  }
1197 }
1198 
1199 // Do the opposite, setting the hardware registers to the updated values from the team.
1200 inline static void
1201 updateHWFPControl(kmp_team_t * team)
1202 {
1203  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1204  //
1205  // Only reset the fp control regs if they have been changed in the team.
1206  // the parallel region that we are exiting.
1207  //
1208  kmp_int16 x87_fpu_control_word;
1209  kmp_uint32 mxcsr;
1210  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1211  __kmp_store_mxcsr( &mxcsr );
1212  mxcsr &= KMP_X86_MXCSR_MASK;
1213 
1214  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1215  __kmp_clear_x87_fpu_status_word();
1216  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1217  }
1218 
1219  if ( team->t.t_mxcsr != mxcsr ) {
1220  __kmp_load_mxcsr( &team->t.t_mxcsr );
1221  }
1222  }
1223 }
1224 #else
1225 # define propagateFPControl(x) ((void)0)
1226 # define updateHWFPControl(x) ((void)0)
1227 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1228 
1229 static void
1230 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1231 
1232 /*
1233  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1234  */
1235 void
1236 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1237 {
1238  kmp_info_t *this_thr;
1239  kmp_team_t *serial_team;
1240 
1241  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1242 
1243  /* Skip all this code for autopar serialized loops since it results in
1244  unacceptable overhead */
1245  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1246  return;
1247 
1248  if( ! TCR_4( __kmp_init_parallel ) )
1249  __kmp_parallel_initialize();
1250 
1251  this_thr = __kmp_threads[ global_tid ];
1252  serial_team = this_thr->th.th_serial_team;
1253 
1254  /* utilize the serialized team held by this thread */
1255  KMP_DEBUG_ASSERT( serial_team );
1256  KMP_MB();
1257 
1258  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1259  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1260  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1261  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1262  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1263  this_thr->th.th_task_team = NULL;
1264  }
1265 
1266 #if OMP_40_ENABLED
1267  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1268  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1269  proc_bind = proc_bind_false;
1270  }
1271  else if ( proc_bind == proc_bind_default ) {
1272  //
1273  // No proc_bind clause was specified, so use the current value
1274  // of proc-bind-var for this parallel region.
1275  //
1276  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1277  }
1278  //
1279  // Reset for next parallel region
1280  //
1281  this_thr->th.th_set_proc_bind = proc_bind_default;
1282 #endif /* OMP_40_ENABLED */
1283 
1284  if( this_thr->th.th_team != serial_team ) {
1285  // Nested level will be an index in the nested nthreads array
1286  int level = this_thr->th.th_team->t.t_level;
1287 
1288  if( serial_team->t.t_serialized ) {
1289  /* this serial team was already used
1290  * TODO increase performance by making this locks more specific */
1291  kmp_team_t *new_team;
1292 
1293  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1294 
1295 #if OMPT_SUPPORT
1296  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1297 #endif
1298 
1299  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1300 #if OMPT_SUPPORT
1301  ompt_parallel_id,
1302 #endif
1303 #if OMP_40_ENABLED
1304  proc_bind,
1305 #endif
1306  & this_thr->th.th_current_task->td_icvs,
1307  0 USE_NESTED_HOT_ARG(NULL) );
1308  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1309  KMP_ASSERT( new_team );
1310 
1311  /* setup new serialized team and install it */
1312  new_team->t.t_threads[0] = this_thr;
1313  new_team->t.t_parent = this_thr->th.th_team;
1314  serial_team = new_team;
1315  this_thr->th.th_serial_team = serial_team;
1316 
1317  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1318  global_tid, serial_team ) );
1319 
1320 
1321  /* TODO the above breaks the requirement that if we run out of
1322  * resources, then we can still guarantee that serialized teams
1323  * are ok, since we may need to allocate a new one */
1324  } else {
1325  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1326  global_tid, serial_team ) );
1327  }
1328 
1329  /* we have to initialize this serial team */
1330  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1331  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1332  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1333  serial_team->t.t_ident = loc;
1334  serial_team->t.t_serialized = 1;
1335  serial_team->t.t_nproc = 1;
1336  serial_team->t.t_parent = this_thr->th.th_team;
1337  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1338  this_thr->th.th_team = serial_team;
1339  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1340 
1341  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1342  global_tid, this_thr->th.th_current_task ) );
1343  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1344  this_thr->th.th_current_task->td_flags.executing = 0;
1345 
1346  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1347 
1348  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1349  each serialized task represented by team->t.t_serialized? */
1350  copy_icvs(
1351  & this_thr->th.th_current_task->td_icvs,
1352  & this_thr->th.th_current_task->td_parent->td_icvs );
1353 
1354  // Thread value exists in the nested nthreads array for the next nested level
1355  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1356  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1357  }
1358 
1359 #if OMP_40_ENABLED
1360  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1361  this_thr->th.th_current_task->td_icvs.proc_bind
1362  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1363  }
1364 #endif /* OMP_40_ENABLED */
1365 
1366 #if USE_DEBUGGER
1367  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1368 #endif
1369  this_thr->th.th_info.ds.ds_tid = 0;
1370 
1371  /* set thread cache values */
1372  this_thr->th.th_team_nproc = 1;
1373  this_thr->th.th_team_master = this_thr;
1374  this_thr->th.th_team_serialized = 1;
1375 
1376  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1377  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1378 
1379  propagateFPControl (serial_team);
1380 
1381  /* check if we need to allocate dispatch buffers stack */
1382  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1384  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1385  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1386  }
1387  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1388 
1389 #if OMPT_SUPPORT
1390  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1391  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1392 #endif
1393 
1394  KMP_MB();
1395 
1396  } else {
1397  /* this serialized team is already being used,
1398  * that's fine, just add another nested level */
1399  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1400  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1401  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1402  ++ serial_team->t.t_serialized;
1403  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1404 
1405  // Nested level will be an index in the nested nthreads array
1406  int level = this_thr->th.th_team->t.t_level;
1407  // Thread value exists in the nested nthreads array for the next nested level
1408  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1409  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1410  }
1411  serial_team->t.t_level++;
1412  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1413  global_tid, serial_team, serial_team->t.t_level ) );
1414 
1415  /* allocate/push dispatch buffers stack */
1416  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1417  {
1418  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1419  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1420  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1421  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1422  }
1423  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1424 
1425  KMP_MB();
1426  }
1427 
1428  if ( __kmp_env_consistency_check )
1429  __kmp_push_parallel( global_tid, NULL );
1430 
1431 #if USE_ITT_BUILD
1432  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
1433  if ( serial_team->t.t_level == 1
1434 #if OMP_40_ENABLED
1435  && this_thr->th.th_teams_microtask == NULL
1436 #endif
1437  ) {
1438 #if USE_ITT_NOTIFY
1439  // Save the start of the "parallel" region for VTune. This is the frame begin at the same time.
1440  if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) &&
1441  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
1442  {
1443  serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
1444  } else // only one notification scheme (either "submit" or "forking/joined", not both)
1445 #endif
1446  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
1447  __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode )
1448  {
1449  this_thr->th.th_ident = loc;
1450  // 0 - no barriers; 1 - serialized parallel
1451  __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
1452  }
1453  }
1454 #endif /* USE_ITT_BUILD */
1455 }
1456 
1457 /* most of the work for a fork */
1458 /* return true if we really went parallel, false if serialized */
1459 int
1460 __kmp_fork_call(
1461  ident_t * loc,
1462  int gtid,
1463  enum fork_context_e call_context, // Intel, GNU, ...
1464  kmp_int32 argc,
1465 #if OMPT_SUPPORT
1466  void *unwrapped_task,
1467 #endif
1468  microtask_t microtask,
1469  launch_t invoker,
1470 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1471 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1472  va_list * ap
1473 #else
1474  va_list ap
1475 #endif
1476  )
1477 {
1478  void **argv;
1479  int i;
1480  int master_tid;
1481  int master_this_cons;
1482  kmp_team_t *team;
1483  kmp_team_t *parent_team;
1484  kmp_info_t *master_th;
1485  kmp_root_t *root;
1486  int nthreads;
1487  int master_active;
1488  int master_set_numthreads;
1489  int level;
1490 #if OMP_40_ENABLED
1491  int active_level;
1492  int teams_level;
1493 #endif
1494 #if KMP_NESTED_HOT_TEAMS
1495  kmp_hot_team_ptr_t **p_hot_teams;
1496 #endif
1497  { // KMP_TIME_BLOCK
1498  KMP_TIME_BLOCK(KMP_fork_call);
1499 
1500  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1501  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1502  /* Some systems prefer the stack for the root thread(s) to start with */
1503  /* some gap from the parent stack to prevent false sharing. */
1504  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1505  /* These 2 lines below are so this does not get optimized out */
1506  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1507  __kmp_stkpadding += (short)((kmp_int64)dummy);
1508  }
1509 
1510  /* initialize if needed */
1511  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1512  if( ! TCR_4(__kmp_init_parallel) )
1513  __kmp_parallel_initialize();
1514 
1515  /* setup current data */
1516  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1517  parent_team = master_th->th.th_team;
1518  master_tid = master_th->th.th_info.ds.ds_tid;
1519  master_this_cons = master_th->th.th_local.this_construct;
1520  root = master_th->th.th_root;
1521  master_active = root->r.r_active;
1522  master_set_numthreads = master_th->th.th_set_nproc;
1523 
1524 #if OMPT_SUPPORT
1525  ompt_parallel_id_t ompt_parallel_id;
1526  ompt_task_id_t ompt_task_id;
1527  ompt_frame_t *ompt_frame;
1528  ompt_task_id_t my_task_id;
1529  ompt_parallel_id_t my_parallel_id;
1530 
1531  if (ompt_status & ompt_status_track) {
1532  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1533  ompt_task_id = __ompt_get_task_id_internal(0);
1534  ompt_frame = __ompt_get_task_frame_internal(0);
1535  }
1536 #endif
1537 
1538  // Nested level will be an index in the nested nthreads array
1539  level = parent_team->t.t_level;
1540 #if OMP_40_ENABLED
1541  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1542  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1543 #endif
1544 #if KMP_NESTED_HOT_TEAMS
1545  p_hot_teams = &master_th->th.th_hot_teams;
1546  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1547  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1548  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1549  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1550  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1551  }
1552 #endif
1553 
1554 #if OMPT_SUPPORT
1555  if ((ompt_status == ompt_status_track_callback) &&
1556  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1557  int team_size = master_set_numthreads;
1558 
1559  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1560  ompt_task_id, ompt_frame, ompt_parallel_id,
1561  team_size, unwrapped_task);
1562  }
1563 #endif
1564 
1565  master_th->th.th_ident = loc;
1566 
1567 #if OMP_40_ENABLED
1568  if ( master_th->th.th_teams_microtask &&
1569  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1570  // AC: This is start of parallel that is nested inside teams construct.
1571  // The team is actual (hot), all workers are ready at the fork barrier.
1572  // No lock needed to initialize the team a bit, then free workers.
1573  parent_team->t.t_ident = loc;
1574  parent_team->t.t_argc = argc;
1575  argv = (void**)parent_team->t.t_argv;
1576  for( i=argc-1; i >= 0; --i )
1577 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1578 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1579  *argv++ = va_arg( *ap, void * );
1580 #else
1581  *argv++ = va_arg( ap, void * );
1582 #endif
1583  /* Increment our nested depth levels, but not increase the serialization */
1584  if ( parent_team == master_th->th.th_serial_team ) {
1585  // AC: we are in serialized parallel
1586  __kmpc_serialized_parallel(loc, gtid);
1587  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1588  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1589  // work correctly, will restore at join time
1590 
1591 #if OMPT_SUPPORT
1592  void *dummy;
1593  void **exit_runtime_p;
1594 
1595  ompt_lw_taskteam_t lw_taskteam;
1596 
1597  if (ompt_status & ompt_status_track) {
1598  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1599  unwrapped_task, ompt_parallel_id);
1600  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1601  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1602 
1603  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1604 
1605 #if OMPT_TRACE
1606  /* OMPT implicit task begin */
1607  my_task_id = lw_taskteam.ompt_task_info.task_id;
1608  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1609  if ((ompt_status == ompt_status_track_callback) &&
1610  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1611  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1612  my_parallel_id, my_task_id);
1613  }
1614 #endif
1615 
1616  /* OMPT state */
1617  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1618  } else {
1619  exit_runtime_p = &dummy;
1620  }
1621 #endif
1622 
1623  KMP_TIME_BLOCK(OMP_work);
1624  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1625 #if OMPT_SUPPORT
1626  , exit_runtime_p
1627 #endif
1628  );
1629 
1630 #if OMPT_SUPPORT
1631  if (ompt_status & ompt_status_track) {
1632 #if OMPT_TRACE
1633  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1634 
1635  if ((ompt_status == ompt_status_track_callback) &&
1636  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1637  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1638  ompt_parallel_id, ompt_task_id);
1639  }
1640 
1641  __ompt_lw_taskteam_unlink(master_th);
1642  // reset clear the task id only after unlinking the task
1643  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1644 #endif
1645 
1646  if ((ompt_status == ompt_status_track_callback) &&
1647  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1648  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1649  ompt_parallel_id, ompt_task_id);
1650  }
1651  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1652  }
1653 #endif
1654  return TRUE;
1655  }
1656 
1657  parent_team->t.t_pkfn = microtask;
1658 #if OMPT_SUPPORT
1659  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1660 #endif
1661  parent_team->t.t_invoke = invoker;
1662  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1663  parent_team->t.t_active_level ++;
1664  parent_team->t.t_level ++;
1665 
1666  /* Change number of threads in the team if requested */
1667  if ( master_set_numthreads ) { // The parallel has num_threads clause
1668  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1669  // AC: only can reduce the number of threads dynamically, cannot increase
1670  kmp_info_t **other_threads = parent_team->t.t_threads;
1671  parent_team->t.t_nproc = master_set_numthreads;
1672  for ( i = 0; i < master_set_numthreads; ++i ) {
1673  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1674  }
1675  // Keep extra threads hot in the team for possible next parallels
1676  }
1677  master_th->th.th_set_nproc = 0;
1678  }
1679 
1680 #if USE_DEBUGGER
1681  if ( __kmp_debugging ) { // Let debugger override number of threads.
1682  int nth = __kmp_omp_num_threads( loc );
1683  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1684  master_set_numthreads = nth;
1685  }; // if
1686  }; // if
1687 #endif
1688 
1689  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1690  __kmp_internal_fork( loc, gtid, parent_team );
1691  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1692 
1693  /* Invoke microtask for MASTER thread */
1694  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1695  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1696 
1697  {
1698  KMP_TIME_BLOCK(OMP_work);
1699  if (! parent_team->t.t_invoke( gtid )) {
1700  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1701  }
1702  }
1703  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1704  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1705  KMP_MB(); /* Flush all pending memory write invalidates. */
1706 
1707  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1708 
1709  return TRUE;
1710  } // Parallel closely nested in teams construct
1711 #endif /* OMP_40_ENABLED */
1712 
1713 #if KMP_DEBUG
1714  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1715  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1716  }
1717 #endif
1718 
1719  /* determine how many new threads we can use */
1720  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1721 
1722  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1723  nthreads = 1;
1724  } else {
1725  nthreads = master_set_numthreads ?
1726  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1727  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1728 #if OMP_40_ENABLED
1729 /* AC: If we execute teams from parallel region (on host), then teams should be created
1730  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1731  then teams and their threads should be created regardless of the nesting setting. */
1732  , ((ap==NULL && active_level==0) ||
1733  (ap && teams_level>0 && teams_level==level))
1734 #endif /* OMP_40_ENABLED */
1735  );
1736  }
1737  KMP_DEBUG_ASSERT( nthreads > 0 );
1738 
1739  /* If we temporarily changed the set number of threads then restore it now */
1740  master_th->th.th_set_nproc = 0;
1741 
1742 
1743  /* create a serialized parallel region? */
1744  if ( nthreads == 1 ) {
1745  /* josh todo: hypothetical question: what do we do for OS X*? */
1746 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1747  void * args[ argc ];
1748 #else
1749  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1750 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1751 
1752  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1753  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1754 
1755  __kmpc_serialized_parallel(loc, gtid);
1756 
1757  if ( call_context == fork_context_intel ) {
1758  /* TODO this sucks, use the compiler itself to pass args! :) */
1759  master_th->th.th_serial_team->t.t_ident = loc;
1760 #if OMP_40_ENABLED
1761  if ( !ap ) {
1762  // revert change made in __kmpc_serialized_parallel()
1763  master_th->th.th_serial_team->t.t_level--;
1764  // Get args from parent team for teams construct
1765 
1766 #if OMPT_SUPPORT
1767  void *dummy;
1768  void **exit_runtime_p;
1769 
1770  ompt_lw_taskteam_t lw_taskteam;
1771 
1772  if (ompt_status & ompt_status_track) {
1773  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1774  unwrapped_task, ompt_parallel_id);
1775  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1776  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1777 
1778  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1779 
1780 #if OMPT_TRACE
1781  my_task_id = lw_taskteam.ompt_task_info.task_id;
1782  if ((ompt_status == ompt_status_track_callback) &&
1783  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1784  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1785  ompt_parallel_id, my_task_id);
1786  }
1787 #endif
1788 
1789  /* OMPT state */
1790  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1791  } else {
1792  exit_runtime_p = &dummy;
1793  }
1794 #endif
1795 
1796  {
1797  KMP_TIME_BLOCK(OMP_work);
1798  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1799 #if OMPT_SUPPORT
1800  , exit_runtime_p
1801 #endif
1802  );
1803  }
1804 
1805 #if OMPT_SUPPORT
1806  if (ompt_status & ompt_status_track) {
1807  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1808 
1809 #if OMPT_TRACE
1810  if ((ompt_status == ompt_status_track_callback) &&
1811  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1812  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1813  ompt_parallel_id, ompt_task_id);
1814  }
1815 #endif
1816 
1817  __ompt_lw_taskteam_unlink(master_th);
1818  // reset clear the task id only after unlinking the task
1819  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1820 
1821  if ((ompt_status == ompt_status_track_callback) &&
1822  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1823  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1824  ompt_parallel_id, ompt_task_id);
1825  }
1826  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1827  }
1828 #endif
1829  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1830  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1831  team = master_th->th.th_team;
1832  //team->t.t_pkfn = microtask;
1833  team->t.t_invoke = invoker;
1834  __kmp_alloc_argv_entries( argc, team, TRUE );
1835  team->t.t_argc = argc;
1836  argv = (void**) team->t.t_argv;
1837  if ( ap ) {
1838  for( i=argc-1; i >= 0; --i )
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841  *argv++ = va_arg( *ap, void * );
1842 # else
1843  *argv++ = va_arg( ap, void * );
1844 # endif
1845  } else {
1846  for( i=0; i < argc; ++i )
1847  // Get args from parent team for teams construct
1848  argv[i] = parent_team->t.t_argv[i];
1849  }
1850  // AC: revert change made in __kmpc_serialized_parallel()
1851  // because initial code in teams should have level=0
1852  team->t.t_level--;
1853  // AC: call special invoker for outer "parallel" of the teams construct
1854  {
1855  KMP_TIME_BLOCK(OMP_work);
1856  invoker(gtid);
1857  }
1858  } else {
1859 #endif /* OMP_40_ENABLED */
1860  argv = args;
1861  for( i=argc-1; i >= 0; --i )
1862 // TODO: revert workaround for Intel(R) 64 tracker #96
1863 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1864  *argv++ = va_arg( *ap, void * );
1865 #else
1866  *argv++ = va_arg( ap, void * );
1867 #endif
1868  KMP_MB();
1869 
1870 #if OMPT_SUPPORT
1871  void *dummy;
1872  void **exit_runtime_p;
1873 
1874  ompt_lw_taskteam_t lw_taskteam;
1875 
1876  if (ompt_status & ompt_status_track) {
1877  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1878  unwrapped_task, ompt_parallel_id);
1879  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1880  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1881 
1882  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1883 
1884 #if OMPT_TRACE
1885  /* OMPT implicit task begin */
1886  my_task_id = lw_taskteam.ompt_task_info.task_id;
1887  my_parallel_id = ompt_parallel_id;
1888  if ((ompt_status == ompt_status_track_callback) &&
1889  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1890  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1891  my_parallel_id, my_task_id);
1892  }
1893 #endif
1894 
1895  /* OMPT state */
1896  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1897  } else {
1898  exit_runtime_p = &dummy;
1899  }
1900 #endif
1901 
1902  {
1903  KMP_TIME_BLOCK(OMP_work);
1904  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1905 #if OMPT_SUPPORT
1906  , exit_runtime_p
1907 #endif
1908  );
1909  }
1910 
1911 #if OMPT_SUPPORT
1912  if (ompt_status & ompt_status_track) {
1913 #if OMPT_TRACE
1914  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1915 
1916  if ((ompt_status == ompt_status_track_callback) &&
1917  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1918  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1919  my_parallel_id, my_task_id);
1920  }
1921 #endif
1922 
1923  __ompt_lw_taskteam_unlink(master_th);
1924  // reset clear the task id only after unlinking the task
1925  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1926 
1927  if ((ompt_status == ompt_status_track_callback) &&
1928  ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1929  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1930  ompt_parallel_id, ompt_task_id);
1931  }
1932  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933  }
1934 #endif
1935 #if OMP_40_ENABLED
1936  }
1937 #endif /* OMP_40_ENABLED */
1938  }
1939  else if ( call_context == fork_context_gnu ) {
1940 #if OMPT_SUPPORT
1941  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1942  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1943  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1944  unwrapped_task, ompt_parallel_id);
1945 
1946  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1947  lwt->ompt_task_info.frame.exit_runtime_frame = 0;
1948  __ompt_lw_taskteam_link(lwt, master_th);
1949 #endif
1950 
1951  // we were called from GNU native code
1952  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1953  return FALSE;
1954  }
1955  else {
1956  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1957  }
1958 
1959 
1960  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1961  KMP_MB();
1962  return FALSE;
1963  }
1964 
1965  // GEH: only modify the executing flag in the case when not serialized
1966  // serialized case is handled in kmpc_serialized_parallel
1967  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1968  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1969  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1970  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1971  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1972  master_th->th.th_current_task->td_flags.executing = 0;
1973 
1974 #if OMP_40_ENABLED
1975  if ( !master_th->th.th_teams_microtask || level > teams_level )
1976 #endif /* OMP_40_ENABLED */
1977  {
1978  /* Increment our nested depth level */
1979  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1980  }
1981 
1982  // See if we need to make a copy of the ICVs.
1983  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1984  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1985  nthreads_icv = __kmp_nested_nth.nth[level+1];
1986  }
1987  else {
1988  nthreads_icv = 0; // don't update
1989  }
1990 
1991 #if OMP_40_ENABLED
1992  // Figure out the proc_bind_policy for the new team.
1993  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1994  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1995  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1996  proc_bind = proc_bind_false;
1997  }
1998  else {
1999  if (proc_bind == proc_bind_default) {
2000  // No proc_bind clause specified; use current proc-bind-var for this parallel region
2001  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2002  }
2003  /* else: The proc_bind policy was specified explicitly on parallel clause. This
2004  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
2005  // Figure the value of proc-bind-var for the child threads.
2006  if ((level+1 < __kmp_nested_proc_bind.used)
2007  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
2008  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
2009  }
2010  }
2011 
2012  // Reset for next parallel region
2013  master_th->th.th_set_proc_bind = proc_bind_default;
2014 #endif /* OMP_40_ENABLED */
2015 
2016  if ((nthreads_icv > 0)
2017 #if OMP_40_ENABLED
2018  || (proc_bind_icv != proc_bind_default)
2019 #endif /* OMP_40_ENABLED */
2020  ) {
2021  kmp_internal_control_t new_icvs;
2022  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2023  new_icvs.next = NULL;
2024  if (nthreads_icv > 0) {
2025  new_icvs.nproc = nthreads_icv;
2026  }
2027 
2028 #if OMP_40_ENABLED
2029  if (proc_bind_icv != proc_bind_default) {
2030  new_icvs.proc_bind = proc_bind_icv;
2031  }
2032 #endif /* OMP_40_ENABLED */
2033 
2034  /* allocate a new parallel team */
2035  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2036  team = __kmp_allocate_team(root, nthreads, nthreads,
2037 #if OMPT_SUPPORT
2038  ompt_parallel_id,
2039 #endif
2040 #if OMP_40_ENABLED
2041  proc_bind,
2042 #endif
2043  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
2044  } else {
2045  /* allocate a new parallel team */
2046  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2047  team = __kmp_allocate_team(root, nthreads, nthreads,
2048 #if OMPT_SUPPORT
2049  ompt_parallel_id,
2050 #endif
2051 #if OMP_40_ENABLED
2052  proc_bind,
2053 #endif
2054  &master_th->th.th_current_task->td_icvs, argc
2055  USE_NESTED_HOT_ARG(master_th) );
2056  }
2057  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2058 
2059  /* setup the new team */
2060  team->t.t_master_tid = master_tid;
2061  team->t.t_master_this_cons = master_this_cons;
2062  team->t.t_ident = loc;
2063  team->t.t_parent = parent_team;
2064  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
2065 #if OMPT_SUPPORT
2066  TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
2067 #endif
2068  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
2069  // TODO: parent_team->t.t_level == INT_MAX ???
2070 #if OMP_40_ENABLED
2071  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2072 #endif /* OMP_40_ENABLED */
2073  team->t.t_level = parent_team->t.t_level + 1;
2074  team->t.t_active_level = parent_team->t.t_active_level + 1;
2075 #if OMP_40_ENABLED
2076  } else {
2077  // AC: Do not increase parallel level at start of the teams construct
2078  team->t.t_level = parent_team->t.t_level;
2079  team->t.t_active_level = parent_team->t.t_active_level;
2080  }
2081 #endif /* OMP_40_ENABLED */
2082  team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
2083 
2084  // Update the floating point rounding in the team if required.
2085  propagateFPControl(team);
2086 
2087  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2088  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2089  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2090  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2091  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2092  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2093  if (level) {
2094  // Take a memo of master's task_state
2095  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2096  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2097  kmp_uint8 *old_stack, *new_stack = (kmp_uint8 *) __kmp_allocate( 2*master_th->th.th_task_state_stack_sz );
2098  kmp_uint32 i;
2099  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2100  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101  }
2102  old_stack = master_th->th.th_task_state_memo_stack;
2103  master_th->th.th_task_state_memo_stack = new_stack;
2104  master_th->th.th_task_state_stack_sz *= 2;
2105  __kmp_free(old_stack);
2106  }
2107  // Store master's task_state on stack
2108  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2109  master_th->th.th_task_state_top++;
2110  master_th->th.th_task_state = 0;
2111  }
2112  master_th->th.th_task_team = team->t.t_task_team[master_th->th.th_task_state];
2113 
2114 #if !KMP_NESTED_HOT_TEAMS
2115  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2116 #endif
2117  }
2118 
2119  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2120  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2121  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2122  ( team->t.t_master_tid == 0 &&
2123  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2124  KMP_MB();
2125 
2126  /* now, setup the arguments */
2127  argv = (void**)team->t.t_argv;
2128 #if OMP_40_ENABLED
2129  if ( ap ) {
2130 #endif /* OMP_40_ENABLED */
2131  for ( i=argc-1; i >= 0; --i )
2132 // TODO: revert workaround for Intel(R) 64 tracker #96
2133 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2134  *argv++ = va_arg( *ap, void * );
2135 #else
2136  *argv++ = va_arg( ap, void * );
2137 #endif
2138 #if OMP_40_ENABLED
2139  } else {
2140  for ( i=0; i < argc; ++i )
2141  // Get args from parent team for teams construct
2142  argv[i] = team->t.t_parent->t.t_argv[i];
2143  }
2144 #endif /* OMP_40_ENABLED */
2145 
2146  /* now actually fork the threads */
2147  team->t.t_master_active = master_active;
2148  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2149  root->r.r_active = TRUE;
2150 
2151  __kmp_fork_team_threads( root, team, master_th, gtid );
2152  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2153 
2154 #if OMPT_SUPPORT
2155  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2156 #endif
2157 
2158  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2159 
2160 
2161 #if USE_ITT_BUILD
2162  if ( team->t.t_active_level == 1 // only report frames at level 1
2163 # if OMP_40_ENABLED
2164  && !master_th->th.th_teams_microtask // not in teams construct
2165 # endif /* OMP_40_ENABLED */
2166  ) {
2167 #if USE_ITT_NOTIFY
2168  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2169  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2170  {
2171  kmp_uint64 tmp_time = 0;
2172  if ( __itt_get_timestamp_ptr )
2173  tmp_time = __itt_get_timestamp();
2174  // Internal fork - report frame begin
2175  master_th->th.th_frame_time = tmp_time;
2176  if ( __kmp_forkjoin_frames_mode == 3 )
2177  team->t.t_region_time = tmp_time;
2178  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2179 #endif /* USE_ITT_NOTIFY */
2180  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2181  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2182  { // Mark start of "parallel" region for VTune.
2183  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2184  }
2185  }
2186 #endif /* USE_ITT_BUILD */
2187 
2188  /* now go on and do the work */
2189  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2190  KMP_MB();
2191  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2192  root, team, master_th, gtid));
2193 
2194 #if USE_ITT_BUILD
2195  if ( __itt_stack_caller_create_ptr ) {
2196  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2197  }
2198 #endif /* USE_ITT_BUILD */
2199 
2200 #if OMP_40_ENABLED
2201  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2202 #endif /* OMP_40_ENABLED */
2203  {
2204  __kmp_internal_fork( loc, gtid, team );
2205  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2206  root, team, master_th, gtid));
2207  }
2208 
2209  if (call_context == fork_context_gnu) {
2210  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2211  return TRUE;
2212  }
2213 
2214  /* Invoke microtask for MASTER thread */
2215  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2216  gtid, team->t.t_id, team->t.t_pkfn ) );
2217  } // END of timer KMP_fork_call block
2218 
2219  {
2220  //KMP_TIME_BLOCK(OMP_work);
2221  KMP_TIME_BLOCK(USER_master_invoke);
2222  if (! team->t.t_invoke( gtid )) {
2223  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2224  }
2225  }
2226  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2227  gtid, team->t.t_id, team->t.t_pkfn ) );
2228  KMP_MB(); /* Flush all pending memory write invalidates. */
2229 
2230  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2231 
2232 #if OMPT_SUPPORT
2233  if (ompt_status & ompt_status_track) {
2234  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2235  }
2236 #endif
2237 
2238  return TRUE;
2239 }
2240 
2241 #if OMPT_SUPPORT
2242 static inline void
2243 __kmp_join_restore_state(
2244  kmp_info_t *thread,
2245  kmp_team_t *team)
2246 {
2247  // restore state outside the region
2248  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2249  ompt_state_work_serial : ompt_state_work_parallel);
2250 }
2251 
2252 static inline void
2253 __kmp_join_ompt(
2254  kmp_info_t *thread,
2255  kmp_team_t *team,
2256  ompt_parallel_id_t parallel_id)
2257 {
2258  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2259  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2260  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2261  parallel_id, task_info->task_id);
2262  }
2263 
2264  __kmp_join_restore_state(thread,team);
2265 }
2266 #endif
2267 
2268 void
2269 __kmp_join_call(ident_t *loc, int gtid
2270 #if OMP_40_ENABLED
2271  , int exit_teams
2272 #endif /* OMP_40_ENABLED */
2273 )
2274 {
2275  KMP_TIME_BLOCK(KMP_join_call);
2276  kmp_team_t *team;
2277  kmp_team_t *parent_team;
2278  kmp_info_t *master_th;
2279  kmp_root_t *root;
2280  int master_active;
2281  int i;
2282 
2283  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2284 
2285  /* setup current data */
2286  master_th = __kmp_threads[ gtid ];
2287  root = master_th->th.th_root;
2288  team = master_th->th.th_team;
2289  parent_team = team->t.t_parent;
2290 
2291  master_th->th.th_ident = loc;
2292 
2293 #if OMPT_SUPPORT
2294  if (ompt_status & ompt_status_track) {
2295  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2296  }
2297 #endif
2298 
2299 #if KMP_DEBUG
2300  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2301  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2302  __kmp_gtid_from_thread( master_th ), team,
2303  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2304  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2305  }
2306 #endif
2307 
2308  if( team->t.t_serialized ) {
2309 #if OMP_40_ENABLED
2310  if ( master_th->th.th_teams_microtask ) {
2311  // We are in teams construct
2312  int level = team->t.t_level;
2313  int tlevel = master_th->th.th_teams_level;
2314  if ( level == tlevel ) {
2315  // AC: we haven't incremented it earlier at start of teams construct,
2316  // so do it here - at the end of teams construct
2317  team->t.t_level++;
2318  } else if ( level == tlevel + 1 ) {
2319  // AC: we are exiting parallel inside teams, need to increment serialization
2320  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2321  team->t.t_serialized++;
2322  }
2323  }
2324 #endif /* OMP_40_ENABLED */
2325  __kmpc_end_serialized_parallel( loc, gtid );
2326 
2327 #if OMPT_SUPPORT
2328  if (ompt_status == ompt_status_track_callback) {
2329  __kmp_join_restore_state(master_th, parent_team);
2330  }
2331 #endif
2332 
2333  return;
2334  }
2335 
2336  master_active = team->t.t_master_active;
2337 
2338 #if OMP_40_ENABLED
2339  if (!exit_teams)
2340 #endif /* OMP_40_ENABLED */
2341  {
2342  // AC: No barrier for internal teams at exit from teams construct.
2343  // But there is barrier for external team (league).
2344  __kmp_internal_join( loc, gtid, team );
2345  }
2346  else {
2347  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2348  }
2349 
2350  KMP_MB();
2351 
2352 #if OMPT_SUPPORT
2353  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2354 #endif
2355 
2356 #if USE_ITT_BUILD
2357  if ( __itt_stack_caller_create_ptr ) {
2358  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2359  }
2360 
2361  // Mark end of "parallel" region for VTune.
2362  if ( team->t.t_active_level == 1
2363 # if OMP_40_ENABLED
2364  && !master_th->th.th_teams_microtask /* not in teams construct */
2365 # endif /* OMP_40_ENABLED */
2366  ) {
2367  master_th->th.th_ident = loc;
2368  // only one notification scheme (either "submit" or "forking/joined", not both)
2369  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2370  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2371  0, loc, master_th->th.th_team_nproc, 1 );
2372  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2373  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2374  __kmp_itt_region_joined( gtid );
2375  } // active_level == 1
2376 #endif /* USE_ITT_BUILD */
2377 
2378 #if OMP_40_ENABLED
2379  if ( master_th->th.th_teams_microtask &&
2380  !exit_teams &&
2381  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2382  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2383  // AC: We need to leave the team structure intact at the end
2384  // of parallel inside the teams construct, so that at the next
2385  // parallel same (hot) team works, only adjust nesting levels
2386 
2387  /* Decrement our nested depth level */
2388  team->t.t_level --;
2389  team->t.t_active_level --;
2390  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2391 
2392  /* Restore number of threads in the team if needed */
2393  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2394  int old_num = master_th->th.th_team_nproc;
2395  int new_num = master_th->th.th_teams_size.nth;
2396  kmp_info_t **other_threads = team->t.t_threads;
2397  kmp_task_team_t * task_team = master_th->th.th_task_team;
2398  team->t.t_nproc = new_num;
2399  if ( task_team ) { // task team might have lesser value of counters
2400  task_team->tt.tt_ref_ct = new_num - 1;
2401  task_team->tt.tt_unfinished_threads = new_num;
2402  }
2403  for ( i = 0; i < old_num; ++i ) {
2404  other_threads[i]->th.th_team_nproc = new_num;
2405  }
2406  // Adjust states of non-used threads of the team
2407  for ( i = old_num; i < new_num; ++i ) {
2408  // Re-initialize thread's barrier data.
2409  int b;
2410  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2411  for ( b = 0; b < bs_last_barrier; ++ b ) {
2412  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2413  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2414 #if USE_DEBUGGER
2415  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2416 #endif
2417  }
2418  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2419  // Synchronize thread's task state
2420  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2421  }
2422  }
2423  }
2424 
2425 #if OMPT_SUPPORT
2426  if (ompt_status == ompt_status_track_callback) {
2427  __kmp_join_ompt(master_th, parent_team, parallel_id);
2428  }
2429 #endif
2430 
2431  return;
2432  }
2433 #endif /* OMP_40_ENABLED */
2434 
2435  /* do cleanup and restore the parent team */
2436  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2437  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2438 
2439  master_th->th.th_dispatch =
2440  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2441 
2442  /* jc: The following lock has instructions with REL and ACQ semantics,
2443  separating the parallel user code called in this parallel region
2444  from the serial user code called after this function returns.
2445  */
2446  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2447 
2448 #if OMP_40_ENABLED
2449  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2450 #endif /* OMP_40_ENABLED */
2451  {
2452  /* Decrement our nested depth level */
2453  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2454  }
2455  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2456 
2457  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2458  0, master_th, team ) );
2459  __kmp_pop_current_task_from_thread( master_th );
2460 
2461 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2462  //
2463  // Restore master thread's partition.
2464  //
2465  master_th->th.th_first_place = team->t.t_first_place;
2466  master_th->th.th_last_place = team->t.t_last_place;
2467 #endif /* OMP_40_ENABLED */
2468 
2469  updateHWFPControl (team);
2470 
2471  if ( root->r.r_active != master_active )
2472  root->r.r_active = master_active;
2473 
2474  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2475 
2476  /* this race was fun to find. make sure the following is in the critical
2477  * region otherwise assertions may fail occasionally since the old team
2478  * may be reallocated and the hierarchy appears inconsistent. it is
2479  * actually safe to run and won't cause any bugs, but will cause those
2480  * assertion failures. it's only one deref&assign so might as well put this
2481  * in the critical region */
2482  master_th->th.th_team = parent_team;
2483  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2484  master_th->th.th_team_master = parent_team->t.t_threads[0];
2485  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2486 
2487  /* restore serialized team, if need be */
2488  if( parent_team->t.t_serialized &&
2489  parent_team != master_th->th.th_serial_team &&
2490  parent_team != root->r.r_root_team ) {
2491  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2492  master_th->th.th_serial_team = parent_team;
2493  }
2494 
2495  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2496  // Restore task state from memo stack
2497  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2498  if (master_th->th.th_task_state_top > 0) {
2499  --master_th->th.th_task_state_top; // pop
2500  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2501  }
2502  // Copy the first task team from the new child / old parent team to the thread and reset state flag.
2503  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2504 
2505  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2506  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2507  parent_team ) );
2508  }
2509 
2510  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2511  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2512  master_th->th.th_current_task->td_flags.executing = 1;
2513 
2514  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2515 
2516 #if OMPT_SUPPORT
2517  if (ompt_status == ompt_status_track_callback) {
2518  __kmp_join_ompt(master_th, parent_team, parallel_id);
2519  }
2520 #endif
2521 
2522  KMP_MB();
2523  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2524 }
2525 
2526 /* ------------------------------------------------------------------------ */
2527 /* ------------------------------------------------------------------------ */
2528 
2529 /* Check whether we should push an internal control record onto the
2530  serial team stack. If so, do it. */
2531 void
2532 __kmp_save_internal_controls ( kmp_info_t * thread )
2533 {
2534 
2535  if ( thread->th.th_team != thread->th.th_serial_team ) {
2536  return;
2537  }
2538  if (thread->th.th_team->t.t_serialized > 1) {
2539  int push = 0;
2540 
2541  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2542  push = 1;
2543  } else {
2544  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2545  thread->th.th_team->t.t_serialized ) {
2546  push = 1;
2547  }
2548  }
2549  if (push) { /* push a record on the serial team's stack */
2550  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2551 
2552  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2553 
2554  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2555 
2556  control->next = thread->th.th_team->t.t_control_stack_top;
2557  thread->th.th_team->t.t_control_stack_top = control;
2558  }
2559  }
2560 }
2561 
2562 /* Changes set_nproc */
2563 void
2564 __kmp_set_num_threads( int new_nth, int gtid )
2565 {
2566  kmp_info_t *thread;
2567  kmp_root_t *root;
2568 
2569  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2570  KMP_DEBUG_ASSERT( __kmp_init_serial );
2571 
2572  if (new_nth < 1)
2573  new_nth = 1;
2574  else if (new_nth > __kmp_max_nth)
2575  new_nth = __kmp_max_nth;
2576 
2577  thread = __kmp_threads[gtid];
2578 
2579  __kmp_save_internal_controls( thread );
2580 
2581  set__nproc( thread, new_nth );
2582 
2583  //
2584  // If this omp_set_num_threads() call will cause the hot team size to be
2585  // reduced (in the absence of a num_threads clause), then reduce it now,
2586  // rather than waiting for the next parallel region.
2587  //
2588  root = thread->th.th_root;
2589  if ( __kmp_init_parallel && ( ! root->r.r_active )
2590  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2591 #if KMP_NESTED_HOT_TEAMS
2592  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2593 #endif
2594  ) {
2595  kmp_team_t *hot_team = root->r.r_hot_team;
2596  int f;
2597 
2598  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2599 
2600 
2601  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2602  int tt_idx;
2603  for (tt_idx=0; tt_idx<2; ++tt_idx) {
2604  kmp_task_team_t *task_team = hot_team->t.t_task_team[tt_idx];
2605  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
2606  // Signal worker threads (esp. the extra ones) to stop looking for tasks while spin waiting.
2607  // The task teams are reference counted and will be deallocated by the last worker thread.
2608  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
2609  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2610  KMP_MB();
2611  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
2612  &hot_team->t.t_task_team[tt_idx] ) );
2613  hot_team->t.t_task_team[tt_idx] = NULL;
2614  }
2615  else {
2616  KMP_DEBUG_ASSERT( task_team == NULL );
2617  }
2618  }
2619  }
2620 
2621  //
2622  // Release the extra threads we don't need any more.
2623  //
2624  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2625  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2626  __kmp_free_thread( hot_team->t.t_threads[f] );
2627  hot_team->t.t_threads[f] = NULL;
2628  }
2629  hot_team->t.t_nproc = new_nth;
2630 #if KMP_NESTED_HOT_TEAMS
2631  if( thread->th.th_hot_teams ) {
2632  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2633  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2634  }
2635 #endif
2636 
2637 
2638  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2639 
2640  //
2641  // Update the t_nproc field in the threads that are still active.
2642  //
2643  for( f=0 ; f < new_nth; f++ ) {
2644  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2645  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2646  }
2647  // Special flag in case omp_set_num_threads() call
2648  hot_team->t.t_size_changed = -1;
2649  }
2650 
2651 }
2652 
2653 /* Changes max_active_levels */
2654 void
2655 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2656 {
2657  kmp_info_t *thread;
2658 
2659  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2660  KMP_DEBUG_ASSERT( __kmp_init_serial );
2661 
2662  // validate max_active_levels
2663  if( max_active_levels < 0 ) {
2664  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2665  // We ignore this call if the user has specified a negative value.
2666  // The current setting won't be changed. The last valid setting will be used.
2667  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2668  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2669  return;
2670  }
2671  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2672  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2673  // We allow a zero value. (implementation defined behavior)
2674  } else {
2675  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2676  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2677  // Current upper limit is MAX_INT. (implementation defined behavior)
2678  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2679  // Actually, the flow should never get here until we use MAX_INT limit.
2680  }
2681  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2682 
2683  thread = __kmp_threads[ gtid ];
2684 
2685  __kmp_save_internal_controls( thread );
2686 
2687  set__max_active_levels( thread, max_active_levels );
2688 
2689 }
2690 
2691 /* Gets max_active_levels */
2692 int
2693 __kmp_get_max_active_levels( int gtid )
2694 {
2695  kmp_info_t *thread;
2696 
2697  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2698  KMP_DEBUG_ASSERT( __kmp_init_serial );
2699 
2700  thread = __kmp_threads[ gtid ];
2701  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2702  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2703  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2704  return thread->th.th_current_task->td_icvs.max_active_levels;
2705 }
2706 
2707 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2708 void
2709 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2710 {
2711  kmp_info_t *thread;
2712 // kmp_team_t *team;
2713 
2714  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2715  KMP_DEBUG_ASSERT( __kmp_init_serial );
2716 
2717  // Check if the kind parameter is valid, correct if needed.
2718  // Valid parameters should fit in one of two intervals - standard or extended:
2719  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2720  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2721  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2722  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2723  {
2724  // TODO: Hint needs attention in case we change the default schedule.
2725  __kmp_msg(
2726  kmp_ms_warning,
2727  KMP_MSG( ScheduleKindOutOfRange, kind ),
2728  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2729  __kmp_msg_null
2730  );
2731  kind = kmp_sched_default;
2732  chunk = 0; // ignore chunk value in case of bad kind
2733  }
2734 
2735  thread = __kmp_threads[ gtid ];
2736 
2737  __kmp_save_internal_controls( thread );
2738 
2739  if ( kind < kmp_sched_upper_std ) {
2740  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2741  // differ static chunked vs. unchunked:
2742  // chunk should be invalid to indicate unchunked schedule (which is the default)
2743  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2744  } else {
2745  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2746  }
2747  } else {
2748  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2749  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2750  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2751  }
2752  if ( kind == kmp_sched_auto ) {
2753  // ignore parameter chunk for schedule auto
2754  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2755  } else {
2756  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2757  }
2758 }
2759 
2760 /* Gets def_sched_var ICV values */
2761 void
2762 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2763 {
2764  kmp_info_t *thread;
2765  enum sched_type th_type;
2766 
2767  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2768  KMP_DEBUG_ASSERT( __kmp_init_serial );
2769 
2770  thread = __kmp_threads[ gtid ];
2771 
2772  //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
2773  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2774 
2775  switch ( th_type ) {
2776  case kmp_sch_static:
2777  case kmp_sch_static_greedy:
2778  case kmp_sch_static_balanced:
2779  *kind = kmp_sched_static;
2780  *chunk = 0; // chunk was not set, try to show this fact via zero value
2781  return;
2782  case kmp_sch_static_chunked:
2783  *kind = kmp_sched_static;
2784  break;
2785  case kmp_sch_dynamic_chunked:
2786  *kind = kmp_sched_dynamic;
2787  break;
2789  case kmp_sch_guided_iterative_chunked:
2790  case kmp_sch_guided_analytical_chunked:
2791  *kind = kmp_sched_guided;
2792  break;
2793  case kmp_sch_auto:
2794  *kind = kmp_sched_auto;
2795  break;
2796  case kmp_sch_trapezoidal:
2797  *kind = kmp_sched_trapezoidal;
2798  break;
2799 /*
2800  case kmp_sch_static_steal:
2801  *kind = kmp_sched_static_steal;
2802  break;
2803 */
2804  default:
2805  KMP_FATAL( UnknownSchedulingType, th_type );
2806  }
2807 
2808  //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
2809  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2810 }
2811 
2812 int
2813 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2814 
2815  int ii, dd;
2816  kmp_team_t *team;
2817  kmp_info_t *thr;
2818 
2819  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2820  KMP_DEBUG_ASSERT( __kmp_init_serial );
2821 
2822  // validate level
2823  if( level == 0 ) return 0;
2824  if( level < 0 ) return -1;
2825  thr = __kmp_threads[ gtid ];
2826  team = thr->th.th_team;
2827  ii = team->t.t_level;
2828  if( level > ii ) return -1;
2829 
2830 #if OMP_40_ENABLED
2831  if( thr->th.th_teams_microtask ) {
2832  // AC: we are in teams region where multiple nested teams have same level
2833  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2834  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2835  KMP_DEBUG_ASSERT( ii >= tlevel );
2836  // AC: As we need to pass by the teams league, we need to artificially increase ii
2837  if ( ii == tlevel ) {
2838  ii += 2; // three teams have same level
2839  } else {
2840  ii ++; // two teams have same level
2841  }
2842  }
2843  }
2844 #endif
2845 
2846  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2847 
2848  dd = team->t.t_serialized;
2849  level++;
2850  while( ii > level )
2851  {
2852  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2853  {
2854  }
2855  if( ( team->t.t_serialized ) && ( !dd ) ) {
2856  team = team->t.t_parent;
2857  continue;
2858  }
2859  if( ii > level ) {
2860  team = team->t.t_parent;
2861  dd = team->t.t_serialized;
2862  ii--;
2863  }
2864  }
2865 
2866  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2867 }
2868 
2869 int
2870 __kmp_get_team_size( int gtid, int level ) {
2871 
2872  int ii, dd;
2873  kmp_team_t *team;
2874  kmp_info_t *thr;
2875 
2876  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2877  KMP_DEBUG_ASSERT( __kmp_init_serial );
2878 
2879  // validate level
2880  if( level == 0 ) return 1;
2881  if( level < 0 ) return -1;
2882  thr = __kmp_threads[ gtid ];
2883  team = thr->th.th_team;
2884  ii = team->t.t_level;
2885  if( level > ii ) return -1;
2886 
2887 #if OMP_40_ENABLED
2888  if( thr->th.th_teams_microtask ) {
2889  // AC: we are in teams region where multiple nested teams have same level
2890  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2891  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2892  KMP_DEBUG_ASSERT( ii >= tlevel );
2893  // AC: As we need to pass by the teams league, we need to artificially increase ii
2894  if ( ii == tlevel ) {
2895  ii += 2; // three teams have same level
2896  } else {
2897  ii ++; // two teams have same level
2898  }
2899  }
2900  }
2901 #endif
2902 
2903  while( ii > level )
2904  {
2905  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2906  {
2907  }
2908  if( team->t.t_serialized && ( !dd ) ) {
2909  team = team->t.t_parent;
2910  continue;
2911  }
2912  if( ii > level ) {
2913  team = team->t.t_parent;
2914  ii--;
2915  }
2916  }
2917 
2918  return team->t.t_nproc;
2919 }
2920 
2921 kmp_r_sched_t
2922 __kmp_get_schedule_global() {
2923 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2924 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2925 
2926  kmp_r_sched_t r_sched;
2927 
2928  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2929  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2930  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2931  if ( __kmp_sched == kmp_sch_static ) {
2932  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2933  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2934  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2935  } else {
2936  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2937  }
2938 
2939  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2940  r_sched.chunk = KMP_DEFAULT_CHUNK;
2941  } else {
2942  r_sched.chunk = __kmp_chunk;
2943  }
2944 
2945  return r_sched;
2946 }
2947 
2948 /* ------------------------------------------------------------------------ */
2949 /* ------------------------------------------------------------------------ */
2950 
2951 
2952 /*
2953  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2954  * at least argc number of *t_argv entries for the requested team.
2955  */
2956 static void
2957 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2958 {
2959 
2960  KMP_DEBUG_ASSERT( team );
2961  if( !realloc || argc > team->t.t_max_argc ) {
2962 
2963  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2964  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2965  /* if previously allocated heap space for args, free them */
2966  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2967  __kmp_free( (void *) team->t.t_argv );
2968 
2969  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2970  /* use unused space in the cache line for arguments */
2971  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2972  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2973  team->t.t_id, team->t.t_max_argc ));
2974  team->t.t_argv = &team->t.t_inline_argv[0];
2975  if ( __kmp_storage_map ) {
2976  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2977  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2978  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2979  "team_%d.t_inline_argv",
2980  team->t.t_id );
2981  }
2982  } else {
2983  /* allocate space for arguments in the heap */
2984  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2985  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2986  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2987  team->t.t_id, team->t.t_max_argc ));
2988  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2989  if ( __kmp_storage_map ) {
2990  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2991  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2992  team->t.t_id );
2993  }
2994  }
2995  }
2996 }
2997 
2998 static void
2999 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
3000 {
3001  int i;
3002  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
3003 #if KMP_USE_POOLED_ALLOC
3004  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
3005  char *ptr = __kmp_allocate(max_nth *
3006  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
3007  + sizeof(kmp_disp_t) + sizeof(int)*6
3008  //+ sizeof(int)
3009  + sizeof(kmp_r_sched_t)
3010  + sizeof(kmp_taskdata_t) ) );
3011 
3012  team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
3013  team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
3014  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
3015  team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
3016  team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
3017  team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
3018  team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
3019  team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
3020  team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
3021  team->t.t_set_bt_set = (int*) ptr;
3022  ptr += sizeof(int) * max_nth;
3023  //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
3024  team->t.t_set_sched = (kmp_r_sched_t*) ptr;
3025  ptr += sizeof(kmp_r_sched_t) * max_nth;
3026  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
3027  ptr += sizeof(kmp_taskdata_t) * max_nth;
3028 #else
3029 
3030  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
3031  team->t.t_disp_buffer = (dispatch_shared_info_t*)
3032  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
3033  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
3034  //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
3035  //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
3036  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
3037 #endif
3038  team->t.t_max_nproc = max_nth;
3039 
3040  /* setup dispatch buffers */
3041  for(i = 0 ; i < num_disp_buff; ++i)
3042  team->t.t_disp_buffer[i].buffer_index = i;
3043 }
3044 
3045 static void
3046 __kmp_free_team_arrays(kmp_team_t *team) {
3047  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3048  int i;
3049  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3050  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3051  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3052  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3053  }; // if
3054  }; // for
3055  __kmp_free(team->t.t_threads);
3056  #if !KMP_USE_POOLED_ALLOC
3057  __kmp_free(team->t.t_disp_buffer);
3058  __kmp_free(team->t.t_dispatch);
3059  //__kmp_free(team->t.t_set_max_active_levels);
3060  //__kmp_free(team->t.t_set_sched);
3061  __kmp_free(team->t.t_implicit_task_taskdata);
3062  #endif
3063  team->t.t_threads = NULL;
3064  team->t.t_disp_buffer = NULL;
3065  team->t.t_dispatch = NULL;
3066  //team->t.t_set_sched = 0;
3067  //team->t.t_set_max_active_levels = 0;
3068  team->t.t_implicit_task_taskdata = 0;
3069 }
3070 
3071 static void
3072 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3073  kmp_info_t **oldThreads = team->t.t_threads;
3074 
3075  #if !KMP_USE_POOLED_ALLOC
3076  __kmp_free(team->t.t_disp_buffer);
3077  __kmp_free(team->t.t_dispatch);
3078  //__kmp_free(team->t.t_set_max_active_levels);
3079  //__kmp_free(team->t.t_set_sched);
3080  __kmp_free(team->t.t_implicit_task_taskdata);
3081  #endif
3082  __kmp_allocate_team_arrays(team, max_nth);
3083 
3084  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3085 
3086  __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t
3090 __kmp_get_global_icvs( void ) {
3091 
3092  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3096 #endif /* OMP_40_ENABLED */
3097 
3098  kmp_internal_control_t g_icvs = {
3099  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3100  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3101  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3102  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3103  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3104  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3105  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3106  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3107  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3108  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3109 #if OMP_40_ENABLED
3110  __kmp_nested_proc_bind.bind_types[0],
3111 #endif /* OMP_40_ENABLED */
3112  NULL //struct kmp_internal_control *next;
3113  };
3114 
3115  return g_icvs;
3116 }
3117 
3118 static kmp_internal_control_t
3119 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3120 
3121  kmp_internal_control_t gx_icvs;
3122  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3123  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3124  gx_icvs.next = NULL;
3125 
3126  return gx_icvs;
3127 }
3128 
3129 static void
3130 __kmp_initialize_root( kmp_root_t *root )
3131 {
3132  int f;
3133  kmp_team_t *root_team;
3134  kmp_team_t *hot_team;
3135  int hot_team_max_nth;
3136  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3137  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3138  KMP_DEBUG_ASSERT( root );
3139  KMP_ASSERT( ! root->r.r_begin );
3140 
3141  /* setup the root state structure */
3142  __kmp_init_lock( &root->r.r_begin_lock );
3143  root->r.r_begin = FALSE;
3144  root->r.r_active = FALSE;
3145  root->r.r_in_parallel = 0;
3146  root->r.r_blocktime = __kmp_dflt_blocktime;
3147  root->r.r_nested = __kmp_dflt_nested;
3148 
3149  /* setup the root team for this task */
3150  /* allocate the root team structure */
3151  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3152 
3153  root_team =
3154  __kmp_allocate_team(
3155  root,
3156  1, // new_nproc
3157  1, // max_nproc
3158 #if OMPT_SUPPORT
3159  0, // root parallel id
3160 #endif
3161 #if OMP_40_ENABLED
3162  __kmp_nested_proc_bind.bind_types[0],
3163 #endif
3164  &r_icvs,
3165  0 // argc
3166  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3167  );
3168 #if USE_DEBUGGER
3169  // Non-NULL value should be assigned to make the debugger display the root team.
3170  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3171 #endif
3172 
3173  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3174 
3175  root->r.r_root_team = root_team;
3176  root_team->t.t_control_stack_top = NULL;
3177 
3178  /* initialize root team */
3179  root_team->t.t_threads[0] = NULL;
3180  root_team->t.t_nproc = 1;
3181  root_team->t.t_serialized = 1;
3182  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3183  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3184  root_team->t.t_sched.chunk = r_sched.chunk;
3185  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3186  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3187 
3188  /* setup the hot team for this task */
3189  /* allocate the hot team structure */
3190  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3191 
3192  hot_team =
3193  __kmp_allocate_team(
3194  root,
3195  1, // new_nproc
3196  __kmp_dflt_team_nth_ub * 2, // max_nproc
3197 #if OMPT_SUPPORT
3198  0, // root parallel id
3199 #endif
3200 #if OMP_40_ENABLED
3201  __kmp_nested_proc_bind.bind_types[0],
3202 #endif
3203  &r_icvs,
3204  0 // argc
3205  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3206  );
3207  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3208 
3209  root->r.r_hot_team = hot_team;
3210  root_team->t.t_control_stack_top = NULL;
3211 
3212  /* first-time initialization */
3213  hot_team->t.t_parent = root_team;
3214 
3215  /* initialize hot team */
3216  hot_team_max_nth = hot_team->t.t_max_nproc;
3217  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3218  hot_team->t.t_threads[ f ] = NULL;
3219  }; // for
3220  hot_team->t.t_nproc = 1;
3221  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3222  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3223  hot_team->t.t_sched.chunk = r_sched.chunk;
3224  hot_team->t.t_size_changed = 0;
3225 
3226 }
3227 
3228 #ifdef KMP_DEBUG
3229 
3230 
3231 typedef struct kmp_team_list_item {
3232  kmp_team_p const * entry;
3233  struct kmp_team_list_item * next;
3234 } kmp_team_list_item_t;
3235 typedef kmp_team_list_item_t * kmp_team_list_t;
3236 
3237 
3238 static void
3239 __kmp_print_structure_team_accum( // Add team to list of teams.
3240  kmp_team_list_t list, // List of teams.
3241  kmp_team_p const * team // Team to add.
3242 ) {
3243 
3244  // List must terminate with item where both entry and next are NULL.
3245  // Team is added to the list only once.
3246  // List is sorted in ascending order by team id.
3247  // Team id is *not* a key.
3248 
3249  kmp_team_list_t l;
3250 
3251  KMP_DEBUG_ASSERT( list != NULL );
3252  if ( team == NULL ) {
3253  return;
3254  }; // if
3255 
3256  __kmp_print_structure_team_accum( list, team->t.t_parent );
3257  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3258 
3259  // Search list for the team.
3260  l = list;
3261  while ( l->next != NULL && l->entry != team ) {
3262  l = l->next;
3263  }; // while
3264  if ( l->next != NULL ) {
3265  return; // Team has been added before, exit.
3266  }; // if
3267 
3268  // Team is not found. Search list again for insertion point.
3269  l = list;
3270  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3271  l = l->next;
3272  }; // while
3273 
3274  // Insert team.
3275  {
3276  kmp_team_list_item_t * item =
3277  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3278  * item = * l;
3279  l->entry = team;
3280  l->next = item;
3281  }
3282 
3283 }
3284 
3285 static void
3286 __kmp_print_structure_team(
3287  char const * title,
3288  kmp_team_p const * team
3289 
3290 ) {
3291  __kmp_printf( "%s", title );
3292  if ( team != NULL ) {
3293  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3294  } else {
3295  __kmp_printf( " - (nil)\n" );
3296  }; // if
3297 }
3298 
3299 static void
3300 __kmp_print_structure_thread(
3301  char const * title,
3302  kmp_info_p const * thread
3303 
3304 ) {
3305  __kmp_printf( "%s", title );
3306  if ( thread != NULL ) {
3307  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3308  } else {
3309  __kmp_printf( " - (nil)\n" );
3310  }; // if
3311 }
3312 
3313 void
3314 __kmp_print_structure(
3315  void
3316 ) {
3317 
3318  kmp_team_list_t list;
3319 
3320  // Initialize list of teams.
3321  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3322  list->entry = NULL;
3323  list->next = NULL;
3324 
3325  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3326  {
3327  int gtid;
3328  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3329  __kmp_printf( "%2d", gtid );
3330  if ( __kmp_threads != NULL ) {
3331  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3332  }; // if
3333  if ( __kmp_root != NULL ) {
3334  __kmp_printf( " %p", __kmp_root[ gtid ] );
3335  }; // if
3336  __kmp_printf( "\n" );
3337  }; // for gtid
3338  }
3339 
3340  // Print out __kmp_threads array.
3341  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3342  if ( __kmp_threads != NULL ) {
3343  int gtid;
3344  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3345  kmp_info_t const * thread = __kmp_threads[ gtid ];
3346  if ( thread != NULL ) {
3347  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3348  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3349  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3350  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3351  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3352  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3353  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3354  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3355 #if OMP_40_ENABLED
3356  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3357 #endif
3358  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3359  __kmp_printf( "\n" );
3360  __kmp_print_structure_team_accum( list, thread->th.th_team );
3361  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3362  }; // if
3363  }; // for gtid
3364  } else {
3365  __kmp_printf( "Threads array is not allocated.\n" );
3366  }; // if
3367 
3368  // Print out __kmp_root array.
3369  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3370  if ( __kmp_root != NULL ) {
3371  int gtid;
3372  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3373  kmp_root_t const * root = __kmp_root[ gtid ];
3374  if ( root != NULL ) {
3375  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3376  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3377  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3378  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3379  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3380  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3381  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3382  __kmp_printf( "\n" );
3383  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3384  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3385  }; // if
3386  }; // for gtid
3387  } else {
3388  __kmp_printf( "Ubers array is not allocated.\n" );
3389  }; // if
3390 
3391  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3392  while ( list->next != NULL ) {
3393  kmp_team_p const * team = list->entry;
3394  int i;
3395  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3396  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3397  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3398  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3399  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3400  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3401  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3402  __kmp_printf( " Thread %2d: ", i );
3403  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3404  }; // for i
3405  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3406  __kmp_printf( "\n" );
3407  list = list->next;
3408  }; // while
3409 
3410  // Print out __kmp_thread_pool and __kmp_team_pool.
3411  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3412  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3413  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3414  __kmp_printf( "\n" );
3415 
3416  // Free team list.
3417  while ( list != NULL ) {
3418  kmp_team_list_item_t * item = list;
3419  list = list->next;
3420  KMP_INTERNAL_FREE( item );
3421  }; // while
3422 
3423 }
3424 
3425 #endif
3426 
3427 
3428 //---------------------------------------------------------------------------
3429 // Stuff for per-thread fast random number generator
3430 // Table of primes
3431 
3432 static const unsigned __kmp_primes[] = {
3433  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3434  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3435  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3436  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3437  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3438  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3439  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3440  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3441  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3443  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3444  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3445  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3446  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3447  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3448  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3449 };
3450 
3451 //---------------------------------------------------------------------------
3452 // __kmp_get_random: Get a random number using a linear congruential method.
3453 
3454 unsigned short
3455 __kmp_get_random( kmp_info_t * thread )
3456 {
3457  unsigned x = thread->th.th_x;
3458  unsigned short r = x>>16;
3459 
3460  thread->th.th_x = x*thread->th.th_a+1;
3461 
3462  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3463  thread->th.th_info.ds.ds_tid, r) );
3464 
3465  return r;
3466 }
3467 //--------------------------------------------------------
3468 // __kmp_init_random: Initialize a random number generator
3469 
3470 void
3471 __kmp_init_random( kmp_info_t * thread )
3472 {
3473  unsigned seed = thread->th.th_info.ds.ds_tid;
3474 
3475  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3476  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3477  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3478 }
3479 
3480 
3481 #if KMP_OS_WINDOWS
3482 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3483 static int
3484 __kmp_reclaim_dead_roots(void) {
3485  int i, r = 0;
3486 
3487  for(i = 0; i < __kmp_threads_capacity; ++i) {
3488  if( KMP_UBER_GTID( i ) &&
3489  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3490  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3491  r += __kmp_unregister_root_other_thread(i);
3492  }
3493  }
3494  return r;
3495 }
3496 #endif
3497 
3498 /*
3499  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3500  free entries generated.
3501 
3502  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3503  already dead.
3504 
3505  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3506  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3507  __kmp_tp_capacity, if threadprivate cache array has been created.
3508  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3509 
3510  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3511  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3512  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3513  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3514  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3515  as many free slots as possible up to nWish.
3516 
3517  If any argument is negative, the behavior is undefined.
3518 */
3519 static int
3520 __kmp_expand_threads(int nWish, int nNeed) {
3521  int added = 0;
3522  int old_tp_cached;
3523  int __kmp_actual_max_nth;
3524 
3525  if(nNeed > nWish) /* normalize the arguments */
3526  nWish = nNeed;
3527 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3528 /* only for Windows static library */
3529  /* reclaim array entries for root threads that are already dead */
3530  added = __kmp_reclaim_dead_roots();
3531 
3532  if(nNeed) {
3533  nNeed -= added;
3534  if(nNeed < 0)
3535  nNeed = 0;
3536  }
3537  if(nWish) {
3538  nWish -= added;
3539  if(nWish < 0)
3540  nWish = 0;
3541  }
3542 #endif
3543  if(nWish <= 0)
3544  return added;
3545 
3546  while(1) {
3547  int nTarget;
3548  int minimumRequiredCapacity;
3549  int newCapacity;
3550  kmp_info_t **newThreads;
3551  kmp_root_t **newRoot;
3552 
3553  //
3554  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3555  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3556  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3557  // become > __kmp_max_nth in one of two ways:
3558  //
3559  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3560  // may not be resused by another thread, so we may need to increase
3561  // __kmp_threads_capacity to __kmp_max_threads + 1.
3562  //
3563  // 2) New foreign root(s) are encountered. We always register new
3564  // foreign roots. This may cause a smaller # of threads to be
3565  // allocated at subsequent parallel regions, but the worker threads
3566  // hang around (and eventually go to sleep) and need slots in the
3567  // __kmp_threads[] array.
3568  //
3569  // Anyway, that is the reason for moving the check to see if
3570  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3571  // instead of having it performed here. -BB
3572  //
3573  old_tp_cached = __kmp_tp_cached;
3574  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3575  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3576 
3577  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3578  nTarget = nWish;
3579  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3580  /* can't fulfil nWish, so try nNeed */
3581  if(nNeed) {
3582  nTarget = nNeed;
3583  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3584  /* possible expansion too small -- give up */
3585  break;
3586  }
3587  } else {
3588  /* best-effort */
3589  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3590  if(!nTarget) {
3591  /* can expand at all -- give up */
3592  break;
3593  }
3594  }
3595  }
3596  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3597 
3598  newCapacity = __kmp_threads_capacity;
3599  do{
3600  newCapacity =
3601  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3602  (newCapacity << 1) :
3603  __kmp_actual_max_nth;
3604  } while(newCapacity < minimumRequiredCapacity);
3605  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3606  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3607  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3608  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3609  memset(newThreads + __kmp_threads_capacity, 0,
3610  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3611  memset(newRoot + __kmp_threads_capacity, 0,
3612  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3613 
3614  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3615  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3616  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3617  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3618  of a double-check pair.
3619  */
3620  __kmp_free(newThreads);
3621  continue; /* start over and try again */
3622  }
3623  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3624  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3625  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3626  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3627  __kmp_free(newThreads);
3628  continue; /* start over and try again */
3629  } else {
3630  /* success */
3631  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3632  //
3633  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3634  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3635  added += newCapacity - __kmp_threads_capacity;
3636  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3637  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3638  break; /* succeeded, so we can exit the loop */
3639  }
3640  }
3641  return added;
3642 }
3643 
3644 /* register the current thread as a root thread and obtain our gtid */
3645 /* we must have the __kmp_initz_lock held at this point */
3646 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3647 int
3648 __kmp_register_root( int initial_thread )
3649 {
3650  kmp_info_t *root_thread;
3651  kmp_root_t *root;
3652  int gtid;
3653  int capacity;
3654  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3655  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3656  KMP_MB();
3657 
3658 
3659  /*
3660  2007-03-02:
3661 
3662  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3663  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3664  return false (that means there is at least one empty slot in __kmp_threads array), but it
3665  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3666  used for this one. Following code workarounds this bug.
3667 
3668  However, right solution seems to be not reserving slot #0 for initial thread because:
3669  (1) there is no magic in slot #0,
3670  (2) we cannot detect initial thread reliably (the first thread which does serial
3671  initialization may be not a real initial thread).
3672  */
3673  capacity = __kmp_threads_capacity;
3674  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3675  -- capacity;
3676  }; // if
3677 
3678  /* see if there are too many threads */
3679  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3680  if ( __kmp_tp_cached ) {
3681  __kmp_msg(
3682  kmp_ms_fatal,
3683  KMP_MSG( CantRegisterNewThread ),
3684  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3685  KMP_HNT( PossibleSystemLimitOnThreads ),
3686  __kmp_msg_null
3687  );
3688  }
3689  else {
3690  __kmp_msg(
3691  kmp_ms_fatal,
3692  KMP_MSG( CantRegisterNewThread ),
3693  KMP_HNT( SystemLimitOnThreads ),
3694  __kmp_msg_null
3695  );
3696  }
3697  }; // if
3698 
3699  /* find an available thread slot */
3700  /* Don't reassign the zero slot since we need that to only be used by initial
3701  thread */
3702  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3703  ;
3704  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3705  KMP_ASSERT( gtid < __kmp_threads_capacity );
3706 
3707  /* update global accounting */
3708  __kmp_all_nth ++;
3709  TCW_4(__kmp_nth, __kmp_nth + 1);
3710 
3711  //
3712  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3713  // for low numbers of procs, and method #2 (keyed API call) for higher
3714  // numbers of procs.
3715  //
3716  if ( __kmp_adjust_gtid_mode ) {
3717  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3718  if ( TCR_4(__kmp_gtid_mode) != 2) {
3719  TCW_4(__kmp_gtid_mode, 2);
3720  }
3721  }
3722  else {
3723  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3724  TCW_4(__kmp_gtid_mode, 1);
3725  }
3726  }
3727  }
3728 
3729 #ifdef KMP_ADJUST_BLOCKTIME
3730  /* Adjust blocktime to zero if necessary */
3731  /* Middle initialization might not have occurred yet */
3732  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3733  if ( __kmp_nth > __kmp_avail_proc ) {
3734  __kmp_zero_bt = TRUE;
3735  }
3736  }
3737 #endif /* KMP_ADJUST_BLOCKTIME */
3738 
3739  /* setup this new hierarchy */
3740  if( ! ( root = __kmp_root[gtid] )) {
3741  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3742  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3743  }
3744 
3745  __kmp_initialize_root( root );
3746 
3747  /* setup new root thread structure */
3748  if( root->r.r_uber_thread ) {
3749  root_thread = root->r.r_uber_thread;
3750  } else {
3751  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3752  if ( __kmp_storage_map ) {
3753  __kmp_print_thread_storage_map( root_thread, gtid );
3754  }
3755  root_thread->th.th_info .ds.ds_gtid = gtid;
3756  root_thread->th.th_root = root;
3757  if( __kmp_env_consistency_check ) {
3758  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3759  }
3760  #if USE_FAST_MEMORY
3761  __kmp_initialize_fast_memory( root_thread );
3762  #endif /* USE_FAST_MEMORY */
3763 
3764  #if KMP_USE_BGET
3765  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3766  __kmp_initialize_bget( root_thread );
3767  #endif
3768  __kmp_init_random( root_thread ); // Initialize random number generator
3769  }
3770 
3771  /* setup the serial team held in reserve by the root thread */
3772  if( ! root_thread->th.th_serial_team ) {
3773  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3774  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3775 
3776  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3777 #if OMPT_SUPPORT
3778  0, // root parallel id
3779 #endif
3780 #if OMP_40_ENABLED
3781  proc_bind_default,
3782 #endif
3783  &r_icvs,
3784  0 USE_NESTED_HOT_ARG(NULL) );
3785  }
3786  KMP_ASSERT( root_thread->th.th_serial_team );
3787  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3788  root_thread->th.th_serial_team ) );
3789 
3790  /* drop root_thread into place */
3791  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3792 
3793  root->r.r_root_team->t.t_threads[0] = root_thread;
3794  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3795  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3796  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3797  root->r.r_uber_thread = root_thread;
3798 
3799  /* initialize the thread, get it ready to go */
3800  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3801 
3802  /* prepare the master thread for get_gtid() */
3803  __kmp_gtid_set_specific( gtid );
3804 
3805  __kmp_itt_thread_name( gtid );
3806 
3807  #ifdef KMP_TDATA_GTID
3808  __kmp_gtid = gtid;
3809  #endif
3810  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3811  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3812  TCW_4(__kmp_init_gtid, TRUE);
3813 
3814  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3815  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3816  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3817  KMP_INIT_BARRIER_STATE ) );
3818  { // Initialize barrier data.
3819  int b;
3820  for ( b = 0; b < bs_last_barrier; ++ b ) {
3821  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3822 #if USE_DEBUGGER
3823  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3824 #endif
3825  }; // for
3826  }
3827  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3828 
3829 
3830 #if KMP_AFFINITY_SUPPORTED
3831  if ( TCR_4(__kmp_init_middle) ) {
3832  __kmp_affinity_set_init_mask( gtid, TRUE );
3833  }
3834 #endif /* KMP_AFFINITY_SUPPORTED */
3835 
3836  __kmp_root_counter ++;
3837 
3838  KMP_MB();
3839  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3840 
3841  return gtid;
3842 }
3843 
3844 #if KMP_NESTED_HOT_TEAMS
3845 static int
3846 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3847 {
3848  int i, n, nth;
3849  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3850  if( !hot_teams || !hot_teams[level].hot_team ) {
3851  return 0;
3852  }
3853  KMP_DEBUG_ASSERT( level < max_level );
3854  kmp_team_t *team = hot_teams[level].hot_team;
3855  nth = hot_teams[level].hot_team_nth;
3856  n = nth - 1; // master is not freed
3857  if( level < max_level - 1 ) {
3858  for( i = 0; i < nth; ++i ) {
3859  kmp_info_t *th = team->t.t_threads[i];
3860  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3861  if( i > 0 && th->th.th_hot_teams ) {
3862  __kmp_free( th->th.th_hot_teams );
3863  th->th.th_hot_teams = NULL;
3864  }
3865  }
3866  }
3867  __kmp_free_team( root, team, NULL );
3868  return n;
3869 }
3870 #endif
3871 
3872 /* Resets a root thread and clear its root and hot teams.
3873  Returns the number of __kmp_threads entries directly and indirectly freed.
3874 */
3875 static int
3876 __kmp_reset_root(int gtid, kmp_root_t *root)
3877 {
3878  kmp_team_t * root_team = root->r.r_root_team;
3879  kmp_team_t * hot_team = root->r.r_hot_team;
3880  int n = hot_team->t.t_nproc;
3881  int i;
3882 
3883  KMP_DEBUG_ASSERT( ! root->r.r_active );
3884 
3885  root->r.r_root_team = NULL;
3886  root->r.r_hot_team = NULL;
3887  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3888  // to __kmp_free_team().
3889  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3890 #if KMP_NESTED_HOT_TEAMS
3891  if( __kmp_hot_teams_max_level > 1 ) { // need to free nested hot teams and their threads if any
3892  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3893  kmp_info_t *th = hot_team->t.t_threads[i];
3894  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3895  if( th->th.th_hot_teams ) {
3896  __kmp_free( th->th.th_hot_teams );
3897  th->th.th_hot_teams = NULL;
3898  }
3899  }
3900  }
3901 #endif
3902  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3903 
3904  //
3905  // Before we can reap the thread, we need to make certain that all
3906  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3907  //
3908  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3909  __kmp_wait_to_unref_task_teams();
3910  }
3911 
3912  #if KMP_OS_WINDOWS
3913  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3914  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3915  (LPVOID)&(root->r.r_uber_thread->th),
3916  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3917  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3918  #endif /* KMP_OS_WINDOWS */
3919 
3920 #if OMPT_SUPPORT
3921  if ((ompt_status == ompt_status_track_callback) &&
3922  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3923  int gtid = __kmp_get_gtid();
3924  __ompt_thread_end(ompt_thread_initial, gtid);
3925  }
3926 #endif
3927 
3928  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3929  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3930 
3931  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3932  root->r.r_uber_thread = NULL;
3933  /* mark root as no longer in use */
3934  root->r.r_begin = FALSE;
3935 
3936  return n;
3937 }
3938 
3939 void
3940 __kmp_unregister_root_current_thread( int gtid )
3941 {
3942  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3943  /* this lock should be ok, since unregister_root_current_thread is never called during
3944  * and abort, only during a normal close. furthermore, if you have the
3945  * forkjoin lock, you should never try to get the initz lock */
3946 
3947  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3948  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3949  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3950  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3951  return;
3952  }
3953  kmp_root_t *root = __kmp_root[gtid];
3954 
3955  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3956  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3957  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3958  KMP_ASSERT( root->r.r_active == FALSE );
3959 
3960 
3961  KMP_MB();
3962 
3963 #if OMP_41_ENABLED
3964  kmp_info_t * thread = __kmp_threads[gtid];
3965  kmp_team_t * team = thread->th.th_team;
3966  kmp_task_team_t * task_team = thread->th.th_task_team;
3967 
3968  // we need to wait for the proxy tasks before finishing the thread
3969  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
3970  __kmp_task_team_wait(thread, team, NULL );
3971 #endif
3972 
3973  __kmp_reset_root(gtid, root);
3974 
3975  /* free up this thread slot */
3976  __kmp_gtid_set_specific( KMP_GTID_DNE );
3977 #ifdef KMP_TDATA_GTID
3978  __kmp_gtid = KMP_GTID_DNE;
3979 #endif
3980 
3981  KMP_MB();
3982  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3983 
3984  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3985 }
3986 
3987 #if KMP_OS_WINDOWS
3988 /* __kmp_forkjoin_lock must be already held
3989  Unregisters a root thread that is not the current thread. Returns the number of
3990  __kmp_threads entries freed as a result.
3991  */
3992 static int
3993 __kmp_unregister_root_other_thread( int gtid )
3994 {
3995  kmp_root_t *root = __kmp_root[gtid];
3996  int r;
3997 
3998  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3999  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4000  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4001  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4002  KMP_ASSERT( root->r.r_active == FALSE );
4003 
4004  r = __kmp_reset_root(gtid, root);
4005  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
4006  return r;
4007 }
4008 #endif
4009 
4010 #if KMP_DEBUG
4011 void __kmp_task_info() {
4012 
4013  kmp_int32 gtid = __kmp_entry_gtid();
4014  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
4015  kmp_info_t *this_thr = __kmp_threads[ gtid ];
4016  kmp_team_t *steam = this_thr->th.th_serial_team;
4017  kmp_team_t *team = this_thr->th.th_team;
4018 
4019  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
4020  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
4021 }
4022 #endif // KMP_DEBUG
4023 
4024 /* TODO optimize with one big memclr, take out what isn't needed,
4025  * split responsibility to workers as much as possible, and delay
4026  * initialization of features as much as possible */
4027 static void
4028 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
4029 {
4030  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
4031  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4032  kmp_info_t *master = team->t.t_threads[0];
4033  KMP_DEBUG_ASSERT( this_thr != NULL );
4034  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
4035  KMP_DEBUG_ASSERT( team );
4036  KMP_DEBUG_ASSERT( team->t.t_threads );
4037  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4038  KMP_DEBUG_ASSERT( master );
4039  KMP_DEBUG_ASSERT( master->th.th_root );
4040 
4041  KMP_MB();
4042 
4043  TCW_SYNC_PTR(this_thr->th.th_team, team);
4044 
4045  this_thr->th.th_info.ds.ds_tid = tid;
4046  this_thr->th.th_set_nproc = 0;
4047 #if OMP_40_ENABLED
4048  this_thr->th.th_set_proc_bind = proc_bind_default;
4049 # if KMP_AFFINITY_SUPPORTED
4050  this_thr->th.th_new_place = this_thr->th.th_current_place;
4051 # endif
4052 #endif
4053  this_thr->th.th_root = master->th.th_root;
4054 
4055  /* setup the thread's cache of the team structure */
4056  this_thr->th.th_team_nproc = team->t.t_nproc;
4057  this_thr->th.th_team_master = master;
4058  this_thr->th.th_team_serialized = team->t.t_serialized;
4059  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4060 
4061  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4062  this_thr->th.th_task_state = 0;
4063 
4064  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4065  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4066 
4067  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4068 
4069  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4070  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4071  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4072 
4073  /* TODO no worksharing in speculative threads */
4074  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4075 
4076  this_thr->th.th_local.this_construct = 0;
4077 
4078 #ifdef BUILD_TV
4079  this_thr->th.th_local.tv_data = 0;
4080 #endif
4081 
4082  if ( ! this_thr->th.th_pri_common ) {
4083  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4084  if ( __kmp_storage_map ) {
4085  __kmp_print_storage_map_gtid(
4086  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4087  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4088  );
4089  }; // if
4090  this_thr->th.th_pri_head = NULL;
4091  }; // if
4092 
4093  /* Initialize dynamic dispatch */
4094  {
4095  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4096  /*
4097  * Use team max_nproc since this will never change for the team.
4098  */
4099  size_t disp_size = sizeof( dispatch_private_info_t ) *
4100  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
4101  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4102  KMP_ASSERT( dispatch );
4103  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4104  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4105 
4106  dispatch->th_disp_index = 0;
4107 
4108  if( ! dispatch->th_disp_buffer ) {
4109  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4110 
4111  if ( __kmp_storage_map ) {
4112  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4113  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
4114  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4115  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4116  gtid, team->t.t_id, gtid );
4117  }
4118  } else {
4119  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4120  }
4121 
4122  dispatch->th_dispatch_pr_current = 0;
4123  dispatch->th_dispatch_sh_current = 0;
4124 
4125  dispatch->th_deo_fcn = 0; /* ORDERED */
4126  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4127  }
4128 
4129  this_thr->th.th_next_pool = NULL;
4130 
4131  if (!this_thr->th.th_task_state_memo_stack) {
4132  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4133  this_thr->th.th_task_state_top = 0;
4134  this_thr->th.th_task_state_stack_sz = 4;
4135  }
4136 
4137  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4138  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4139 
4140  KMP_MB();
4141 }
4142 
4143 
4144 /* allocate a new thread for the requesting team. this is only called from within a
4145  * forkjoin critical section. we will first try to get an available thread from the
4146  * thread pool. if none is available, we will fork a new one assuming we are able
4147  * to create a new one. this should be assured, as the caller should check on this
4148  * first.
4149  */
4150 kmp_info_t *
4151 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4152 {
4153  kmp_team_t *serial_team;
4154  kmp_info_t *new_thr;
4155  int new_gtid;
4156 
4157  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4158  KMP_DEBUG_ASSERT( root && team );
4159 #if !KMP_NESTED_HOT_TEAMS
4160  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4161 #endif
4162  KMP_MB();
4163 
4164  /* first, try to get one from the thread pool */
4165  if ( __kmp_thread_pool ) {
4166 
4167  new_thr = (kmp_info_t*)__kmp_thread_pool;
4168  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4169  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4170  __kmp_thread_pool_insert_pt = NULL;
4171  }
4172  TCW_4(new_thr->th.th_in_pool, FALSE);
4173  //
4174  // Don't touch th_active_in_pool or th_active.
4175  // The worker thread adjusts those flags as it sleeps/awakens.
4176  //
4177 
4178  __kmp_thread_pool_nth--;
4179 
4180  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4181  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4182  KMP_ASSERT( ! new_thr->th.th_team );
4183  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4184  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4185 
4186  /* setup the thread structure */
4187  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4188  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4189 
4190  TCW_4(__kmp_nth, __kmp_nth + 1);
4191 
4192  new_thr->th.th_task_state_top = 0;
4193  new_thr->th.th_task_state_stack_sz = 4;
4194 
4195 #ifdef KMP_ADJUST_BLOCKTIME
4196  /* Adjust blocktime back to zero if necessar y */
4197  /* Middle initialization might not have occurred yet */
4198  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4199  if ( __kmp_nth > __kmp_avail_proc ) {
4200  __kmp_zero_bt = TRUE;
4201  }
4202  }
4203 #endif /* KMP_ADJUST_BLOCKTIME */
4204 
4205 #if KMP_DEBUG
4206  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4207  int b;
4208  kmp_balign_t * balign = new_thr->th.th_bar;
4209  for( b = 0; b < bs_last_barrier; ++ b )
4210  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4211 #endif
4212 
4213  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4214  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4215 
4216  KMP_MB();
4217  return new_thr;
4218  }
4219 
4220 
4221  /* no, well fork a new one */
4222  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4223  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4224 
4225  //
4226  // If this is the first worker thread the RTL is creating, then also
4227  // launch the monitor thread. We try to do this as early as possible.
4228  //
4229  if ( ! TCR_4( __kmp_init_monitor ) ) {
4230  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4231  if ( ! TCR_4( __kmp_init_monitor ) ) {
4232  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4233  TCW_4( __kmp_init_monitor, 1 );
4234  __kmp_create_monitor( & __kmp_monitor );
4235  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4236  #if KMP_OS_WINDOWS
4237  // AC: wait until monitor has started. This is a fix for CQ232808.
4238  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4239  // work in between, then there is high probability that monitor thread started after
4240  // the library shutdown. At shutdown it is too late to cope with the problem, because
4241  // when the master is in DllMain (process detach) the monitor has no chances to start
4242  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4243  // because all the memory which the monitor can access is going to be released/reset.
4244  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4245  KMP_YIELD( TRUE );
4246  }
4247  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4248  #endif
4249  }
4250  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4251  }
4252 
4253  KMP_MB();
4254  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4255  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4256  }
4257 
4258  /* allocate space for it. */
4259  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4260 
4261  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4262 
4263  if ( __kmp_storage_map ) {
4264  __kmp_print_thread_storage_map( new_thr, new_gtid );
4265  }
4266 
4267  /* add the reserve serialized team, initialized from the team's master thread */
4268  {
4269  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4270  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4271 
4272  new_thr->th.th_serial_team = serial_team =
4273  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4274 #if OMPT_SUPPORT
4275  0, // root parallel id
4276 #endif
4277 #if OMP_40_ENABLED
4278  proc_bind_default,
4279 #endif
4280  &r_icvs,
4281  0 USE_NESTED_HOT_ARG(NULL) );
4282  }
4283  KMP_ASSERT ( serial_team );
4284  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4285  serial_team->t.t_threads[0] = new_thr;
4286  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4287  new_thr ) );
4288 
4289  /* setup the thread structures */
4290  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4291 
4292  #if USE_FAST_MEMORY
4293  __kmp_initialize_fast_memory( new_thr );
4294  #endif /* USE_FAST_MEMORY */
4295 
4296  #if KMP_USE_BGET
4297  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4298  __kmp_initialize_bget( new_thr );
4299  #endif
4300 
4301  __kmp_init_random( new_thr ); // Initialize random number generator
4302 
4303  /* Initialize these only once when thread is grabbed for a team allocation */
4304  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4305  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4306 
4307  int b;
4308  kmp_balign_t * balign = new_thr->th.th_bar;
4309  for(b=0; b<bs_last_barrier; ++b) {
4310  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4311  balign[b].bb.team = NULL;
4312  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4313  balign[b].bb.use_oncore_barrier = 0;
4314  }
4315 
4316  new_thr->th.th_spin_here = FALSE;
4317  new_thr->th.th_next_waiting = 0;
4318 
4319 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4320  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4321  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4322  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4323  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4324 #endif
4325 
4326  TCW_4(new_thr->th.th_in_pool, FALSE);
4327  new_thr->th.th_active_in_pool = FALSE;
4328  TCW_4(new_thr->th.th_active, TRUE);
4329 
4330  /* adjust the global counters */
4331  __kmp_all_nth ++;
4332  __kmp_nth ++;
4333 
4334  //
4335  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4336  // for low numbers of procs, and method #2 (keyed API call) for higher
4337  // numbers of procs.
4338  //
4339  if ( __kmp_adjust_gtid_mode ) {
4340  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4341  if ( TCR_4(__kmp_gtid_mode) != 2) {
4342  TCW_4(__kmp_gtid_mode, 2);
4343  }
4344  }
4345  else {
4346  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4347  TCW_4(__kmp_gtid_mode, 1);
4348  }
4349  }
4350  }
4351 
4352 #ifdef KMP_ADJUST_BLOCKTIME
4353  /* Adjust blocktime back to zero if necessary */
4354  /* Middle initialization might not have occurred yet */
4355  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4356  if ( __kmp_nth > __kmp_avail_proc ) {
4357  __kmp_zero_bt = TRUE;
4358  }
4359  }
4360 #endif /* KMP_ADJUST_BLOCKTIME */
4361 
4362  /* actually fork it and create the new worker thread */
4363  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4364  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4365  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4366 
4367 
4368  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4369  KMP_MB();
4370  return new_thr;
4371 }
4372 
4373 /*
4374  * reinitialize team for reuse.
4375  *
4376  * The hot team code calls this case at every fork barrier, so EPCC barrier
4377  * test are extremely sensitive to changes in it, esp. writes to the team
4378  * struct, which cause a cache invalidation in all threads.
4379  *
4380  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4381  */
4382 static void
4383 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4384  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4385  team->t.t_threads[0], team ) );
4386  KMP_DEBUG_ASSERT( team && new_icvs);
4387  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4388  team->t.t_ident = loc;
4389 
4390  team->t.t_id = KMP_GEN_TEAM_ID();
4391 
4392  // Copy ICVs to the master thread's implicit taskdata
4393  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4394  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4395 
4396  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4397  team->t.t_threads[0], team ) );
4398 }
4399 
4400 
4401 /* initialize the team data structure
4402  * this assumes the t_threads and t_max_nproc are already set
4403  * also, we don't touch the arguments */
4404 static void
4405 __kmp_initialize_team(
4406  kmp_team_t * team,
4407  int new_nproc,
4408  kmp_internal_control_t * new_icvs,
4409  ident_t * loc
4410 ) {
4411  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4412 
4413  /* verify */
4414  KMP_DEBUG_ASSERT( team );
4415  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4416  KMP_DEBUG_ASSERT( team->t.t_threads );
4417  KMP_MB();
4418 
4419  team->t.t_master_tid = 0; /* not needed */
4420  /* team->t.t_master_bar; not needed */
4421  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4422  team->t.t_nproc = new_nproc;
4423 
4424  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4425  team->t.t_next_pool = NULL;
4426  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4427 
4428  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4429  team->t.t_invoke = NULL; /* not needed */
4430 
4431  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4432  team->t.t_sched = new_icvs->sched;
4433 
4434 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4435  team->t.t_fp_control_saved = FALSE; /* not needed */
4436  team->t.t_x87_fpu_control_word = 0; /* not needed */
4437  team->t.t_mxcsr = 0; /* not needed */
4438 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4439 
4440  team->t.t_construct = 0;
4441  __kmp_init_lock( & team->t.t_single_lock );
4442 
4443  team->t.t_ordered .dt.t_value = 0;
4444  team->t.t_master_active = FALSE;
4445 
4446  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4447 
4448 #ifdef KMP_DEBUG
4449  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4450 #endif
4451  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4452 
4453  team->t.t_control_stack_top = NULL;
4454 
4455  __kmp_reinitialize_team( team, new_icvs, loc );
4456 
4457  KMP_MB();
4458  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4459 }
4460 
4461 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4462 /* Sets full mask for thread and returns old mask, no changes to structures. */
4463 static void
4464 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4465 {
4466  if ( KMP_AFFINITY_CAPABLE() ) {
4467  int status;
4468  if ( old_mask != NULL ) {
4469  status = __kmp_get_system_affinity( old_mask, TRUE );
4470  int error = errno;
4471  if ( status != 0 ) {
4472  __kmp_msg(
4473  kmp_ms_fatal,
4474  KMP_MSG( ChangeThreadAffMaskError ),
4475  KMP_ERR( error ),
4476  __kmp_msg_null
4477  );
4478  }
4479  }
4480  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
4481  }
4482 }
4483 #endif
4484 
4485 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4486 
4487 //
4488 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4489 // It calculats the worker + master thread's partition based upon the parent
4490 // thread's partition, and binds each worker to a thread in their partition.
4491 // The master thread's partition should already include its current binding.
4492 //
4493 static void
4494 __kmp_partition_places( kmp_team_t *team )
4495 {
4496  //
4497  // Copy the master thread's place partion to the team struct
4498  //
4499  kmp_info_t *master_th = team->t.t_threads[0];
4500  KMP_DEBUG_ASSERT( master_th != NULL );
4501  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4502  int first_place = master_th->th.th_first_place;
4503  int last_place = master_th->th.th_last_place;
4504  int masters_place = master_th->th.th_current_place;
4505  team->t.t_first_place = first_place;
4506  team->t.t_last_place = last_place;
4507 
4508  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4509  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4510  masters_place, first_place, last_place ) );
4511 
4512  switch ( proc_bind ) {
4513 
4514  case proc_bind_default:
4515  //
4516  // serial teams might have the proc_bind policy set to
4517  // proc_bind_default. It doesn't matter, as we don't
4518  // rebind the master thread for any proc_bind policy.
4519  //
4520  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4521  break;
4522 
4523  case proc_bind_master:
4524  {
4525  int f;
4526  int n_th = team->t.t_nproc;
4527  for ( f = 1; f < n_th; f++ ) {
4528  kmp_info_t *th = team->t.t_threads[f];
4529  KMP_DEBUG_ASSERT( th != NULL );
4530  th->th.th_first_place = first_place;
4531  th->th.th_last_place = last_place;
4532  th->th.th_new_place = masters_place;
4533 
4534  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4535  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4536  team->t.t_id, f, masters_place, first_place, last_place ) );
4537  }
4538  }
4539  break;
4540 
4541  case proc_bind_close:
4542  {
4543  int f;
4544  int n_th = team->t.t_nproc;
4545  int n_places;
4546  if ( first_place <= last_place ) {
4547  n_places = last_place - first_place + 1;
4548  }
4549  else {
4550  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4551  }
4552  if ( n_th <= n_places ) {
4553  int place = masters_place;
4554  for ( f = 1; f < n_th; f++ ) {
4555  kmp_info_t *th = team->t.t_threads[f];
4556  KMP_DEBUG_ASSERT( th != NULL );
4557 
4558  if ( place == last_place ) {
4559  place = first_place;
4560  }
4561  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4562  place = 0;
4563  }
4564  else {
4565  place++;
4566  }
4567  th->th.th_first_place = first_place;
4568  th->th.th_last_place = last_place;
4569  th->th.th_new_place = place;
4570 
4571  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4572  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4573  team->t.t_id, f, place, first_place, last_place ) );
4574  }
4575  }
4576  else {
4577  int S, rem, gap, s_count;
4578  S = n_th / n_places;
4579  s_count = 0;
4580  rem = n_th - ( S * n_places );
4581  gap = rem > 0 ? n_places/rem : n_places;
4582  int place = masters_place;
4583  int gap_ct = gap;
4584  for ( f = 0; f < n_th; f++ ) {
4585  kmp_info_t *th = team->t.t_threads[f];
4586  KMP_DEBUG_ASSERT( th != NULL );
4587 
4588  th->th.th_first_place = first_place;
4589  th->th.th_last_place = last_place;
4590  th->th.th_new_place = place;
4591  s_count++;
4592 
4593  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4594  // do nothing, add an extra thread to place on next iteration
4595  }
4596  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4597  // we added an extra thread to this place; move to next place
4598  if ( place == last_place ) {
4599  place = first_place;
4600  }
4601  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4602  place = 0;
4603  }
4604  else {
4605  place++;
4606  }
4607  s_count = 0;
4608  gap_ct = 1;
4609  rem--;
4610  }
4611  else if (s_count == S) { // place full; don't add extra
4612  if ( place == last_place ) {
4613  place = first_place;
4614  }
4615  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4616  place = 0;
4617  }
4618  else {
4619  place++;
4620  }
4621  gap_ct++;
4622  s_count = 0;
4623  }
4624 
4625  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4626  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4627  team->t.t_id, f, th->th.th_new_place, first_place,
4628  last_place ) );
4629  }
4630  KMP_DEBUG_ASSERT( place == masters_place );
4631  }
4632  }
4633  break;
4634 
4635  case proc_bind_spread:
4636  {
4637  int f;
4638  int n_th = team->t.t_nproc;
4639  int n_places;
4640  if ( first_place <= last_place ) {
4641  n_places = last_place - first_place + 1;
4642  }
4643  else {
4644  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4645  }
4646  if ( n_th <= n_places ) {
4647  int place = masters_place;
4648  int S = n_places/n_th;
4649  int s_count, rem, gap, gap_ct;
4650  rem = n_places - n_th*S;
4651  gap = rem ? n_th/rem : 1;
4652  gap_ct = gap;
4653  for ( f = 0; f < n_th; f++ ) {
4654  kmp_info_t *th = team->t.t_threads[f];
4655  KMP_DEBUG_ASSERT( th != NULL );
4656 
4657  th->th.th_first_place = place;
4658  th->th.th_new_place = place;
4659  s_count = 1;
4660  while (s_count < S) {
4661  if ( place == last_place ) {
4662  place = first_place;
4663  }
4664  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4665  place = 0;
4666  }
4667  else {
4668  place++;
4669  }
4670  s_count++;
4671  }
4672  if (rem && (gap_ct == gap)) {
4673  if ( place == last_place ) {
4674  place = first_place;
4675  }
4676  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4677  place = 0;
4678  }
4679  else {
4680  place++;
4681  }
4682  rem--;
4683  gap_ct = 0;
4684  }
4685  th->th.th_last_place = place;
4686  gap_ct++;
4687 
4688  if ( place == last_place ) {
4689  place = first_place;
4690  }
4691  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4692  place = 0;
4693  }
4694  else {
4695  place++;
4696  }
4697 
4698  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4699  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4700  team->t.t_id, f, th->th.th_new_place,
4701  th->th.th_first_place, th->th.th_last_place ) );
4702  }
4703  KMP_DEBUG_ASSERT( place == masters_place );
4704  }
4705  else {
4706  int S, rem, gap, s_count;
4707  S = n_th / n_places;
4708  s_count = 0;
4709  rem = n_th - ( S * n_places );
4710  gap = rem > 0 ? n_places/rem : n_places;
4711  int place = masters_place;
4712  int gap_ct = gap;
4713  for ( f = 0; f < n_th; f++ ) {
4714  kmp_info_t *th = team->t.t_threads[f];
4715  KMP_DEBUG_ASSERT( th != NULL );
4716 
4717  th->th.th_first_place = place;
4718  th->th.th_last_place = place;
4719  th->th.th_new_place = place;
4720  s_count++;
4721 
4722  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4723  // do nothing, add an extra thread to place on next iteration
4724  }
4725  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4726  // we added an extra thread to this place; move on to next place
4727  if ( place == last_place ) {
4728  place = first_place;
4729  }
4730  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4731  place = 0;
4732  }
4733  else {
4734  place++;
4735  }
4736  s_count = 0;
4737  gap_ct = 1;
4738  rem--;
4739  }
4740  else if (s_count == S) { // place is full; don't add extra thread
4741  if ( place == last_place ) {
4742  place = first_place;
4743  }
4744  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4745  place = 0;
4746  }
4747  else {
4748  place++;
4749  }
4750  gap_ct++;
4751  s_count = 0;
4752  }
4753 
4754  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4755  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4756  team->t.t_id, f, th->th.th_new_place,
4757  th->th.th_first_place, th->th.th_last_place) );
4758  }
4759  KMP_DEBUG_ASSERT( place == masters_place );
4760  }
4761  }
4762  break;
4763 
4764  default:
4765  break;
4766  }
4767 
4768  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4769 }
4770 
4771 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4772 
4773 /* allocate a new team data structure to use. take one off of the free pool if available */
4774 kmp_team_t *
4775 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4776 #if OMPT_SUPPORT
4777  ompt_parallel_id_t ompt_parallel_id,
4778 #endif
4779 #if OMP_40_ENABLED
4780  kmp_proc_bind_t new_proc_bind,
4781 #endif
4782  kmp_internal_control_t *new_icvs,
4783  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4784 {
4785  KMP_TIME_BLOCK(KMP_allocate_team);
4786  int f;
4787  kmp_team_t *team;
4788  int use_hot_team = ! root->r.r_active;
4789  int level = 0;
4790 
4791  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4792  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4793  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4794  KMP_MB();
4795 
4796 #if KMP_NESTED_HOT_TEAMS
4797  kmp_hot_team_ptr_t *hot_teams;
4798  if( master ) {
4799  team = master->th.th_team;
4800  level = team->t.t_active_level;
4801  if( master->th.th_teams_microtask ) { // in teams construct?
4802  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4803  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4804  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4805  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4806  }
4807  }
4808  hot_teams = master->th.th_hot_teams;
4809  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4810  { // hot team has already been allocated for given level
4811  use_hot_team = 1;
4812  } else {
4813  use_hot_team = 0;
4814  }
4815  }
4816 #endif
4817  // Optimization to use a "hot" team
4818  if( use_hot_team && new_nproc > 1 ) {
4819  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4820 #if KMP_NESTED_HOT_TEAMS
4821  team = hot_teams[level].hot_team;
4822 #else
4823  team = root->r.r_hot_team;
4824 #endif
4825 #if KMP_DEBUG
4826  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4827  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4828  team->t.t_task_team[0], team->t.t_task_team[1] ));
4829  }
4830 #endif
4831 
4832  // Has the number of threads changed?
4833  /* Let's assume the most common case is that the number of threads is unchanged, and
4834  put that case first. */
4835  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4836  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4837  // This case can mean that omp_set_num_threads() was called and the hot team size
4838  // was already reduced, so we check the special flag
4839  if ( team->t.t_size_changed == -1 ) {
4840  team->t.t_size_changed = 1;
4841  } else {
4842  team->t.t_size_changed = 0;
4843  }
4844 
4845  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4846  team->t.t_sched = new_icvs->sched;
4847 
4848  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4849 
4850  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4851  0, team->t.t_threads[0], team ) );
4852  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4853 
4854 #if OMP_40_ENABLED
4855 # if KMP_AFFINITY_SUPPORTED
4856  if ( team->t.t_proc_bind == new_proc_bind ) {
4857  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4858  team->t.t_id, new_proc_bind, team->t.t_first_place,
4859  team->t.t_last_place ) );
4860  }
4861  else {
4862  team->t.t_proc_bind = new_proc_bind;
4863  __kmp_partition_places( team );
4864  }
4865 # else
4866  if ( team->t.t_proc_bind != new_proc_bind ) {
4867  team->t.t_proc_bind = new_proc_bind;
4868  }
4869 # endif /* KMP_AFFINITY_SUPPORTED */
4870 #endif /* OMP_40_ENABLED */
4871 
4872  if (level) {
4873  for(f = 0; f < new_nproc; ++f) {
4874  team->t.t_threads[f]->th.th_task_state = 0;
4875  }
4876  }
4877  }
4878  else if( team->t.t_nproc > new_nproc ) {
4879  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4880 
4881  team->t.t_size_changed = 1;
4882  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4883  // Signal the worker threads (esp. extra ones) to stop looking for tasks while spin waiting.
4884  // The task teams are reference counted and will be deallocated by the last worker thread.
4885  int tt_idx;
4886  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4887  // We don't know which of the two task teams workers are waiting on, so deactivate both.
4888  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4889  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
4890  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4891  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4892  KMP_MB();
4893  KA_TRACE(20, ("__kmp_allocate_team: setting task_team %p to NULL\n",
4894  &team->t.t_task_team[tt_idx]));
4895  team->t.t_task_team[tt_idx] = NULL;
4896  }
4897  else {
4898  KMP_DEBUG_ASSERT( task_team == NULL );
4899  }
4900  }
4901  }
4902 #if KMP_NESTED_HOT_TEAMS
4903  if( __kmp_hot_teams_mode == 0 ) {
4904  // AC: saved number of threads should correspond to team's value in this mode,
4905  // can be bigger in mode 1, when hot team has some threads in reserve
4906  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4907  hot_teams[level].hot_team_nth = new_nproc;
4908 #endif // KMP_NESTED_HOT_TEAMS
4909  /* release the extra threads we don't need any more */
4910  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4911  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4912  __kmp_free_thread( team->t.t_threads[ f ] );
4913  team->t.t_threads[ f ] = NULL;
4914  }
4915 #if KMP_NESTED_HOT_TEAMS
4916  } // (__kmp_hot_teams_mode == 0)
4917 #endif // KMP_NESTED_HOT_TEAMS
4918  team->t.t_nproc = new_nproc;
4919  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4920  team->t.t_sched = new_icvs->sched;
4921  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4922 
4923  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4924  // Init both task teams
4925  int tt_idx;
4926  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4927  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4928  if ( task_team != NULL ) {
4929  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4930  task_team->tt.tt_nproc = new_nproc;
4931  task_team->tt.tt_unfinished_threads = new_nproc;
4932  task_team->tt.tt_ref_ct = new_nproc - 1;
4933  }
4934  }
4935  }
4936 
4937  /* update the remaining threads */
4938  if (level) {
4939  for(f = 0; f < new_nproc; ++f) {
4940  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4941  team->t.t_threads[f]->th.th_task_state = 0;
4942  }
4943  }
4944  else {
4945  for(f = 0; f < new_nproc; ++f) {
4946  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4947  }
4948  }
4949  // restore the current task state of the master thread: should be the implicit task
4950  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4951  0, team->t.t_threads[0], team ) );
4952 
4953  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4954 
4955 #ifdef KMP_DEBUG
4956  for ( f = 0; f < team->t.t_nproc; f++ ) {
4957  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4958  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4959  }
4960 #endif
4961 
4962 #if OMP_40_ENABLED
4963  team->t.t_proc_bind = new_proc_bind;
4964 # if KMP_AFFINITY_SUPPORTED
4965  __kmp_partition_places( team );
4966 # endif
4967 #endif
4968  }
4969  else { // team->t.t_nproc < new_nproc
4970 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4971  kmp_affin_mask_t *old_mask;
4972  if ( KMP_AFFINITY_CAPABLE() ) {
4973  KMP_CPU_ALLOC(old_mask);
4974  }
4975 #endif
4976 
4977  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4978 
4979  team->t.t_size_changed = 1;
4980 
4981 
4982 #if KMP_NESTED_HOT_TEAMS
4983  int avail_threads = hot_teams[level].hot_team_nth;
4984  if( new_nproc < avail_threads )
4985  avail_threads = new_nproc;
4986  kmp_info_t **other_threads = team->t.t_threads;
4987  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4988  // Adjust barrier data of reserved threads (if any) of the team
4989  // Other data will be set in __kmp_initialize_info() below.
4990  int b;
4991  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4992  for ( b = 0; b < bs_last_barrier; ++ b ) {
4993  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4994  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4995 #if USE_DEBUGGER
4996  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4997 #endif
4998  }
4999  }
5000  if( hot_teams[level].hot_team_nth >= new_nproc ) {
5001  // we have all needed threads in reserve, no need to allocate any
5002  // this only possible in mode 1, cannot have reserved threads in mode 0
5003  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5004  team->t.t_nproc = new_nproc; // just get reserved threads involved
5005  } else {
5006  // we may have some threads in reserve, but not enough
5007  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
5008  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5009 #endif // KMP_NESTED_HOT_TEAMS
5010  if(team->t.t_max_nproc < new_nproc) {
5011  /* reallocate larger arrays */
5012  __kmp_reallocate_team_arrays(team, new_nproc);
5013  __kmp_reinitialize_team( team, new_icvs, NULL );
5014  }
5015 
5016 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5017  /* Temporarily set full mask for master thread before
5018  creation of workers. The reason is that workers inherit
5019  the affinity from master, so if a lot of workers are
5020  created on the single core quickly, they don't get
5021  a chance to set their own affinity for a long time.
5022  */
5023  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
5024 #endif
5025 
5026  /* allocate new threads for the hot team */
5027  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
5028  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
5029  KMP_DEBUG_ASSERT( new_worker );
5030  team->t.t_threads[ f ] = new_worker;
5031  new_worker->th.th_team_nproc = team->t.t_nproc;
5032 
5033  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
5034  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
5035  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5036  team->t.t_bar[bs_plain_barrier].b_arrived ) );
5037 
5038  { // Initialize barrier data for new threads.
5039  int b;
5040  kmp_balign_t * balign = new_worker->th.th_bar;
5041  for( b = 0; b < bs_last_barrier; ++ b ) {
5042  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5043  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5044 #if USE_DEBUGGER
5045  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5046 #endif
5047  }
5048  }
5049  }
5050 
5051 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5052  if ( KMP_AFFINITY_CAPABLE() ) {
5053  /* Restore initial master thread's affinity mask */
5054  __kmp_set_system_affinity( old_mask, TRUE );
5055  KMP_CPU_FREE(old_mask);
5056  }
5057 #endif
5058 #if KMP_NESTED_HOT_TEAMS
5059  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5060 #endif // KMP_NESTED_HOT_TEAMS
5061  /* make sure everyone is syncronized */
5062  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5063 
5064  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5065  int tt_idx;
5066  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5067  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5068  if ( task_team != NULL ) {
5069  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
5070  task_team->tt.tt_nproc = new_nproc;
5071  task_team->tt.tt_unfinished_threads = new_nproc;
5072  task_team->tt.tt_ref_ct = new_nproc - 1;
5073  }
5074  }
5075  }
5076 
5077  /* reinitialize the old threads */
5078  if (level) {
5079  for( f = 0 ; f < team->t.t_nproc ; f++ ) {
5080  __kmp_initialize_info( team->t.t_threads[ f ], team, f,
5081  __kmp_gtid_from_tid( f, team ) );
5082  }
5083  }
5084  else {
5085  int old_state = team->t.t_threads[0]->th.th_task_state;
5086  for (f=0; f < team->t.t_nproc; ++f) {
5087  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5088  team->t.t_threads[f]->th.th_task_state = old_state;
5089  team->t.t_threads[f]->th.th_task_team = team->t.t_task_team[old_state];
5090  }
5091  }
5092 
5093 #ifdef KMP_DEBUG
5094  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5095  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5096  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5097  }
5098 #endif
5099 
5100 #if OMP_40_ENABLED
5101  team->t.t_proc_bind = new_proc_bind;
5102 # if KMP_AFFINITY_SUPPORTED
5103  __kmp_partition_places( team );
5104 # endif
5105 #endif
5106  } // Check changes in number of threads
5107 
5108 #if OMP_40_ENABLED
5109  kmp_info_t *master = team->t.t_threads[0];
5110  if( master->th.th_teams_microtask ) {
5111  for( f = 1; f < new_nproc; ++f ) {
5112  // propagate teams construct specific info to workers
5113  kmp_info_t *thr = team->t.t_threads[f];
5114  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5115  thr->th.th_teams_level = master->th.th_teams_level;
5116  thr->th.th_teams_size = master->th.th_teams_size;
5117  }
5118  }
5119 #endif /* OMP_40_ENABLED */
5120 #if KMP_NESTED_HOT_TEAMS
5121  if( level ) {
5122  // Sync task (TODO: and barrier?) state for nested hot teams, not needed for outermost hot team.
5123  for( f = 1; f < new_nproc; ++f ) {
5124  kmp_info_t *thr = team->t.t_threads[f];
5125  thr->th.th_task_state = 0;
5126  int b;
5127  kmp_balign_t * balign = thr->th.th_bar;
5128  for( b = 0; b < bs_last_barrier; ++ b ) {
5129  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5130  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5131 #if USE_DEBUGGER
5132  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5133 #endif
5134  }
5135  }
5136  }
5137 #endif // KMP_NESTED_HOT_TEAMS
5138 
5139  /* reallocate space for arguments if necessary */
5140  __kmp_alloc_argv_entries( argc, team, TRUE );
5141  team->t.t_argc = argc;
5142  //
5143  // The hot team re-uses the previous task team,
5144  // if untouched during the previous release->gather phase.
5145  //
5146 
5147  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5148 
5149 #if KMP_DEBUG
5150  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5151  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5152  team->t.t_task_team[0], team->t.t_task_team[1] ));
5153  }
5154 #endif
5155 
5156 #if OMPT_SUPPORT
5157  __ompt_team_assign_id(team, ompt_parallel_id);
5158 #endif
5159 
5160  KMP_MB();
5161 
5162  return team;
5163  }
5164 
5165  /* next, let's try to take one from the team pool */
5166  KMP_MB();
5167  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5168  {
5169  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5170  if ( team->t.t_max_nproc >= max_nproc ) {
5171  /* take this team from the team pool */
5172  __kmp_team_pool = team->t.t_next_pool;
5173 
5174  /* setup the team for fresh use */
5175  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5176 
5177  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5178  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5179  team->t.t_task_team[0] = NULL;
5180  team->t.t_task_team[1] = NULL;
5181 
5182  /* reallocate space for arguments if necessary */
5183  __kmp_alloc_argv_entries( argc, team, TRUE );
5184  team->t.t_argc = argc;
5185 
5186  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5187  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5188  { // Initialize barrier data.
5189  int b;
5190  for ( b = 0; b < bs_last_barrier; ++ b) {
5191  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5192 #if USE_DEBUGGER
5193  team->t.t_bar[ b ].b_master_arrived = 0;
5194  team->t.t_bar[ b ].b_team_arrived = 0;
5195 #endif
5196  }
5197  }
5198 
5199 #if OMP_40_ENABLED
5200  team->t.t_proc_bind = new_proc_bind;
5201 #endif
5202 
5203  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5204 
5205 #if OMPT_SUPPORT
5206  __ompt_team_assign_id(team, ompt_parallel_id);
5207 #endif
5208 
5209  KMP_MB();
5210 
5211  return team;
5212  }
5213 
5214  /* reap team if it is too small, then loop back and check the next one */
5215  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5216  /* TODO: Use technique to find the right size hot-team, don't reap them */
5217  team = __kmp_reap_team( team );
5218  __kmp_team_pool = team;
5219  }
5220 
5221  /* nothing available in the pool, no matter, make a new team! */
5222  KMP_MB();
5223  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5224 
5225  /* and set it up */
5226  team->t.t_max_nproc = max_nproc;
5227  /* NOTE well, for some reason allocating one big buffer and dividing it
5228  * up seems to really hurt performance a lot on the P4, so, let's not use
5229  * this... */
5230  __kmp_allocate_team_arrays( team, max_nproc );
5231 
5232  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5233  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5234 
5235  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5236  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5237  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5238  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5239 
5240  if ( __kmp_storage_map ) {
5241  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5242  }
5243 
5244  /* allocate space for arguments */
5245  __kmp_alloc_argv_entries( argc, team, FALSE );
5246  team->t.t_argc = argc;
5247 
5248  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5249  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5250  { // Initialize barrier data.
5251  int b;
5252  for ( b = 0; b < bs_last_barrier; ++ b ) {
5253  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5254 #if USE_DEBUGGER
5255  team->t.t_bar[ b ].b_master_arrived = 0;
5256  team->t.t_bar[ b ].b_team_arrived = 0;
5257 #endif
5258  }
5259  }
5260 
5261 #if OMP_40_ENABLED
5262  team->t.t_proc_bind = new_proc_bind;
5263 #endif
5264 
5265 #if OMPT_SUPPORT
5266  __ompt_team_assign_id(team, ompt_parallel_id);
5267  team->t.ompt_serialized_team_info = NULL;
5268 #endif
5269 
5270  KMP_MB();
5271 
5272  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5273 
5274  return team;
5275 }
5276 
5277 /* TODO implement hot-teams at all levels */
5278 /* TODO implement lazy thread release on demand (disband request) */
5279 
5280 /* free the team. return it to the team pool. release all the threads
5281  * associated with it */
5282 void
5283 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5284 {
5285  int f;
5286  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5287 
5288  /* verify state */
5289  KMP_DEBUG_ASSERT( root );
5290  KMP_DEBUG_ASSERT( team );
5291  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5292  KMP_DEBUG_ASSERT( team->t.t_threads );
5293 
5294  int use_hot_team = team == root->r.r_hot_team;
5295 #if KMP_NESTED_HOT_TEAMS
5296  int level;
5297  kmp_hot_team_ptr_t *hot_teams;
5298  if( master ) {
5299  level = team->t.t_active_level - 1;
5300  if( master->th.th_teams_microtask ) { // in teams construct?
5301  if( master->th.th_teams_size.nteams > 1 ) {
5302  ++level; // level was not increased in teams construct for team_of_masters
5303  }
5304  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5305  master->th.th_teams_level == team->t.t_level ) {
5306  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5307  } // team->t.t_level will be increased inside parallel
5308  }
5309  hot_teams = master->th.th_hot_teams;
5310  if( level < __kmp_hot_teams_max_level ) {
5311  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5312  use_hot_team = 1;
5313  }
5314  }
5315 #endif // KMP_NESTED_HOT_TEAMS
5316 
5317  /* team is done working */
5318  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5319  team->t.t_copyin_counter = 0; // init counter for possible reuse
5320  // Do not reset pointer to parent team to NULL for hot teams.
5321 
5322  /* if we are non-hot team, release our threads */
5323  if( ! use_hot_team ) {
5324  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5325  int tt_idx;
5326  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5327  // We don't know which of the two task teams workers are waiting on, so deactivate both.
5328  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5329  if ( task_team != NULL ) {
5330  // Signal the worker threads to stop looking for tasks while spin waiting. The task
5331  // teams are reference counted and will be deallocated by the last worker thread via the
5332  // thread's pointer to the task team.
5333  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n", task_team ) );
5334  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
5335  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5336  KMP_MB();
5337  team->t.t_task_team[tt_idx] = NULL;
5338  }
5339  }
5340  }
5341 
5342  // Reset pointer to parent team only for non-hot teams.
5343  team->t.t_parent = NULL;
5344 
5345 
5346  /* free the worker threads */
5347  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5348  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5349  __kmp_free_thread( team->t.t_threads[ f ] );
5350  team->t.t_threads[ f ] = NULL;
5351  }
5352 
5353 
5354  /* put the team back in the team pool */
5355  /* TODO limit size of team pool, call reap_team if pool too large */
5356  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5357  __kmp_team_pool = (volatile kmp_team_t*) team;
5358  }
5359 
5360  KMP_MB();
5361 }
5362 
5363 
5364 /* reap the team. destroy it, reclaim all its resources and free its memory */
5365 kmp_team_t *
5366 __kmp_reap_team( kmp_team_t *team )
5367 {
5368  kmp_team_t *next_pool = team->t.t_next_pool;
5369 
5370  KMP_DEBUG_ASSERT( team );
5371  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5372  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5373  KMP_DEBUG_ASSERT( team->t.t_threads );
5374  KMP_DEBUG_ASSERT( team->t.t_argv );
5375 
5376  /* TODO clean the threads that are a part of this? */
5377 
5378  /* free stuff */
5379 
5380  __kmp_free_team_arrays( team );
5381  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5382  __kmp_free( (void*) team->t.t_argv );
5383  __kmp_free( team );
5384 
5385  KMP_MB();
5386  return next_pool;
5387 }
5388 
5389 //
5390 // Free the thread. Don't reap it, just place it on the pool of available
5391 // threads.
5392 //
5393 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5394 // binding for the affinity mechanism to be useful.
5395 //
5396 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5397 // However, we want to avoid a potential performance problem by always
5398 // scanning through the list to find the correct point at which to insert
5399 // the thread (potential N**2 behavior). To do this we keep track of the
5400 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5401 // With single-level parallelism, threads will always be added to the tail
5402 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5403 // parallelism, all bets are off and we may need to scan through the entire
5404 // free list.
5405 //
5406 // This change also has a potentially large performance benefit, for some
5407 // applications. Previously, as threads were freed from the hot team, they
5408 // would be placed back on the free list in inverse order. If the hot team
5409 // grew back to it's original size, then the freed thread would be placed
5410 // back on the hot team in reverse order. This could cause bad cache
5411 // locality problems on programs where the size of the hot team regularly
5412 // grew and shrunk.
5413 //
5414 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5415 //
5416 void
5417 __kmp_free_thread( kmp_info_t *this_th )
5418 {
5419  int gtid;
5420  kmp_info_t **scan;
5421 
5422  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5423  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5424 
5425  KMP_DEBUG_ASSERT( this_th );
5426 
5427  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5428  int b;
5429  kmp_balign_t *balign = this_th->th.th_bar;
5430  for (b=0; b<bs_last_barrier; ++b) {
5431  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5432  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5433  balign[b].bb.team = NULL;
5434  }
5435 
5436 
5437  /* put thread back on the free pool */
5438  TCW_PTR(this_th->th.th_team, NULL);
5439  TCW_PTR(this_th->th.th_root, NULL);
5440  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5441 
5442  //
5443  // If the __kmp_thread_pool_insert_pt is already past the new insert
5444  // point, then we need to re-scan the entire list.
5445  //
5446  gtid = this_th->th.th_info.ds.ds_gtid;
5447  if ( __kmp_thread_pool_insert_pt != NULL ) {
5448  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5449  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5450  __kmp_thread_pool_insert_pt = NULL;
5451  }
5452  }
5453 
5454  //
5455  // Scan down the list to find the place to insert the thread.
5456  // scan is the address of a link in the list, possibly the address of
5457  // __kmp_thread_pool itself.
5458  //
5459  // In the absence of nested parallism, the for loop will have 0 iterations.
5460  //
5461  if ( __kmp_thread_pool_insert_pt != NULL ) {
5462  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5463  }
5464  else {
5465  scan = (kmp_info_t **)&__kmp_thread_pool;
5466  }
5467  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5468  scan = &( (*scan)->th.th_next_pool ) );
5469 
5470  //
5471  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5472  // to its address.
5473  //
5474  TCW_PTR(this_th->th.th_next_pool, *scan);
5475  __kmp_thread_pool_insert_pt = *scan = this_th;
5476  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5477  || ( this_th->th.th_info.ds.ds_gtid
5478  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5479  TCW_4(this_th->th.th_in_pool, TRUE);
5480  __kmp_thread_pool_nth++;
5481 
5482  TCW_4(__kmp_nth, __kmp_nth - 1);
5483 
5484 #ifdef KMP_ADJUST_BLOCKTIME
5485  /* Adjust blocktime back to user setting or default if necessary */
5486  /* Middle initialization might never have occurred */
5487  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5488  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5489  if ( __kmp_nth <= __kmp_avail_proc ) {
5490  __kmp_zero_bt = FALSE;
5491  }
5492  }
5493 #endif /* KMP_ADJUST_BLOCKTIME */
5494 
5495  KMP_MB();
5496 }
5497 
5498 
5499 /* ------------------------------------------------------------------------ */
5500 
5501 void *
5502 __kmp_launch_thread( kmp_info_t *this_thr )
5503 {
5504  int gtid = this_thr->th.th_info.ds.ds_gtid;
5505 /* void *stack_data;*/
5506  kmp_team_t *(*volatile pteam);
5507 
5508  KMP_MB();
5509  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5510 
5511  if( __kmp_env_consistency_check ) {
5512  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5513  }
5514 
5515 #if OMPT_SUPPORT
5516  if (ompt_status & ompt_status_track) {
5517  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5518  this_thr->th.ompt_thread_info.wait_id = 0;
5519  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5520  if ((ompt_status == ompt_status_track_callback) &&
5521  ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5522  __ompt_thread_begin(ompt_thread_worker, gtid);
5523  }
5524  }
5525 #endif
5526 
5527  /* This is the place where threads wait for work */
5528  while( ! TCR_4(__kmp_global.g.g_done) ) {
5529  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5530  KMP_MB();
5531 
5532  /* wait for work to do */
5533  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5534 
5535 #if OMPT_SUPPORT
5536  if (ompt_status & ompt_status_track) {
5537  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5538  }
5539 #endif
5540 
5541  /* No tid yet since not part of a team */
5542  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5543 
5544 #if OMPT_SUPPORT
5545  if (ompt_status & ompt_status_track) {
5546  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5547  }
5548 #endif
5549 
5550  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5551 
5552  /* have we been allocated? */
5553  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5554  /* we were just woken up, so run our new task */
5555  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5556  int rc;
5557  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5558  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5559 
5560  updateHWFPControl (*pteam);
5561 
5562 #if OMPT_SUPPORT
5563  if (ompt_status & ompt_status_track) {
5564  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5565  // Initialize OMPT task id for implicit task.
5566  int tid = __kmp_tid_from_gtid(gtid);
5567  (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id =
5568  __ompt_task_id_new(tid);
5569  }
5570 #endif
5571 
5572  KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
5573  {
5574  KMP_TIME_BLOCK(USER_worker_invoke);
5575  rc = (*pteam)->t.t_invoke( gtid );
5576  }
5577  KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
5578  KMP_ASSERT( rc );
5579 
5580 #if OMPT_SUPPORT
5581  if (ompt_status & ompt_status_track) {
5582  /* no frame set while outside task */
5583  int tid = __kmp_tid_from_gtid(gtid);
5584  (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
5585 
5586  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5587  }
5588 #endif
5589  KMP_MB();
5590  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5591  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5592  }
5593  /* join barrier after parallel region */
5594  __kmp_join_barrier( gtid );
5595  }
5596  }
5597  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5598 
5599 #if OMPT_SUPPORT
5600  if ((ompt_status == ompt_status_track_callback) &&
5601  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5602  __ompt_thread_end(ompt_thread_worker, gtid);
5603  }
5604 #endif
5605 
5606  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
5607  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
5608  }
5609  /* run the destructors for the threadprivate data for this thread */
5610  __kmp_common_destroy_gtid( gtid );
5611 
5612  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5613  KMP_MB();
5614  return this_thr;
5615 }
5616 
5617 /* ------------------------------------------------------------------------ */
5618 /* ------------------------------------------------------------------------ */
5619 
5620 void
5621 __kmp_internal_end_dest( void *specific_gtid )
5622 {
5623  #if KMP_COMPILER_ICC
5624  #pragma warning( push )
5625  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5626  #endif
5627  // Make sure no significant bits are lost
5628  int gtid = (kmp_intptr_t)specific_gtid - 1;
5629  #if KMP_COMPILER_ICC
5630  #pragma warning( pop )
5631  #endif
5632 
5633  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5634  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5635  * this is because 0 is reserved for the nothing-stored case */
5636 
5637  /* josh: One reason for setting the gtid specific data even when it is being
5638  destroyed by pthread is to allow gtid lookup through thread specific data
5639  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5640  that gets executed in the call to __kmp_internal_end_thread, actually
5641  gets the gtid through the thread specific data. Setting it here seems
5642  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5643  to run smoothly.
5644  todo: get rid of this after we remove the dependence on
5645  __kmp_gtid_get_specific
5646  */
5647  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5648  __kmp_gtid_set_specific( gtid );
5649  #ifdef KMP_TDATA_GTID
5650  __kmp_gtid = gtid;
5651  #endif
5652  __kmp_internal_end_thread( gtid );
5653 }
5654 
5655 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5656 
5657 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5658 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5659 // option in makefile.mk works fine.
5660 
5661 __attribute__(( destructor ))
5662 void
5663 __kmp_internal_end_dtor( void )
5664 {
5665  __kmp_internal_end_atexit();
5666 }
5667 
5668 void
5669 __kmp_internal_end_fini( void )
5670 {
5671  __kmp_internal_end_atexit();
5672 }
5673 
5674 #endif
5675 
5676 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5677 void
5678 __kmp_internal_end_atexit( void )
5679 {
5680  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5681  /* [Windows]
5682  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5683  stat code that depends on thread specific data for gtid fails because that data becomes
5684  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5685  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5686  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5687 
5688 // TODO: Can some of this comment about GVS be removed?
5689  I suspect that the offending stat code is executed when the calling thread tries to
5690  clean up a dead root thread's data structures, resulting in GVS code trying to close
5691  the GVS structures for that thread, but since the stat code uses
5692  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5693  cleaning up itself instead of another thread, it gets confused. This happens because
5694  allowing a thread to unregister and cleanup another thread is a recent modification for
5695  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5696  thread may end up trying to unregister another thread only if thread death does not
5697  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5698  specific data destructor function to detect thread death. For Windows dynamic, there
5699  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5700  workaround is applicable only for Windows static stat library.
5701  */
5702  __kmp_internal_end_library( -1 );
5703  #if KMP_OS_WINDOWS
5704  __kmp_close_console();
5705  #endif
5706 }
5707 
5708 static void
5709 __kmp_reap_thread(
5710  kmp_info_t * thread,
5711  int is_root
5712 ) {
5713 
5714  // It is assumed __kmp_forkjoin_lock is acquired.
5715 
5716  int gtid;
5717 
5718  KMP_DEBUG_ASSERT( thread != NULL );
5719 
5720  gtid = thread->th.th_info.ds.ds_gtid;
5721 
5722  if ( ! is_root ) {
5723 
5724  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5725  /* Assume the threads are at the fork barrier here */
5726  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5727  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5728  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5729  __kmp_release_64(&flag);
5730  }; // if
5731 
5732 
5733  // Terminate OS thread.
5734  __kmp_reap_worker( thread );
5735 
5736  //
5737  // The thread was killed asynchronously. If it was actively
5738  // spinning in the in the thread pool, decrement the global count.
5739  //
5740  // There is a small timing hole here - if the worker thread was
5741  // just waking up after sleeping in the pool, had reset it's
5742  // th_active_in_pool flag but not decremented the global counter
5743  // __kmp_thread_pool_active_nth yet, then the global counter
5744  // might not get updated.
5745  //
5746  // Currently, this can only happen as the library is unloaded,
5747  // so there are no harmful side effects.
5748  //
5749  if ( thread->th.th_active_in_pool ) {
5750  thread->th.th_active_in_pool = FALSE;
5751  KMP_TEST_THEN_DEC32(
5752  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5753  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5754  }
5755 
5756  // Decrement # of [worker] threads in the pool.
5757  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5758  --__kmp_thread_pool_nth;
5759  }; // if
5760 
5761  // Free the fast memory for tasking
5762  #if USE_FAST_MEMORY
5763  __kmp_free_fast_memory( thread );
5764  #endif /* USE_FAST_MEMORY */
5765 
5766  __kmp_suspend_uninitialize_thread( thread );
5767 
5768  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5769  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5770 
5771  -- __kmp_all_nth;
5772  // __kmp_nth was decremented when thread is added to the pool.
5773 
5774 #ifdef KMP_ADJUST_BLOCKTIME
5775  /* Adjust blocktime back to user setting or default if necessary */
5776  /* Middle initialization might never have occurred */
5777  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5778  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5779  if ( __kmp_nth <= __kmp_avail_proc ) {
5780  __kmp_zero_bt = FALSE;
5781  }
5782  }
5783 #endif /* KMP_ADJUST_BLOCKTIME */
5784 
5785  /* free the memory being used */
5786  if( __kmp_env_consistency_check ) {
5787  if ( thread->th.th_cons ) {
5788  __kmp_free_cons_stack( thread->th.th_cons );
5789  thread->th.th_cons = NULL;
5790  }; // if
5791  }
5792 
5793  if ( thread->th.th_pri_common != NULL ) {
5794  __kmp_free( thread->th.th_pri_common );
5795  thread->th.th_pri_common = NULL;
5796  }; // if
5797 
5798  if (thread->th.th_task_state_memo_stack != NULL) {
5799  __kmp_free(thread->th.th_task_state_memo_stack);
5800  thread->th.th_task_state_memo_stack = NULL;
5801  }
5802 
5803  #if KMP_USE_BGET
5804  if ( thread->th.th_local.bget_data != NULL ) {
5805  __kmp_finalize_bget( thread );
5806  }; // if
5807  #endif
5808 
5809 #if KMP_AFFINITY_SUPPORTED
5810  if ( thread->th.th_affin_mask != NULL ) {
5811  KMP_CPU_FREE( thread->th.th_affin_mask );
5812  thread->th.th_affin_mask = NULL;
5813  }; // if
5814 #endif /* KMP_AFFINITY_SUPPORTED */
5815 
5816  __kmp_reap_team( thread->th.th_serial_team );
5817  thread->th.th_serial_team = NULL;
5818  __kmp_free( thread );
5819 
5820  KMP_MB();
5821 
5822 } // __kmp_reap_thread
5823 
5824 static void
5825 __kmp_internal_end(void)
5826 {
5827  int i;
5828 
5829  /* First, unregister the library */
5830  __kmp_unregister_library();
5831 
5832  #if KMP_OS_WINDOWS
5833  /* In Win static library, we can't tell when a root actually dies, so we
5834  reclaim the data structures for any root threads that have died but not
5835  unregistered themselves, in order to shut down cleanly.
5836  In Win dynamic library we also can't tell when a thread dies.
5837  */
5838  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5839  #endif
5840 
5841  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5842  if( __kmp_root[i] )
5843  if( __kmp_root[i]->r.r_active )
5844  break;
5845  KMP_MB(); /* Flush all pending memory write invalidates. */
5846  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5847 
5848  if ( i < __kmp_threads_capacity ) {
5849  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5850  KMP_MB(); /* Flush all pending memory write invalidates. */
5851 
5852  //
5853  // Need to check that monitor was initialized before reaping it.
5854  // If we are called form __kmp_atfork_child (which sets
5855  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5856  // contain valid data, but it is only valid in the parent process,
5857  // not the child.
5858  //
5859  // One of the possible fixes for CQ138434 / CQ140126
5860  // (used in 20091103_dreamworks patch)
5861  //
5862  // New behavior (201008): instead of keying off of the flag
5863  // __kmp_init_parallel, the monitor thread creation is keyed off
5864  // of the new flag __kmp_init_monitor.
5865  //
5866  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5867  if ( TCR_4( __kmp_init_monitor ) ) {
5868  __kmp_reap_monitor( & __kmp_monitor );
5869  TCW_4( __kmp_init_monitor, 0 );
5870  }
5871  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5872  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5873  } else {
5874  /* TODO move this to cleanup code */
5875  #ifdef KMP_DEBUG
5876  /* make sure that everything has properly ended */
5877  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5878  if( __kmp_root[i] ) {
5879 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5880  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5881  }
5882  }
5883  #endif
5884 
5885  KMP_MB();
5886 
5887  // Reap the worker threads.
5888  // This is valid for now, but be careful if threads are reaped sooner.
5889  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5890  // Get the next thread from the pool.
5891  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5892  __kmp_thread_pool = thread->th.th_next_pool;
5893  // Reap it.
5894  thread->th.th_next_pool = NULL;
5895  thread->th.th_in_pool = FALSE;
5896  __kmp_reap_thread( thread, 0 );
5897  }; // while
5898  __kmp_thread_pool_insert_pt = NULL;
5899 
5900  // Reap teams.
5901  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5902  // Get the next team from the pool.
5903  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5904  __kmp_team_pool = team->t.t_next_pool;
5905  // Reap it.
5906  team->t.t_next_pool = NULL;
5907  __kmp_reap_team( team );
5908  }; // while
5909 
5910  __kmp_reap_task_teams( );
5911 
5912  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5913  // TBD: Add some checking...
5914  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5915  }
5916 
5917  /* Make sure all threadprivate destructors get run by joining with all worker
5918  threads before resetting this flag */
5919  TCW_SYNC_4(__kmp_init_common, FALSE);
5920 
5921  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5922  KMP_MB();
5923 
5924  //
5925  // See note above: One of the possible fixes for CQ138434 / CQ140126
5926  //
5927  // FIXME: push both code fragments down and CSE them?
5928  // push them into __kmp_cleanup() ?
5929  //
5930  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5931  if ( TCR_4( __kmp_init_monitor ) ) {
5932  __kmp_reap_monitor( & __kmp_monitor );
5933  TCW_4( __kmp_init_monitor, 0 );
5934  }
5935  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5936  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5937 
5938  } /* else !__kmp_global.t_active */
5939  TCW_4(__kmp_init_gtid, FALSE);
5940  KMP_MB(); /* Flush all pending memory write invalidates. */
5941 
5942 
5943  __kmp_cleanup();
5944 #if OMPT_SUPPORT
5945  ompt_fini();
5946 #endif
5947 }
5948 
5949 void
5950 __kmp_internal_end_library( int gtid_req )
5951 {
5952  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5953  /* this shouldn't be a race condition because __kmp_internal_end() is the
5954  * only place to clear __kmp_serial_init */
5955  /* we'll check this later too, after we get the lock */
5956  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5957  // because the next check will work in any case.
5958  if( __kmp_global.g.g_abort ) {
5959  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5960  /* TODO abort? */
5961  return;
5962  }
5963  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5964  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5965  return;
5966  }
5967 
5968 
5969  KMP_MB(); /* Flush all pending memory write invalidates. */
5970 
5971  /* find out who we are and what we should do */
5972  {
5973  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5974  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5975  if( gtid == KMP_GTID_SHUTDOWN ) {
5976  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5977  return;
5978  } else if( gtid == KMP_GTID_MONITOR ) {
5979  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5980  return;
5981  } else if( gtid == KMP_GTID_DNE ) {
5982  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5983  /* we don't know who we are, but we may still shutdown the library */
5984  } else if( KMP_UBER_GTID( gtid )) {
5985  /* unregister ourselves as an uber thread. gtid is no longer valid */
5986  if( __kmp_root[gtid]->r.r_active ) {
5987  __kmp_global.g.g_abort = -1;
5988  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5989  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5990  return;
5991  } else {
5992  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5993  __kmp_unregister_root_current_thread( gtid );
5994  }
5995  } else {
5996  /* worker threads may call this function through the atexit handler, if they call exit() */
5997  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5998  TODO: do a thorough shutdown instead
5999  */
6000  #ifdef DUMP_DEBUG_ON_EXIT
6001  if ( __kmp_debug_buf )
6002  __kmp_dump_debug_buffer( );
6003  #endif
6004  return;
6005  }
6006  }
6007  /* synchronize the termination process */
6008  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6009 
6010  /* have we already finished */
6011  if( __kmp_global.g.g_abort ) {
6012  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
6013  /* TODO abort? */
6014  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6015  return;
6016  }
6017  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6018  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6019  return;
6020  }
6021 
6022  /* We need this lock to enforce mutex between this reading of
6023  __kmp_threads_capacity and the writing by __kmp_register_root.
6024  Alternatively, we can use a counter of roots that is
6025  atomically updated by __kmp_get_global_thread_id_reg,
6026  __kmp_do_serial_initialize and __kmp_internal_end_*.
6027  */
6028  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6029 
6030  /* now we can safely conduct the actual termination */
6031  __kmp_internal_end();
6032 
6033  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6034  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6035 
6036  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6037 
6038  #ifdef DUMP_DEBUG_ON_EXIT
6039  if ( __kmp_debug_buf )
6040  __kmp_dump_debug_buffer();
6041  #endif
6042 
6043  #if KMP_OS_WINDOWS
6044  __kmp_close_console();
6045  #endif
6046 
6047  __kmp_fini_allocator();
6048 
6049 } // __kmp_internal_end_library
6050 
6051 void
6052 __kmp_internal_end_thread( int gtid_req )
6053 {
6054  int i;
6055 
6056  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6057  /* this shouldn't be a race condition because __kmp_internal_end() is the
6058  * only place to clear __kmp_serial_init */
6059  /* we'll check this later too, after we get the lock */
6060  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6061  // because the next check will work in any case.
6062  if( __kmp_global.g.g_abort ) {
6063  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6064  /* TODO abort? */
6065  return;
6066  }
6067  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6068  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6069  return;
6070  }
6071 
6072  KMP_MB(); /* Flush all pending memory write invalidates. */
6073 
6074  /* find out who we are and what we should do */
6075  {
6076  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6077  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
6078  if( gtid == KMP_GTID_SHUTDOWN ) {
6079  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6080  return;
6081  } else if( gtid == KMP_GTID_MONITOR ) {
6082  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6083  return;
6084  } else if( gtid == KMP_GTID_DNE ) {
6085  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6086  return;
6087  /* we don't know who we are */
6088  } else if( KMP_UBER_GTID( gtid )) {
6089  /* unregister ourselves as an uber thread. gtid is no longer valid */
6090  if( __kmp_root[gtid]->r.r_active ) {
6091  __kmp_global.g.g_abort = -1;
6092  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6093  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6094  return;
6095  } else {
6096  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6097  __kmp_unregister_root_current_thread( gtid );
6098  }
6099  } else {
6100  /* just a worker thread, let's leave */
6101  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6102 
6103  if ( gtid >= 0 ) {
6104  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6105  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
6106  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
6107  }
6108  }
6109 
6110  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6111  return;
6112  }
6113  }
6114  #if defined KMP_DYNAMIC_LIB
6115  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6116  // because we will better shutdown later in the library destructor.
6117  // The reason of this change is performance problem when non-openmp thread
6118  // in a loop forks and joins many openmp threads. We can save a lot of time
6119  // keeping worker threads alive until the program shutdown.
6120  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6121  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6122  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6123  return;
6124  #endif
6125  /* synchronize the termination process */
6126  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6127 
6128  /* have we already finished */
6129  if( __kmp_global.g.g_abort ) {
6130  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6131  /* TODO abort? */
6132  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6133  return;
6134  }
6135  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6136  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6137  return;
6138  }
6139 
6140  /* We need this lock to enforce mutex between this reading of
6141  __kmp_threads_capacity and the writing by __kmp_register_root.
6142  Alternatively, we can use a counter of roots that is
6143  atomically updated by __kmp_get_global_thread_id_reg,
6144  __kmp_do_serial_initialize and __kmp_internal_end_*.
6145  */
6146 
6147  /* should we finish the run-time? are all siblings done? */
6148  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6149 
6150  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6151  if ( KMP_UBER_GTID( i ) ) {
6152  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6153  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6154  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6155  return;
6156  };
6157  }
6158 
6159  /* now we can safely conduct the actual termination */
6160 
6161  __kmp_internal_end();
6162 
6163  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6164  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6165 
6166  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6167 
6168  #ifdef DUMP_DEBUG_ON_EXIT
6169  if ( __kmp_debug_buf )
6170  __kmp_dump_debug_buffer();
6171  #endif
6172 } // __kmp_internal_end_thread
6173 
6174 // -------------------------------------------------------------------------------------------------
6175 // Library registration stuff.
6176 
6177 static long __kmp_registration_flag = 0;
6178  // Random value used to indicate library initialization.
6179 static char * __kmp_registration_str = NULL;
6180  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6181 
6182 
6183 static inline
6184 char *
6185 __kmp_reg_status_name() {
6186  /*
6187  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6188  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6189  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6190  */
6191  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6192 } // __kmp_reg_status_get
6193 
6194 
6195 void
6196 __kmp_register_library_startup(
6197  void
6198 ) {
6199 
6200  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6201  int done = 0;
6202  union {
6203  double dtime;
6204  long ltime;
6205  } time;
6206  #if KMP_OS_WINDOWS
6207  __kmp_initialize_system_tick();
6208  #endif
6209  __kmp_read_system_time( & time.dtime );
6210  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6211  __kmp_registration_str =
6212  __kmp_str_format(
6213  "%p-%lx-%s",
6214  & __kmp_registration_flag,
6215  __kmp_registration_flag,
6216  KMP_LIBRARY_FILE
6217  );
6218 
6219  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6220 
6221  while ( ! done ) {
6222 
6223  char * value = NULL; // Actual value of the environment variable.
6224 
6225  // Set environment variable, but do not overwrite if it is exist.
6226  __kmp_env_set( name, __kmp_registration_str, 0 );
6227  // Check the variable is written.
6228  value = __kmp_env_get( name );
6229  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6230 
6231  done = 1; // Ok, environment variable set successfully, exit the loop.
6232 
6233  } else {
6234 
6235  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6236  // Check whether it alive or dead.
6237  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6238  char * tail = value;
6239  char * flag_addr_str = NULL;
6240  char * flag_val_str = NULL;
6241  char const * file_name = NULL;
6242  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6243  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6244  file_name = tail;
6245  if ( tail != NULL ) {
6246  long * flag_addr = 0;
6247  long flag_val = 0;
6248  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6249  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6250  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6251  // First, check whether environment-encoded address is mapped into addr space.
6252  // If so, dereference it to see if it still has the right value.
6253 
6254  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6255  neighbor = 1;
6256  } else {
6257  // If not, then we know the other copy of the library is no longer running.
6258  neighbor = 2;
6259  }; // if
6260  }; // if
6261  }; // if
6262  switch ( neighbor ) {
6263  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6264  // Assume it is the incompatible format of future version of the library.
6265  // Assume the other library is alive.
6266  // WARN( ... ); // TODO: Issue a warning.
6267  file_name = "unknown library";
6268  // Attention! Falling to the next case. That's intentional.
6269  case 1 : { // Neighbor is alive.
6270  // Check it is allowed.
6271  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6272  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6273  // That's not allowed. Issue fatal error.
6274  __kmp_msg(
6275  kmp_ms_fatal,
6276  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6277  KMP_HNT( DuplicateLibrary ),
6278  __kmp_msg_null
6279  );
6280  }; // if
6281  KMP_INTERNAL_FREE( duplicate_ok );
6282  __kmp_duplicate_library_ok = 1;
6283  done = 1; // Exit the loop.
6284  } break;
6285  case 2 : { // Neighbor is dead.
6286  // Clear the variable and try to register library again.
6287  __kmp_env_unset( name );
6288  } break;
6289  default : {
6290  KMP_DEBUG_ASSERT( 0 );
6291  } break;
6292  }; // switch
6293 
6294  }; // if
6295  KMP_INTERNAL_FREE( (void *) value );
6296 
6297  }; // while
6298  KMP_INTERNAL_FREE( (void *) name );
6299 
6300 } // func __kmp_register_library_startup
6301 
6302 
6303 void
6304 __kmp_unregister_library( void ) {
6305 
6306  char * name = __kmp_reg_status_name();
6307  char * value = __kmp_env_get( name );
6308 
6309  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6310  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6311  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6312  // Ok, this is our variable. Delete it.
6313  __kmp_env_unset( name );
6314  }; // if
6315 
6316  KMP_INTERNAL_FREE( __kmp_registration_str );
6317  KMP_INTERNAL_FREE( value );
6318  KMP_INTERNAL_FREE( name );
6319 
6320  __kmp_registration_flag = 0;
6321  __kmp_registration_str = NULL;
6322 
6323 } // __kmp_unregister_library
6324 
6325 
6326 // End of Library registration stuff.
6327 // -------------------------------------------------------------------------------------------------
6328 
6329 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6330 
6331 static void __kmp_check_mic_type()
6332 {
6333  kmp_cpuid_t cpuid_state = {0};
6334  kmp_cpuid_t * cs_p = &cpuid_state;
6335  __kmp_x86_cpuid(1, 0, cs_p);
6336  // We don't support mic1 at the moment
6337  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6338  __kmp_mic_type = mic2;
6339  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6340  __kmp_mic_type = mic3;
6341  } else {
6342  __kmp_mic_type = non_mic;
6343  }
6344 }
6345 
6346 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6347 
6348 static void
6349 __kmp_do_serial_initialize( void )
6350 {
6351  int i, gtid;
6352  int size;
6353 
6354  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6355 
6356  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6357  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6358  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6359  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6360  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6361 
6362  __kmp_validate_locks();
6363 
6364  /* Initialize internal memory allocator */
6365  __kmp_init_allocator();
6366 
6367  /* Register the library startup via an environment variable
6368  and check to see whether another copy of the library is already
6369  registered. */
6370 
6371  __kmp_register_library_startup( );
6372 
6373  /* TODO reinitialization of library */
6374  if( TCR_4(__kmp_global.g.g_done) ) {
6375  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6376  }
6377 
6378  __kmp_global.g.g_abort = 0;
6379  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6380 
6381  /* initialize the locks */
6382 #if KMP_USE_ADAPTIVE_LOCKS
6383 #if KMP_DEBUG_ADAPTIVE_LOCKS
6384  __kmp_init_speculative_stats();
6385 #endif
6386 #endif
6387  __kmp_init_lock( & __kmp_global_lock );
6388  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6389  __kmp_init_lock( & __kmp_debug_lock );
6390  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6391  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6392  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6393  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6394  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6395  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6396  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6397  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6398  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6399  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6400  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6401  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6402  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6403  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6404  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6405  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6406  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6407 
6408  /* conduct initialization and initial setup of configuration */
6409 
6410  __kmp_runtime_initialize();
6411 
6412 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6413  __kmp_check_mic_type();
6414 #endif
6415 
6416  // Some global variable initialization moved here from kmp_env_initialize()
6417 #ifdef KMP_DEBUG
6418  kmp_diag = 0;
6419 #endif
6420  __kmp_abort_delay = 0;
6421 
6422  // From __kmp_init_dflt_team_nth()
6423  /* assume the entire machine will be used */
6424  __kmp_dflt_team_nth_ub = __kmp_xproc;
6425  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6426  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6427  }
6428  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6429  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6430  }
6431  __kmp_max_nth = __kmp_sys_max_nth;
6432 
6433  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6434  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6435  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6436  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6437  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6438  __kmp_library = library_throughput;
6439  // From KMP_SCHEDULE initialization
6440  __kmp_static = kmp_sch_static_balanced;
6441  // AC: do not use analytical here, because it is non-monotonous
6442  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6443  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6444  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6445  // control parts
6446  #if KMP_FAST_REDUCTION_BARRIER
6447  #define kmp_reduction_barrier_gather_bb ((int)1)
6448  #define kmp_reduction_barrier_release_bb ((int)1)
6449  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6450  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6451  #endif // KMP_FAST_REDUCTION_BARRIER
6452  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6453  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6454  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6455  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6456  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6457  #if KMP_FAST_REDUCTION_BARRIER
6458  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6459  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6460  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6461  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6462  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6463  }
6464  #endif // KMP_FAST_REDUCTION_BARRIER
6465  }
6466  #if KMP_FAST_REDUCTION_BARRIER
6467  #undef kmp_reduction_barrier_release_pat
6468  #undef kmp_reduction_barrier_gather_pat
6469  #undef kmp_reduction_barrier_release_bb
6470  #undef kmp_reduction_barrier_gather_bb
6471  #endif // KMP_FAST_REDUCTION_BARRIER
6472 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6473  if( __kmp_mic_type != non_mic ) {
6474  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6475  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plane gather
6476  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6477  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6478  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6479  }
6480 #if KMP_FAST_REDUCTION_BARRIER
6481  if( __kmp_mic_type != non_mic ) {
6482  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6483  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6484  }
6485 #endif
6486 #endif
6487 
6488  // From KMP_CHECKS initialization
6489 #ifdef KMP_DEBUG
6490  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6491 #else
6492  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6493 #endif
6494 
6495  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6496  __kmp_foreign_tp = TRUE;
6497 
6498  __kmp_global.g.g_dynamic = FALSE;
6499  __kmp_global.g.g_dynamic_mode = dynamic_default;
6500 
6501  __kmp_env_initialize( NULL );
6502 
6503  // Print all messages in message catalog for testing purposes.
6504  #ifdef KMP_DEBUG
6505  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6506  if ( __kmp_str_match_true( val ) ) {
6507  kmp_str_buf_t buffer;
6508  __kmp_str_buf_init( & buffer );
6509  __kmp_i18n_dump_catalog( & buffer );
6510  __kmp_printf( "%s", buffer.str );
6511  __kmp_str_buf_free( & buffer );
6512  }; // if
6513  __kmp_env_free( & val );
6514  #endif
6515 
6516  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6517  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6518  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6519 
6520 
6521  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6522  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6523  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6524  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6525  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6526  __kmp_thread_pool = NULL;
6527  __kmp_thread_pool_insert_pt = NULL;
6528  __kmp_team_pool = NULL;
6529 
6530  /* Allocate all of the variable sized records */
6531  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6532  /* Since allocation is cache-aligned, just add extra padding at the end */
6533  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6534  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6535  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6536 
6537  /* init thread counts */
6538  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6539  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6540  __kmp_all_nth = 0;
6541  __kmp_nth = 0;
6542 
6543  /* setup the uber master thread and hierarchy */
6544  gtid = __kmp_register_root( TRUE );
6545  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6546  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6547  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6548 
6549  KMP_MB(); /* Flush all pending memory write invalidates. */
6550 
6551  __kmp_common_initialize();
6552 
6553  #if KMP_OS_UNIX
6554  /* invoke the child fork handler */
6555  __kmp_register_atfork();
6556  #endif
6557 
6558  #if ! defined KMP_DYNAMIC_LIB
6559  {
6560  /* Invoke the exit handler when the program finishes, only for static library.
6561  For dynamic library, we already have _fini and DllMain.
6562  */
6563  int rc = atexit( __kmp_internal_end_atexit );
6564  if ( rc != 0 ) {
6565  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6566  }; // if
6567  }
6568  #endif
6569 
6570  #if KMP_HANDLE_SIGNALS
6571  #if KMP_OS_UNIX
6572  /* NOTE: make sure that this is called before the user installs
6573  * their own signal handlers so that the user handlers
6574  * are called first. this way they can return false,
6575  * not call our handler, avoid terminating the library,
6576  * and continue execution where they left off. */
6577  __kmp_install_signals( FALSE );
6578  #endif /* KMP_OS_UNIX */
6579  #if KMP_OS_WINDOWS
6580  __kmp_install_signals( TRUE );
6581  #endif /* KMP_OS_WINDOWS */
6582  #endif
6583 
6584  /* we have finished the serial initialization */
6585  __kmp_init_counter ++;
6586 
6587  __kmp_init_serial = TRUE;
6588 
6589  if (__kmp_settings) {
6590  __kmp_env_print();
6591  }
6592 
6593 #if OMP_40_ENABLED
6594  if (__kmp_display_env || __kmp_display_env_verbose) {
6595  __kmp_env_print_2();
6596  }
6597 #endif // OMP_40_ENABLED
6598 
6599  KMP_MB();
6600 
6601  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6602 #if OMPT_SUPPORT
6603  ompt_init();
6604 #endif
6605 }
6606 
6607 void
6608 __kmp_serial_initialize( void )
6609 {
6610  if ( __kmp_init_serial ) {
6611  return;
6612  }
6613  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6614  if ( __kmp_init_serial ) {
6615  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6616  return;
6617  }
6618  __kmp_do_serial_initialize();
6619  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6620 }
6621 
6622 static void
6623 __kmp_do_middle_initialize( void )
6624 {
6625  int i, j;
6626  int prev_dflt_team_nth;
6627 
6628  if( !__kmp_init_serial ) {
6629  __kmp_do_serial_initialize();
6630  }
6631 
6632  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6633 
6634  //
6635  // Save the previous value for the __kmp_dflt_team_nth so that
6636  // we can avoid some reinitialization if it hasn't changed.
6637  //
6638  prev_dflt_team_nth = __kmp_dflt_team_nth;
6639 
6640 #if KMP_AFFINITY_SUPPORTED
6641  //
6642  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6643  // number of cores on the machine.
6644  //
6645  __kmp_affinity_initialize();
6646 
6647  //
6648  // Run through the __kmp_threads array and set the affinity mask
6649  // for each root thread that is currently registered with the RTL.
6650  //
6651  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6652  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6653  __kmp_affinity_set_init_mask( i, TRUE );
6654  }
6655  }
6656 #endif /* KMP_AFFINITY_SUPPORTED */
6657 
6658  KMP_ASSERT( __kmp_xproc > 0 );
6659  if ( __kmp_avail_proc == 0 ) {
6660  __kmp_avail_proc = __kmp_xproc;
6661  }
6662 
6663  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6664  j = 0;
6665  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6666  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6667  j++;
6668  }
6669 
6670  if ( __kmp_dflt_team_nth == 0 ) {
6671 #ifdef KMP_DFLT_NTH_CORES
6672  //
6673  // Default #threads = #cores
6674  //
6675  __kmp_dflt_team_nth = __kmp_ncores;
6676  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6677  __kmp_dflt_team_nth ) );
6678 #else
6679  //
6680  // Default #threads = #available OS procs
6681  //
6682  __kmp_dflt_team_nth = __kmp_avail_proc;
6683  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6684  __kmp_dflt_team_nth ) );
6685 #endif /* KMP_DFLT_NTH_CORES */
6686  }
6687 
6688  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6689  __kmp_dflt_team_nth = KMP_MIN_NTH;
6690  }
6691  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6692  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6693  }
6694 
6695  //
6696  // There's no harm in continuing if the following check fails,
6697  // but it indicates an error in the previous logic.
6698  //
6699  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6700 
6701  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6702  //
6703  // Run through the __kmp_threads array and set the num threads icv
6704  // for each root thread that is currently registered with the RTL
6705  // (which has not already explicitly set its nthreads-var with a
6706  // call to omp_set_num_threads()).
6707  //
6708  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6709  kmp_info_t *thread = __kmp_threads[ i ];
6710  if ( thread == NULL ) continue;
6711  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6712 
6713  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6714  }
6715  }
6716  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6717  __kmp_dflt_team_nth) );
6718 
6719 #ifdef KMP_ADJUST_BLOCKTIME
6720  /* Adjust blocktime to zero if necessary */
6721  /* now that __kmp_avail_proc is set */
6722  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6723  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6724  if ( __kmp_nth > __kmp_avail_proc ) {
6725  __kmp_zero_bt = TRUE;
6726  }
6727  }
6728 #endif /* KMP_ADJUST_BLOCKTIME */
6729 
6730  /* we have finished middle initialization */
6731  TCW_SYNC_4(__kmp_init_middle, TRUE);
6732 
6733  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6734 }
6735 
6736 void
6737 __kmp_middle_initialize( void )
6738 {
6739  if ( __kmp_init_middle ) {
6740  return;
6741  }
6742  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6743  if ( __kmp_init_middle ) {
6744  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6745  return;
6746  }
6747  __kmp_do_middle_initialize();
6748  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6749 #if OMPT_SUPPORT
6750  ompt_init();
6751 #endif
6752 }
6753 
6754 void
6755 __kmp_parallel_initialize( void )
6756 {
6757  int gtid = __kmp_entry_gtid(); // this might be a new root
6758 
6759  /* syncronize parallel initialization (for sibling) */
6760  if( TCR_4(__kmp_init_parallel) ) return;
6761  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6762  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6763 
6764  /* TODO reinitialization after we have already shut down */
6765  if( TCR_4(__kmp_global.g.g_done) ) {
6766  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6767  __kmp_infinite_loop();
6768  }
6769 
6770  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6771  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6772  */
6773  if( !__kmp_init_middle ) {
6774  __kmp_do_middle_initialize();
6775  }
6776 
6777  /* begin initialization */
6778  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6779  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6780 
6781 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6782  //
6783  // Save the FP control regs.
6784  // Worker threads will set theirs to these values at thread startup.
6785  //
6786  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6787  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6788  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6789 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6790 
6791 #if KMP_OS_UNIX
6792 # if KMP_HANDLE_SIGNALS
6793  /* must be after __kmp_serial_initialize */
6794  __kmp_install_signals( TRUE );
6795 # endif
6796 #endif
6797 
6798  __kmp_suspend_initialize();
6799 
6800 # if defined(USE_LOAD_BALANCE)
6801  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6802  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6803  }
6804 #else
6805  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6806  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6807  }
6808 #endif
6809 
6810  if ( __kmp_version ) {
6811  __kmp_print_version_2();
6812  }
6813 
6814  /* we have finished parallel initialization */
6815  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6816 
6817  KMP_MB();
6818  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6819 
6820  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6821 #if OMPT_SUPPORT
6822  ompt_init();
6823 #endif
6824 }
6825 
6826 
6827 /* ------------------------------------------------------------------------ */
6828 
6829 void
6830 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6831  kmp_team_t *team )
6832 {
6833  kmp_disp_t *dispatch;
6834 
6835  KMP_MB();
6836 
6837  /* none of the threads have encountered any constructs, yet. */
6838  this_thr->th.th_local.this_construct = 0;
6839 #if KMP_CACHE_MANAGE
6840  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6841 #endif /* KMP_CACHE_MANAGE */
6842  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6843  KMP_DEBUG_ASSERT( dispatch );
6844  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6845  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6846 
6847  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6848 
6849  if( __kmp_env_consistency_check )
6850  __kmp_push_parallel( gtid, team->t.t_ident );
6851 
6852  KMP_MB(); /* Flush all pending memory write invalidates. */
6853 }
6854 
6855 void
6856 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6857  kmp_team_t *team )
6858 {
6859  if( __kmp_env_consistency_check )
6860  __kmp_pop_parallel( gtid, team->t.t_ident );
6861 }
6862 
6863 int
6864 __kmp_invoke_task_func( int gtid )
6865 {
6866  int rc;
6867  int tid = __kmp_tid_from_gtid( gtid );
6868  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6869  kmp_team_t *team = this_thr->th.th_team;
6870 
6871  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6872 #if USE_ITT_BUILD
6873  if ( __itt_stack_caller_create_ptr ) {
6874  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6875  }
6876 #endif /* USE_ITT_BUILD */
6877 #if INCLUDE_SSC_MARKS
6878  SSC_MARK_INVOKING();
6879 #endif
6880 
6881 #if OMPT_SUPPORT
6882  void *dummy;
6883  void **exit_runtime_p;
6884  ompt_task_id_t my_task_id;
6885  ompt_parallel_id_t my_parallel_id;
6886 
6887  if (ompt_status & ompt_status_track) {
6888  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6889  ompt_task_info.frame.exit_runtime_frame);
6890  } else {
6891  exit_runtime_p = &dummy;
6892  }
6893 
6894 #if OMPT_TRACE
6895  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6896  my_parallel_id = team->t.ompt_team_info.parallel_id;
6897  if ((ompt_status == ompt_status_track_callback) &&
6898  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6899  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6900  my_parallel_id, my_task_id);
6901  }
6902 #endif
6903 #endif
6904 
6905  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6906  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6907 #if OMPT_SUPPORT
6908  , exit_runtime_p
6909 #endif
6910  );
6911 
6912 #if OMPT_SUPPORT && OMPT_TRACE
6913  if (ompt_status & ompt_status_track) {
6914  if ((ompt_status == ompt_status_track_callback) &&
6915  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
6916  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
6917  my_parallel_id, my_task_id);
6918  }
6919  // the implicit task is not dead yet, so we can't clear its task id here
6920  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
6921  }
6922 #endif
6923 
6924 #if USE_ITT_BUILD
6925  if ( __itt_stack_caller_create_ptr ) {
6926  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6927  }
6928 #endif /* USE_ITT_BUILD */
6929  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6930 
6931  return rc;
6932 }
6933 
6934 #if OMP_40_ENABLED
6935 void
6936 __kmp_teams_master( int gtid )
6937 {
6938  // This routine is called by all master threads in teams construct
6939  kmp_info_t *thr = __kmp_threads[ gtid ];
6940  kmp_team_t *team = thr->th.th_team;
6941  ident_t *loc = team->t.t_ident;
6942  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6943  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6944  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6945  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6946  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6947  // Launch league of teams now, but not let workers execute
6948  // (they hang on fork barrier until next parallel)
6949 #if INCLUDE_SSC_MARKS
6950  SSC_MARK_FORKING();
6951 #endif
6952  __kmp_fork_call( loc, gtid, fork_context_intel,
6953  team->t.t_argc,
6954 #if OMPT_SUPPORT
6955  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6956 #endif
6957  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6958  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6959  NULL );
6960 #if INCLUDE_SSC_MARKS
6961  SSC_MARK_JOINING();
6962 #endif
6963  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
6964  // worker threads are in a fork barrier waiting for more parallel regions
6965 }
6966 
6967 int
6968 __kmp_invoke_teams_master( int gtid )
6969 {
6970  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6971  kmp_team_t *team = this_thr->th.th_team;
6972  #if KMP_DEBUG
6973  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6974  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6975  #endif
6976  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6977  __kmp_teams_master( gtid );
6978  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6979  return 1;
6980 }
6981 #endif /* OMP_40_ENABLED */
6982 
6983 /* this sets the requested number of threads for the next parallel region
6984  * encountered by this team */
6985 /* since this should be enclosed in the forkjoin critical section it
6986  * should avoid race conditions with assymmetrical nested parallelism */
6987 
6988 void
6989 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6990 {
6991  kmp_info_t *thr = __kmp_threads[gtid];
6992 
6993  if( num_threads > 0 )
6994  thr->th.th_set_nproc = num_threads;
6995 }
6996 
6997 #if OMP_40_ENABLED
6998 
6999 /* this sets the requested number of teams for the teams region and/or
7000  * the number of threads for the next parallel region encountered */
7001 void
7002 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
7003 {
7004  kmp_info_t *thr = __kmp_threads[gtid];
7005  KMP_DEBUG_ASSERT(num_teams >= 0);
7006  KMP_DEBUG_ASSERT(num_threads >= 0);
7007  if( num_teams == 0 ) {
7008  num_teams = 1; // default number of teams is 1.
7009  }
7010  // Set number of teams (number of threads in the outer "parallel" of the teams)
7011  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7012 
7013  // Remember the number of threads for inner parallel regions
7014  if( num_threads > 0 ) {
7015  thr->th.th_teams_size.nth = num_threads;
7016  } else {
7017  if( !TCR_4(__kmp_init_middle) )
7018  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7019  thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
7020  }
7021 }
7022 
7023 
7024 //
7025 // Set the proc_bind var to use in the following parallel region.
7026 //
7027 void
7028 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7029 {
7030  kmp_info_t *thr = __kmp_threads[gtid];
7031  thr->th.th_set_proc_bind = proc_bind;
7032 }
7033 
7034 #endif /* OMP_40_ENABLED */
7035 
7036 /* Launch the worker threads into the microtask. */
7037 
7038 void
7039 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7040 {
7041  kmp_info_t *this_thr = __kmp_threads[gtid];
7042 
7043 #ifdef KMP_DEBUG
7044  int f;
7045 #endif /* KMP_DEBUG */
7046 
7047  KMP_DEBUG_ASSERT( team );
7048  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7049  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7050  KMP_MB(); /* Flush all pending memory write invalidates. */
7051 
7052  team->t.t_construct = 0; /* no single directives seen yet */
7053  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7054 
7055  /* Reset the identifiers on the dispatch buffer */
7056  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7057  if ( team->t.t_max_nproc > 1 ) {
7058  int i;
7059  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
7060  team->t.t_disp_buffer[ i ].buffer_index = i;
7061  } else {
7062  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7063  }
7064 
7065  KMP_MB(); /* Flush all pending memory write invalidates. */
7066  KMP_ASSERT( this_thr->th.th_team == team );
7067 
7068 #ifdef KMP_DEBUG
7069  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7070  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7071  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7072  }
7073 #endif /* KMP_DEBUG */
7074 
7075  /* release the worker threads so they may begin working */
7076  __kmp_fork_barrier( gtid, 0 );
7077 }
7078 
7079 
7080 void
7081 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7082 {
7083  kmp_info_t *this_thr = __kmp_threads[gtid];
7084 
7085  KMP_DEBUG_ASSERT( team );
7086  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7087  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7088  KMP_MB(); /* Flush all pending memory write invalidates. */
7089 
7090  /* Join barrier after fork */
7091 
7092 #ifdef KMP_DEBUG
7093  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7094  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7095  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7096  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7097  __kmp_print_structure();
7098  }
7099  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7100  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7101 #endif /* KMP_DEBUG */
7102 
7103  __kmp_join_barrier( gtid ); /* wait for everyone */
7104 
7105  KMP_MB(); /* Flush all pending memory write invalidates. */
7106  KMP_ASSERT( this_thr->th.th_team == team );
7107 }
7108 
7109 
7110 /* ------------------------------------------------------------------------ */
7111 /* ------------------------------------------------------------------------ */
7112 
7113 #ifdef USE_LOAD_BALANCE
7114 
7115 //
7116 // Return the worker threads actively spinning in the hot team, if we
7117 // are at the outermost level of parallelism. Otherwise, return 0.
7118 //
7119 static int
7120 __kmp_active_hot_team_nproc( kmp_root_t *root )
7121 {
7122  int i;
7123  int retval;
7124  kmp_team_t *hot_team;
7125 
7126  if ( root->r.r_active ) {
7127  return 0;
7128  }
7129  hot_team = root->r.r_hot_team;
7130  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7131  return hot_team->t.t_nproc - 1; // Don't count master thread
7132  }
7133 
7134  //
7135  // Skip the master thread - it is accounted for elsewhere.
7136  //
7137  retval = 0;
7138  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7139  if ( hot_team->t.t_threads[i]->th.th_active ) {
7140  retval++;
7141  }
7142  }
7143  return retval;
7144 }
7145 
7146 //
7147 // Perform an automatic adjustment to the number of
7148 // threads used by the next parallel region.
7149 //
7150 static int
7151 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7152 {
7153  int retval;
7154  int pool_active;
7155  int hot_team_active;
7156  int team_curr_active;
7157  int system_active;
7158 
7159  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7160  root, set_nproc ) );
7161  KMP_DEBUG_ASSERT( root );
7162  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7163  KMP_DEBUG_ASSERT( set_nproc > 1 );
7164 
7165  if ( set_nproc == 1) {
7166  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7167  return 1;
7168  }
7169 
7170  //
7171  // Threads that are active in the thread pool, active in the hot team
7172  // for this particular root (if we are at the outer par level), and
7173  // the currently executing thread (to become the master) are available
7174  // to add to the new team, but are currently contributing to the system
7175  // load, and must be accounted for.
7176  //
7177  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7178  hot_team_active = __kmp_active_hot_team_nproc( root );
7179  team_curr_active = pool_active + hot_team_active + 1;
7180 
7181  //
7182  // Check the system load.
7183  //
7184  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7185  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7186  system_active, pool_active, hot_team_active ) );
7187 
7188  if ( system_active < 0 ) {
7189  //
7190  // There was an error reading the necessary info from /proc,
7191  // so use the thread limit algorithm instead. Once we set
7192  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7193  // we shouldn't wind up getting back here.
7194  //
7195  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7196  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7197 
7198  //
7199  // Make this call behave like the thread limit algorithm.
7200  //
7201  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7202  : root->r.r_hot_team->t.t_nproc);
7203  if ( retval > set_nproc ) {
7204  retval = set_nproc;
7205  }
7206  if ( retval < KMP_MIN_NTH ) {
7207  retval = KMP_MIN_NTH;
7208  }
7209 
7210  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7211  return retval;
7212  }
7213 
7214  //
7215  // There is a slight delay in the load balance algorithm in detecting
7216  // new running procs. The real system load at this instant should be
7217  // at least as large as the #active omp thread that are available to
7218  // add to the team.
7219  //
7220  if ( system_active < team_curr_active ) {
7221  system_active = team_curr_active;
7222  }
7223  retval = __kmp_avail_proc - system_active + team_curr_active;
7224  if ( retval > set_nproc ) {
7225  retval = set_nproc;
7226  }
7227  if ( retval < KMP_MIN_NTH ) {
7228  retval = KMP_MIN_NTH;
7229  }
7230 
7231  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7232  return retval;
7233 } // __kmp_load_balance_nproc()
7234 
7235 #endif /* USE_LOAD_BALANCE */
7236 
7237 
7238 /* ------------------------------------------------------------------------ */
7239 /* ------------------------------------------------------------------------ */
7240 
7241 /* NOTE: this is called with the __kmp_init_lock held */
7242 void
7243 __kmp_cleanup( void )
7244 {
7245  int f;
7246 
7247  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7248 
7249  if (TCR_4(__kmp_init_parallel)) {
7250 #if KMP_HANDLE_SIGNALS
7251  __kmp_remove_signals();
7252 #endif
7253  TCW_4(__kmp_init_parallel, FALSE);
7254  }
7255 
7256  if (TCR_4(__kmp_init_middle)) {
7257 #if KMP_AFFINITY_SUPPORTED
7258  __kmp_affinity_uninitialize();
7259 #endif /* KMP_AFFINITY_SUPPORTED */
7260  TCW_4(__kmp_init_middle, FALSE);
7261  }
7262 
7263  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7264 
7265  if (__kmp_init_serial) {
7266 
7267  __kmp_runtime_destroy();
7268 
7269  __kmp_init_serial = FALSE;
7270  }
7271 
7272  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7273  if ( __kmp_root[ f ] != NULL ) {
7274  __kmp_free( __kmp_root[ f ] );
7275  __kmp_root[ f ] = NULL;
7276  }
7277  }
7278  __kmp_free( __kmp_threads );
7279  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7280  // freeing __kmp_root.
7281  __kmp_threads = NULL;
7282  __kmp_root = NULL;
7283  __kmp_threads_capacity = 0;
7284 
7285 #if KMP_USE_DYNAMIC_LOCK
7286  __kmp_cleanup_indirect_user_locks();
7287 #else
7288  __kmp_cleanup_user_locks();
7289 #endif
7290 
7291  #if KMP_AFFINITY_SUPPORTED
7292  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7293  __kmp_cpuinfo_file = NULL;
7294  #endif /* KMP_AFFINITY_SUPPORTED */
7295 
7296  #if KMP_USE_ADAPTIVE_LOCKS
7297  #if KMP_DEBUG_ADAPTIVE_LOCKS
7298  __kmp_print_speculative_stats();
7299  #endif
7300  #endif
7301  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7302  __kmp_nested_nth.nth = NULL;
7303  __kmp_nested_nth.size = 0;
7304  __kmp_nested_nth.used = 0;
7305 
7306  __kmp_i18n_catclose();
7307 
7308 #if KMP_STATS_ENABLED
7309  __kmp_accumulate_stats_at_exit();
7310  __kmp_stats_list.deallocate();
7311 #endif
7312 
7313  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7314 }
7315 
7316 /* ------------------------------------------------------------------------ */
7317 /* ------------------------------------------------------------------------ */
7318 
7319 int
7320 __kmp_ignore_mppbeg( void )
7321 {
7322  char *env;
7323 
7324  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7325  if (__kmp_str_match_false( env ))
7326  return FALSE;
7327  }
7328  // By default __kmpc_begin() is no-op.
7329  return TRUE;
7330 }
7331 
7332 int
7333 __kmp_ignore_mppend( void )
7334 {
7335  char *env;
7336 
7337  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7338  if (__kmp_str_match_false( env ))
7339  return FALSE;
7340  }
7341  // By default __kmpc_end() is no-op.
7342  return TRUE;
7343 }
7344 
7345 void
7346 __kmp_internal_begin( void )
7347 {
7348  int gtid;
7349  kmp_root_t *root;
7350 
7351  /* this is a very important step as it will register new sibling threads
7352  * and assign these new uber threads a new gtid */
7353  gtid = __kmp_entry_gtid();
7354  root = __kmp_threads[ gtid ]->th.th_root;
7355  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7356 
7357  if( root->r.r_begin ) return;
7358  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7359  if( root->r.r_begin ) {
7360  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7361  return;
7362  }
7363 
7364  root->r.r_begin = TRUE;
7365 
7366  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7367 }
7368 
7369 
7370 /* ------------------------------------------------------------------------ */
7371 /* ------------------------------------------------------------------------ */
7372 
7373 void
7374 __kmp_user_set_library (enum library_type arg)
7375 {
7376  int gtid;
7377  kmp_root_t *root;
7378  kmp_info_t *thread;
7379 
7380  /* first, make sure we are initialized so we can get our gtid */
7381 
7382  gtid = __kmp_entry_gtid();
7383  thread = __kmp_threads[ gtid ];
7384 
7385  root = thread->th.th_root;
7386 
7387  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7388  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7389  KMP_WARNING( SetLibraryIncorrectCall );
7390  return;
7391  }
7392 
7393  switch ( arg ) {
7394  case library_serial :
7395  thread->th.th_set_nproc = 0;
7396  set__nproc( thread, 1 );
7397  break;
7398  case library_turnaround :
7399  thread->th.th_set_nproc = 0;
7400  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7401  break;
7402  case library_throughput :
7403  thread->th.th_set_nproc = 0;
7404  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7405  break;
7406  default:
7407  KMP_FATAL( UnknownLibraryType, arg );
7408  }
7409 
7410  __kmp_aux_set_library ( arg );
7411 }
7412 
7413 void
7414 __kmp_aux_set_stacksize( size_t arg )
7415 {
7416  if (! __kmp_init_serial)
7417  __kmp_serial_initialize();
7418 
7419 #if KMP_OS_DARWIN
7420  if (arg & (0x1000 - 1)) {
7421  arg &= ~(0x1000 - 1);
7422  if(arg + 0x1000) /* check for overflow if we round up */
7423  arg += 0x1000;
7424  }
7425 #endif
7426  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7427 
7428  /* only change the default stacksize before the first parallel region */
7429  if (! TCR_4(__kmp_init_parallel)) {
7430  size_t value = arg; /* argument is in bytes */
7431 
7432  if (value < __kmp_sys_min_stksize )
7433  value = __kmp_sys_min_stksize ;
7434  else if (value > KMP_MAX_STKSIZE)
7435  value = KMP_MAX_STKSIZE;
7436 
7437  __kmp_stksize = value;
7438 
7439  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7440  }
7441 
7442  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7443 }
7444 
7445 /* set the behaviour of the runtime library */
7446 /* TODO this can cause some odd behaviour with sibling parallelism... */
7447 void
7448 __kmp_aux_set_library (enum library_type arg)
7449 {
7450  __kmp_library = arg;
7451 
7452  switch ( __kmp_library ) {
7453  case library_serial :
7454  {
7455  KMP_INFORM( LibraryIsSerial );
7456  (void) __kmp_change_library( TRUE );
7457  }
7458  break;
7459  case library_turnaround :
7460  (void) __kmp_change_library( TRUE );
7461  break;
7462  case library_throughput :
7463  (void) __kmp_change_library( FALSE );
7464  break;
7465  default:
7466  KMP_FATAL( UnknownLibraryType, arg );
7467  }
7468 }
7469 
7470 /* ------------------------------------------------------------------------ */
7471 /* ------------------------------------------------------------------------ */
7472 
7473 void
7474 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7475 {
7476  int blocktime = arg; /* argument is in milliseconds */
7477  int bt_intervals;
7478  int bt_set;
7479 
7480  __kmp_save_internal_controls( thread );
7481 
7482  /* Normalize and set blocktime for the teams */
7483  if (blocktime < KMP_MIN_BLOCKTIME)
7484  blocktime = KMP_MIN_BLOCKTIME;
7485  else if (blocktime > KMP_MAX_BLOCKTIME)
7486  blocktime = KMP_MAX_BLOCKTIME;
7487 
7488  set__blocktime_team( thread->th.th_team, tid, blocktime );
7489  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7490 
7491  /* Calculate and set blocktime intervals for the teams */
7492  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7493 
7494  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7495  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7496 
7497  /* Set whether blocktime has been set to "TRUE" */
7498  bt_set = TRUE;
7499 
7500  set__bt_set_team( thread->th.th_team, tid, bt_set );
7501  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7502  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7503  __kmp_gtid_from_tid(tid, thread->th.th_team),
7504  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7505 }
7506 
7507 void
7508 __kmp_aux_set_defaults(
7509  char const * str,
7510  int len
7511 ) {
7512  if ( ! __kmp_init_serial ) {
7513  __kmp_serial_initialize();
7514  };
7515  __kmp_env_initialize( str );
7516 
7517  if (__kmp_settings
7518 #if OMP_40_ENABLED
7519  || __kmp_display_env || __kmp_display_env_verbose
7520 #endif // OMP_40_ENABLED
7521  ) {
7522  __kmp_env_print();
7523  }
7524 } // __kmp_aux_set_defaults
7525 
7526 /* ------------------------------------------------------------------------ */
7527 
7528 /*
7529  * internal fast reduction routines
7530  */
7531 
7532 PACKED_REDUCTION_METHOD_T
7533 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7534  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7535  kmp_critical_name *lck )
7536 {
7537 
7538  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7539  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7540  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7541  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7542 
7543  PACKED_REDUCTION_METHOD_T retval;
7544 
7545  int team_size;
7546 
7547  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7548  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7549 
7550  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7551  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7552 
7553  retval = critical_reduce_block;
7554 
7555  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7556 
7557  if( team_size == 1 ) {
7558 
7559  retval = empty_reduce_block;
7560 
7561  } else {
7562 
7563  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7564  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7565 
7566  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7567 
7568  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7569 
7570  int teamsize_cutoff = 4;
7571 
7572 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7573  if( __kmp_mic_type != non_mic ) {
7574  teamsize_cutoff = 8;
7575  }
7576 #endif
7577  if( tree_available ) {
7578  if( team_size <= teamsize_cutoff ) {
7579  if ( atomic_available ) {
7580  retval = atomic_reduce_block;
7581  }
7582  } else {
7583  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7584  }
7585  } else if ( atomic_available ) {
7586  retval = atomic_reduce_block;
7587  }
7588  #else
7589  #error "Unknown or unsupported OS"
7590  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7591 
7592  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7593 
7594  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7595 
7596  // basic tuning
7597 
7598  if( atomic_available ) {
7599  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7600  retval = atomic_reduce_block;
7601  }
7602  } // otherwise: use critical section
7603 
7604  #elif KMP_OS_DARWIN
7605 
7606  if( atomic_available && ( num_vars <= 3 ) ) {
7607  retval = atomic_reduce_block;
7608  } else if( tree_available ) {
7609  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7610  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7611  }
7612  } // otherwise: use critical section
7613 
7614  #else
7615  #error "Unknown or unsupported OS"
7616  #endif
7617 
7618  #else
7619  #error "Unknown or unsupported architecture"
7620  #endif
7621 
7622  }
7623 
7624  // KMP_FORCE_REDUCTION
7625 
7626  if( __kmp_force_reduction_method != reduction_method_not_defined ) {
7627 
7628  PACKED_REDUCTION_METHOD_T forced_retval;
7629 
7630  int atomic_available, tree_available;
7631 
7632  switch( ( forced_retval = __kmp_force_reduction_method ) )
7633  {
7634  case critical_reduce_block:
7635  KMP_ASSERT( lck ); // lck should be != 0
7636  if( team_size <= 1 ) {
7637  forced_retval = empty_reduce_block;
7638  }
7639  break;
7640 
7641  case atomic_reduce_block:
7642  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7643  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7644  break;
7645 
7646  case tree_reduce_block:
7647  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7648  KMP_ASSERT( tree_available ); // tree_available should be != 0
7649  #if KMP_FAST_REDUCTION_BARRIER
7650  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7651  #endif
7652  break;
7653 
7654  default:
7655  KMP_ASSERT( 0 ); // "unsupported method specified"
7656  }
7657 
7658  retval = forced_retval;
7659  }
7660 
7661  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7662 
7663  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7664  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7665 
7666  return ( retval );
7667 }
7668 
7669 // this function is for testing set/get/determine reduce method
7670 kmp_int32
7671 __kmp_get_reduce_method( void ) {
7672  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7673 }
7674 
7675 /* ------------------------------------------------------------------------ */
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:649
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:663
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:610
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:445
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:181
Definition: kmp.h:198
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:430
sched_type
Definition: kmp.h:300
kmp_int32 flags
Definition: kmp.h:200