LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 
21 #if KMP_MIC
22 #include <immintrin.h>
23 #define USE_NGO_STORES 1
24 #endif // KMP_MIC
25 
26 #if KMP_MIC && USE_NGO_STORES
27 // ICV copying
28 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32 #else
33 #define ngo_load(src) ((void)0)
34 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
35 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
36 #define ngo_sync() ((void)0)
37 #endif /* KMP_MIC && USE_NGO_STORES */
38 
39 void __kmp_print_structure(void); // Forward declaration
40 
41 // ---------------------------- Barrier Algorithms ----------------------------
42 
43 // Linear Barrier
44 static void
45 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46  void (*reduce)(void *, void *)
47  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48 {
49  KMP_TIME_BLOCK(KMP_linear_gather);
50  register kmp_team_t *team = this_thr->th.th_team;
51  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52  register kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55  gtid, team->t.t_id, tid, bt));
56  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57 
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY
59  // Barrier imbalance - save arrive time to the thread
60  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
61  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62  }
63 #endif
64  // We now perform a linear reduction to signal that all of the threads have arrived.
65  if (!KMP_MASTER_TID(tid)) {
66  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
67  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
68  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70  // Mark arrival to master thread
71  /* After performing this write, a worker thread may not assume that the team is valid
72  any more - it could be deallocated by the master thread at any time. */
73  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74  flag.release();
75  } else {
76  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77  register int nproc = this_thr->th.th_team_nproc;
78  register int i;
79  // Don't have to worry about sleep bit here or atomic since team setting
80  register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
81 
82  // Collect all the worker team member threads.
83  for (i=1; i<nproc; ++i) {
84 #if KMP_CACHE_MANAGE
85  // Prefetch next thread's arrived count
86  if (i+1 < nproc)
87  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88 #endif /* KMP_CACHE_MANAGE */
89  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
90  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
91  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93 
94  // Wait for worker thread to arrive
95  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96  flag.wait(this_thr, FALSE
97  USE_ITT_BUILD_ARG(itt_sync_obj) );
98 #if USE_ITT_BUILD && USE_ITT_NOTIFY
99  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
100  if (__kmp_forkjoin_frames_mode == 2) {
101  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102  other_threads[i]->th.th_bar_min_time);
103  }
104 #endif
105  if (reduce) {
106  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108  (*reduce)(this_thr->th.th_local.reduce_data,
109  other_threads[i]->th.th_local.reduce_data);
110  }
111  }
112  // Don't have to worry about sleep bit here or atomic since team setting
113  team_bar->b_arrived = new_state;
114  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
115  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116  }
117  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118  gtid, team->t.t_id, tid, bt));
119 }
120 
121 static void
122 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123  int propagate_icvs
124  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125 {
126  KMP_TIME_BLOCK(KMP_linear_release);
127  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128  register kmp_team_t *team;
129 
130  if (KMP_MASTER_TID(tid)) {
131  register unsigned int i;
132  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133  register kmp_info_t **other_threads;
134 
135  team = __kmp_threads[gtid]->th.th_team;
136  KMP_DEBUG_ASSERT(team != NULL);
137  other_threads = team->t.t_threads;
138 
139  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140  gtid, team->t.t_id, tid, bt));
141 
142  if (nproc > 1) {
143 #if KMP_BARRIER_ICV_PUSH
144  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
145  if (propagate_icvs) {
146  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
147  for (i=1; i<nproc; ++i) {
148  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
149  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
150  &team->t.t_implicit_task_taskdata[0].td_icvs);
151  }
152  ngo_sync();
153  }
154  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
155 #endif // KMP_BARRIER_ICV_PUSH
156 
157  // Now, release all of the worker threads
158  for (i=1; i<nproc; ++i) {
159 #if KMP_CACHE_MANAGE
160  // Prefetch next thread's go flag
161  if (i+1 < nproc)
162  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
163 #endif /* KMP_CACHE_MANAGE */
164  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
165  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
166  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
167  &other_threads[i]->th.th_bar[bt].bb.b_go,
168  other_threads[i]->th.th_bar[bt].bb.b_go,
169  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
170  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
171  flag.release();
172  }
173  }
174  } else { // Wait for the MASTER thread to release us
175  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
176  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
177  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
178  flag.wait(this_thr, TRUE
179  USE_ITT_BUILD_ARG(itt_sync_obj) );
180 #if USE_ITT_BUILD && USE_ITT_NOTIFY
181  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
182  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
183  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
184  // Cancel wait on previous parallel region...
185  __kmp_itt_task_starting(itt_sync_obj);
186 
187  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
188  return;
189 
190  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
191  if (itt_sync_obj != NULL)
192  // Call prepare as early as possible for "new" barrier
193  __kmp_itt_task_finished(itt_sync_obj);
194  } else
195 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
196  // Early exit for reaping threads releasing forkjoin barrier
197  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
198  return;
199  // The worker thread may now assume that the team is valid.
200 #ifdef KMP_DEBUG
201  tid = __kmp_tid_from_gtid(gtid);
202  team = __kmp_threads[gtid]->th.th_team;
203 #endif
204  KMP_DEBUG_ASSERT(team != NULL);
205  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
206  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
207  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
208  KMP_MB(); // Flush all pending memory write invalidates.
209  }
210  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
211  gtid, team->t.t_id, tid, bt));
212 }
213 
214 // Tree barrier
215 static void
216 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
217  void (*reduce)(void *, void *)
218  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
219 {
220  KMP_TIME_BLOCK(KMP_tree_gather);
221  register kmp_team_t *team = this_thr->th.th_team;
222  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
223  register kmp_info_t **other_threads = team->t.t_threads;
224  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
225  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
226  register kmp_uint32 branch_factor = 1 << branch_bits;
227  register kmp_uint32 child;
228  register kmp_uint32 child_tid;
229  register kmp_uint new_state;
230 
231  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
232  gtid, team->t.t_id, tid, bt));
233  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
234 
235 #if USE_ITT_BUILD && USE_ITT_NOTIFY
236  // Barrier imbalance - save arrive time to the thread
237  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
238  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
239  }
240 #endif
241  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
242  child_tid = (tid << branch_bits) + 1;
243  if (child_tid < nproc) {
244  // Parent threads wait for all their children to arrive
245  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
246  child = 1;
247  do {
248  register kmp_info_t *child_thr = other_threads[child_tid];
249  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
250 #if KMP_CACHE_MANAGE
251  // Prefetch next thread's arrived count
252  if (child+1 <= branch_factor && child_tid+1 < nproc)
253  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
254 #endif /* KMP_CACHE_MANAGE */
255  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
256  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
257  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
258  &child_bar->b_arrived, new_state));
259  // Wait for child to arrive
260  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
261  flag.wait(this_thr, FALSE
262  USE_ITT_BUILD_ARG(itt_sync_obj) );
263 #if USE_ITT_BUILD && USE_ITT_NOTIFY
264  // Barrier imbalance - write min of the thread time and a child time to the thread.
265  if (__kmp_forkjoin_frames_mode == 2) {
266  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
267  child_thr->th.th_bar_min_time);
268  }
269 #endif
270  if (reduce) {
271  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
272  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
273  team->t.t_id, child_tid));
274  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
275  }
276  child++;
277  child_tid++;
278  }
279  while (child <= branch_factor && child_tid < nproc);
280  }
281 
282  if (!KMP_MASTER_TID(tid)) { // Worker threads
283  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
284 
285  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
286  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
287  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
288  &thr_bar->b_arrived, thr_bar->b_arrived,
289  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
290 
291  // Mark arrival to parent thread
292  /* After performing this write, a worker thread may not assume that the team is valid
293  any more - it could be deallocated by the master thread at any time. */
294  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
295  flag.release();
296  } else {
297  // Need to update the team arrived pointer if we are the master thread
298  if (nproc > 1) // New value was already computed above
299  team->t.t_bar[bt].b_arrived = new_state;
300  else
301  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
302  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
303  gtid, team->t.t_id, tid, team->t.t_id,
304  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
305  }
306  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
307  gtid, team->t.t_id, tid, bt));
308 }
309 
310 static void
311 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
312  int propagate_icvs
313  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
314 {
315  KMP_TIME_BLOCK(KMP_tree_release);
316  register kmp_team_t *team;
317  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
318  register kmp_uint32 nproc;
319  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
320  register kmp_uint32 branch_factor = 1 << branch_bits;
321  register kmp_uint32 child;
322  register kmp_uint32 child_tid;
323 
324  // Perform a tree release for all of the threads that have been gathered
325  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
326  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
327  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
328  // Wait for parent thread to release us
329  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
330  flag.wait(this_thr, TRUE
331  USE_ITT_BUILD_ARG(itt_sync_obj) );
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY
333  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
334  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
335  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
336  // Cancel wait on previous parallel region...
337  __kmp_itt_task_starting(itt_sync_obj);
338 
339  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
340  return;
341 
342  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
343  if (itt_sync_obj != NULL)
344  // Call prepare as early as possible for "new" barrier
345  __kmp_itt_task_finished(itt_sync_obj);
346  } else
347 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
348  // Early exit for reaping threads releasing forkjoin barrier
349  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
350  return;
351 
352  // The worker thread may now assume that the team is valid.
353  team = __kmp_threads[gtid]->th.th_team;
354  KMP_DEBUG_ASSERT(team != NULL);
355  tid = __kmp_tid_from_gtid(gtid);
356 
357  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
358  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
359  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
360  KMP_MB(); // Flush all pending memory write invalidates.
361  } else {
362  team = __kmp_threads[gtid]->th.th_team;
363  KMP_DEBUG_ASSERT(team != NULL);
364  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
365  gtid, team->t.t_id, tid, bt));
366  }
367  nproc = this_thr->th.th_team_nproc;
368  child_tid = (tid << branch_bits) + 1;
369 
370  if (child_tid < nproc) {
371  register kmp_info_t **other_threads = team->t.t_threads;
372  child = 1;
373  // Parent threads release all their children
374  do {
375  register kmp_info_t *child_thr = other_threads[child_tid];
376  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
377 #if KMP_CACHE_MANAGE
378  // Prefetch next thread's go count
379  if (child+1 <= branch_factor && child_tid+1 < nproc)
380  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
381 #endif /* KMP_CACHE_MANAGE */
382 
383 #if KMP_BARRIER_ICV_PUSH
384  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
385  if (propagate_icvs) {
386  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
387  team, child_tid, FALSE);
388  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
389  &team->t.t_implicit_task_taskdata[0].td_icvs);
390  }
391  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
392 #endif // KMP_BARRIER_ICV_PUSH
393  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
394  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
395  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
396  child_tid, &child_bar->b_go, child_bar->b_go,
397  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
398  // Release child from barrier
399  kmp_flag_64 flag(&child_bar->b_go, child_thr);
400  flag.release();
401  child++;
402  child_tid++;
403  }
404  while (child <= branch_factor && child_tid < nproc);
405  }
406  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
407  gtid, team->t.t_id, tid, bt));
408 }
409 
410 
411 // Hyper Barrier
412 static void
413 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
414  void (*reduce)(void *, void *)
415  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
416 {
417  KMP_TIME_BLOCK(KMP_hyper_gather);
418  register kmp_team_t *team = this_thr->th.th_team;
419  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
420  register kmp_info_t **other_threads = team->t.t_threads;
421  register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
422  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
423  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
424  register kmp_uint32 branch_factor = 1 << branch_bits;
425  register kmp_uint32 offset;
426  register kmp_uint32 level;
427 
428  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
429  gtid, team->t.t_id, tid, bt));
430 
431  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
432 
433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
434  // Barrier imbalance - save arrive time to the thread
435  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
436  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
437  }
438 #endif
439  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
440  arrived, and reduce any required data as we go. */
441  kmp_flag_64 p_flag(&thr_bar->b_arrived);
442  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
443  {
444  register kmp_uint32 child;
445  register kmp_uint32 child_tid;
446 
447  if (((tid >> level) & (branch_factor - 1)) != 0) {
448  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
449 
450  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
451  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
452  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
453  &thr_bar->b_arrived, thr_bar->b_arrived,
454  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
455  // Mark arrival to parent thread
456  /* After performing this write (in the last iteration of the enclosing for loop),
457  a worker thread may not assume that the team is valid any more - it could be
458  deallocated by the master thread at any time. */
459  p_flag.set_waiter(other_threads[parent_tid]);
460  p_flag.release();
461  break;
462  }
463 
464  // Parent threads wait for children to arrive
465  if (new_state == KMP_BARRIER_UNUSED_STATE)
466  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
467  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
468  child++, child_tid+=(1 << level))
469  {
470  register kmp_info_t *child_thr = other_threads[child_tid];
471  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473  register kmp_uint32 next_child_tid = child_tid + (1 << level);
474  // Prefetch next thread's arrived count
475  if (child+1 < branch_factor && next_child_tid < num_threads)
476  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
477 #endif /* KMP_CACHE_MANAGE */
478  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
479  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
480  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
481  &child_bar->b_arrived, new_state));
482  // Wait for child to arrive
483  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
484  c_flag.wait(this_thr, FALSE
485  USE_ITT_BUILD_ARG(itt_sync_obj) );
486 #if USE_ITT_BUILD && USE_ITT_NOTIFY
487  // Barrier imbalance - write min of the thread time and a child time to the thread.
488  if (__kmp_forkjoin_frames_mode == 2) {
489  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
490  child_thr->th.th_bar_min_time);
491  }
492 #endif
493  if (reduce) {
494  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
495  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
496  team->t.t_id, child_tid));
497  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
498  }
499  }
500  }
501 
502  if (KMP_MASTER_TID(tid)) {
503  // Need to update the team arrived pointer if we are the master thread
504  if (new_state == KMP_BARRIER_UNUSED_STATE)
505  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
506  else
507  team->t.t_bar[bt].b_arrived = new_state;
508  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
509  gtid, team->t.t_id, tid, team->t.t_id,
510  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
511  }
512  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
513  gtid, team->t.t_id, tid, bt));
514 }
515 
516 // The reverse versions seem to beat the forward versions overall
517 #define KMP_REVERSE_HYPER_BAR
518 static void
519 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
520  int propagate_icvs
521  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
522 {
523  KMP_TIME_BLOCK(KMP_hyper_release);
524  register kmp_team_t *team;
525  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
526  register kmp_info_t **other_threads;
527  register kmp_uint32 num_threads;
528  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
529  register kmp_uint32 branch_factor = 1 << branch_bits;
530  register kmp_uint32 child;
531  register kmp_uint32 child_tid;
532  register kmp_uint32 offset;
533  register kmp_uint32 level;
534 
535  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
536  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
537  order of the corresponding gather, otherwise threads are released in the same order. */
538  if (KMP_MASTER_TID(tid)) { // master
539  team = __kmp_threads[gtid]->th.th_team;
540  KMP_DEBUG_ASSERT(team != NULL);
541  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
542  gtid, team->t.t_id, tid, bt));
543 #if KMP_BARRIER_ICV_PUSH
544  if (propagate_icvs) { // master already has ICVs in final destination; copy
545  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
546  }
547 #endif
548  }
549  else { // Handle fork barrier workers who aren't part of a team yet
550  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
551  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
552  // Wait for parent thread to release us
553  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
554  flag.wait(this_thr, TRUE
555  USE_ITT_BUILD_ARG(itt_sync_obj) );
556 #if USE_ITT_BUILD && USE_ITT_NOTIFY
557  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
558  // In fork barrier where we could not get the object reliably
559  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
560  // Cancel wait on previous parallel region...
561  __kmp_itt_task_starting(itt_sync_obj);
562 
563  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
564  return;
565 
566  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
567  if (itt_sync_obj != NULL)
568  // Call prepare as early as possible for "new" barrier
569  __kmp_itt_task_finished(itt_sync_obj);
570  } else
571 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
572  // Early exit for reaping threads releasing forkjoin barrier
573  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
574  return;
575 
576  // The worker thread may now assume that the team is valid.
577  team = __kmp_threads[gtid]->th.th_team;
578  KMP_DEBUG_ASSERT(team != NULL);
579  tid = __kmp_tid_from_gtid(gtid);
580 
581  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
582  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
583  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
584  KMP_MB(); // Flush all pending memory write invalidates.
585  }
586  num_threads = this_thr->th.th_team_nproc;
587  other_threads = team->t.t_threads;
588 
589 #ifdef KMP_REVERSE_HYPER_BAR
590  // Count up to correct level for parent
591  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
592  level+=branch_bits, offset<<=branch_bits);
593 
594  // Now go down from there
595  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
596  level-=branch_bits, offset>>=branch_bits)
597 #else
598  // Go down the tree, level by level
599  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
600 #endif // KMP_REVERSE_HYPER_BAR
601  {
602 #ifdef KMP_REVERSE_HYPER_BAR
603  /* Now go in reverse order through the children, highest to lowest.
604  Initial setting of child is conservative here. */
605  child = num_threads >> ((level==0)?level:level-1);
606  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
607  child>=1; child--, child_tid-=(1<<level))
608 #else
609  if (((tid >> level) & (branch_factor - 1)) != 0)
610  // No need to go lower than this, since this is the level parent would be notified
611  break;
612  // Iterate through children on this level of the tree
613  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
614  child++, child_tid+=(1<<level))
615 #endif // KMP_REVERSE_HYPER_BAR
616  {
617  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
618  else {
619  register kmp_info_t *child_thr = other_threads[child_tid];
620  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
621 #if KMP_CACHE_MANAGE
622  register kmp_uint32 next_child_tid = child_tid - (1 << level);
623  // Prefetch next thread's go count
624 # ifdef KMP_REVERSE_HYPER_BAR
625  if (child-1 >= 1 && next_child_tid < num_threads)
626 # else
627  if (child+1 < branch_factor && next_child_tid < num_threads)
628 # endif // KMP_REVERSE_HYPER_BAR
629  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
630 #endif /* KMP_CACHE_MANAGE */
631 
632 #if KMP_BARRIER_ICV_PUSH
633  if (propagate_icvs) // push my fixed ICVs to my child
634  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
635 #endif // KMP_BARRIER_ICV_PUSH
636 
637  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
638  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
639  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
640  child_tid, &child_bar->b_go, child_bar->b_go,
641  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
642  // Release child from barrier
643  kmp_flag_64 flag(&child_bar->b_go, child_thr);
644  flag.release();
645  }
646  }
647  }
648 #if KMP_BARRIER_ICV_PUSH
649  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
650  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
651  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
652  }
653 #endif
654  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
655  gtid, team->t.t_id, tid, bt));
656 }
657 
658 // Hierarchical Barrier
659 
660 // Initialize thread barrier data
661 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
662  minimum amount of initialization required based on how the team has changed. Returns true if
663  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
664  team size increases, threads already in the team will respond to on-core wakeup on their parent
665  thread, but threads newly added to the team will only be listening on the their local b_go. */
666 static bool
667 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
668  int gtid, int tid, kmp_team_t *team)
669 {
670  // Checks to determine if (re-)initialization is needed
671  bool uninitialized = thr_bar->team == NULL;
672  bool team_changed = team != thr_bar->team;
673  bool team_sz_changed = nproc != thr_bar->nproc;
674  bool tid_changed = tid != thr_bar->old_tid;
675  bool retval = false;
676 
677  if (uninitialized || team_sz_changed) {
678  __kmp_get_hierarchy(nproc, thr_bar);
679  }
680 
681  if (uninitialized || team_sz_changed || tid_changed) {
682  thr_bar->my_level = thr_bar->depth-1; // default for master
683  thr_bar->parent_tid = -1; // default for master
684  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
685  kmp_uint32 d=0;
686  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
687  kmp_uint32 rem;
688  if (d == thr_bar->depth-2) { // reached level right below the master
689  thr_bar->parent_tid = 0;
690  thr_bar->my_level = d;
691  break;
692  }
693  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
694  // thread is not a subtree root at next level, so this is max
695  thr_bar->parent_tid = tid - rem;
696  thr_bar->my_level = d;
697  break;
698  }
699  ++d;
700  }
701  }
702  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
703  thr_bar->old_tid = tid;
704  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
705  }
706  if (uninitialized || team_changed || tid_changed) {
707  thr_bar->team = team;
708  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
709  retval = true;
710  }
711  if (uninitialized || team_sz_changed || tid_changed) {
712  thr_bar->nproc = nproc;
713  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
714  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
715  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
716  thr_bar->leaf_kids = nproc - tid - 1;
717  thr_bar->leaf_state = 0;
718  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
719  }
720  return retval;
721 }
722 
723 static void
724 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
725  int gtid, int tid, void (*reduce) (void *, void *)
726  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
727 {
728  KMP_TIME_BLOCK(KMP_hier_gather);
729  register kmp_team_t *team = this_thr->th.th_team;
730  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
731  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
732  register kmp_info_t **other_threads = team->t.t_threads;
733  register kmp_uint64 new_state;
734 
735  int level = team->t.t_level;
736  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
737  if (this_thr->th.th_teams_size.nteams > 1)
738  ++level; // level was not increased in teams construct for team_of_masters
739  if (level == 1) thr_bar->use_oncore_barrier = 1;
740  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
741 
742  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
743  gtid, team->t.t_id, tid, bt));
744  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
745 
746 #if USE_ITT_BUILD && USE_ITT_NOTIFY
747  // Barrier imbalance - save arrive time to the thread
748  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
749  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
750  }
751 #endif
752 
753  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
754 
755  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
756  register kmp_int32 child_tid;
757  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
758  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
759  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
760  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
761  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
762  flag.wait(this_thr, FALSE
763  USE_ITT_BUILD_ARG(itt_sync_obj) );
764  if (reduce) {
765  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
766  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
767  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
768  team->t.t_id, child_tid));
769  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
770  }
771  }
772  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
773  }
774  // Next, wait for higher level children on each child's b_arrived flag
775  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
776  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
777  if (last > nproc) last = nproc;
778  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
779  register kmp_info_t *child_thr = other_threads[child_tid];
780  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
781  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
782  "arrived(%p) == %u\n",
783  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
784  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
785  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
786  flag.wait(this_thr, FALSE
787  USE_ITT_BUILD_ARG(itt_sync_obj) );
788  if (reduce) {
789  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
790  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
791  team->t.t_id, child_tid));
792  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
793  }
794  }
795  }
796  }
797  else { // Blocktime is not infinite
798  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
799  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
800  if (last > nproc) last = nproc;
801  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
802  register kmp_info_t *child_thr = other_threads[child_tid];
803  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
804  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
805  "arrived(%p) == %u\n",
806  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
807  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
808  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
809  flag.wait(this_thr, FALSE
810  USE_ITT_BUILD_ARG(itt_sync_obj) );
811  if (reduce) {
812  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
813  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
814  team->t.t_id, child_tid));
815  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
816  }
817  }
818  }
819  }
820  }
821  // All subordinates are gathered; now release parent if not master thread
822 
823  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
824  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
825  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
826  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
827  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
828  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
829  the team is valid any more - it could be deallocated by the master thread at any time. */
830  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
831  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
832  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
833  flag.release();
834  }
835  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
836  thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
837  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
838  flag.set_waiter(other_threads[thr_bar->parent_tid]);
839  flag.release();
840  }
841  } else { // Master thread needs to update the team's b_arrived value
842  team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
843  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
844  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
845  }
846  // Is the team access below unsafe or just technically invalid?
847  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
848  gtid, team->t.t_id, tid, bt));
849 }
850 
851 static void
852 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
853  int propagate_icvs
854  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
855 {
856  KMP_TIME_BLOCK(KMP_hier_release);
857  register kmp_team_t *team;
858  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
859  register kmp_uint32 nproc;
860  bool team_change = false; // indicates on-core barrier shouldn't be used
861 
862  if (KMP_MASTER_TID(tid)) {
863  team = __kmp_threads[gtid]->th.th_team;
864  KMP_DEBUG_ASSERT(team != NULL);
865  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
866  gtid, team->t.t_id, tid, bt));
867  }
868  else { // Worker threads
869  // Wait for parent thread to release me
870  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
871  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
872  // Use traditional method of waiting on my own b_go flag
873  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
874  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
875  flag.wait(this_thr, TRUE
876  USE_ITT_BUILD_ARG(itt_sync_obj) );
877  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
878  }
879  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
880  // Wait on my "offset" bits on parent's b_go flag
881  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
882  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
883  bt, this_thr
884  USE_ITT_BUILD_ARG(itt_sync_obj) );
885  flag.wait(this_thr, TRUE
886  USE_ITT_BUILD_ARG(itt_sync_obj) );
887  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
888  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
889  }
890  else { // Reset my bits on parent's b_go flag
891  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
892  }
893  }
894  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
895  // Early exit for reaping threads releasing forkjoin barrier
896  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
897  return;
898  // The worker thread may now assume that the team is valid.
899  team = __kmp_threads[gtid]->th.th_team;
900  KMP_DEBUG_ASSERT(team != NULL);
901  tid = __kmp_tid_from_gtid(gtid);
902 
903  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
904  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
905  KMP_MB(); // Flush all pending memory write invalidates.
906  }
907 
908  int level = team->t.t_level;
909  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
910  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
911  ++level; // level was not increased in teams construct for team_of_workers
912  if( this_thr->th.th_teams_size.nteams > 1 )
913  ++level; // level was not increased in teams construct for team_of_masters
914  }
915  if (level == 1) thr_bar->use_oncore_barrier = 1;
916  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
917  nproc = this_thr->th.th_team_nproc;
918 
919  // If the team size has increased, we still communicate with old leaves via oncore barrier.
920  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
921  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
922  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
923  // But if the entire team changes, we won't use oncore barrier at all
924  if (team_change) old_leaf_kids = 0;
925 
926 #if KMP_BARRIER_ICV_PUSH
927  if (propagate_icvs) {
928  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
929  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
930  }
931  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
932  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
933  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
934  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
935  &thr_bar->parent_bar->th_fixed_icvs);
936  // non-leaves will get ICVs piggybacked with b_go via NGO store
937  }
938  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
939  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
940  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
941  else // leaves copy parent's fixed ICVs directly to local ICV store
942  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
943  &thr_bar->parent_bar->th_fixed_icvs);
944  }
945  }
946 #endif // KMP_BARRIER_ICV_PUSH
947 
948  // Now, release my children
949  if (thr_bar->my_level) { // not a leaf
950  register kmp_int32 child_tid;
951  kmp_uint32 last;
952  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
953  if (KMP_MASTER_TID(tid)) { // do a flat release
954  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
955  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
956  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
957  ngo_load(&thr_bar->th_fixed_icvs);
958  // This loops over all the threads skipping only the leaf nodes in the hierarchy
959  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
960  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
961  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
962  " go(%p): %u => %u\n",
963  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
964  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
965  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
966  // Use ngo store (if available) to both store ICVs and release child via child's b_go
967  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
968  }
969  ngo_sync();
970  }
971  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
972  // Now, release leaf children
973  if (thr_bar->leaf_kids) { // if there are any
974  // We test team_change on the off-chance that the level 1 team changed.
975  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
976  if (old_leaf_kids) { // release old leaf kids
977  thr_bar->b_go |= old_leaf_state;
978  }
979  // Release new leaf kids
980  last = tid+thr_bar->skip_per_level[1];
981  if (last > nproc) last = nproc;
982  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
983  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
984  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
986  " T#%d(%d:%d) go(%p): %u => %u\n",
987  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
988  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
989  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
990  // Release child using child's b_go flag
991  kmp_flag_64 flag(&child_bar->b_go, child_thr);
992  flag.release();
993  }
994  }
995  else { // Release all children at once with leaf_state bits on my own b_go flag
996  thr_bar->b_go |= thr_bar->leaf_state;
997  }
998  }
999  }
1000  else { // Blocktime is not infinite; do a simple hierarchical release
1001  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1002  last = tid+thr_bar->skip_per_level[d+1];
1003  kmp_uint32 skip = thr_bar->skip_per_level[d];
1004  if (last > nproc) last = nproc;
1005  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1006  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1007  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1008  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1009  " go(%p): %u => %u\n",
1010  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1011  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1012  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1013  // Release child using child's b_go flag
1014  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1015  flag.release();
1016  }
1017  }
1018  }
1019 #if KMP_BARRIER_ICV_PUSH
1020  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1021  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1022 #endif // KMP_BARRIER_ICV_PUSH
1023  }
1024  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1025  gtid, team->t.t_id, tid, bt));
1026 }
1027 
1028 // ---------------------------- End of Barrier Algorithms ----------------------------
1029 
1030 // Internal function to do a barrier.
1031 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1032  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1033  Returns 0 if master thread, 1 if worker thread. */
1034 int
1035 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1036  void *reduce_data, void (*reduce)(void *, void *))
1037 {
1038  KMP_TIME_BLOCK(KMP_barrier);
1039  register int tid = __kmp_tid_from_gtid(gtid);
1040  register kmp_info_t *this_thr = __kmp_threads[gtid];
1041  register kmp_team_t *team = this_thr->th.th_team;
1042  register int status = 0;
1043  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1044 #if OMPT_SUPPORT
1045  ompt_task_id_t my_task_id;
1046  ompt_parallel_id_t my_parallel_id;
1047 #endif
1048 
1049  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1050  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1051 
1052 #if OMPT_SUPPORT
1053  if (ompt_status & ompt_status_track) {
1054 #if OMPT_BLAME
1055  if (ompt_status == ompt_status_track_callback) {
1056  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1057  my_parallel_id = team->t.ompt_team_info.parallel_id;
1058 
1059 #if OMPT_TRACE
1060  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1061  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1062  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1063  my_parallel_id, my_task_id);
1064  }
1065  }
1066 #endif
1067  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1068  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1069  my_parallel_id, my_task_id);
1070  }
1071  }
1072 #endif
1073  // It is OK to report the barrier state after the barrier begin callback.
1074  // According to the OMPT specification, a compliant implementation may
1075  // even delay reporting this state until the barrier begins to wait.
1076  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1077  }
1078 #endif
1079 
1080  if (! team->t.t_serialized) {
1081 #if USE_ITT_BUILD
1082  // This value will be used in itt notify events below.
1083  void *itt_sync_obj = NULL;
1084 # if USE_ITT_NOTIFY
1085  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1086  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1087 # endif
1088 #endif /* USE_ITT_BUILD */
1089  if (__kmp_tasking_mode == tskm_extra_barrier) {
1090  __kmp_tasking_barrier(team, this_thr, gtid);
1091  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1092  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1093  }
1094 
1095  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1096  the team struct is not guaranteed to exist. */
1097  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1098  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1099  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1100  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1101  }
1102 
1103 #if USE_ITT_BUILD
1104  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1105  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1106 #endif /* USE_ITT_BUILD */
1107 #if USE_DEBUGGER
1108  // Let the debugger know: the thread arrived to the barrier and waiting.
1109  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1110  team->t.t_bar[bt].b_master_arrived += 1;
1111  } else {
1112  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1113  } // if
1114 #endif /* USE_DEBUGGER */
1115  if (reduce != NULL) {
1116  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1117  this_thr->th.th_local.reduce_data = reduce_data;
1118  }
1119  switch (__kmp_barrier_gather_pattern[bt]) {
1120  case bp_hyper_bar: {
1121  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1122  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1123  USE_ITT_BUILD_ARG(itt_sync_obj) );
1124  break;
1125  }
1126  case bp_hierarchical_bar: {
1127  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1128  USE_ITT_BUILD_ARG(itt_sync_obj));
1129  break;
1130  }
1131  case bp_tree_bar: {
1132  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1133  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1134  USE_ITT_BUILD_ARG(itt_sync_obj) );
1135  break;
1136  }
1137  default: {
1138  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1139  USE_ITT_BUILD_ARG(itt_sync_obj) );
1140  }
1141  }
1142 
1143  KMP_MB();
1144 
1145  if (KMP_MASTER_TID(tid)) {
1146  status = 0;
1147  if (__kmp_tasking_mode != tskm_immediate_exec) {
1148  __kmp_task_team_wait(this_thr, team
1149  USE_ITT_BUILD_ARG(itt_sync_obj) );
1150  __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
1151  }
1152 #if USE_DEBUGGER
1153  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1154  team->t.t_bar[bt].b_team_arrived += 1;
1155 #endif
1156 
1157 #if USE_ITT_BUILD
1158  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1159  before the final summation into the shared variable is done (final summation can be a
1160  long operation for array reductions). */
1161  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1162  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1163 #endif /* USE_ITT_BUILD */
1164 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1165  // Barrier - report frame end (only if active_level == 1)
1166  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1167 #if OMP_40_ENABLED
1168  this_thr->th.th_teams_microtask == NULL &&
1169 #endif
1170  team->t.t_active_level == 1)
1171  {
1172  kmp_uint64 cur_time = __itt_get_timestamp();
1173  kmp_info_t **other_threads = team->t.t_threads;
1174  int nproc = this_thr->th.th_team_nproc;
1175  int i;
1176  switch(__kmp_forkjoin_frames_mode) {
1177  case 1:
1178  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1179  this_thr->th.th_frame_time = cur_time;
1180  break;
1181  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1182  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1183  break;
1184  case 3:
1185  if( __itt_metadata_add_ptr ) {
1186  // Initialize with master's wait time
1187  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1188  for (i=1; i<nproc; ++i) {
1189  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1190  }
1191  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1192  }
1193  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1194  this_thr->th.th_frame_time = cur_time;
1195  break;
1196  }
1197  }
1198 #endif /* USE_ITT_BUILD */
1199  } else {
1200  status = 1;
1201 #if USE_ITT_BUILD
1202  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1203  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1204 #endif /* USE_ITT_BUILD */
1205  }
1206  if (status == 1 || ! is_split) {
1207  switch (__kmp_barrier_release_pattern[bt]) {
1208  case bp_hyper_bar: {
1209  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1210  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1211  USE_ITT_BUILD_ARG(itt_sync_obj) );
1212  break;
1213  }
1214  case bp_hierarchical_bar: {
1215  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1216  USE_ITT_BUILD_ARG(itt_sync_obj) );
1217  break;
1218  }
1219  case bp_tree_bar: {
1220  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1221  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1222  USE_ITT_BUILD_ARG(itt_sync_obj) );
1223  break;
1224  }
1225  default: {
1226  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1227  USE_ITT_BUILD_ARG(itt_sync_obj) );
1228  }
1229  }
1230  if (__kmp_tasking_mode != tskm_immediate_exec) {
1231  __kmp_task_team_sync(this_thr, team);
1232  }
1233  }
1234 
1235 #if USE_ITT_BUILD
1236  /* GEH: TODO: Move this under if-condition above and also include in
1237  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1238  of the threads for split barriers. */
1239  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1240  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1241 #endif /* USE_ITT_BUILD */
1242  } else { // Team is serialized.
1243  status = 0;
1244  if (__kmp_tasking_mode != tskm_immediate_exec) {
1245 #if OMP_41_ENABLED
1246  if ( this_thr->th.th_task_team != NULL ) {
1247  void *itt_sync_obj = NULL;
1248 #if USE_ITT_NOTIFY
1249  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1250  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1251  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1252  }
1253 #endif
1254 
1255  kmp_task_team_t * task_team;
1256  task_team = this_thr->th.th_task_team;
1257  KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1258  __kmp_task_team_wait(this_thr, team
1259  USE_ITT_BUILD_ARG(itt_sync_obj));
1260  __kmp_task_team_setup(this_thr, team, 0, 0);
1261 
1262 #if USE_ITT_BUILD
1263  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1264  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1265 #endif /* USE_ITT_BUILD */
1266  }
1267 #else
1268  // The task team should be NULL for serialized code (tasks will be executed immediately)
1269  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1270  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1271 #endif
1272  }
1273  }
1274  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1275  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1276 
1277 #if OMPT_SUPPORT
1278  if (ompt_status & ompt_status_track) {
1279 #if OMPT_BLAME
1280  if ((ompt_status == ompt_status_track_callback) &&
1281  ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1282  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1283  my_parallel_id, my_task_id);
1284  }
1285 #endif
1286  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1287  }
1288 #endif
1289 
1290  return status;
1291 }
1292 
1293 
1294 void
1295 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1296 {
1297  KMP_TIME_BLOCK(KMP_end_split_barrier);
1298  int tid = __kmp_tid_from_gtid(gtid);
1299  kmp_info_t *this_thr = __kmp_threads[gtid];
1300  kmp_team_t *team = this_thr->th.th_team;
1301 
1302  if (!team->t.t_serialized) {
1303  if (KMP_MASTER_GTID(gtid)) {
1304  switch (__kmp_barrier_release_pattern[bt]) {
1305  case bp_hyper_bar: {
1306  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1307  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1308  USE_ITT_BUILD_ARG(NULL) );
1309  break;
1310  }
1311  case bp_hierarchical_bar: {
1312  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1313  USE_ITT_BUILD_ARG(NULL));
1314  break;
1315  }
1316  case bp_tree_bar: {
1317  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1318  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1319  USE_ITT_BUILD_ARG(NULL) );
1320  break;
1321  }
1322  default: {
1323  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1324  USE_ITT_BUILD_ARG(NULL) );
1325  }
1326  }
1327  if (__kmp_tasking_mode != tskm_immediate_exec) {
1328  __kmp_task_team_sync(this_thr, team);
1329  } // if
1330  }
1331  }
1332 }
1333 
1334 
1335 void
1336 __kmp_join_barrier(int gtid)
1337 {
1338  KMP_TIME_BLOCK(KMP_join_barrier);
1339  register kmp_info_t *this_thr = __kmp_threads[gtid];
1340  register kmp_team_t *team;
1341  register kmp_uint nproc;
1342  kmp_info_t *master_thread;
1343  int tid;
1344 #ifdef KMP_DEBUG
1345  int team_id;
1346 #endif /* KMP_DEBUG */
1347 #if USE_ITT_BUILD
1348  void *itt_sync_obj = NULL;
1349 # if USE_ITT_NOTIFY
1350  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1351  // Get object created at fork_barrier
1352  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1353 # endif
1354 #endif /* USE_ITT_BUILD */
1355  KMP_MB();
1356 
1357  // Get current info
1358  team = this_thr->th.th_team;
1359  nproc = this_thr->th.th_team_nproc;
1360  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1361  tid = __kmp_tid_from_gtid(gtid);
1362 #ifdef KMP_DEBUG
1363  team_id = team->t.t_id;
1364 #endif /* KMP_DEBUG */
1365  master_thread = this_thr->th.th_team_master;
1366 #ifdef KMP_DEBUG
1367  if (master_thread != team->t.t_threads[0]) {
1368  __kmp_print_structure();
1369  }
1370 #endif /* KMP_DEBUG */
1371  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1372  KMP_MB();
1373 
1374  // Verify state
1375  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1376  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1377  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1378  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1379  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1380 
1381 #if OMPT_SUPPORT
1382 #if OMPT_TRACE
1383  if ((ompt_status == ompt_status_track_callback) &&
1384  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1385  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1386  team->t.ompt_team_info.parallel_id,
1387  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1388  }
1389 #endif
1390  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1391 #endif
1392 
1393  if (__kmp_tasking_mode == tskm_extra_barrier) {
1394  __kmp_tasking_barrier(team, this_thr, gtid);
1395  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1396  }
1397 # ifdef KMP_DEBUG
1398  if (__kmp_tasking_mode != tskm_immediate_exec) {
1399  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1400  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1401  this_thr->th.th_task_team));
1402  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1403  }
1404 # endif /* KMP_DEBUG */
1405 
1406  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1407  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1408  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1409  since the values are not used by __kmp_wait_template() in that case. */
1410  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1411  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1412  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1413  }
1414 
1415 #if USE_ITT_BUILD
1416  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1418 #endif /* USE_ITT_BUILD */
1419 
1420  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1421  case bp_hyper_bar: {
1422  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1423  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1424  USE_ITT_BUILD_ARG(itt_sync_obj) );
1425  break;
1426  }
1427  case bp_hierarchical_bar: {
1428  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1429  USE_ITT_BUILD_ARG(itt_sync_obj) );
1430  break;
1431  }
1432  case bp_tree_bar: {
1433  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1434  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1435  USE_ITT_BUILD_ARG(itt_sync_obj) );
1436  break;
1437  }
1438  default: {
1439  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1440  USE_ITT_BUILD_ARG(itt_sync_obj) );
1441  }
1442  }
1443 
1444  /* From this point on, the team data structure may be deallocated at any time by the
1445  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1446  data items that need to be referenced before the end of the barrier should be moved to
1447  the kmp_task_team_t structs. */
1448  if (KMP_MASTER_TID(tid)) {
1449  if (__kmp_tasking_mode != tskm_immediate_exec) {
1450  // Master shouldn't call decrease_load(). // TODO: enable master threads.
1451  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1452  __kmp_task_team_wait(this_thr, team
1453  USE_ITT_BUILD_ARG(itt_sync_obj) );
1454  }
1455 #if USE_ITT_BUILD
1456  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1457  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1458 #endif /* USE_ITT_BUILD */
1459 
1460 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1461  // Join barrier - report frame end
1462  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1463 #if OMP_40_ENABLED
1464  this_thr->th.th_teams_microtask == NULL &&
1465 #endif
1466  team->t.t_active_level == 1)
1467  {
1468  kmp_uint64 cur_time = __itt_get_timestamp();
1469  ident_t * loc = team->t.t_ident;
1470  kmp_info_t **other_threads = team->t.t_threads;
1471  int nproc = this_thr->th.th_team_nproc;
1472  int i;
1473  switch(__kmp_forkjoin_frames_mode) {
1474  case 1:
1475  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1476  break;
1477  case 2:
1478  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1479  break;
1480  case 3:
1481  if( __itt_metadata_add_ptr ) {
1482  // Initialize with master's wait time
1483  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1484  for (i=1; i<nproc; ++i) {
1485  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1486  }
1487  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1488  }
1489  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1490  this_thr->th.th_frame_time = cur_time;
1491  break;
1492  }
1493  }
1494 # endif /* USE_ITT_BUILD */
1495  }
1496 #if USE_ITT_BUILD
1497  else {
1498  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1499  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1500  }
1501 #endif /* USE_ITT_BUILD */
1502 
1503 #if KMP_DEBUG
1504  if (KMP_MASTER_TID(tid)) {
1505  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1506  gtid, team_id, tid, nproc));
1507  }
1508 #endif /* KMP_DEBUG */
1509 
1510  // TODO now, mark worker threads as done so they may be disbanded
1511  KMP_MB(); // Flush all pending memory write invalidates.
1512  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1513 
1514 #if OMPT_SUPPORT
1515  if (ompt_status & ompt_status_track) {
1516 #if OMPT_TRACE
1517  if ((ompt_status == ompt_status_track_callback) &&
1518  ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1519  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1520  team->t.ompt_team_info.parallel_id,
1521  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1522  }
1523 #endif
1524 
1525  // return to default state
1526  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1527  }
1528 #endif
1529 }
1530 
1531 
1532 // TODO release worker threads' fork barriers as we are ready instead of all at once
1533 void
1534 __kmp_fork_barrier(int gtid, int tid)
1535 {
1536  KMP_TIME_BLOCK(KMP_fork_barrier);
1537  kmp_info_t *this_thr = __kmp_threads[gtid];
1538  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1539 #if USE_ITT_BUILD
1540  void * itt_sync_obj = NULL;
1541 #endif /* USE_ITT_BUILD */
1542 
1543  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1544  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1545 
1546  // th_team pointer only valid for master thread here
1547  if (KMP_MASTER_TID(tid)) {
1548 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1549  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1550  // Create itt barrier object
1551  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1552  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1553  }
1554 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1555 
1556 #ifdef KMP_DEBUG
1557  register kmp_info_t **other_threads = team->t.t_threads;
1558  register int i;
1559 
1560  // Verify state
1561  KMP_MB();
1562 
1563  for(i=1; i<team->t.t_nproc; ++i) {
1564  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1565  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1566  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1567  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1568  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1569  & ~(KMP_BARRIER_SLEEP_STATE))
1570  == KMP_INIT_BARRIER_STATE);
1571  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1572  }
1573 #endif
1574 
1575  if (__kmp_tasking_mode != tskm_immediate_exec) {
1576  __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
1577  }
1578 
1579  /* The master thread may have changed its blocktime between the join barrier and the
1580  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1581  access it when the team struct is not guaranteed to exist. */
1582  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1583  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1584  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1585  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1586  }
1587  } // master
1588 
1589  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1590  case bp_hyper_bar: {
1591  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1592  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1593  USE_ITT_BUILD_ARG(itt_sync_obj) );
1594  break;
1595  }
1596  case bp_hierarchical_bar: {
1597  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1598  USE_ITT_BUILD_ARG(itt_sync_obj) );
1599  break;
1600  }
1601  case bp_tree_bar: {
1602  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1603  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1604  USE_ITT_BUILD_ARG(itt_sync_obj) );
1605  break;
1606  }
1607  default: {
1608  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1609  USE_ITT_BUILD_ARG(itt_sync_obj) );
1610  }
1611  }
1612 
1613  // Early exit for reaping threads releasing forkjoin barrier
1614  if (TCR_4(__kmp_global.g.g_done)) {
1615  if (this_thr->th.th_task_team != NULL) {
1616  if (KMP_MASTER_TID(tid)) {
1617  TCW_PTR(this_thr->th.th_task_team, NULL);
1618  }
1619  else {
1620  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1621  }
1622  }
1623 
1624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1625  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1626  if (!KMP_MASTER_TID(tid)) {
1627  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1628  if (itt_sync_obj)
1629  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1630  }
1631  }
1632 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1633  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1634  return;
1635  }
1636 
1637  /* We can now assume that a valid team structure has been allocated by the master and
1638  propagated to all worker threads. The current thread, however, may not be part of the
1639  team, so we can't blindly assume that the team pointer is non-null. */
1640  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1641  KMP_DEBUG_ASSERT(team != NULL);
1642  tid = __kmp_tid_from_gtid(gtid);
1643 
1644 
1645 #if KMP_BARRIER_ICV_PULL
1646  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1647  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1648  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1649  the fixed ICVs in the master's thread struct, because it is not always the case that the
1650  threads arrays have been allocated when __kmp_fork_call() is executed. */
1651  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1652  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1653  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1654  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1655  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1656  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1657  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1658  }
1659  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1660 #endif // KMP_BARRIER_ICV_PULL
1661 
1662  if (__kmp_tasking_mode != tskm_immediate_exec) {
1663  __kmp_task_team_sync(this_thr, team);
1664  }
1665 
1666 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1667  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1668  if (proc_bind == proc_bind_intel) {
1669 #endif
1670 #if KMP_AFFINITY_SUPPORTED
1671  // Call dynamic affinity settings
1672  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1673  __kmp_balanced_affinity(tid, team->t.t_nproc);
1674  }
1675 #endif // KMP_AFFINITY_SUPPORTED
1676 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1677  }
1678  else if (proc_bind != proc_bind_false) {
1679  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1680  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1681  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1682  }
1683  else {
1684  __kmp_affinity_set_place(gtid);
1685  }
1686  }
1687 #endif
1688 
1689 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1690  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1691  if (!KMP_MASTER_TID(tid)) {
1692  // Get correct barrier object
1693  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1694  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1695  } // (prepare called inside barrier_release)
1696  }
1697 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1698  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1699 }
1700 
1701 
1702 void
1703 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1704 {
1705  KMP_TIME_BLOCK(KMP_setup_icv_copy);
1706 
1707  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1708  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1709 
1710  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1711  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1712  this data before this function is called. */
1713 #if KMP_BARRIER_ICV_PULL
1714  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1715  all of the worker threads can access them and make their own copies after the barrier. */
1716  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1717  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1718  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1719  0, team->t.t_threads[0], team));
1720 #elif KMP_BARRIER_ICV_PUSH
1721  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1722  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1723  0, team->t.t_threads[0], team));
1724 #else
1725  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1726  ngo_load(new_icvs);
1727  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1728  for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1729  // TODO: GEH - pass in better source location info since usually NULL here
1730  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1731  f, team->t.t_threads[f], team));
1732  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1733  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1734  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1735  f, team->t.t_threads[f], team));
1736  }
1737  ngo_sync();
1738 #endif // KMP_BARRIER_ICV_PULL
1739 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:649
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:663
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:610
Definition: kmp.h:198