LLVM OpenMP* Runtime Library
kmp_csupport.cpp
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_itt.h"
21 #include "kmp_lock.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 // flags will be used in future, e.g. to implement openmp_strict library
32 // restrictions
33 
42 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
43  // By default __kmpc_begin() is no-op.
44  char *env;
45  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
46  __kmp_str_match_true(env)) {
47  __kmp_middle_initialize();
48  KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
49  } else if (__kmp_ignore_mppbeg() == FALSE) {
50  // By default __kmp_ignore_mppbeg() returns TRUE.
51  __kmp_internal_begin();
52  KC_TRACE(10, ("__kmpc_begin: called\n"));
53  }
54 }
55 
64 void __kmpc_end(ident_t *loc) {
65  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
66  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
67  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
68  // returns FALSE and __kmpc_end() will unregister this root (it can cause
69  // library shut down).
70  if (__kmp_ignore_mppend() == FALSE) {
71  KC_TRACE(10, ("__kmpc_end: called\n"));
72  KA_TRACE(30, ("__kmpc_end\n"));
73 
74  __kmp_internal_end_thread(-1);
75  }
76 }
77 
97  kmp_int32 gtid = __kmp_entry_gtid();
98 
99  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
100 
101  return gtid;
102 }
103 
119  KC_TRACE(10,
120  ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
121 
122  return TCR_4(__kmp_all_nth);
123 }
124 
132  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
133  return __kmp_tid_from_gtid(__kmp_entry_gtid());
134 }
135 
142  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
143 
144  return __kmp_entry_thread()->th.th_team->t.t_nproc;
145 }
146 
153 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
154 #ifndef KMP_DEBUG
155 
156  return TRUE;
157 
158 #else
159 
160  const char *semi2;
161  const char *semi3;
162  int line_no;
163 
164  if (__kmp_par_range == 0) {
165  return TRUE;
166  }
167  semi2 = loc->psource;
168  if (semi2 == NULL) {
169  return TRUE;
170  }
171  semi2 = strchr(semi2, ';');
172  if (semi2 == NULL) {
173  return TRUE;
174  }
175  semi2 = strchr(semi2 + 1, ';');
176  if (semi2 == NULL) {
177  return TRUE;
178  }
179  if (__kmp_par_range_filename[0]) {
180  const char *name = semi2 - 1;
181  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
182  name--;
183  }
184  if ((*name == '/') || (*name == ';')) {
185  name++;
186  }
187  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
188  return __kmp_par_range < 0;
189  }
190  }
191  semi3 = strchr(semi2 + 1, ';');
192  if (__kmp_par_range_routine[0]) {
193  if ((semi3 != NULL) && (semi3 > semi2) &&
194  (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
195  return __kmp_par_range < 0;
196  }
197  }
198  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
199  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
200  return __kmp_par_range > 0;
201  }
202  return __kmp_par_range < 0;
203  }
204  return TRUE;
205 
206 #endif /* KMP_DEBUG */
207 }
208 
215 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
216  return __kmp_entry_thread()->th.th_root->r.r_active;
217 }
218 
228 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
229  kmp_int32 num_threads) {
230  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
231  global_tid, num_threads));
232 
233  __kmp_push_num_threads(loc, global_tid, num_threads);
234 }
235 
236 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
237  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
238 
239  /* the num_threads are automatically popped */
240 }
241 
242 #if OMP_40_ENABLED
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245  kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247  proc_bind));
248 
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 #endif /* OMP_40_ENABLED */
253 
264 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
265  int gtid = __kmp_entry_gtid();
266 
267 #if (KMP_STATS_ENABLED)
268  int inParallel = __kmpc_in_parallel(loc);
269  if (inParallel) {
270  KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
271  } else {
272  KMP_COUNT_BLOCK(OMP_PARALLEL);
273  }
274 #endif
275 
276  // maybe to save thr_state is enough here
277  {
278  va_list ap;
279  va_start(ap, microtask);
280 
281 #if OMPT_SUPPORT
282  ompt_frame_t *ompt_frame;
283  if (ompt_enabled) {
284  kmp_info_t *master_th = __kmp_threads[gtid];
285  kmp_team_t *parent_team = master_th->th.th_team;
286  ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
287  if (lwt)
288  ompt_frame = &(lwt->ompt_task_info.frame);
289  else {
290  int tid = __kmp_tid_from_gtid(gtid);
291  ompt_frame = &(
292  parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
293  }
294  ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
295  }
296 #endif
297 
298 #if INCLUDE_SSC_MARKS
299  SSC_MARK_FORKING();
300 #endif
301  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
302 #if OMPT_SUPPORT
303  VOLATILE_CAST(void *) microtask, // "unwrapped" task
304 #endif
305  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
306  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
307 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
308 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
309  &ap
310 #else
311  ap
312 #endif
313  );
314 #if INCLUDE_SSC_MARKS
315  SSC_MARK_JOINING();
316 #endif
317  __kmp_join_call(loc, gtid
318 #if OMPT_SUPPORT
319  ,
320  fork_context_intel
321 #endif
322  );
323 
324  va_end(ap);
325  }
326 }
327 
328 #if OMP_40_ENABLED
329 
340 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
341  kmp_int32 num_teams, kmp_int32 num_threads) {
342  KA_TRACE(20,
343  ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
344  global_tid, num_teams, num_threads));
345 
346  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
347 }
348 
359 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
360  ...) {
361  int gtid = __kmp_entry_gtid();
362  kmp_info_t *this_thr = __kmp_threads[gtid];
363  va_list ap;
364  va_start(ap, microtask);
365 
366  KMP_COUNT_BLOCK(OMP_TEAMS);
367 
368  // remember teams entry point and nesting level
369  this_thr->th.th_teams_microtask = microtask;
370  this_thr->th.th_teams_level =
371  this_thr->th.th_team->t.t_level; // AC: can be >0 on host
372 
373 #if OMPT_SUPPORT
374  kmp_team_t *parent_team = this_thr->th.th_team;
375  int tid = __kmp_tid_from_gtid(gtid);
376  if (ompt_enabled) {
377  parent_team->t.t_implicit_task_taskdata[tid]
378  .ompt_task_info.frame.reenter_runtime_frame =
379  __builtin_frame_address(1);
380  }
381 #endif
382 
383  // check if __kmpc_push_num_teams called, set default number of teams
384  // otherwise
385  if (this_thr->th.th_teams_size.nteams == 0) {
386  __kmp_push_num_teams(loc, gtid, 0, 0);
387  }
388  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
389  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
390  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
391 
392  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
393 #if OMPT_SUPPORT
394  VOLATILE_CAST(void *) microtask, // "unwrapped" task
395 #endif
396  VOLATILE_CAST(microtask_t)
397  __kmp_teams_master, // "wrapped" task
398  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
399 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
400  &ap
401 #else
402  ap
403 #endif
404  );
405  __kmp_join_call(loc, gtid
406 #if OMPT_SUPPORT
407  ,
408  fork_context_intel
409 #endif
410  );
411 
412  this_thr->th.th_teams_microtask = NULL;
413  this_thr->th.th_teams_level = 0;
414  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
415  va_end(ap);
416 }
417 #endif /* OMP_40_ENABLED */
418 
419 // I don't think this function should ever have been exported.
420 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
421 // openmp code ever called it, but it's been exported from the RTL for so
422 // long that I'm afraid to remove the definition.
423 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
424 
437 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
438  // The implementation is now in kmp_runtime.cpp so that it can share static
439  // functions with kmp_fork_call since the tasks to be done are similar in
440  // each case.
441  __kmp_serialized_parallel(loc, global_tid);
442 }
443 
451 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
452  kmp_internal_control_t *top;
453  kmp_info_t *this_thr;
454  kmp_team_t *serial_team;
455 
456  KC_TRACE(10,
457  ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
458 
459  /* skip all this code for autopar serialized loops since it results in
460  unacceptable overhead */
461  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
462  return;
463 
464  // Not autopar code
465  if (!TCR_4(__kmp_init_parallel))
466  __kmp_parallel_initialize();
467 
468  this_thr = __kmp_threads[global_tid];
469  serial_team = this_thr->th.th_serial_team;
470 
471 #if OMP_45_ENABLED
472  kmp_task_team_t *task_team = this_thr->th.th_task_team;
473 
474  // we need to wait for the proxy tasks before finishing the thread
475  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
476  __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
477 #endif
478 
479  KMP_MB();
480  KMP_DEBUG_ASSERT(serial_team);
481  KMP_ASSERT(serial_team->t.t_serialized);
482  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
483  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
484  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
485  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
486 
487  /* If necessary, pop the internal control stack values and replace the team
488  * values */
489  top = serial_team->t.t_control_stack_top;
490  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
491  copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
492  serial_team->t.t_control_stack_top = top->next;
493  __kmp_free(top);
494  }
495 
496  // if( serial_team -> t.t_serialized > 1 )
497  serial_team->t.t_level--;
498 
499  /* pop dispatch buffers stack */
500  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
501  {
502  dispatch_private_info_t *disp_buffer =
503  serial_team->t.t_dispatch->th_disp_buffer;
504  serial_team->t.t_dispatch->th_disp_buffer =
505  serial_team->t.t_dispatch->th_disp_buffer->next;
506  __kmp_free(disp_buffer);
507  }
508 
509  --serial_team->t.t_serialized;
510  if (serial_team->t.t_serialized == 0) {
511 
512 /* return to the parallel section */
513 
514 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
515  if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
516  __kmp_clear_x87_fpu_status_word();
517  __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
518  __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
519  }
520 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
521 
522  this_thr->th.th_team = serial_team->t.t_parent;
523  this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
524 
525  /* restore values cached in the thread */
526  this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
527  this_thr->th.th_team_master =
528  serial_team->t.t_parent->t.t_threads[0]; /* JPH */
529  this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
530 
531  /* TODO the below shouldn't need to be adjusted for serialized teams */
532  this_thr->th.th_dispatch =
533  &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
534 
535  __kmp_pop_current_task_from_thread(this_thr);
536 
537  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
538  this_thr->th.th_current_task->td_flags.executing = 1;
539 
540  if (__kmp_tasking_mode != tskm_immediate_exec) {
541  // Copy the task team from the new child / old parent team to the thread.
542  this_thr->th.th_task_team =
543  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
544  KA_TRACE(20,
545  ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
546  "team %p\n",
547  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
548  }
549  } else {
550  if (__kmp_tasking_mode != tskm_immediate_exec) {
551  KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
552  "depth of serial team %p to %d\n",
553  global_tid, serial_team, serial_team->t.t_serialized));
554  }
555  }
556 
557  if (__kmp_env_consistency_check)
558  __kmp_pop_parallel(global_tid, NULL);
559 }
560 
569 void __kmpc_flush(ident_t *loc) {
570  KC_TRACE(10, ("__kmpc_flush: called\n"));
571 
572  /* need explicit __mf() here since use volatile instead in library */
573  KMP_MB(); /* Flush all pending memory write invalidates. */
574 
575 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
576 #if KMP_MIC
577 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
578 // We shouldn't need it, though, since the ABI rules require that
579 // * If the compiler generates NGO stores it also generates the fence
580 // * If users hand-code NGO stores they should insert the fence
581 // therefore no incomplete unordered stores should be visible.
582 #else
583  // C74404
584  // This is to address non-temporal store instructions (sfence needed).
585  // The clflush instruction is addressed either (mfence needed).
586  // Probably the non-temporal load monvtdqa instruction should also be
587  // addressed.
588  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
589  if (!__kmp_cpuinfo.initialized) {
590  __kmp_query_cpuid(&__kmp_cpuinfo);
591  }; // if
592  if (!__kmp_cpuinfo.sse2) {
593  // CPU cannot execute SSE2 instructions.
594  } else {
595 #if KMP_COMPILER_ICC
596  _mm_mfence();
597 #elif KMP_COMPILER_MSVC
598  MemoryBarrier();
599 #else
600  __sync_synchronize();
601 #endif // KMP_COMPILER_ICC
602  }; // if
603 #endif // KMP_MIC
604 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
605 // Nothing to see here move along
606 #elif KMP_ARCH_PPC64
607 // Nothing needed here (we have a real MB above).
608 #if KMP_OS_CNK
609  // The flushing thread needs to yield here; this prevents a
610  // busy-waiting thread from saturating the pipeline. flush is
611  // often used in loops like this:
612  // while (!flag) {
613  // #pragma omp flush(flag)
614  // }
615  // and adding the yield here is good for at least a 10x speedup
616  // when running >2 threads per core (on the NAS LU benchmark).
617  __kmp_yield(TRUE);
618 #endif
619 #else
620 #error Unknown or unsupported architecture
621 #endif
622 }
623 
624 /* -------------------------------------------------------------------------- */
632 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
633  KMP_COUNT_BLOCK(OMP_BARRIER);
634  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
635 
636  if (!TCR_4(__kmp_init_parallel))
637  __kmp_parallel_initialize();
638 
639  if (__kmp_env_consistency_check) {
640  if (loc == 0) {
641  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
642  }; // if
643 
644  __kmp_check_barrier(global_tid, ct_barrier, loc);
645  }
646 
647 #if OMPT_SUPPORT && OMPT_TRACE
648  ompt_frame_t *ompt_frame;
649  if (ompt_enabled) {
650  ompt_frame = __ompt_get_task_frame_internal(0);
651  if (ompt_frame->reenter_runtime_frame == NULL)
652  ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
653  }
654 #endif
655  __kmp_threads[global_tid]->th.th_ident = loc;
656  // TODO: explicit barrier_wait_id:
657  // this function is called when 'barrier' directive is present or
658  // implicit barrier at the end of a worksharing construct.
659  // 1) better to add a per-thread barrier counter to a thread data structure
660  // 2) set to 0 when a new team is created
661  // 4) no sync is required
662 
663  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
664 #if OMPT_SUPPORT && OMPT_TRACE
665  if (ompt_enabled) {
666  ompt_frame->reenter_runtime_frame = NULL;
667  }
668 #endif
669 }
670 
671 /* The BARRIER for a MASTER section is always explicit */
678 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
679  int status = 0;
680 
681  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
682 
683  if (!TCR_4(__kmp_init_parallel))
684  __kmp_parallel_initialize();
685 
686  if (KMP_MASTER_GTID(global_tid)) {
687  KMP_COUNT_BLOCK(OMP_MASTER);
688  KMP_PUSH_PARTITIONED_TIMER(OMP_master);
689  status = 1;
690  }
691 
692 #if OMPT_SUPPORT && OMPT_TRACE
693  if (status) {
694  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
695  kmp_info_t *this_thr = __kmp_threads[global_tid];
696  kmp_team_t *team = this_thr->th.th_team;
697 
698  int tid = __kmp_tid_from_gtid(global_tid);
699  ompt_callbacks.ompt_callback(ompt_event_master_begin)(
700  team->t.ompt_team_info.parallel_id,
701  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
702  }
703  }
704 #endif
705 
706  if (__kmp_env_consistency_check) {
707 #if KMP_USE_DYNAMIC_LOCK
708  if (status)
709  __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
710  else
711  __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
712 #else
713  if (status)
714  __kmp_push_sync(global_tid, ct_master, loc, NULL);
715  else
716  __kmp_check_sync(global_tid, ct_master, loc, NULL);
717 #endif
718  }
719 
720  return status;
721 }
722 
731 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
732  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
733 
734  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
735  KMP_POP_PARTITIONED_TIMER();
736 
737 #if OMPT_SUPPORT && OMPT_TRACE
738  kmp_info_t *this_thr = __kmp_threads[global_tid];
739  kmp_team_t *team = this_thr->th.th_team;
740  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_end)) {
741  int tid = __kmp_tid_from_gtid(global_tid);
742  ompt_callbacks.ompt_callback(ompt_event_master_end)(
743  team->t.ompt_team_info.parallel_id,
744  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
745  }
746 #endif
747 
748  if (__kmp_env_consistency_check) {
749  if (global_tid < 0)
750  KMP_WARNING(ThreadIdentInvalid);
751 
752  if (KMP_MASTER_GTID(global_tid))
753  __kmp_pop_sync(global_tid, ct_master, loc);
754  }
755 }
756 
764 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
765  int cid = 0;
766  kmp_info_t *th;
767  KMP_DEBUG_ASSERT(__kmp_init_serial);
768 
769  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
770 
771  if (!TCR_4(__kmp_init_parallel))
772  __kmp_parallel_initialize();
773 
774 #if USE_ITT_BUILD
775  __kmp_itt_ordered_prep(gtid);
776 // TODO: ordered_wait_id
777 #endif /* USE_ITT_BUILD */
778 
779  th = __kmp_threads[gtid];
780 
781 #if OMPT_SUPPORT && OMPT_TRACE
782  if (ompt_enabled) {
783  /* OMPT state update */
784  th->th.ompt_thread_info.wait_id = (uint64_t)loc;
785  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
786 
787  /* OMPT event callback */
788  if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
789  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
790  th->th.ompt_thread_info.wait_id);
791  }
792  }
793 #endif
794 
795  if (th->th.th_dispatch->th_deo_fcn != 0)
796  (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
797  else
798  __kmp_parallel_deo(&gtid, &cid, loc);
799 
800 #if OMPT_SUPPORT && OMPT_TRACE
801  if (ompt_enabled) {
802  /* OMPT state update */
803  th->th.ompt_thread_info.state = ompt_state_work_parallel;
804  th->th.ompt_thread_info.wait_id = 0;
805 
806  /* OMPT event callback */
807  if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
808  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
809  th->th.ompt_thread_info.wait_id);
810  }
811  }
812 #endif
813 
814 #if USE_ITT_BUILD
815  __kmp_itt_ordered_start(gtid);
816 #endif /* USE_ITT_BUILD */
817 }
818 
826 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
827  int cid = 0;
828  kmp_info_t *th;
829 
830  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
831 
832 #if USE_ITT_BUILD
833  __kmp_itt_ordered_end(gtid);
834 // TODO: ordered_wait_id
835 #endif /* USE_ITT_BUILD */
836 
837  th = __kmp_threads[gtid];
838 
839  if (th->th.th_dispatch->th_dxo_fcn != 0)
840  (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
841  else
842  __kmp_parallel_dxo(&gtid, &cid, loc);
843 
844 #if OMPT_SUPPORT && OMPT_BLAME
845  if (ompt_enabled &&
846  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
847  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
848  th->th.ompt_thread_info.wait_id);
849  }
850 #endif
851 }
852 
853 #if KMP_USE_DYNAMIC_LOCK
854 
855 static __forceinline void
856 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
857  kmp_int32 gtid, kmp_indirect_locktag_t tag) {
858  // Pointer to the allocated indirect lock is written to crit, while indexing
859  // is ignored.
860  void *idx;
861  kmp_indirect_lock_t **lck;
862  lck = (kmp_indirect_lock_t **)crit;
863  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
864  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
865  KMP_SET_I_LOCK_LOCATION(ilk, loc);
866  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
867  KA_TRACE(20,
868  ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
869 #if USE_ITT_BUILD
870  __kmp_itt_critical_creating(ilk->lock, loc);
871 #endif
872  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
873  if (status == 0) {
874 #if USE_ITT_BUILD
875  __kmp_itt_critical_destroyed(ilk->lock);
876 #endif
877  // We don't really need to destroy the unclaimed lock here since it will be
878  // cleaned up at program exit.
879  // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
880  }
881  KMP_DEBUG_ASSERT(*lck != NULL);
882 }
883 
884 // Fast-path acquire tas lock
885 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
886  { \
887  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
888  if (l->lk.poll != KMP_LOCK_FREE(tas) || \
889  !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
890  KMP_LOCK_BUSY(gtid + 1, tas))) { \
891  kmp_uint32 spins; \
892  KMP_FSYNC_PREPARE(l); \
893  KMP_INIT_YIELD(spins); \
894  if (TCR_4(__kmp_nth) > \
895  (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
896  KMP_YIELD(TRUE); \
897  } else { \
898  KMP_YIELD_SPIN(spins); \
899  } \
900  kmp_backoff_t backoff = __kmp_spin_backoff_params; \
901  while (l->lk.poll != KMP_LOCK_FREE(tas) || \
902  !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
903  KMP_LOCK_BUSY(gtid + 1, tas))) { \
904  __kmp_spin_backoff(&backoff); \
905  if (TCR_4(__kmp_nth) > \
906  (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
907  KMP_YIELD(TRUE); \
908  } else { \
909  KMP_YIELD_SPIN(spins); \
910  } \
911  } \
912  } \
913  KMP_FSYNC_ACQUIRED(l); \
914  }
915 
916 // Fast-path test tas lock
917 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
918  { \
919  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
920  rc = l->lk.poll == KMP_LOCK_FREE(tas) && \
921  KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
922  KMP_LOCK_BUSY(gtid + 1, tas)); \
923  }
924 
925 // Fast-path release tas lock
926 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
927  { \
928  TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \
929  KMP_MB(); \
930  }
931 
932 #if KMP_USE_FUTEX
933 
934 #include <sys/syscall.h>
935 #include <unistd.h>
936 #ifndef FUTEX_WAIT
937 #define FUTEX_WAIT 0
938 #endif
939 #ifndef FUTEX_WAKE
940 #define FUTEX_WAKE 1
941 #endif
942 
943 // Fast-path acquire futex lock
944 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
945  { \
946  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
947  kmp_int32 gtid_code = (gtid + 1) << 1; \
948  KMP_MB(); \
949  KMP_FSYNC_PREPARE(ftx); \
950  kmp_int32 poll_val; \
951  while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
952  &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
953  KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
954  kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
955  if (!cond) { \
956  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
957  poll_val | \
958  KMP_LOCK_BUSY(1, futex))) { \
959  continue; \
960  } \
961  poll_val |= KMP_LOCK_BUSY(1, futex); \
962  } \
963  kmp_int32 rc; \
964  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
965  NULL, NULL, 0)) != 0) { \
966  continue; \
967  } \
968  gtid_code |= 1; \
969  } \
970  KMP_FSYNC_ACQUIRED(ftx); \
971  }
972 
973 // Fast-path test futex lock
974 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
975  { \
976  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
977  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
978  KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
979  KMP_FSYNC_ACQUIRED(ftx); \
980  rc = TRUE; \
981  } else { \
982  rc = FALSE; \
983  } \
984  }
985 
986 // Fast-path release futex lock
987 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
988  { \
989  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
990  KMP_MB(); \
991  KMP_FSYNC_RELEASING(ftx); \
992  kmp_int32 poll_val = \
993  KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
994  if (KMP_LOCK_STRIP(poll_val) & 1) { \
995  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
996  KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
997  } \
998  KMP_MB(); \
999  KMP_YIELD(TCR_4(__kmp_nth) > \
1000  (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1001  }
1002 
1003 #endif // KMP_USE_FUTEX
1004 
1005 #else // KMP_USE_DYNAMIC_LOCK
1006 
1007 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1008  ident_t const *loc,
1009  kmp_int32 gtid) {
1010  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1011 
1012  // Because of the double-check, the following load doesn't need to be volatile
1013  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1014 
1015  if (lck == NULL) {
1016  void *idx;
1017 
1018  // Allocate & initialize the lock.
1019  // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1020  lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1021  __kmp_init_user_lock_with_checks(lck);
1022  __kmp_set_user_lock_location(lck, loc);
1023 #if USE_ITT_BUILD
1024  __kmp_itt_critical_creating(lck);
1025 // __kmp_itt_critical_creating() should be called *before* the first usage
1026 // of underlying lock. It is the only place where we can guarantee it. There
1027 // are chances the lock will destroyed with no usage, but it is not a
1028 // problem, because this is not real event seen by user but rather setting
1029 // name for object (lock). See more details in kmp_itt.h.
1030 #endif /* USE_ITT_BUILD */
1031 
1032  // Use a cmpxchg instruction to slam the start of the critical section with
1033  // the lock pointer. If another thread beat us to it, deallocate the lock,
1034  // and use the lock that the other thread allocated.
1035  int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1036 
1037  if (status == 0) {
1038 // Deallocate the lock and reload the value.
1039 #if USE_ITT_BUILD
1040  __kmp_itt_critical_destroyed(lck);
1041 // Let ITT know the lock is destroyed and the same memory location may be reused
1042 // for another purpose.
1043 #endif /* USE_ITT_BUILD */
1044  __kmp_destroy_user_lock_with_checks(lck);
1045  __kmp_user_lock_free(&idx, gtid, lck);
1046  lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1047  KMP_DEBUG_ASSERT(lck != NULL);
1048  }
1049  }
1050  return lck;
1051 }
1052 
1053 #endif // KMP_USE_DYNAMIC_LOCK
1054 
1065 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1066  kmp_critical_name *crit) {
1067 #if KMP_USE_DYNAMIC_LOCK
1068  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1069 #else
1070  KMP_COUNT_BLOCK(OMP_CRITICAL);
1071  KMP_TIME_PARTITIONED_BLOCK(
1072  OMP_critical_wait); /* Time spent waiting to enter the critical section */
1073  kmp_user_lock_p lck;
1074 
1075  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1076 
1077  // TODO: add THR_OVHD_STATE
1078 
1079  KMP_CHECK_USER_LOCK_INIT();
1080 
1081  if ((__kmp_user_lock_kind == lk_tas) &&
1082  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1083  lck = (kmp_user_lock_p)crit;
1084  }
1085 #if KMP_USE_FUTEX
1086  else if ((__kmp_user_lock_kind == lk_futex) &&
1087  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1088  lck = (kmp_user_lock_p)crit;
1089  }
1090 #endif
1091  else { // ticket, queuing or drdpa
1092  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1093  }
1094 
1095  if (__kmp_env_consistency_check)
1096  __kmp_push_sync(global_tid, ct_critical, loc, lck);
1097 
1098 // since the critical directive binds to all threads, not just the current
1099 // team we have to check this even if we are in a serialized team.
1100 // also, even if we are the uber thread, we still have to conduct the lock,
1101 // as we have to contend with sibling threads.
1102 
1103 #if USE_ITT_BUILD
1104  __kmp_itt_critical_acquiring(lck);
1105 #endif /* USE_ITT_BUILD */
1106  // Value of 'crit' should be good for using as a critical_id of the critical
1107  // section directive.
1108  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1109 
1110 #if USE_ITT_BUILD
1111  __kmp_itt_critical_acquired(lck);
1112 #endif /* USE_ITT_BUILD */
1113 
1114  KMP_START_EXPLICIT_TIMER(OMP_critical);
1115  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1116 #endif // KMP_USE_DYNAMIC_LOCK
1117 }
1118 
1119 #if KMP_USE_DYNAMIC_LOCK
1120 
1121 // Converts the given hint to an internal lock implementation
1122 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1123 #if KMP_USE_TSX
1124 #define KMP_TSX_LOCK(seq) lockseq_##seq
1125 #else
1126 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1127 #endif
1128 
1129 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1130 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1131 #else
1132 #define KMP_CPUINFO_RTM 0
1133 #endif
1134 
1135  // Hints that do not require further logic
1136  if (hint & kmp_lock_hint_hle)
1137  return KMP_TSX_LOCK(hle);
1138  if (hint & kmp_lock_hint_rtm)
1139  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1140  if (hint & kmp_lock_hint_adaptive)
1141  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1142 
1143  // Rule out conflicting hints first by returning the default lock
1144  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1145  return __kmp_user_lock_seq;
1146  if ((hint & omp_lock_hint_speculative) &&
1147  (hint & omp_lock_hint_nonspeculative))
1148  return __kmp_user_lock_seq;
1149 
1150  // Do not even consider speculation when it appears to be contended
1151  if (hint & omp_lock_hint_contended)
1152  return lockseq_queuing;
1153 
1154  // Uncontended lock without speculation
1155  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1156  return lockseq_tas;
1157 
1158  // HLE lock for speculation
1159  if (hint & omp_lock_hint_speculative)
1160  return KMP_TSX_LOCK(hle);
1161 
1162  return __kmp_user_lock_seq;
1163 }
1164 
1178 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1179  kmp_critical_name *crit, uintptr_t hint) {
1180  KMP_COUNT_BLOCK(OMP_CRITICAL);
1181  kmp_user_lock_p lck;
1182 
1183  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1184 
1185  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1186  // Check if it is initialized.
1187  if (*lk == 0) {
1188  kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1189  if (KMP_IS_D_LOCK(lckseq)) {
1190  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1191  KMP_GET_D_TAG(lckseq));
1192  } else {
1193  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1194  }
1195  }
1196  // Branch for accessing the actual lock object and set operation. This
1197  // branching is inevitable since this lock initialization does not follow the
1198  // normal dispatch path (lock table is not used).
1199  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1200  lck = (kmp_user_lock_p)lk;
1201  if (__kmp_env_consistency_check) {
1202  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1203  __kmp_map_hint_to_lock(hint));
1204  }
1205 #if USE_ITT_BUILD
1206  __kmp_itt_critical_acquiring(lck);
1207 #endif
1208 #if KMP_USE_INLINED_TAS
1209  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1210  KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1211  } else
1212 #elif KMP_USE_INLINED_FUTEX
1213  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1214  KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1215  } else
1216 #endif
1217  {
1218  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1219  }
1220  } else {
1221  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1222  lck = ilk->lock;
1223  if (__kmp_env_consistency_check) {
1224  __kmp_push_sync(global_tid, ct_critical, loc, lck,
1225  __kmp_map_hint_to_lock(hint));
1226  }
1227 #if USE_ITT_BUILD
1228  __kmp_itt_critical_acquiring(lck);
1229 #endif
1230  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1231  }
1232 
1233 #if USE_ITT_BUILD
1234  __kmp_itt_critical_acquired(lck);
1235 #endif /* USE_ITT_BUILD */
1236 
1237  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1238  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1239 } // __kmpc_critical_with_hint
1240 
1241 #endif // KMP_USE_DYNAMIC_LOCK
1242 
1252 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1253  kmp_critical_name *crit) {
1254  kmp_user_lock_p lck;
1255 
1256  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1257 
1258 #if KMP_USE_DYNAMIC_LOCK
1259  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1260  lck = (kmp_user_lock_p)crit;
1261  KMP_ASSERT(lck != NULL);
1262  if (__kmp_env_consistency_check) {
1263  __kmp_pop_sync(global_tid, ct_critical, loc);
1264  }
1265 #if USE_ITT_BUILD
1266  __kmp_itt_critical_releasing(lck);
1267 #endif
1268 #if KMP_USE_INLINED_TAS
1269  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1270  KMP_RELEASE_TAS_LOCK(lck, global_tid);
1271  } else
1272 #elif KMP_USE_INLINED_FUTEX
1273  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1274  KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1275  } else
1276 #endif
1277  {
1278  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1279  }
1280  } else {
1281  kmp_indirect_lock_t *ilk =
1282  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1283  KMP_ASSERT(ilk != NULL);
1284  lck = ilk->lock;
1285  if (__kmp_env_consistency_check) {
1286  __kmp_pop_sync(global_tid, ct_critical, loc);
1287  }
1288 #if USE_ITT_BUILD
1289  __kmp_itt_critical_releasing(lck);
1290 #endif
1291  KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1292  }
1293 
1294 #else // KMP_USE_DYNAMIC_LOCK
1295 
1296  if ((__kmp_user_lock_kind == lk_tas) &&
1297  (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1298  lck = (kmp_user_lock_p)crit;
1299  }
1300 #if KMP_USE_FUTEX
1301  else if ((__kmp_user_lock_kind == lk_futex) &&
1302  (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1303  lck = (kmp_user_lock_p)crit;
1304  }
1305 #endif
1306  else { // ticket, queuing or drdpa
1307  lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1308  }
1309 
1310  KMP_ASSERT(lck != NULL);
1311 
1312  if (__kmp_env_consistency_check)
1313  __kmp_pop_sync(global_tid, ct_critical, loc);
1314 
1315 #if USE_ITT_BUILD
1316  __kmp_itt_critical_releasing(lck);
1317 #endif /* USE_ITT_BUILD */
1318  // Value of 'crit' should be good for using as a critical_id of the critical
1319  // section directive.
1320  __kmp_release_user_lock_with_checks(lck, global_tid);
1321 
1322 #if OMPT_SUPPORT && OMPT_BLAME
1323  if (ompt_enabled &&
1324  ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1325  ompt_callbacks.ompt_callback(ompt_event_release_critical)((uint64_t)lck);
1326  }
1327 #endif
1328 
1329 #endif // KMP_USE_DYNAMIC_LOCK
1330  KMP_POP_PARTITIONED_TIMER();
1331  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1332 }
1333 
1343 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1344  int status;
1345 
1346  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1347 
1348  if (!TCR_4(__kmp_init_parallel))
1349  __kmp_parallel_initialize();
1350 
1351  if (__kmp_env_consistency_check)
1352  __kmp_check_barrier(global_tid, ct_barrier, loc);
1353 
1354 #if USE_ITT_NOTIFY
1355  __kmp_threads[global_tid]->th.th_ident = loc;
1356 #endif
1357  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1358 
1359  return (status != 0) ? 0 : 1;
1360 }
1361 
1371 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1372  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1373 
1374  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1375 }
1376 
1387 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1388  kmp_int32 ret;
1389 
1390  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1391 
1392  if (!TCR_4(__kmp_init_parallel))
1393  __kmp_parallel_initialize();
1394 
1395  if (__kmp_env_consistency_check) {
1396  if (loc == 0) {
1397  KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1398  }
1399  __kmp_check_barrier(global_tid, ct_barrier, loc);
1400  }
1401 
1402 #if USE_ITT_NOTIFY
1403  __kmp_threads[global_tid]->th.th_ident = loc;
1404 #endif
1405  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1406 
1407  ret = __kmpc_master(loc, global_tid);
1408 
1409  if (__kmp_env_consistency_check) {
1410  /* there's no __kmpc_end_master called; so the (stats) */
1411  /* actions of __kmpc_end_master are done here */
1412 
1413  if (global_tid < 0) {
1414  KMP_WARNING(ThreadIdentInvalid);
1415  }
1416  if (ret) {
1417  /* only one thread should do the pop since only */
1418  /* one did the push (see __kmpc_master()) */
1419 
1420  __kmp_pop_sync(global_tid, ct_master, loc);
1421  }
1422  }
1423 
1424  return (ret);
1425 }
1426 
1427 /* The BARRIER for a SINGLE process section is always explicit */
1439 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1440  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1441 
1442  if (rc) {
1443  // We are going to execute the single statement, so we should count it.
1444  KMP_COUNT_BLOCK(OMP_SINGLE);
1445  KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1446  }
1447 
1448 #if OMPT_SUPPORT && OMPT_TRACE
1449  kmp_info_t *this_thr = __kmp_threads[global_tid];
1450  kmp_team_t *team = this_thr->th.th_team;
1451  int tid = __kmp_tid_from_gtid(global_tid);
1452 
1453  if (ompt_enabled) {
1454  if (rc) {
1455  if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1456  ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1457  team->t.ompt_team_info.parallel_id,
1458  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1459  team->t.ompt_team_info.microtask);
1460  }
1461  } else {
1462  if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1463  ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1464  team->t.ompt_team_info.parallel_id,
1465  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1466  }
1467  this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1468  }
1469  }
1470 #endif
1471 
1472  return rc;
1473 }
1474 
1484 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1485  __kmp_exit_single(global_tid);
1486  KMP_POP_PARTITIONED_TIMER();
1487 
1488 #if OMPT_SUPPORT && OMPT_TRACE
1489  kmp_info_t *this_thr = __kmp_threads[global_tid];
1490  kmp_team_t *team = this_thr->th.th_team;
1491  int tid = __kmp_tid_from_gtid(global_tid);
1492 
1493  if (ompt_enabled &&
1494  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1495  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1496  team->t.ompt_team_info.parallel_id,
1497  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1498  }
1499 #endif
1500 }
1501 
1509 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1510  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1511 
1512 #if OMPT_SUPPORT && OMPT_TRACE
1513  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1514  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1515  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1516  ompt_callbacks.ompt_callback(ompt_event_loop_end)(team_info->parallel_id,
1517  task_info->task_id);
1518  }
1519 #endif
1520 
1521  if (__kmp_env_consistency_check)
1522  __kmp_pop_workshare(global_tid, ct_pdo, loc);
1523 }
1524 
1525 // User routines which take C-style arguments (call by value)
1526 // different from the Fortran equivalent routines
1527 
1528 void ompc_set_num_threads(int arg) {
1529  // !!!!! TODO: check the per-task binding
1530  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1531 }
1532 
1533 void ompc_set_dynamic(int flag) {
1534  kmp_info_t *thread;
1535 
1536  /* For the thread-private implementation of the internal controls */
1537  thread = __kmp_entry_thread();
1538 
1539  __kmp_save_internal_controls(thread);
1540 
1541  set__dynamic(thread, flag ? TRUE : FALSE);
1542 }
1543 
1544 void ompc_set_nested(int flag) {
1545  kmp_info_t *thread;
1546 
1547  /* For the thread-private internal controls implementation */
1548  thread = __kmp_entry_thread();
1549 
1550  __kmp_save_internal_controls(thread);
1551 
1552  set__nested(thread, flag ? TRUE : FALSE);
1553 }
1554 
1555 void ompc_set_max_active_levels(int max_active_levels) {
1556  /* TO DO */
1557  /* we want per-task implementation of this internal control */
1558 
1559  /* For the per-thread internal controls implementation */
1560  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1561 }
1562 
1563 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1564  // !!!!! TODO: check the per-task binding
1565  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1566 }
1567 
1568 int ompc_get_ancestor_thread_num(int level) {
1569  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1570 }
1571 
1572 int ompc_get_team_size(int level) {
1573  return __kmp_get_team_size(__kmp_entry_gtid(), level);
1574 }
1575 
1576 void kmpc_set_stacksize(int arg) {
1577  // __kmp_aux_set_stacksize initializes the library if needed
1578  __kmp_aux_set_stacksize(arg);
1579 }
1580 
1581 void kmpc_set_stacksize_s(size_t arg) {
1582  // __kmp_aux_set_stacksize initializes the library if needed
1583  __kmp_aux_set_stacksize(arg);
1584 }
1585 
1586 void kmpc_set_blocktime(int arg) {
1587  int gtid, tid;
1588  kmp_info_t *thread;
1589 
1590  gtid = __kmp_entry_gtid();
1591  tid = __kmp_tid_from_gtid(gtid);
1592  thread = __kmp_thread_from_gtid(gtid);
1593 
1594  __kmp_aux_set_blocktime(arg, thread, tid);
1595 }
1596 
1597 void kmpc_set_library(int arg) {
1598  // __kmp_user_set_library initializes the library if needed
1599  __kmp_user_set_library((enum library_type)arg);
1600 }
1601 
1602 void kmpc_set_defaults(char const *str) {
1603  // __kmp_aux_set_defaults initializes the library if needed
1604  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1605 }
1606 
1607 void kmpc_set_disp_num_buffers(int arg) {
1608  // ignore after initialization because some teams have already
1609  // allocated dispatch buffers
1610  if (__kmp_init_serial == 0 && arg > 0)
1611  __kmp_dispatch_num_buffers = arg;
1612 }
1613 
1614 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1615 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1616  return -1;
1617 #else
1618  if (!TCR_4(__kmp_init_middle)) {
1619  __kmp_middle_initialize();
1620  }
1621  return __kmp_aux_set_affinity_mask_proc(proc, mask);
1622 #endif
1623 }
1624 
1625 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1626 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1627  return -1;
1628 #else
1629  if (!TCR_4(__kmp_init_middle)) {
1630  __kmp_middle_initialize();
1631  }
1632  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1633 #endif
1634 }
1635 
1636 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1637 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1638  return -1;
1639 #else
1640  if (!TCR_4(__kmp_init_middle)) {
1641  __kmp_middle_initialize();
1642  }
1643  return __kmp_aux_get_affinity_mask_proc(proc, mask);
1644 #endif
1645 }
1646 
1647 /* -------------------------------------------------------------------------- */
1692 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1693  void *cpy_data, void (*cpy_func)(void *, void *),
1694  kmp_int32 didit) {
1695  void **data_ptr;
1696 
1697  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1698 
1699  KMP_MB();
1700 
1701  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1702 
1703  if (__kmp_env_consistency_check) {
1704  if (loc == 0) {
1705  KMP_WARNING(ConstructIdentInvalid);
1706  }
1707  }
1708 
1709  // ToDo: Optimize the following two barriers into some kind of split barrier
1710 
1711  if (didit)
1712  *data_ptr = cpy_data;
1713 
1714 /* This barrier is not a barrier region boundary */
1715 #if USE_ITT_NOTIFY
1716  __kmp_threads[gtid]->th.th_ident = loc;
1717 #endif
1718  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1719 
1720  if (!didit)
1721  (*cpy_func)(cpy_data, *data_ptr);
1722 
1723 // Consider next barrier a user-visible barrier for barrier region boundaries
1724 // Nesting checks are already handled by the single construct checks
1725 
1726 #if USE_ITT_NOTIFY
1727  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
1728 // tasks can overwrite the location)
1729 #endif
1730  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1731 }
1732 
1733 /* -------------------------------------------------------------------------- */
1734 
1735 #define INIT_LOCK __kmp_init_user_lock_with_checks
1736 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1737 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1738 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1739 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1740 #define ACQUIRE_NESTED_LOCK_TIMED \
1741  __kmp_acquire_nested_user_lock_with_checks_timed
1742 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1743 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1744 #define TEST_LOCK __kmp_test_user_lock_with_checks
1745 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1746 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1747 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1748 
1749 // TODO: Make check abort messages use location info & pass it into
1750 // with_checks routines
1751 
1752 #if KMP_USE_DYNAMIC_LOCK
1753 
1754 // internal lock initializer
1755 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
1756  kmp_dyna_lockseq_t seq) {
1757  if (KMP_IS_D_LOCK(seq)) {
1758  KMP_INIT_D_LOCK(lock, seq);
1759 #if USE_ITT_BUILD
1760  __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1761 #endif
1762  } else {
1763  KMP_INIT_I_LOCK(lock, seq);
1764 #if USE_ITT_BUILD
1765  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1766  __kmp_itt_lock_creating(ilk->lock, loc);
1767 #endif
1768  }
1769 }
1770 
1771 // internal nest lock initializer
1772 static __forceinline void
1773 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
1774  kmp_dyna_lockseq_t seq) {
1775 #if KMP_USE_TSX
1776  // Don't have nested lock implementation for speculative locks
1777  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1778  seq = __kmp_user_lock_seq;
1779 #endif
1780  switch (seq) {
1781  case lockseq_tas:
1782  seq = lockseq_nested_tas;
1783  break;
1784 #if KMP_USE_FUTEX
1785  case lockseq_futex:
1786  seq = lockseq_nested_futex;
1787  break;
1788 #endif
1789  case lockseq_ticket:
1790  seq = lockseq_nested_ticket;
1791  break;
1792  case lockseq_queuing:
1793  seq = lockseq_nested_queuing;
1794  break;
1795  case lockseq_drdpa:
1796  seq = lockseq_nested_drdpa;
1797  break;
1798  default:
1799  seq = lockseq_nested_queuing;
1800  }
1801  KMP_INIT_I_LOCK(lock, seq);
1802 #if USE_ITT_BUILD
1803  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1804  __kmp_itt_lock_creating(ilk->lock, loc);
1805 #endif
1806 }
1807 
1808 /* initialize the lock with a hint */
1809 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
1810  uintptr_t hint) {
1811  KMP_DEBUG_ASSERT(__kmp_init_serial);
1812  if (__kmp_env_consistency_check && user_lock == NULL) {
1813  KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1814  }
1815 
1816  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1817 }
1818 
1819 /* initialize the lock with a hint */
1820 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
1821  void **user_lock, uintptr_t hint) {
1822  KMP_DEBUG_ASSERT(__kmp_init_serial);
1823  if (__kmp_env_consistency_check && user_lock == NULL) {
1824  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1825  }
1826 
1827  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1828 }
1829 
1830 #endif // KMP_USE_DYNAMIC_LOCK
1831 
1832 /* initialize the lock */
1833 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1834 #if KMP_USE_DYNAMIC_LOCK
1835 
1836  KMP_DEBUG_ASSERT(__kmp_init_serial);
1837  if (__kmp_env_consistency_check && user_lock == NULL) {
1838  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1839  }
1840  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1841 
1842 #else // KMP_USE_DYNAMIC_LOCK
1843 
1844  static char const *const func = "omp_init_lock";
1845  kmp_user_lock_p lck;
1846  KMP_DEBUG_ASSERT(__kmp_init_serial);
1847 
1848  if (__kmp_env_consistency_check) {
1849  if (user_lock == NULL) {
1850  KMP_FATAL(LockIsUninitialized, func);
1851  }
1852  }
1853 
1854  KMP_CHECK_USER_LOCK_INIT();
1855 
1856  if ((__kmp_user_lock_kind == lk_tas) &&
1857  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1858  lck = (kmp_user_lock_p)user_lock;
1859  }
1860 #if KMP_USE_FUTEX
1861  else if ((__kmp_user_lock_kind == lk_futex) &&
1862  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1863  lck = (kmp_user_lock_p)user_lock;
1864  }
1865 #endif
1866  else {
1867  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
1868  }
1869  INIT_LOCK(lck);
1870  __kmp_set_user_lock_location(lck, loc);
1871 
1872 #if OMPT_SUPPORT && OMPT_TRACE
1873  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1874  ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t)lck);
1875  }
1876 #endif
1877 
1878 #if USE_ITT_BUILD
1879  __kmp_itt_lock_creating(lck);
1880 #endif /* USE_ITT_BUILD */
1881 
1882 #endif // KMP_USE_DYNAMIC_LOCK
1883 } // __kmpc_init_lock
1884 
1885 /* initialize the lock */
1886 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1887 #if KMP_USE_DYNAMIC_LOCK
1888 
1889  KMP_DEBUG_ASSERT(__kmp_init_serial);
1890  if (__kmp_env_consistency_check && user_lock == NULL) {
1891  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1892  }
1893  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1894 
1895 #else // KMP_USE_DYNAMIC_LOCK
1896 
1897  static char const *const func = "omp_init_nest_lock";
1898  kmp_user_lock_p lck;
1899  KMP_DEBUG_ASSERT(__kmp_init_serial);
1900 
1901  if (__kmp_env_consistency_check) {
1902  if (user_lock == NULL) {
1903  KMP_FATAL(LockIsUninitialized, func);
1904  }
1905  }
1906 
1907  KMP_CHECK_USER_LOCK_INIT();
1908 
1909  if ((__kmp_user_lock_kind == lk_tas) &&
1910  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
1911  OMP_NEST_LOCK_T_SIZE)) {
1912  lck = (kmp_user_lock_p)user_lock;
1913  }
1914 #if KMP_USE_FUTEX
1915  else if ((__kmp_user_lock_kind == lk_futex) &&
1916  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
1917  OMP_NEST_LOCK_T_SIZE)) {
1918  lck = (kmp_user_lock_p)user_lock;
1919  }
1920 #endif
1921  else {
1922  lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
1923  }
1924 
1925  INIT_NESTED_LOCK(lck);
1926  __kmp_set_user_lock_location(lck, loc);
1927 
1928 #if OMPT_SUPPORT && OMPT_TRACE
1929  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
1930  ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t)lck);
1931  }
1932 #endif
1933 
1934 #if USE_ITT_BUILD
1935  __kmp_itt_lock_creating(lck);
1936 #endif /* USE_ITT_BUILD */
1937 
1938 #endif // KMP_USE_DYNAMIC_LOCK
1939 } // __kmpc_init_nest_lock
1940 
1941 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1942 #if KMP_USE_DYNAMIC_LOCK
1943 
1944 #if USE_ITT_BUILD
1945  kmp_user_lock_p lck;
1946  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
1947  lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
1948  } else {
1949  lck = (kmp_user_lock_p)user_lock;
1950  }
1951  __kmp_itt_lock_destroyed(lck);
1952 #endif
1953  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1954 #else
1955  kmp_user_lock_p lck;
1956 
1957  if ((__kmp_user_lock_kind == lk_tas) &&
1958  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1959  lck = (kmp_user_lock_p)user_lock;
1960  }
1961 #if KMP_USE_FUTEX
1962  else if ((__kmp_user_lock_kind == lk_futex) &&
1963  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1964  lck = (kmp_user_lock_p)user_lock;
1965  }
1966 #endif
1967  else {
1968  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
1969  }
1970 
1971 #if OMPT_SUPPORT && OMPT_TRACE
1972  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
1973  ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t)lck);
1974  }
1975 #endif
1976 
1977 #if USE_ITT_BUILD
1978  __kmp_itt_lock_destroyed(lck);
1979 #endif /* USE_ITT_BUILD */
1980  DESTROY_LOCK(lck);
1981 
1982  if ((__kmp_user_lock_kind == lk_tas) &&
1983  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1984  ;
1985  }
1986 #if KMP_USE_FUTEX
1987  else if ((__kmp_user_lock_kind == lk_futex) &&
1988  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1989  ;
1990  }
1991 #endif
1992  else {
1993  __kmp_user_lock_free(user_lock, gtid, lck);
1994  }
1995 #endif // KMP_USE_DYNAMIC_LOCK
1996 } // __kmpc_destroy_lock
1997 
1998 /* destroy the lock */
1999 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2000 #if KMP_USE_DYNAMIC_LOCK
2001 
2002 #if USE_ITT_BUILD
2003  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2004  __kmp_itt_lock_destroyed(ilk->lock);
2005 #endif
2006  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2007 
2008 #else // KMP_USE_DYNAMIC_LOCK
2009 
2010  kmp_user_lock_p lck;
2011 
2012  if ((__kmp_user_lock_kind == lk_tas) &&
2013  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2014  OMP_NEST_LOCK_T_SIZE)) {
2015  lck = (kmp_user_lock_p)user_lock;
2016  }
2017 #if KMP_USE_FUTEX
2018  else if ((__kmp_user_lock_kind == lk_futex) &&
2019  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2020  OMP_NEST_LOCK_T_SIZE)) {
2021  lck = (kmp_user_lock_p)user_lock;
2022  }
2023 #endif
2024  else {
2025  lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2026  }
2027 
2028 #if OMPT_SUPPORT && OMPT_TRACE
2029  if (ompt_enabled &&
2030  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2031  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t)lck);
2032  }
2033 #endif
2034 
2035 #if USE_ITT_BUILD
2036  __kmp_itt_lock_destroyed(lck);
2037 #endif /* USE_ITT_BUILD */
2038 
2039  DESTROY_NESTED_LOCK(lck);
2040 
2041  if ((__kmp_user_lock_kind == lk_tas) &&
2042  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2043  OMP_NEST_LOCK_T_SIZE)) {
2044  ;
2045  }
2046 #if KMP_USE_FUTEX
2047  else if ((__kmp_user_lock_kind == lk_futex) &&
2048  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2049  OMP_NEST_LOCK_T_SIZE)) {
2050  ;
2051  }
2052 #endif
2053  else {
2054  __kmp_user_lock_free(user_lock, gtid, lck);
2055  }
2056 #endif // KMP_USE_DYNAMIC_LOCK
2057 } // __kmpc_destroy_nest_lock
2058 
2059 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2060  KMP_COUNT_BLOCK(OMP_set_lock);
2061 #if KMP_USE_DYNAMIC_LOCK
2062  int tag = KMP_EXTRACT_D_TAG(user_lock);
2063 #if USE_ITT_BUILD
2064  __kmp_itt_lock_acquiring(
2065  (kmp_user_lock_p)
2066  user_lock); // itt function will get to the right lock object.
2067 #endif
2068 #if KMP_USE_INLINED_TAS
2069  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2070  KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2071  } else
2072 #elif KMP_USE_INLINED_FUTEX
2073  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2074  KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2075  } else
2076 #endif
2077  {
2078  __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2079  }
2080 #if USE_ITT_BUILD
2081  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2082 #endif
2083 
2084 #else // KMP_USE_DYNAMIC_LOCK
2085 
2086  kmp_user_lock_p lck;
2087 
2088  if ((__kmp_user_lock_kind == lk_tas) &&
2089  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2090  lck = (kmp_user_lock_p)user_lock;
2091  }
2092 #if KMP_USE_FUTEX
2093  else if ((__kmp_user_lock_kind == lk_futex) &&
2094  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2095  lck = (kmp_user_lock_p)user_lock;
2096  }
2097 #endif
2098  else {
2099  lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2100  }
2101 
2102 #if USE_ITT_BUILD
2103  __kmp_itt_lock_acquiring(lck);
2104 #endif /* USE_ITT_BUILD */
2105 
2106  ACQUIRE_LOCK(lck, gtid);
2107 
2108 #if USE_ITT_BUILD
2109  __kmp_itt_lock_acquired(lck);
2110 #endif /* USE_ITT_BUILD */
2111 
2112 #if OMPT_SUPPORT && OMPT_TRACE
2113  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2114  ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t)lck);
2115  }
2116 #endif
2117 
2118 #endif // KMP_USE_DYNAMIC_LOCK
2119 }
2120 
2121 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2122 #if KMP_USE_DYNAMIC_LOCK
2123 
2124 #if USE_ITT_BUILD
2125  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2126 #endif
2127  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2128 #if USE_ITT_BUILD
2129  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2130 #endif
2131 
2132 #if OMPT_SUPPORT && OMPT_TRACE
2133  if (ompt_enabled) {
2134  // missing support here: need to know whether acquired first or not
2135  }
2136 #endif
2137 
2138 #else // KMP_USE_DYNAMIC_LOCK
2139  int acquire_status;
2140  kmp_user_lock_p lck;
2141 
2142  if ((__kmp_user_lock_kind == lk_tas) &&
2143  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2144  OMP_NEST_LOCK_T_SIZE)) {
2145  lck = (kmp_user_lock_p)user_lock;
2146  }
2147 #if KMP_USE_FUTEX
2148  else if ((__kmp_user_lock_kind == lk_futex) &&
2149  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2150  OMP_NEST_LOCK_T_SIZE)) {
2151  lck = (kmp_user_lock_p)user_lock;
2152  }
2153 #endif
2154  else {
2155  lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2156  }
2157 
2158 #if USE_ITT_BUILD
2159  __kmp_itt_lock_acquiring(lck);
2160 #endif /* USE_ITT_BUILD */
2161 
2162  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2163 
2164 #if USE_ITT_BUILD
2165  __kmp_itt_lock_acquired(lck);
2166 #endif /* USE_ITT_BUILD */
2167 
2168 #if OMPT_SUPPORT && OMPT_TRACE
2169  if (ompt_enabled) {
2170  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2171  if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2172  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)(
2173  (uint64_t)lck);
2174  } else {
2175  if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2176  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)(
2177  (uint64_t)lck);
2178  }
2179  }
2180 #endif
2181 
2182 #endif // KMP_USE_DYNAMIC_LOCK
2183 }
2184 
2185 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2186 #if KMP_USE_DYNAMIC_LOCK
2187 
2188  int tag = KMP_EXTRACT_D_TAG(user_lock);
2189 #if USE_ITT_BUILD
2190  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2191 #endif
2192 #if KMP_USE_INLINED_TAS
2193  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2194  KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2195  } else
2196 #elif KMP_USE_INLINED_FUTEX
2197  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2198  KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2199  } else
2200 #endif
2201  {
2202  __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2203  }
2204 
2205 #else // KMP_USE_DYNAMIC_LOCK
2206 
2207  kmp_user_lock_p lck;
2208 
2209  /* Can't use serial interval since not block structured */
2210  /* release the lock */
2211 
2212  if ((__kmp_user_lock_kind == lk_tas) &&
2213  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2214 #if KMP_OS_LINUX && \
2215  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2216 // "fast" path implemented to fix customer performance issue
2217 #if USE_ITT_BUILD
2218  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2219 #endif /* USE_ITT_BUILD */
2220  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2221  KMP_MB();
2222  return;
2223 #else
2224  lck = (kmp_user_lock_p)user_lock;
2225 #endif
2226  }
2227 #if KMP_USE_FUTEX
2228  else if ((__kmp_user_lock_kind == lk_futex) &&
2229  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2230  lck = (kmp_user_lock_p)user_lock;
2231  }
2232 #endif
2233  else {
2234  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2235  }
2236 
2237 #if USE_ITT_BUILD
2238  __kmp_itt_lock_releasing(lck);
2239 #endif /* USE_ITT_BUILD */
2240 
2241  RELEASE_LOCK(lck, gtid);
2242 
2243 #if OMPT_SUPPORT && OMPT_BLAME
2244  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2245  ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t)lck);
2246  }
2247 #endif
2248 
2249 #endif // KMP_USE_DYNAMIC_LOCK
2250 }
2251 
2252 /* release the lock */
2253 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2254 #if KMP_USE_DYNAMIC_LOCK
2255 
2256 #if USE_ITT_BUILD
2257  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2258 #endif
2259  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2260 
2261 #else // KMP_USE_DYNAMIC_LOCK
2262 
2263  kmp_user_lock_p lck;
2264 
2265  /* Can't use serial interval since not block structured */
2266 
2267  if ((__kmp_user_lock_kind == lk_tas) &&
2268  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2269  OMP_NEST_LOCK_T_SIZE)) {
2270 #if KMP_OS_LINUX && \
2271  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2272  // "fast" path implemented to fix customer performance issue
2273  kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2274 #if USE_ITT_BUILD
2275  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2276 #endif /* USE_ITT_BUILD */
2277  if (--(tl->lk.depth_locked) == 0) {
2278  TCW_4(tl->lk.poll, 0);
2279  }
2280  KMP_MB();
2281  return;
2282 #else
2283  lck = (kmp_user_lock_p)user_lock;
2284 #endif
2285  }
2286 #if KMP_USE_FUTEX
2287  else if ((__kmp_user_lock_kind == lk_futex) &&
2288  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2289  OMP_NEST_LOCK_T_SIZE)) {
2290  lck = (kmp_user_lock_p)user_lock;
2291  }
2292 #endif
2293  else {
2294  lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2295  }
2296 
2297 #if USE_ITT_BUILD
2298  __kmp_itt_lock_releasing(lck);
2299 #endif /* USE_ITT_BUILD */
2300 
2301  int release_status;
2302  release_status = RELEASE_NESTED_LOCK(lck, gtid);
2303 #if OMPT_SUPPORT && OMPT_BLAME
2304  if (ompt_enabled) {
2305  if (release_status == KMP_LOCK_RELEASED) {
2306  if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2307  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2308  (uint64_t)lck);
2309  }
2310  } else if (ompt_callbacks.ompt_callback(
2311  ompt_event_release_nest_lock_prev)) {
2312  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2313  (uint64_t)lck);
2314  }
2315  }
2316 #endif
2317 
2318 #endif // KMP_USE_DYNAMIC_LOCK
2319 }
2320 
2321 /* try to acquire the lock */
2322 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2323  KMP_COUNT_BLOCK(OMP_test_lock);
2324 
2325 #if KMP_USE_DYNAMIC_LOCK
2326  int rc;
2327  int tag = KMP_EXTRACT_D_TAG(user_lock);
2328 #if USE_ITT_BUILD
2329  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2330 #endif
2331 #if KMP_USE_INLINED_TAS
2332  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2333  KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2334  } else
2335 #elif KMP_USE_INLINED_FUTEX
2336  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2337  KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2338  } else
2339 #endif
2340  {
2341  rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2342  }
2343  if (rc) {
2344 #if USE_ITT_BUILD
2345  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2346 #endif
2347  return FTN_TRUE;
2348  } else {
2349 #if USE_ITT_BUILD
2350  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2351 #endif
2352  return FTN_FALSE;
2353  }
2354 
2355 #else // KMP_USE_DYNAMIC_LOCK
2356 
2357  kmp_user_lock_p lck;
2358  int rc;
2359 
2360  if ((__kmp_user_lock_kind == lk_tas) &&
2361  (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2362  lck = (kmp_user_lock_p)user_lock;
2363  }
2364 #if KMP_USE_FUTEX
2365  else if ((__kmp_user_lock_kind == lk_futex) &&
2366  (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2367  lck = (kmp_user_lock_p)user_lock;
2368  }
2369 #endif
2370  else {
2371  lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2372  }
2373 
2374 #if USE_ITT_BUILD
2375  __kmp_itt_lock_acquiring(lck);
2376 #endif /* USE_ITT_BUILD */
2377 
2378  rc = TEST_LOCK(lck, gtid);
2379 #if USE_ITT_BUILD
2380  if (rc) {
2381  __kmp_itt_lock_acquired(lck);
2382  } else {
2383  __kmp_itt_lock_cancelled(lck);
2384  }
2385 #endif /* USE_ITT_BUILD */
2386  return (rc ? FTN_TRUE : FTN_FALSE);
2387 
2388 /* Can't use serial interval since not block structured */
2389 
2390 #endif // KMP_USE_DYNAMIC_LOCK
2391 }
2392 
2393 /* try to acquire the lock */
2394 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2395 #if KMP_USE_DYNAMIC_LOCK
2396  int rc;
2397 #if USE_ITT_BUILD
2398  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2399 #endif
2400  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2401 #if USE_ITT_BUILD
2402  if (rc) {
2403  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2404  } else {
2405  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2406  }
2407 #endif
2408  return rc;
2409 
2410 #else // KMP_USE_DYNAMIC_LOCK
2411 
2412  kmp_user_lock_p lck;
2413  int rc;
2414 
2415  if ((__kmp_user_lock_kind == lk_tas) &&
2416  (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2417  OMP_NEST_LOCK_T_SIZE)) {
2418  lck = (kmp_user_lock_p)user_lock;
2419  }
2420 #if KMP_USE_FUTEX
2421  else if ((__kmp_user_lock_kind == lk_futex) &&
2422  (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2423  OMP_NEST_LOCK_T_SIZE)) {
2424  lck = (kmp_user_lock_p)user_lock;
2425  }
2426 #endif
2427  else {
2428  lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
2429  }
2430 
2431 #if USE_ITT_BUILD
2432  __kmp_itt_lock_acquiring(lck);
2433 #endif /* USE_ITT_BUILD */
2434 
2435  rc = TEST_NESTED_LOCK(lck, gtid);
2436 #if USE_ITT_BUILD
2437  if (rc) {
2438  __kmp_itt_lock_acquired(lck);
2439  } else {
2440  __kmp_itt_lock_cancelled(lck);
2441  }
2442 #endif /* USE_ITT_BUILD */
2443  return rc;
2444 
2445 /* Can't use serial interval since not block structured */
2446 
2447 #endif // KMP_USE_DYNAMIC_LOCK
2448 }
2449 
2450 // Interface to fast scalable reduce methods routines
2451 
2452 // keep the selected method in a thread local structure for cross-function
2453 // usage: will be used in __kmpc_end_reduce* functions;
2454 // another solution: to re-determine the method one more time in
2455 // __kmpc_end_reduce* functions (new prototype required then)
2456 // AT: which solution is better?
2457 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
2458  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
2459 
2460 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2461  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
2462 
2463 // description of the packed_reduction_method variable: look at the macros in
2464 // kmp.h
2465 
2466 // used in a critical section reduce block
2467 static __forceinline void
2468 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
2469  kmp_critical_name *crit) {
2470 
2471  // this lock was visible to a customer and to the threading profile tool as a
2472  // serial overhead span (although it's used for an internal purpose only)
2473  // why was it visible in previous implementation?
2474  // should we keep it visible in new reduce block?
2475  kmp_user_lock_p lck;
2476 
2477 #if KMP_USE_DYNAMIC_LOCK
2478 
2479  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2480  // Check if it is initialized.
2481  if (*lk == 0) {
2482  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2483  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
2484  KMP_GET_D_TAG(__kmp_user_lock_seq));
2485  } else {
2486  __kmp_init_indirect_csptr(crit, loc, global_tid,
2487  KMP_GET_I_TAG(__kmp_user_lock_seq));
2488  }
2489  }
2490  // Branch for accessing the actual lock object and set operation. This
2491  // branching is inevitable since this lock initialization does not follow the
2492  // normal dispatch path (lock table is not used).
2493  if (KMP_EXTRACT_D_TAG(lk) != 0) {
2494  lck = (kmp_user_lock_p)lk;
2495  KMP_DEBUG_ASSERT(lck != NULL);
2496  if (__kmp_env_consistency_check) {
2497  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2498  }
2499  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2500  } else {
2501  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2502  lck = ilk->lock;
2503  KMP_DEBUG_ASSERT(lck != NULL);
2504  if (__kmp_env_consistency_check) {
2505  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2506  }
2507  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2508  }
2509 
2510 #else // KMP_USE_DYNAMIC_LOCK
2511 
2512  // We know that the fast reduction code is only emitted by Intel compilers
2513  // with 32 byte critical sections. If there isn't enough space, then we
2514  // have to use a pointer.
2515  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
2516  lck = (kmp_user_lock_p)crit;
2517  } else {
2518  lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
2519  }
2520  KMP_DEBUG_ASSERT(lck != NULL);
2521 
2522  if (__kmp_env_consistency_check)
2523  __kmp_push_sync(global_tid, ct_critical, loc, lck);
2524 
2525  __kmp_acquire_user_lock_with_checks(lck, global_tid);
2526 
2527 #endif // KMP_USE_DYNAMIC_LOCK
2528 }
2529 
2530 // used in a critical section reduce block
2531 static __forceinline void
2532 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
2533  kmp_critical_name *crit) {
2534 
2535  kmp_user_lock_p lck;
2536 
2537 #if KMP_USE_DYNAMIC_LOCK
2538 
2539  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2540  lck = (kmp_user_lock_p)crit;
2541  if (__kmp_env_consistency_check)
2542  __kmp_pop_sync(global_tid, ct_critical, loc);
2543  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2544  } else {
2545  kmp_indirect_lock_t *ilk =
2546  (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2547  if (__kmp_env_consistency_check)
2548  __kmp_pop_sync(global_tid, ct_critical, loc);
2549  KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2550  }
2551 
2552 #else // KMP_USE_DYNAMIC_LOCK
2553 
2554  // We know that the fast reduction code is only emitted by Intel compilers
2555  // with 32 byte critical sections. If there isn't enough space, then we have
2556  // to use a pointer.
2557  if (__kmp_base_user_lock_size > 32) {
2558  lck = *((kmp_user_lock_p *)crit);
2559  KMP_ASSERT(lck != NULL);
2560  } else {
2561  lck = (kmp_user_lock_p)crit;
2562  }
2563 
2564  if (__kmp_env_consistency_check)
2565  __kmp_pop_sync(global_tid, ct_critical, loc);
2566 
2567  __kmp_release_user_lock_with_checks(lck, global_tid);
2568 
2569 #endif // KMP_USE_DYNAMIC_LOCK
2570 } // __kmp_end_critical_section_reduce_block
2571 
2572 /* 2.a.i. Reduce Block without a terminating barrier */
2588 kmp_int32
2589 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
2590  size_t reduce_size, void *reduce_data,
2591  void (*reduce_func)(void *lhs_data, void *rhs_data),
2592  kmp_critical_name *lck) {
2593 
2594  KMP_COUNT_BLOCK(REDUCE_nowait);
2595  int retval = 0;
2596  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2597 #if OMP_40_ENABLED
2598  kmp_team_t *team;
2599  kmp_info_t *th;
2600  int teams_swapped = 0, task_state;
2601 #endif
2602  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
2603 
2604  // why do we need this initialization here at all?
2605  // Reduction clause can not be used as a stand-alone directive.
2606 
2607  // do not call __kmp_serial_initialize(), it will be called by
2608  // __kmp_parallel_initialize() if needed
2609  // possible detection of false-positive race by the threadchecker ???
2610  if (!TCR_4(__kmp_init_parallel))
2611  __kmp_parallel_initialize();
2612 
2613 // check correctness of reduce block nesting
2614 #if KMP_USE_DYNAMIC_LOCK
2615  if (__kmp_env_consistency_check)
2616  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
2617 #else
2618  if (__kmp_env_consistency_check)
2619  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
2620 #endif
2621 
2622 #if OMP_40_ENABLED
2623  th = __kmp_thread_from_gtid(global_tid);
2624  if (th->th.th_teams_microtask) { // AC: check if we are inside the teams
2625  // construct?
2626  team = th->th.th_team;
2627  if (team->t.t_level == th->th.th_teams_level) {
2628  // this is reduction at teams construct
2629  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2630  // Let's swap teams temporarily for the reduction barrier
2631  teams_swapped = 1;
2632  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2633  th->th.th_team = team->t.t_parent;
2634  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2635  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2636  task_state = th->th.th_task_state;
2637  th->th.th_task_state = 0;
2638  }
2639  }
2640 #endif // OMP_40_ENABLED
2641 
2642  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
2643  // the value should be kept in a variable
2644  // the variable should be either a construct-specific or thread-specific
2645  // property, not a team specific property
2646  // (a thread can reach the next reduce block on the next construct, reduce
2647  // method may differ on the next construct)
2648  // an ident_t "loc" parameter could be used as a construct-specific property
2649  // (what if loc == 0?)
2650  // (if both construct-specific and team-specific variables were shared,
2651  // then unness extra syncs should be needed)
2652  // a thread-specific variable is better regarding two issues above (next
2653  // construct and extra syncs)
2654  // a thread-specific "th_local.reduction_method" variable is used currently
2655  // each thread executes 'determine' and 'set' lines (no need to execute by one
2656  // thread, to avoid unness extra syncs)
2657 
2658  packed_reduction_method = __kmp_determine_reduction_method(
2659  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
2660  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
2661 
2662  if (packed_reduction_method == critical_reduce_block) {
2663 
2664  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
2665  retval = 1;
2666 
2667  } else if (packed_reduction_method == empty_reduce_block) {
2668 
2669  // usage: if team size == 1, no synchronization is required ( Intel
2670  // platforms only )
2671  retval = 1;
2672 
2673  } else if (packed_reduction_method == atomic_reduce_block) {
2674 
2675  retval = 2;
2676 
2677  // all threads should do this pop here (because __kmpc_end_reduce_nowait()
2678  // won't be called by the code gen)
2679  // (it's not quite good, because the checking block has been closed by
2680  // this 'pop',
2681  // but atomic operation has not been executed yet, will be executed
2682  // slightly later, literally on next instruction)
2683  if (__kmp_env_consistency_check)
2684  __kmp_pop_sync(global_tid, ct_reduce, loc);
2685 
2686  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2687  tree_reduce_block)) {
2688 
2689 // AT: performance issue: a real barrier here
2690 // AT: (if master goes slow, other threads are blocked here waiting for the
2691 // master to come and release them)
2692 // AT: (it's not what a customer might expect specifying NOWAIT clause)
2693 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
2694 // be confusing to a customer)
2695 // AT: another implementation of *barrier_gather*nowait() (or some other design)
2696 // might go faster and be more in line with sense of NOWAIT
2697 // AT: TO DO: do epcc test and compare times
2698 
2699 // this barrier should be invisible to a customer and to the threading profile
2700 // tool (it's neither a terminating barrier nor customer's code, it's
2701 // used for an internal purpose)
2702 #if USE_ITT_NOTIFY
2703  __kmp_threads[global_tid]->th.th_ident = loc;
2704 #endif
2705  retval =
2706  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2707  global_tid, FALSE, reduce_size, reduce_data, reduce_func);
2708  retval = (retval != 0) ? (0) : (1);
2709 
2710  // all other workers except master should do this pop here
2711  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2712  if (__kmp_env_consistency_check) {
2713  if (retval == 0) {
2714  __kmp_pop_sync(global_tid, ct_reduce, loc);
2715  }
2716  }
2717 
2718  } else {
2719 
2720  // should never reach this block
2721  KMP_ASSERT(0); // "unexpected method"
2722  }
2723 #if OMP_40_ENABLED
2724  if (teams_swapped) {
2725  // Restore thread structure
2726  th->th.th_info.ds.ds_tid = 0;
2727  th->th.th_team = team;
2728  th->th.th_team_nproc = team->t.t_nproc;
2729  th->th.th_task_team = team->t.t_task_team[task_state];
2730  th->th.th_task_state = task_state;
2731  }
2732 #endif
2733  KA_TRACE(
2734  10,
2735  ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
2736  global_tid, packed_reduction_method, retval));
2737 
2738  return retval;
2739 }
2740 
2749 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
2750  kmp_critical_name *lck) {
2751 
2752  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2753 
2754  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
2755 
2756  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
2757 
2758  if (packed_reduction_method == critical_reduce_block) {
2759 
2760  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
2761 
2762  } else if (packed_reduction_method == empty_reduce_block) {
2763 
2764  // usage: if team size == 1, no synchronization is required ( on Intel
2765  // platforms only )
2766 
2767  } else if (packed_reduction_method == atomic_reduce_block) {
2768 
2769  // neither master nor other workers should get here
2770  // (code gen does not generate this call in case 2: atomic reduce block)
2771  // actually it's better to remove this elseif at all;
2772  // after removal this value will checked by the 'else' and will assert
2773 
2774  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2775  tree_reduce_block)) {
2776 
2777  // only master gets here
2778 
2779  } else {
2780 
2781  // should never reach this block
2782  KMP_ASSERT(0); // "unexpected method"
2783  }
2784 
2785  if (__kmp_env_consistency_check)
2786  __kmp_pop_sync(global_tid, ct_reduce, loc);
2787 
2788  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
2789  global_tid, packed_reduction_method));
2790 
2791  return;
2792 }
2793 
2794 /* 2.a.ii. Reduce Block with a terminating barrier */
2795 
2811 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
2812  size_t reduce_size, void *reduce_data,
2813  void (*reduce_func)(void *lhs_data, void *rhs_data),
2814  kmp_critical_name *lck) {
2815  KMP_COUNT_BLOCK(REDUCE_wait);
2816  int retval = 0;
2817  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2818 
2819  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
2820 
2821  // why do we need this initialization here at all?
2822  // Reduction clause can not be a stand-alone directive.
2823 
2824  // do not call __kmp_serial_initialize(), it will be called by
2825  // __kmp_parallel_initialize() if needed
2826  // possible detection of false-positive race by the threadchecker ???
2827  if (!TCR_4(__kmp_init_parallel))
2828  __kmp_parallel_initialize();
2829 
2830 // check correctness of reduce block nesting
2831 #if KMP_USE_DYNAMIC_LOCK
2832  if (__kmp_env_consistency_check)
2833  __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
2834 #else
2835  if (__kmp_env_consistency_check)
2836  __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
2837 #endif
2838 
2839  packed_reduction_method = __kmp_determine_reduction_method(
2840  loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
2841  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
2842 
2843  if (packed_reduction_method == critical_reduce_block) {
2844 
2845  __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
2846  retval = 1;
2847 
2848  } else if (packed_reduction_method == empty_reduce_block) {
2849 
2850  // usage: if team size == 1, no synchronization is required ( Intel
2851  // platforms only )
2852  retval = 1;
2853 
2854  } else if (packed_reduction_method == atomic_reduce_block) {
2855 
2856  retval = 2;
2857 
2858  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2859  tree_reduce_block)) {
2860 
2861 // case tree_reduce_block:
2862 // this barrier should be visible to a customer and to the threading profile
2863 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
2864 #if USE_ITT_NOTIFY
2865  __kmp_threads[global_tid]->th.th_ident =
2866  loc; // needed for correct notification of frames
2867 #endif
2868  retval =
2869  __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2870  global_tid, TRUE, reduce_size, reduce_data, reduce_func);
2871  retval = (retval != 0) ? (0) : (1);
2872 
2873  // all other workers except master should do this pop here
2874  // ( none of other workers except master will enter __kmpc_end_reduce() )
2875  if (__kmp_env_consistency_check) {
2876  if (retval == 0) { // 0: all other workers; 1: master
2877  __kmp_pop_sync(global_tid, ct_reduce, loc);
2878  }
2879  }
2880 
2881  } else {
2882 
2883  // should never reach this block
2884  KMP_ASSERT(0); // "unexpected method"
2885  }
2886 
2887  KA_TRACE(10,
2888  ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
2889  global_tid, packed_reduction_method, retval));
2890 
2891  return retval;
2892 }
2893 
2904 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
2905  kmp_critical_name *lck) {
2906 
2907  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2908 
2909  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
2910 
2911  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
2912 
2913  // this barrier should be visible to a customer and to the threading profile
2914  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
2915 
2916  if (packed_reduction_method == critical_reduce_block) {
2917 
2918  __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
2919 
2920 // TODO: implicit barrier: should be exposed
2921 #if USE_ITT_NOTIFY
2922  __kmp_threads[global_tid]->th.th_ident = loc;
2923 #endif
2924  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2925 
2926  } else if (packed_reduction_method == empty_reduce_block) {
2927 
2928 // usage: if team size==1, no synchronization is required (Intel platforms only)
2929 
2930 // TODO: implicit barrier: should be exposed
2931 #if USE_ITT_NOTIFY
2932  __kmp_threads[global_tid]->th.th_ident = loc;
2933 #endif
2934  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2935 
2936  } else if (packed_reduction_method == atomic_reduce_block) {
2937 
2938 // TODO: implicit barrier: should be exposed
2939 #if USE_ITT_NOTIFY
2940  __kmp_threads[global_tid]->th.th_ident = loc;
2941 #endif
2942  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2943 
2944  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2945  tree_reduce_block)) {
2946 
2947  // only master executes here (master releases all other workers)
2948  __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2949  global_tid);
2950 
2951  } else {
2952 
2953  // should never reach this block
2954  KMP_ASSERT(0); // "unexpected method"
2955  }
2956 
2957  if (__kmp_env_consistency_check)
2958  __kmp_pop_sync(global_tid, ct_reduce, loc);
2959 
2960  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
2961  global_tid, packed_reduction_method));
2962 
2963  return;
2964 }
2965 
2966 #undef __KMP_GET_REDUCTION_METHOD
2967 #undef __KMP_SET_REDUCTION_METHOD
2968 
2969 /* end of interface to fast scalable reduce routines */
2970 
2971 kmp_uint64 __kmpc_get_taskid() {
2972 
2973  kmp_int32 gtid;
2974  kmp_info_t *thread;
2975 
2976  gtid = __kmp_get_gtid();
2977  if (gtid < 0) {
2978  return 0;
2979  }; // if
2980  thread = __kmp_thread_from_gtid(gtid);
2981  return thread->th.th_current_task->td_task_id;
2982 
2983 } // __kmpc_get_taskid
2984 
2985 kmp_uint64 __kmpc_get_parent_taskid() {
2986 
2987  kmp_int32 gtid;
2988  kmp_info_t *thread;
2989  kmp_taskdata_t *parent_task;
2990 
2991  gtid = __kmp_get_gtid();
2992  if (gtid < 0) {
2993  return 0;
2994  }; // if
2995  thread = __kmp_thread_from_gtid(gtid);
2996  parent_task = thread->th.th_current_task->td_parent;
2997  return (parent_task == NULL ? 0 : parent_task->td_task_id);
2998 
2999 } // __kmpc_get_parent_taskid
3000 
3001 #if OMP_45_ENABLED
3002 
3013 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3014  struct kmp_dim *dims) {
3015  int j, idx;
3016  kmp_int64 last, trace_count;
3017  kmp_info_t *th = __kmp_threads[gtid];
3018  kmp_team_t *team = th->th.th_team;
3019  kmp_uint32 *flags;
3020  kmp_disp_t *pr_buf = th->th.th_dispatch;
3021  dispatch_shared_info_t *sh_buf;
3022 
3023  KA_TRACE(
3024  20,
3025  ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3026  gtid, num_dims, !team->t.t_serialized));
3027  KMP_DEBUG_ASSERT(dims != NULL);
3028  KMP_DEBUG_ASSERT(num_dims > 0);
3029 
3030  if (team->t.t_serialized) {
3031  KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3032  return; // no dependencies if team is serialized
3033  }
3034  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3035  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3036  // the next loop
3037  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3038 
3039  // Save bounds info into allocated private buffer
3040  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3041  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3042  th, sizeof(kmp_int64) * (4 * num_dims + 1));
3043  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3044  pr_buf->th_doacross_info[0] =
3045  (kmp_int64)num_dims; // first element is number of dimensions
3046  // Save also address of num_done in order to access it later without knowing
3047  // the buffer index
3048  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3049  pr_buf->th_doacross_info[2] = dims[0].lo;
3050  pr_buf->th_doacross_info[3] = dims[0].up;
3051  pr_buf->th_doacross_info[4] = dims[0].st;
3052  last = 5;
3053  for (j = 1; j < num_dims; ++j) {
3054  kmp_int64
3055  range_length; // To keep ranges of all dimensions but the first dims[0]
3056  if (dims[j].st == 1) { // most common case
3057  // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3058  range_length = dims[j].up - dims[j].lo + 1;
3059  } else {
3060  if (dims[j].st > 0) {
3061  KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3062  range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3063  } else { // negative increment
3064  KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3065  range_length =
3066  (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3067  }
3068  }
3069  pr_buf->th_doacross_info[last++] = range_length;
3070  pr_buf->th_doacross_info[last++] = dims[j].lo;
3071  pr_buf->th_doacross_info[last++] = dims[j].up;
3072  pr_buf->th_doacross_info[last++] = dims[j].st;
3073  }
3074 
3075  // Compute total trip count.
3076  // Start with range of dims[0] which we don't need to keep in the buffer.
3077  if (dims[0].st == 1) { // most common case
3078  trace_count = dims[0].up - dims[0].lo + 1;
3079  } else if (dims[0].st > 0) {
3080  KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3081  trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3082  } else { // negative increment
3083  KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3084  trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3085  }
3086  for (j = 1; j < num_dims; ++j) {
3087  trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3088  }
3089  KMP_DEBUG_ASSERT(trace_count > 0);
3090 
3091  // Check if shared buffer is not occupied by other loop (idx -
3092  // __kmp_dispatch_num_buffers)
3093  if (idx != sh_buf->doacross_buf_idx) {
3094  // Shared buffer is occupied, wait for it to be free
3095  __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3096  __kmp_eq_4, NULL);
3097  }
3098  // Check if we are the first thread. After the CAS the first thread gets 0,
3099  // others get 1 if initialization is in progress, allocated pointer otherwise.
3100  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3101  (kmp_int64 *)&sh_buf->doacross_flags, NULL, (kmp_int64)1);
3102  if (flags == NULL) {
3103  // we are the first thread, allocate the array of flags
3104  kmp_int64 size =
3105  trace_count / 8 + 8; // in bytes, use single bit per iteration
3106  sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3107  } else if ((kmp_int64)flags == 1) {
3108  // initialization is still in progress, need to wait
3109  while ((volatile kmp_int64)sh_buf->doacross_flags == 1) {
3110  KMP_YIELD(TRUE);
3111  }
3112  }
3113  KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags >
3114  1); // check value of pointer
3115  pr_buf->th_doacross_flags =
3116  sh_buf->doacross_flags; // save private copy in order to not
3117  // touch shared buffer on each iteration
3118  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3119 }
3120 
3121 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) {
3122  kmp_int32 shft, num_dims, i;
3123  kmp_uint32 flag;
3124  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3125  kmp_info_t *th = __kmp_threads[gtid];
3126  kmp_team_t *team = th->th.th_team;
3127  kmp_disp_t *pr_buf;
3128  kmp_int64 lo, up, st;
3129 
3130  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3131  if (team->t.t_serialized) {
3132  KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3133  return; // no dependencies if team is serialized
3134  }
3135 
3136  // calculate sequential iteration number and check out-of-bounds condition
3137  pr_buf = th->th.th_dispatch;
3138  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3139  num_dims = pr_buf->th_doacross_info[0];
3140  lo = pr_buf->th_doacross_info[2];
3141  up = pr_buf->th_doacross_info[3];
3142  st = pr_buf->th_doacross_info[4];
3143  if (st == 1) { // most common case
3144  if (vec[0] < lo || vec[0] > up) {
3145  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3146  "bounds [%lld,%lld]\n",
3147  gtid, vec[0], lo, up));
3148  return;
3149  }
3150  iter_number = vec[0] - lo;
3151  } else if (st > 0) {
3152  if (vec[0] < lo || vec[0] > up) {
3153  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3154  "bounds [%lld,%lld]\n",
3155  gtid, vec[0], lo, up));
3156  return;
3157  }
3158  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3159  } else { // negative increment
3160  if (vec[0] > lo || vec[0] < up) {
3161  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3162  "bounds [%lld,%lld]\n",
3163  gtid, vec[0], lo, up));
3164  return;
3165  }
3166  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3167  }
3168  for (i = 1; i < num_dims; ++i) {
3169  kmp_int64 iter, ln;
3170  kmp_int32 j = i * 4;
3171  ln = pr_buf->th_doacross_info[j + 1];
3172  lo = pr_buf->th_doacross_info[j + 2];
3173  up = pr_buf->th_doacross_info[j + 3];
3174  st = pr_buf->th_doacross_info[j + 4];
3175  if (st == 1) {
3176  if (vec[i] < lo || vec[i] > up) {
3177  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3178  "bounds [%lld,%lld]\n",
3179  gtid, vec[i], lo, up));
3180  return;
3181  }
3182  iter = vec[i] - lo;
3183  } else if (st > 0) {
3184  if (vec[i] < lo || vec[i] > up) {
3185  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3186  "bounds [%lld,%lld]\n",
3187  gtid, vec[i], lo, up));
3188  return;
3189  }
3190  iter = (kmp_uint64)(vec[i] - lo) / st;
3191  } else { // st < 0
3192  if (vec[i] > lo || vec[i] < up) {
3193  KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3194  "bounds [%lld,%lld]\n",
3195  gtid, vec[i], lo, up));
3196  return;
3197  }
3198  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3199  }
3200  iter_number = iter + ln * iter_number;
3201  }
3202  shft = iter_number % 32; // use 32-bit granularity
3203  iter_number >>= 5; // divided by 32
3204  flag = 1 << shft;
3205  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3206  KMP_YIELD(TRUE);
3207  }
3208  KA_TRACE(20,
3209  ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3210  gtid, (iter_number << 5) + shft));
3211 }
3212 
3213 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) {
3214  kmp_int32 shft, num_dims, i;
3215  kmp_uint32 flag;
3216  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3217  kmp_info_t *th = __kmp_threads[gtid];
3218  kmp_team_t *team = th->th.th_team;
3219  kmp_disp_t *pr_buf;
3220  kmp_int64 lo, st;
3221 
3222  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3223  if (team->t.t_serialized) {
3224  KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
3225  return; // no dependencies if team is serialized
3226  }
3227 
3228  // calculate sequential iteration number (same as in "wait" but no
3229  // out-of-bounds checks)
3230  pr_buf = th->th.th_dispatch;
3231  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3232  num_dims = pr_buf->th_doacross_info[0];
3233  lo = pr_buf->th_doacross_info[2];
3234  st = pr_buf->th_doacross_info[4];
3235  if (st == 1) { // most common case
3236  iter_number = vec[0] - lo;
3237  } else if (st > 0) {
3238  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3239  } else { // negative increment
3240  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3241  }
3242  for (i = 1; i < num_dims; ++i) {
3243  kmp_int64 iter, ln;
3244  kmp_int32 j = i * 4;
3245  ln = pr_buf->th_doacross_info[j + 1];
3246  lo = pr_buf->th_doacross_info[j + 2];
3247  st = pr_buf->th_doacross_info[j + 4];
3248  if (st == 1) {
3249  iter = vec[i] - lo;
3250  } else if (st > 0) {
3251  iter = (kmp_uint64)(vec[i] - lo) / st;
3252  } else { // st < 0
3253  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3254  }
3255  iter_number = iter + ln * iter_number;
3256  }
3257  shft = iter_number % 32; // use 32-bit granularity
3258  iter_number >>= 5; // divided by 32
3259  flag = 1 << shft;
3260  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
3261  KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
3262  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
3263  (iter_number << 5) + shft));
3264 }
3265 
3266 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
3267  kmp_int64 num_done;
3268  kmp_info_t *th = __kmp_threads[gtid];
3269  kmp_team_t *team = th->th.th_team;
3270  kmp_disp_t *pr_buf = th->th.th_dispatch;
3271 
3272  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3273  if (team->t.t_serialized) {
3274  KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3275  return; // nothing to do
3276  }
3277  num_done = KMP_TEST_THEN_INC64((kmp_int64 *)pr_buf->th_doacross_info[1]) + 1;
3278  if (num_done == th->th.th_team_nproc) {
3279  // we are the last thread, need to free shared resources
3280  int idx = pr_buf->th_doacross_buf_idx - 1;
3281  dispatch_shared_info_t *sh_buf =
3282  &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3283  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
3284  (kmp_int64)&sh_buf->doacross_num_done);
3285  KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3286  KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3287  __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
3288  sh_buf->doacross_flags = NULL;
3289  sh_buf->doacross_num_done = 0;
3290  sh_buf->doacross_buf_idx +=
3291  __kmp_dispatch_num_buffers; // free buffer for future re-use
3292  }
3293  // free private resources (need to keep buffer index forever)
3294  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
3295  pr_buf->th_doacross_info = NULL;
3296  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3297 }
3298 #endif
3299 
3300 // end of file //
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:823
void __kmpc_flush(ident_t *loc)
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:191
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:805
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:208
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
kmp_int32 __kmpc_in_parallel(ident_t *loc)
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1393
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
char const * psource
Definition: kmp.h:218
kmp_int32 flags
Definition: kmp.h:210