LLVM OpenMP* Runtime Library
kmp_taskq.cpp
1 /*
2  * kmp_taskq.cpp -- TASKQ support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_error.h"
20 
21 #define MAX_MESSAGE 512
22 
23 /* ------------------------------------------------------------------------ */
24 /* ------------------------------------------------------------------------ */
25 
26 /*
27  * Taskq routines and global variables
28  */
29 
30 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
31 
32 #define THREAD_ALLOC_FOR_TASKQ
33 
34 static int
35 in_parallel_context( kmp_team_t *team )
36 {
37  return ! team -> t.t_serialized;
38 }
39 
40 static void
41 __kmp_taskq_eo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
42 {
43  int gtid = *gtid_ref;
44  int tid = __kmp_tid_from_gtid( gtid );
45  kmp_uint32 my_token;
46  kmpc_task_queue_t *taskq;
47  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
48 
49  if ( __kmp_env_consistency_check )
50 #if KMP_USE_DYNAMIC_LOCK
51  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL, 0 );
52 #else
53  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL );
54 #endif
55 
56  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
57  KMP_MB(); /* Flush all pending memory write invalidates. */
58 
59  /* GEH - need check here under stats to make sure */
60  /* inside task (curr_thunk[*tid_ref] != NULL) */
61 
62  my_token =tq->tq_curr_thunk[ tid ]-> th_tasknum;
63 
64  taskq = tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue;
65 
66  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
67  KMP_MB();
68  }
69 }
70 
71 static void
72 __kmp_taskq_xo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
73 {
74  int gtid = *gtid_ref;
75  int tid = __kmp_tid_from_gtid( gtid );
76  kmp_uint32 my_token;
77  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
78 
79  if ( __kmp_env_consistency_check )
80  __kmp_pop_sync( gtid, ct_ordered_in_taskq, loc_ref );
81 
82  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
83  KMP_MB(); /* Flush all pending memory write invalidates. */
84 
85  /* GEH - need check here under stats to make sure */
86  /* inside task (curr_thunk[tid] != NULL) */
87 
88  my_token = tq->tq_curr_thunk[ tid ]->th_tasknum;
89 
90  KMP_MB(); /* Flush all pending memory write invalidates. */
91 
92  tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue -> tq_tasknum_serving = my_token + 1;
93 
94  KMP_MB(); /* Flush all pending memory write invalidates. */
95  }
96 }
97 
98 static void
99 __kmp_taskq_check_ordered( kmp_int32 gtid, kmpc_thunk_t *thunk )
100 {
101  kmp_uint32 my_token;
102  kmpc_task_queue_t *taskq;
103 
104  /* assume we are always called from an active parallel context */
105 
106  KMP_MB(); /* Flush all pending memory write invalidates. */
107 
108  my_token = thunk -> th_tasknum;
109 
110  taskq = thunk -> th.th_shareds -> sv_queue;
111 
112  if(taskq->tq_tasknum_serving <= my_token) {
113  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
114  KMP_MB();
115  taskq->tq_tasknum_serving = my_token +1;
116  KMP_MB();
117  }
118 }
119 
120 #ifdef KMP_DEBUG
121 
122 static void
123 __kmp_dump_TQF(kmp_int32 flags)
124 {
125  if (flags & TQF_IS_ORDERED)
126  __kmp_printf("ORDERED ");
127  if (flags & TQF_IS_LASTPRIVATE)
128  __kmp_printf("LAST_PRIV ");
129  if (flags & TQF_IS_NOWAIT)
130  __kmp_printf("NOWAIT ");
131  if (flags & TQF_HEURISTICS)
132  __kmp_printf("HEURIST ");
133  if (flags & TQF_INTERFACE_RESERVED1)
134  __kmp_printf("RESERV1 ");
135  if (flags & TQF_INTERFACE_RESERVED2)
136  __kmp_printf("RESERV2 ");
137  if (flags & TQF_INTERFACE_RESERVED3)
138  __kmp_printf("RESERV3 ");
139  if (flags & TQF_INTERFACE_RESERVED4)
140  __kmp_printf("RESERV4 ");
141  if (flags & TQF_IS_LAST_TASK)
142  __kmp_printf("LAST_TASK ");
143  if (flags & TQF_TASKQ_TASK)
144  __kmp_printf("TASKQ_TASK ");
145  if (flags & TQF_RELEASE_WORKERS)
146  __kmp_printf("RELEASE ");
147  if (flags & TQF_ALL_TASKS_QUEUED)
148  __kmp_printf("ALL_QUEUED ");
149  if (flags & TQF_PARALLEL_CONTEXT)
150  __kmp_printf("PARALLEL ");
151  if (flags & TQF_DEALLOCATED)
152  __kmp_printf("DEALLOC ");
153  if (!(flags & (TQF_INTERNAL_FLAGS|TQF_INTERFACE_FLAGS)))
154  __kmp_printf("(NONE)");
155 }
156 
157 static void
158 __kmp_dump_thunk( kmp_taskq_t *tq, kmpc_thunk_t *thunk, kmp_int32 global_tid )
159 {
160  int i;
161  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
162 
163  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
164 
165  if (thunk != NULL) {
166  for (i = 0; i < nproc; i++) {
167  if( tq->tq_curr_thunk[i] == thunk ) {
168  __kmp_printf("[%i] ", i);
169  }
170  }
171  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
172  __kmp_printf("th_task=%p, ", thunk->th_task);
173  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
174  __kmp_printf("th_status=%d, ", thunk->th_status);
175  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
176  __kmp_printf("th_flags="); __kmp_dump_TQF(thunk->th_flags);
177  }
178 
179  __kmp_printf("\n");
180 }
181 
182 static void
183 __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num)
184 {
185  kmpc_thunk_t *th;
186 
187  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
188 
189  for (th = thunk; th != NULL; th = th->th_encl_thunk )
190  __kmp_printf("%p ", th);
191 
192  __kmp_printf("\n");
193 }
194 
195 static void
196 __kmp_dump_task_queue( kmp_taskq_t *tq, kmpc_task_queue_t *queue, kmp_int32 global_tid )
197 {
198  int qs, count, i;
199  kmpc_thunk_t *thunk;
200  kmpc_task_queue_t *taskq;
201 
202  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
203 
204  if (queue != NULL) {
205  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
206 
207  if ( __kmp_env_consistency_check ) {
208  __kmp_printf(" tq_loc : ");
209  }
210  if (in_parallel) {
211 
212  //if (queue->tq.tq_parent != 0)
213  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
214 
215  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
216 
217  KMP_MB(); /* make sure data structures are in consistent state before querying them */
218  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
219 
220  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
221  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
222  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
223  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
224  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
225 
226  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
227 
228  //if (queue->tq.tq_parent != 0)
229  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
230 
231  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
232  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
233 
234  KMP_MB(); /* make sure data structures are in consistent state before querying them */
235  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
236  }
237 
238  __kmp_printf(" tq_shareds : ");
239  for (i=0; i<((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
240  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
241  __kmp_printf("\n");
242 
243  if (in_parallel) {
244  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
245  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
246  }
247 
248  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
249  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
250  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
251 
252  __kmp_printf(" tq_free_thunks : ");
253  for (thunk = queue->tq_free_thunks; thunk != NULL; thunk = thunk->th.th_next_free )
254  __kmp_printf("%p ", thunk);
255  __kmp_printf("\n");
256 
257  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
258  __kmp_printf(" tq_head : %d\n", queue->tq_head);
259  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
260  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
261  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
262  __kmp_printf(" tq_flags : "); __kmp_dump_TQF(queue->tq_flags);
263  __kmp_printf("\n");
264 
265  if (in_parallel) {
266  __kmp_printf(" tq_th_thunks : ");
267  for (i = 0; i < queue->tq_nproc; i++) {
268  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
269  }
270  __kmp_printf("\n");
271  }
272 
273  __kmp_printf("\n");
274  __kmp_printf(" Queue slots:\n");
275 
276 
277  qs = queue->tq_tail;
278  for ( count = 0; count < queue->tq_nfull; ++count ) {
279  __kmp_printf("(%d)", qs);
280  __kmp_dump_thunk( tq, queue->tq_queue[qs].qs_thunk, global_tid );
281  qs = (qs+1) % queue->tq_nslots;
282  }
283 
284  __kmp_printf("\n");
285 
286  if (in_parallel) {
287  if (queue->tq_taskq_slot != NULL) {
288  __kmp_printf(" TaskQ slot:\n");
289  __kmp_dump_thunk( tq, (kmpc_thunk_t *) queue->tq_taskq_slot, global_tid );
290  __kmp_printf("\n");
291  }
292  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
293  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
294  }
295  }
296 
297  __kmp_printf(" Taskq freelist: ");
298 
299  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
300 
301  KMP_MB(); /* make sure data structures are in consistent state before querying them */
302  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
303 
304  for( taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free )
305  __kmp_printf("%p ", taskq);
306 
307  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
308 
309  __kmp_printf("\n\n");
310 }
311 
312 static void
313 __kmp_aux_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *curr_queue, kmp_int32 level, kmp_int32 global_tid )
314 {
315  int i, count, qs;
316  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
317  kmpc_task_queue_t *queue = curr_queue;
318 
319  if (curr_queue == NULL)
320  return;
321 
322  __kmp_printf(" ");
323 
324  for (i=0; i<level; i++)
325  __kmp_printf(" ");
326 
327  __kmp_printf("%p", curr_queue);
328 
329  for (i = 0; i < nproc; i++) {
330  if( tq->tq_curr_thunk[i] && tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue ) {
331  __kmp_printf(" [%i]", i);
332  }
333  }
334 
335  __kmp_printf(":");
336 
337  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
338 
339  KMP_MB(); /* make sure data structures are in consistent state before querying them */
340  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
341 
342  qs = curr_queue->tq_tail;
343 
344  for ( count = 0; count < curr_queue->tq_nfull; ++count ) {
345  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
346  qs = (qs+1) % curr_queue->tq_nslots;
347  }
348 
349  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
350 
351  __kmp_printf("\n");
352 
353  if (curr_queue->tq_first_child) {
354  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
355 
356  KMP_MB(); /* make sure data structures are in consistent state before querying them */
357  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
358 
359  if (curr_queue->tq_first_child) {
360  for(queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
361  queue != NULL;
362  queue = queue->tq_next_child) {
363  __kmp_aux_dump_task_queue_tree( tq, queue, level+1, global_tid );
364  }
365  }
366 
367  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
368  }
369 }
370 
371 static void
372 __kmp_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *tqroot, kmp_int32 global_tid)
373 {
374  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
375 
376  __kmp_aux_dump_task_queue_tree( tq, tqroot, 0, global_tid );
377 
378  __kmp_printf("\n");
379 }
380 #endif
381 
382 /* --------------------------------------------------------------------------- */
383 
384 /*
385  New taskq storage routines that try to minimize overhead of mallocs but
386  still provide cache line alignment.
387 */
388 
389 
390 static void *
391 __kmp_taskq_allocate(size_t size, kmp_int32 global_tid)
392 {
393  void *addr, *orig_addr;
394  size_t bytes;
395 
396  KB_TRACE( 5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int) size, global_tid ) );
397 
398  bytes = sizeof(void *) + CACHE_LINE + size;
399 
400 #ifdef THREAD_ALLOC_FOR_TASKQ
401  orig_addr = (void *) __kmp_thread_malloc( __kmp_thread_from_gtid(global_tid), bytes );
402 #else
403  KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", bytes ) );
404  orig_addr = (void *) KMP_INTERNAL_MALLOC( bytes );
405 #endif /* THREAD_ALLOC_FOR_TASKQ */
406 
407  if (orig_addr == 0)
408  KMP_FATAL( OutOfHeapMemory );
409 
410  addr = orig_addr;
411 
412  if (((kmp_uintptr_t) addr & ( CACHE_LINE - 1 )) != 0) {
413  KB_TRACE( 50, ("__kmp_taskq_allocate: adjust for cache alignment\n" ) );
414  addr = (void *) (((kmp_uintptr_t) addr + CACHE_LINE) & ~( CACHE_LINE - 1 ));
415  }
416 
417  (* (void **) addr) = orig_addr;
418 
419  KB_TRACE( 10, ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, gtid: %d\n",
420  orig_addr, ((void **) addr) + 1, ((char *)(((void **) addr) + 1)) + size-1,
421  (int) size, global_tid ));
422 
423  return ( ((void **) addr) + 1 );
424 }
425 
426 static void
427 __kmpc_taskq_free(void *p, kmp_int32 global_tid)
428 {
429  KB_TRACE( 5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid ) );
430 
431  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", (*( ((void **) p)-1)), global_tid ));
432 
433 #ifdef THREAD_ALLOC_FOR_TASKQ
434  __kmp_thread_free( __kmp_thread_from_gtid(global_tid), *( ((void **) p)-1) );
435 #else
436  KMP_INTERNAL_FREE( *( ((void **) p)-1) );
437 #endif /* THREAD_ALLOC_FOR_TASKQ */
438 }
439 
440 /* --------------------------------------------------------------------------- */
441 
442 /*
443  * Keep freed kmpc_task_queue_t on an internal freelist and recycle since
444  * they're of constant size.
445  */
446 
447 static kmpc_task_queue_t *
448 __kmp_alloc_taskq ( kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, kmp_int32 nthunks,
449  kmp_int32 nshareds, kmp_int32 nproc, size_t sizeof_thunk,
450  size_t sizeof_shareds, kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid )
451 {
452  kmp_int32 i;
453  size_t bytes;
454  kmpc_task_queue_t *new_queue;
455  kmpc_aligned_shared_vars_t *shared_var_array;
456  char *shared_var_storage;
457  char *pt; /* for doing byte-adjusted address computations */
458 
459  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
460 
461  KMP_MB(); /* make sure data structures are in consistent state before querying them */
462  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
463 
464  if( tq->tq_freelist ) {
465  new_queue = tq -> tq_freelist;
466  tq -> tq_freelist = tq -> tq_freelist -> tq.tq_next_free;
467 
468  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
469 
470  new_queue->tq_flags = 0;
471 
472  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
473  }
474  else {
475  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
476 
477  new_queue = (kmpc_task_queue_t *) __kmp_taskq_allocate (sizeof (kmpc_task_queue_t), global_tid);
478  new_queue->tq_flags = 0;
479  }
480 
481  /* space in the task queue for queue slots (allocate as one big chunk */
482  /* of storage including new_taskq_task space) */
483 
484  sizeof_thunk += (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
485  pt = (char *) __kmp_taskq_allocate (nthunks * sizeof_thunk, global_tid);
486  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
487  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
488 
489  /* chain the allocated thunks into a freelist for this queue */
490 
491  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
492 
493  for (i = 0; i < (nthunks - 2); i++) {
494  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th.th_next_free = (kmpc_thunk_t *)(pt + (i+1)*sizeof_thunk);
495 #ifdef KMP_DEBUG
496  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
497 #endif
498  }
499 
500  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th.th_next_free = NULL;
501 #ifdef KMP_DEBUG
502  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
503 #endif
504 
505  /* initialize the locks */
506 
507  if (in_parallel) {
508  __kmp_init_lock( & new_queue->tq_link_lck );
509  __kmp_init_lock( & new_queue->tq_free_thunks_lck );
510  __kmp_init_lock( & new_queue->tq_queue_lck );
511  }
512 
513  /* now allocate the slots */
514 
515  bytes = nslots * sizeof (kmpc_aligned_queue_slot_t);
516  new_queue->tq_queue = (kmpc_aligned_queue_slot_t *) __kmp_taskq_allocate( bytes, global_tid );
517 
518  /* space for array of pointers to shared variable structures */
519  sizeof_shareds += sizeof(kmpc_task_queue_t *);
520  sizeof_shareds += (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
521 
522  bytes = nshareds * sizeof (kmpc_aligned_shared_vars_t);
523  shared_var_array = (kmpc_aligned_shared_vars_t *) __kmp_taskq_allocate ( bytes, global_tid);
524 
525  bytes = nshareds * sizeof_shareds;
526  shared_var_storage = (char *) __kmp_taskq_allocate ( bytes, global_tid);
527 
528  for (i=0; i<nshareds; i++) {
529  shared_var_array[i].ai_data = (kmpc_shared_vars_t *) (shared_var_storage + i*sizeof_shareds);
530  shared_var_array[i].ai_data->sv_queue = new_queue;
531  }
532  new_queue->tq_shareds = shared_var_array;
533 
534 
535  /* array for number of outstanding thunks per thread */
536 
537  if (in_parallel) {
538  bytes = nproc * sizeof(kmpc_aligned_int32_t);
539  new_queue->tq_th_thunks = (kmpc_aligned_int32_t *) __kmp_taskq_allocate ( bytes, global_tid);
540  new_queue->tq_nproc = nproc;
541 
542  for (i=0; i<nproc; i++)
543  new_queue->tq_th_thunks[i].ai_data = 0;
544  }
545 
546  return new_queue;
547 }
548 
549 static void
550 __kmp_free_taskq (kmp_taskq_t *tq, kmpc_task_queue_t *p, int in_parallel, kmp_int32 global_tid)
551 {
552  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
553  __kmpc_taskq_free(p->tq_queue, global_tid);
554 
555  /* free shared var structure storage */
556  __kmpc_taskq_free((void *) p->tq_shareds[0].ai_data, global_tid);
557 
558  /* free array of pointers to shared vars storage */
559  __kmpc_taskq_free(p->tq_shareds, global_tid);
560 
561 #ifdef KMP_DEBUG
562  p->tq_first_child = NULL;
563  p->tq_next_child = NULL;
564  p->tq_prev_child = NULL;
565  p->tq_ref_count = -10;
566  p->tq_shareds = NULL;
567  p->tq_tasknum_queuing = 0;
568  p->tq_tasknum_serving = 0;
569  p->tq_queue = NULL;
570  p->tq_thunk_space = NULL;
571  p->tq_taskq_slot = NULL;
572  p->tq_free_thunks = NULL;
573  p->tq_nslots = 0;
574  p->tq_head = 0;
575  p->tq_tail = 0;
576  p->tq_nfull = 0;
577  p->tq_hiwat = 0;
578 
579  if (in_parallel) {
580  int i;
581 
582  for (i=0; i<p->tq_nproc; i++)
583  p->tq_th_thunks[i].ai_data = 0;
584  }
585  if ( __kmp_env_consistency_check )
586  p->tq_loc = NULL;
587  KMP_DEBUG_ASSERT( p->tq_flags & TQF_DEALLOCATED );
588  p->tq_flags = TQF_DEALLOCATED;
589 #endif /* KMP_DEBUG */
590 
591  if (in_parallel) {
592  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
593  __kmp_destroy_lock(& p->tq_link_lck);
594  __kmp_destroy_lock(& p->tq_queue_lck);
595  __kmp_destroy_lock(& p->tq_free_thunks_lck);
596  }
597 #ifdef KMP_DEBUG
598  p->tq_th_thunks = NULL;
599 #endif /* KMP_DEBUG */
600 
601  KMP_MB(); /* make sure data structures are in consistent state before querying them */
602  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
603 
604  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
605  p->tq.tq_next_free = tq->tq_freelist;
606 
607  tq->tq_freelist = p;
608  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
609 }
610 
611 /*
612  * Once a group of thunks has been allocated for use in a particular queue,
613  * these are managed via a per-queue freelist.
614  * We force a check that there's always a thunk free if we need one.
615  */
616 
617 static kmpc_thunk_t *
618 __kmp_alloc_thunk (kmpc_task_queue_t *queue, int in_parallel, kmp_int32 global_tid)
619 {
620  kmpc_thunk_t *fl;
621 
622  if (in_parallel) {
623  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
624 
625  KMP_MB(); /* make sure data structures are in consistent state before querying them */
626  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
627  }
628 
629  fl = queue->tq_free_thunks;
630 
631  KMP_DEBUG_ASSERT (fl != NULL);
632 
633  queue->tq_free_thunks = fl->th.th_next_free;
634  fl->th_flags = 0;
635 
636  if (in_parallel)
637  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
638 
639  return fl;
640 }
641 
642 static void
643 __kmp_free_thunk (kmpc_task_queue_t *queue, kmpc_thunk_t *p, int in_parallel, kmp_int32 global_tid)
644 {
645 #ifdef KMP_DEBUG
646  p->th_task = 0;
647  p->th_encl_thunk = 0;
648  p->th_status = 0;
649  p->th_tasknum = 0;
650  /* Also could zero pointers to private vars */
651 #endif
652 
653  if (in_parallel) {
654  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
655 
656  KMP_MB(); /* make sure data structures are in consistent state before querying them */
657  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
658  }
659 
660  p->th.th_next_free = queue->tq_free_thunks;
661  queue->tq_free_thunks = p;
662 
663 #ifdef KMP_DEBUG
664  p->th_flags = TQF_DEALLOCATED;
665 #endif
666 
667  if (in_parallel)
668  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
669 }
670 
671 /* --------------------------------------------------------------------------- */
672 
673 /* returns nonzero if the queue just became full after the enqueue */
674 
675 static kmp_int32
676 __kmp_enqueue_task ( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, kmpc_thunk_t *thunk, int in_parallel )
677 {
678  kmp_int32 ret;
679 
680  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the master is executing then) */
681  if (in_parallel) {
682  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
683 
684  KMP_MB(); /* make sure data structures are in consistent state before querying them */
685  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
686  }
687 
688  KMP_DEBUG_ASSERT (queue->tq_nfull < queue->tq_nslots); /* check queue not full */
689 
690  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
691 
692  if (queue->tq_head >= queue->tq_nslots)
693  queue->tq_head = 0;
694 
695  (queue->tq_nfull)++;
696 
697  KMP_MB(); /* to assure that nfull is seen to increase before TQF_ALL_TASKS_QUEUED is set */
698 
699  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
700 
701  if (in_parallel) {
702  /* don't need to wait until workers are released before unlocking */
703  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
704 
705  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
706  /* If just creating the root queue, the worker threads are waiting at */
707  /* a join barrier until now, when there's something in the queue for */
708  /* them to do; release them now to do work. */
709  /* This should only be done when this is the first task enqueued, */
710  /* so reset the flag here also. */
711 
712  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers are still in spin mode */
713 
714  KMP_MB(); /* avoid releasing barrier twice if taskq_task switches threads */
715 
716  __kmpc_end_barrier_master( NULL, global_tid);
717  }
718  }
719 
720  return ret;
721 }
722 
723 static kmpc_thunk_t *
724 __kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel)
725 {
726  kmpc_thunk_t *pt;
727  int tid = __kmp_tid_from_gtid( global_tid );
728 
729  KMP_DEBUG_ASSERT (queue->tq_nfull > 0); /* check queue not empty */
730 
731  if (queue->tq.tq_parent != NULL && in_parallel) {
732  int ct;
733  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
734  ct = ++(queue->tq_ref_count);
735  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
736  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
737  __LINE__, global_tid, queue, ct));
738  }
739 
740  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
741 
742  if (queue->tq_tail >= queue->tq_nslots)
743  queue->tq_tail = 0;
744 
745  if (in_parallel) {
746  queue->tq_th_thunks[tid].ai_data++;
747 
748  KMP_MB(); /* necessary so ai_data increment is propagated to other threads immediately (digital) */
749 
750  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding thunks from queue %p\n",
751  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
752  }
753 
754  (queue->tq_nfull)--;
755 
756 #ifdef KMP_DEBUG
757  KMP_MB();
758 
759  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is decremented */
760 
761  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
762 
763  if (in_parallel) {
764  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= __KMP_TASKQ_THUNKS_PER_TH);
765  }
766 #endif
767 
768  return pt;
769 }
770 
771 /*
772  * Find the next (non-null) task to dequeue and return it.
773  * This is never called unless in_parallel=TRUE
774  *
775  * Here are the rules for deciding which queue to take the task from:
776  * 1. Walk up the task queue tree from the current queue's parent and look
777  * on the way up (for loop, below).
778  * 2. Do a depth-first search back down the tree from the root and
779  * look (find_task_in_descendant_queue()).
780  *
781  * Here are the rules for deciding which task to take from a queue
782  * (__kmp_find_task_in_queue ()):
783  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
784  * must be staged to make sure we execute the last one with
785  * TQF_IS_LAST_TASK at the end of task queue execution.
786  * 2. If the queue length is below some high water mark and the taskq task
787  * is enqueued, prefer running the taskq task.
788  * 3. Otherwise, take a (normal) task from the queue.
789  *
790  * If we do all this and return pt == NULL at the bottom of this routine,
791  * this means there are no more tasks to execute (except possibly for
792  * TQF_IS_LASTPRIVATE).
793  */
794 
795 static kmpc_thunk_t *
796 __kmp_find_task_in_queue (kmp_int32 global_tid, kmpc_task_queue_t *queue)
797 {
798  kmpc_thunk_t *pt = NULL;
799  int tid = __kmp_tid_from_gtid( global_tid );
800 
801  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
802  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
803 
804  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
805 
806  /* Check again to avoid race in __kmpc_end_taskq() */
807  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
808 
809  KMP_MB(); /* make sure data structures are in consistent state before querying them */
810  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
811 
812  if ((queue->tq_taskq_slot != NULL) && (queue->tq_nfull <= queue->tq_hiwat)) {
813  /* if there's enough room in the queue and the dispatcher */
814  /* (taskq task) is available, schedule more tasks */
815  pt = (kmpc_thunk_t *) queue->tq_taskq_slot;
816  queue->tq_taskq_slot = NULL;
817  }
818  else if (queue->tq_nfull == 0 ||
819  queue->tq_th_thunks[tid].ai_data >= __KMP_TASKQ_THUNKS_PER_TH) {
820  /* do nothing if no thunks available or this thread can't */
821  /* run any because it already is executing too many */
822 
823  pt = NULL;
824  }
825  else if (queue->tq_nfull > 1) {
826  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
827 
828  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
829  }
830  else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
831  /* one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE */
832 
833  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
834  }
835  else if (queue->tq_flags & TQF_IS_LAST_TASK) {
836  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
837  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
838  /* instrumentation does copy-out. */
839 
840  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
841  pt->th_flags |= TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
842  }
843  }
844 
845  /* GEH - What happens here if is lastprivate, but not last task? */
846  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
847  }
848 
849  return pt;
850 }
851 
852 /*
853  * Walk a tree of queues starting at queue's first child
854  * and return a non-NULL thunk if one can be scheduled.
855  * Must only be called when in_parallel=TRUE
856  */
857 
858 static kmpc_thunk_t *
859 __kmp_find_task_in_descendant_queue (kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
860 {
861  kmpc_thunk_t *pt = NULL;
862  kmpc_task_queue_t *queue = curr_queue;
863 
864  if (curr_queue->tq_first_child != NULL) {
865  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
866 
867  KMP_MB(); /* make sure data structures are in consistent state before querying them */
868  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
869 
870  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
871  if (queue == NULL) {
872  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
873  return NULL;
874  }
875 
876  while (queue != NULL) {
877  int ct;
878  kmpc_task_queue_t *next;
879 
880  ct= ++(queue->tq_ref_count);
881  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
882  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
883  __LINE__, global_tid, queue, ct));
884 
885  pt = __kmp_find_task_in_queue (global_tid, queue);
886 
887  if (pt != NULL) {
888  int ct;
889 
890  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
891 
892  KMP_MB(); /* make sure data structures are in consistent state before querying them */
893  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
894 
895  ct = --(queue->tq_ref_count);
896  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
897  __LINE__, global_tid, queue, ct));
898  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
899 
900  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
901 
902  return pt;
903  }
904 
905  /* although reference count stays active during descendant walk, shouldn't matter */
906  /* since if children still exist, reference counts aren't being monitored anyway */
907 
908  pt = __kmp_find_task_in_descendant_queue (global_tid, queue);
909 
910  if (pt != NULL) {
911  int ct;
912 
913  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
914 
915  KMP_MB(); /* make sure data structures are in consistent state before querying them */
916  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
917 
918  ct = --(queue->tq_ref_count);
919  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
920  __LINE__, global_tid, queue, ct));
921  KMP_DEBUG_ASSERT( ct >= 0 );
922 
923  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
924 
925  return pt;
926  }
927 
928  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
929 
930  KMP_MB(); /* make sure data structures are in consistent state before querying them */
931  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
932 
933  next = queue->tq_next_child;
934 
935  ct = --(queue->tq_ref_count);
936  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
937  __LINE__, global_tid, queue, ct));
938  KMP_DEBUG_ASSERT( ct >= 0 );
939 
940  queue = next;
941  }
942 
943  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
944  }
945 
946  return pt;
947 }
948 
949 /*
950  * Walk up the taskq tree looking for a task to execute.
951  * If we get to the root, search the tree for a descendent queue task.
952  * Must only be called when in_parallel=TRUE
953  */
954 
955 static kmpc_thunk_t *
956 __kmp_find_task_in_ancestor_queue (kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
957 {
958  kmpc_task_queue_t *queue;
959  kmpc_thunk_t *pt;
960 
961  pt = NULL;
962 
963  if (curr_queue->tq.tq_parent != NULL) {
964  queue = curr_queue->tq.tq_parent;
965 
966  while (queue != NULL) {
967  if (queue->tq.tq_parent != NULL) {
968  int ct;
969  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
970 
971  KMP_MB(); /* make sure data structures are in consistent state before querying them */
972  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
973 
974  ct = ++(queue->tq_ref_count);
975  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
976  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
977  __LINE__, global_tid, queue, ct));
978  }
979 
980  pt = __kmp_find_task_in_queue (global_tid, queue);
981  if (pt != NULL) {
982  if (queue->tq.tq_parent != NULL) {
983  int ct;
984  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
985 
986  KMP_MB(); /* make sure data structures are in consistent state before querying them */
987  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
988 
989  ct = --(queue->tq_ref_count);
990  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
991  __LINE__, global_tid, queue, ct));
992  KMP_DEBUG_ASSERT( ct >= 0 );
993 
994  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
995  }
996 
997  return pt;
998  }
999 
1000  if (queue->tq.tq_parent != NULL) {
1001  int ct;
1002  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1003 
1004  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1005  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1006 
1007  ct = --(queue->tq_ref_count);
1008  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1009  __LINE__, global_tid, queue, ct));
1010  KMP_DEBUG_ASSERT( ct >= 0 );
1011  }
1012  queue = queue->tq.tq_parent;
1013 
1014  if (queue != NULL)
1015  __kmp_release_lock(& queue->tq_link_lck, global_tid);
1016  }
1017 
1018  }
1019 
1020  pt = __kmp_find_task_in_descendant_queue( global_tid, tq->tq_root );
1021 
1022  return pt;
1023 }
1024 
1025 static int
1026 __kmp_taskq_tasks_finished (kmpc_task_queue_t *queue)
1027 {
1028  int i;
1029 
1030  /* KMP_MB(); *//* is this really necessary? */
1031 
1032  for (i=0; i<queue->tq_nproc; i++) {
1033  if (queue->tq_th_thunks[i].ai_data != 0)
1034  return FALSE;
1035  }
1036 
1037  return TRUE;
1038 }
1039 
1040 static int
1041 __kmp_taskq_has_any_children (kmpc_task_queue_t *queue)
1042 {
1043  return (queue->tq_first_child != NULL);
1044 }
1045 
1046 static void
1047 __kmp_remove_queue_from_tree( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel )
1048 {
1049 #ifdef KMP_DEBUG
1050  kmp_int32 i;
1051  kmpc_thunk_t *thunk;
1052 #endif
1053 
1054  KF_TRACE(50, ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1055  KF_DUMP(50, __kmp_dump_task_queue( tq, queue, global_tid ));
1056 
1057  /* sub-queue in a recursion, not the root task queue */
1058  KMP_DEBUG_ASSERT (queue->tq.tq_parent != NULL);
1059 
1060  if (in_parallel) {
1061  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1062 
1063  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1064  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1065  }
1066 
1067  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1068 
1069  /* unlink queue from its siblings if any at this level */
1070  if (queue->tq_prev_child != NULL)
1071  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1072  if (queue->tq_next_child != NULL)
1073  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1074  if (queue->tq.tq_parent->tq_first_child == queue)
1075  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1076 
1077  queue->tq_prev_child = NULL;
1078  queue->tq_next_child = NULL;
1079 
1080  if (in_parallel) {
1081  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1082  __LINE__, global_tid, queue, queue->tq_ref_count));
1083 
1084  /* wait until all other threads have stopped accessing this queue */
1085  while (queue->tq_ref_count > 1) {
1086  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1087 
1088  KMP_WAIT_YIELD((volatile kmp_uint32*)&queue->tq_ref_count, 1, KMP_LE, NULL);
1089 
1090  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1091 
1092  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1093  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1094  }
1095 
1096  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1097  }
1098 
1099  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p freeing queue\n",
1100  __LINE__, global_tid, queue));
1101 
1102 #ifdef KMP_DEBUG
1103  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1104  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1105 
1106  for (i=0; i<queue->tq_nproc; i++) {
1107  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1108  }
1109 
1110  i = 0;
1111  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1112  ++i;
1113 
1114  KMP_ASSERT (i == queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1115 #endif
1116 
1117  /* release storage for queue entry */
1118  __kmp_free_taskq ( tq, queue, TRUE, global_tid );
1119 
1120  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1121  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1122 }
1123 
1124 /*
1125  * Starting from indicated queue, proceed downward through tree and
1126  * remove all taskqs which are finished, but only go down to taskqs
1127  * which have the "nowait" clause present. Assume this is only called
1128  * when in_parallel=TRUE.
1129  */
1130 
1131 static void
1132 __kmp_find_and_remove_finished_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue )
1133 {
1134  kmpc_task_queue_t *queue = curr_queue;
1135 
1136  if (curr_queue->tq_first_child != NULL) {
1137  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1138 
1139  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1140  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1141 
1142  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
1143  if (queue != NULL) {
1144  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1145  return;
1146  }
1147 
1148  while (queue != NULL) {
1149  kmpc_task_queue_t *next;
1150  int ct = ++(queue->tq_ref_count);
1151  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
1152  __LINE__, global_tid, queue, ct));
1153 
1154 
1155  /* although reference count stays active during descendant walk, */
1156  /* shouldn't matter since if children still exist, reference */
1157  /* counts aren't being monitored anyway */
1158 
1159  if (queue->tq_flags & TQF_IS_NOWAIT) {
1160  __kmp_find_and_remove_finished_child_taskq ( tq, global_tid, queue );
1161 
1162  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && (queue->tq_nfull == 0) &&
1163  __kmp_taskq_tasks_finished(queue) && ! __kmp_taskq_has_any_children(queue)) {
1164 
1165  /*
1166  Only remove this if we have not already marked it for deallocation.
1167  This should prevent multiple threads from trying to free this.
1168  */
1169 
1170  if ( __kmp_test_lock(& queue->tq_queue_lck, global_tid) ) {
1171  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1172  queue->tq_flags |= TQF_DEALLOCATED;
1173  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1174 
1175  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1176 
1177  /* Can't do any more here since can't be sure where sibling queue is so just exit this level */
1178  return;
1179  }
1180  else {
1181  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1182  }
1183  }
1184  /* otherwise, just fall through and decrement reference count */
1185  }
1186  }
1187 
1188  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1189 
1190  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1191  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1192 
1193  next = queue->tq_next_child;
1194 
1195  ct = --(queue->tq_ref_count);
1196  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1197  __LINE__, global_tid, queue, ct));
1198  KMP_DEBUG_ASSERT( ct >= 0 );
1199 
1200  queue = next;
1201  }
1202 
1203  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1204  }
1205 }
1206 
1207 /*
1208  * Starting from indicated queue, proceed downward through tree and
1209  * remove all taskq's assuming all are finished and
1210  * assuming NO other threads are executing at this point.
1211  */
1212 
1213 static void
1214 __kmp_remove_all_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue )
1215 {
1216  kmpc_task_queue_t *next_child;
1217 
1218  queue = (kmpc_task_queue_t *) queue->tq_first_child;
1219 
1220  while (queue != NULL) {
1221  __kmp_remove_all_child_taskq ( tq, global_tid, queue );
1222 
1223  next_child = queue->tq_next_child;
1224  queue->tq_flags |= TQF_DEALLOCATED;
1225  __kmp_remove_queue_from_tree ( tq, global_tid, queue, FALSE );
1226  queue = next_child;
1227  }
1228 }
1229 
1230 static void
1231 __kmp_execute_task_from_queue( kmp_taskq_t *tq, ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, int in_parallel )
1232 {
1233  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1234  kmp_int32 tid = __kmp_tid_from_gtid( global_tid );
1235 
1236  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1237  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1238  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1239  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1240 
1241  /*
1242  * For the taskq task, the curr_thunk pushes and pop pairs are set up as follows:
1243  *
1244  * happens exactly once:
1245  * 1) __kmpc_taskq : push (if returning thunk only)
1246  * 4) __kmpc_end_taskq_task : pop
1247  *
1248  * optionally happens *each* time taskq task is dequeued/enqueued:
1249  * 2) __kmpc_taskq_task : pop
1250  * 3) __kmp_execute_task_from_queue : push
1251  *
1252  * execution ordering: 1,(2,3)*,4
1253  */
1254 
1255  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1256  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1257  thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[index].ai_data;
1258 
1259  if ( __kmp_env_consistency_check ) {
1260  __kmp_push_workshare( global_tid,
1261  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1262  queue->tq_loc );
1263  }
1264  }
1265  else {
1266  if ( __kmp_env_consistency_check )
1267  __kmp_push_workshare( global_tid, ct_taskq, queue->tq_loc );
1268  }
1269 
1270  if (in_parallel) {
1271  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1272  tq->tq_curr_thunk[tid] = thunk;
1273 
1274  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1275  }
1276 
1277  KF_TRACE( 50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1278  thunk->th_task (global_tid, thunk);
1279  KF_TRACE( 50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1280 
1281  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1282  if ( __kmp_env_consistency_check )
1283  __kmp_pop_workshare( global_tid, (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1284  queue->tq_loc );
1285 
1286  if (in_parallel) {
1287  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1288  thunk->th_encl_thunk = NULL;
1289  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1290  }
1291 
1292  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1293  __kmp_taskq_check_ordered(global_tid, thunk);
1294  }
1295 
1296  __kmp_free_thunk (queue, thunk, in_parallel, global_tid);
1297 
1298  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", global_tid, thunk));
1299  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1300 
1301  if (in_parallel) {
1302  KMP_MB(); /* needed so thunk put on free list before outstanding thunk count is decremented */
1303 
1304  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1305 
1306  KF_TRACE( 200, ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1307  global_tid, queue->tq_th_thunks[tid].ai_data-1, queue));
1308 
1309  queue->tq_th_thunks[tid].ai_data--;
1310 
1311  /* KMP_MB(); */ /* is MB really necessary ? */
1312  }
1313 
1314  if (queue->tq.tq_parent != NULL && in_parallel) {
1315  int ct;
1316  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1317  ct = --(queue->tq_ref_count);
1318  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1319  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1320  __LINE__, global_tid, queue, ct));
1321  KMP_DEBUG_ASSERT( ct >= 0 );
1322  }
1323  }
1324 }
1325 
1326 /* --------------------------------------------------------------------------- */
1327 
1328 /* starts a taskq; creates and returns a thunk for the taskq_task */
1329 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1330 
1331 kmpc_thunk_t *
1332 __kmpc_taskq( ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task,
1333  size_t sizeof_thunk, size_t sizeof_shareds,
1334  kmp_int32 flags, kmpc_shared_vars_t **shareds )
1335 {
1336  int in_parallel;
1337  kmp_int32 nslots, nthunks, nshareds, nproc;
1338  kmpc_task_queue_t *new_queue, *curr_queue;
1339  kmpc_thunk_t *new_taskq_thunk;
1340  kmp_info_t *th;
1341  kmp_team_t *team;
1342  kmp_taskq_t *tq;
1343  kmp_int32 tid;
1344 
1345  KE_TRACE( 10, ("__kmpc_taskq called (%d)\n", global_tid));
1346 
1347  th = __kmp_threads[ global_tid ];
1348  team = th -> th.th_team;
1349  tq = & team -> t.t_taskq;
1350  nproc = team -> t.t_nproc;
1351  tid = __kmp_tid_from_gtid( global_tid );
1352 
1353  /* find out whether this is a parallel taskq or serialized one. */
1354  in_parallel = in_parallel_context( team );
1355 
1356  if( ! tq->tq_root ) {
1357  if (in_parallel) {
1358  /* Vector ORDERED SECTION to taskq version */
1359  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1360 
1361  /* Vector ORDERED SECTION to taskq version */
1362  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1363  }
1364 
1365  if (in_parallel) {
1366  /* This shouldn't be a barrier region boundary, it will confuse the user. */
1367  /* Need the boundary to be at the end taskq instead. */
1368  if ( __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1369  /* Creating the active root queue, and we are not the master thread. */
1370  /* The master thread below created the queue and tasks have been */
1371  /* enqueued, and the master thread released this barrier. This */
1372  /* worker thread can now proceed and execute tasks. See also the */
1373  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1374 
1375  *shareds = (kmpc_shared_vars_t *) tq->tq_root->tq_shareds[tid].ai_data;
1376 
1377  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1378 
1379  return NULL;
1380  }
1381  }
1382 
1383  /* master thread only executes this code */
1384 
1385  if( tq->tq_curr_thunk_capacity < nproc ) {
1386  if(tq->tq_curr_thunk)
1387  __kmp_free(tq->tq_curr_thunk);
1388  else {
1389  /* only need to do this once at outer level, i.e. when tq_curr_thunk is still NULL */
1390  __kmp_init_lock( & tq->tq_freelist_lck );
1391  }
1392 
1393  tq->tq_curr_thunk = (kmpc_thunk_t **) __kmp_allocate( nproc * sizeof(kmpc_thunk_t *) );
1394  tq -> tq_curr_thunk_capacity = nproc;
1395  }
1396 
1397  if (in_parallel)
1398  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1399  }
1400 
1401  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1402  /* on some heuristics (e.g., depth of queue nesting?). */
1403 
1404  nslots = (in_parallel) ? (2 * nproc) : 1;
1405 
1406  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1407  /* jobs being executed by other threads, and one extra for taskq slot */
1408 
1409  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) : nslots + 2;
1410 
1411  /* Only the root taskq gets a per-thread array of shareds. */
1412  /* The rest of the taskq's only get one copy of the shared vars. */
1413 
1414  nshareds = ( !tq->tq_root && in_parallel) ? nproc : 1;
1415 
1416  /* create overall queue data structure and its components that require allocation */
1417 
1418  new_queue = __kmp_alloc_taskq ( tq, in_parallel, nslots, nthunks, nshareds, nproc,
1419  sizeof_thunk, sizeof_shareds, &new_taskq_thunk, global_tid );
1420 
1421  /* rest of new_queue initializations */
1422 
1423  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1424 
1425  if (in_parallel) {
1426  new_queue->tq_tasknum_queuing = 0;
1427  new_queue->tq_tasknum_serving = 0;
1428  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1429  }
1430 
1431  new_queue->tq_taskq_slot = NULL;
1432  new_queue->tq_nslots = nslots;
1433  new_queue->tq_hiwat = HIGH_WATER_MARK (nslots);
1434  new_queue->tq_nfull = 0;
1435  new_queue->tq_head = 0;
1436  new_queue->tq_tail = 0;
1437  new_queue->tq_loc = loc;
1438 
1439  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1440  /* prepare to serve the first-queued task's ORDERED directive */
1441  new_queue->tq_tasknum_serving = 1;
1442 
1443  /* Vector ORDERED SECTION to taskq version */
1444  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1445 
1446  /* Vector ORDERED SECTION to taskq version */
1447  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1448  }
1449 
1450  /* create a new thunk for the taskq_task in the new_queue */
1451  *shareds = (kmpc_shared_vars_t *) new_queue->tq_shareds[0].ai_data;
1452 
1453  new_taskq_thunk->th.th_shareds = *shareds;
1454  new_taskq_thunk->th_task = taskq_task;
1455  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1456  new_taskq_thunk->th_status = 0;
1457 
1458  KMP_DEBUG_ASSERT (new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1459 
1460  /* KMP_MB(); */ /* make sure these inits complete before threads start using this queue (necessary?) */
1461 
1462  /* insert the new task queue into the tree, but only after all fields initialized */
1463 
1464  if (in_parallel) {
1465  if( ! tq->tq_root ) {
1466  new_queue->tq.tq_parent = NULL;
1467  new_queue->tq_first_child = NULL;
1468  new_queue->tq_next_child = NULL;
1469  new_queue->tq_prev_child = NULL;
1470  new_queue->tq_ref_count = 1;
1471  tq->tq_root = new_queue;
1472  }
1473  else {
1474  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1475  new_queue->tq.tq_parent = curr_queue;
1476  new_queue->tq_first_child = NULL;
1477  new_queue->tq_prev_child = NULL;
1478  new_queue->tq_ref_count = 1; /* for this the thread that built the queue */
1479 
1480  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n",
1481  __LINE__, global_tid, new_queue, new_queue->tq_ref_count));
1482 
1483  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1484 
1485  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1486  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1487 
1488  new_queue->tq_next_child = (struct kmpc_task_queue_t *) curr_queue->tq_first_child;
1489 
1490  if (curr_queue->tq_first_child != NULL)
1491  curr_queue->tq_first_child->tq_prev_child = new_queue;
1492 
1493  curr_queue->tq_first_child = new_queue;
1494 
1495  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1496  }
1497 
1498  /* set up thunk stack only after code that determines curr_queue above */
1499  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1500  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1501 
1502  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1503  }
1504  else {
1505  new_taskq_thunk->th_encl_thunk = 0;
1506  new_queue->tq.tq_parent = NULL;
1507  new_queue->tq_first_child = NULL;
1508  new_queue->tq_next_child = NULL;
1509  new_queue->tq_prev_child = NULL;
1510  new_queue->tq_ref_count = 1;
1511  }
1512 
1513 #ifdef KMP_DEBUG
1514  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1515  KF_DUMP(150, __kmp_dump_thunk( tq, new_taskq_thunk, global_tid ));
1516 
1517  if (in_parallel) {
1518  KF_TRACE(25, ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1519  } else {
1520  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1521  }
1522 
1523  KF_DUMP(25, __kmp_dump_task_queue( tq, new_queue, global_tid ));
1524 
1525  if (in_parallel) {
1526  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1527  }
1528 #endif /* KMP_DEBUG */
1529 
1530  if ( __kmp_env_consistency_check )
1531  __kmp_push_workshare( global_tid, ct_taskq, new_queue->tq_loc );
1532 
1533  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1534 
1535  return new_taskq_thunk;
1536 }
1537 
1538 
1539 /* ends a taskq; last thread out destroys the queue */
1540 
1541 void
1542 __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
1543 {
1544 #ifdef KMP_DEBUG
1545  kmp_int32 i;
1546 #endif
1547  kmp_taskq_t *tq;
1548  int in_parallel;
1549  kmp_info_t *th;
1550  kmp_int32 is_outermost;
1551  kmpc_task_queue_t *queue;
1552  kmpc_thunk_t *thunk;
1553  int nproc;
1554 
1555  KE_TRACE( 10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1556 
1557  tq = & __kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1558  nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
1559 
1560  /* For the outermost taskq only, all but one thread will have taskq_thunk == NULL */
1561  queue = (taskq_thunk == NULL) ? tq->tq_root : taskq_thunk->th.th_shareds->sv_queue;
1562 
1563  KE_TRACE( 50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1564  is_outermost = (queue == tq->tq_root);
1565  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1566 
1567  if (in_parallel) {
1568  kmp_uint32 spins;
1569 
1570  /* this is just a safeguard to release the waiting threads if */
1571  /* the outermost taskq never queues a task */
1572 
1573  if (is_outermost && (KMP_MASTER_GTID( global_tid ))) {
1574  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
1575  /* no lock needed, workers are still in spin mode */
1576  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1577 
1578  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1579  }
1580  }
1581 
1582  /* keep dequeueing work until all tasks are queued and dequeued */
1583 
1584  do {
1585  /* wait until something is available to dequeue */
1586  KMP_INIT_YIELD(spins);
1587 
1588  while ( (queue->tq_nfull == 0)
1589  && (queue->tq_taskq_slot == NULL)
1590  && (! __kmp_taskq_has_any_children(queue) )
1591  && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
1592  ) {
1593  KMP_YIELD_WHEN( TRUE, spins );
1594  }
1595 
1596  /* check to see if we can execute tasks in the queue */
1597  while ( ( (queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL) )
1598  && (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL
1599  ) {
1600  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, queue, global_tid));
1601  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1602  }
1603 
1604  /* see if work found can be found in a descendant queue */
1605  if ( (__kmp_taskq_has_any_children(queue))
1606  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1607  ) {
1608 
1609  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1610  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid ));
1611 
1612  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1613  }
1614 
1615  } while ( (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED))
1616  || (queue->tq_nfull != 0)
1617  );
1618 
1619  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, global_tid));
1620 
1621  /* wait while all tasks are not finished and more work found
1622  in descendant queues */
1623 
1624  while ( (!__kmp_taskq_tasks_finished(queue))
1625  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1626  ) {
1627 
1628  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1629  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1630 
1631  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1632  }
1633 
1634  KF_TRACE(50, ("No work found in descendent queues or all work finished in queue: %p (%d)\n", queue, global_tid));
1635 
1636  if (!is_outermost) {
1637  /* need to return if NOWAIT present and not outermost taskq */
1638 
1639  if (queue->tq_flags & TQF_IS_NOWAIT) {
1640  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1641  queue->tq_ref_count--;
1642  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
1643  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1644 
1645  KE_TRACE( 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1646 
1647  return;
1648  }
1649 
1650  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1651 
1652  /* WAIT until all tasks are finished and no child queues exist before proceeding */
1653  KMP_INIT_YIELD(spins);
1654 
1655  while (!__kmp_taskq_tasks_finished(queue) || __kmp_taskq_has_any_children(queue)) {
1656  thunk = __kmp_find_task_in_ancestor_queue( tq, global_tid, queue );
1657 
1658  if (thunk != NULL) {
1659  KF_TRACE(50, ("Stole thunk: %p in ancestor queue: %p while waiting in queue: %p (%d)\n",
1660  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1661  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1662  }
1663 
1664  KMP_YIELD_WHEN( thunk == NULL, spins );
1665 
1666  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1667  }
1668 
1669  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1670  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1671  queue->tq_flags |= TQF_DEALLOCATED;
1672  }
1673  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1674 
1675  /* only the allocating thread can deallocate the queue */
1676  if (taskq_thunk != NULL) {
1677  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1678  }
1679 
1680  KE_TRACE( 10, ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", global_tid));
1681 
1682  return;
1683  }
1684 
1685  /* Outermost Queue: steal work from descendants until all tasks are finished */
1686 
1687  KMP_INIT_YIELD(spins);
1688 
1689  while (!__kmp_taskq_tasks_finished(queue)) {
1690  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1691 
1692  if (thunk != NULL) {
1693  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1694  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1695 
1696  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1697  }
1698 
1699  KMP_YIELD_WHEN( thunk == NULL, spins );
1700  }
1701 
1702  /* Need this barrier to prevent destruction of queue before threads have all executed above code */
1703  /* This may need to be done earlier when NOWAIT is implemented for the outermost level */
1704 
1705  if ( !__kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1706  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1707  /* for right now, everybody waits, and the master thread destroys the */
1708  /* remaining queues. */
1709 
1710  __kmp_remove_all_child_taskq( tq, global_tid, queue );
1711 
1712  /* Now destroy the root queue */
1713  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", global_tid, queue ));
1714  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1715 
1716 #ifdef KMP_DEBUG
1717  /* the root queue entry */
1718  KMP_DEBUG_ASSERT ((queue->tq.tq_parent == NULL) && (queue->tq_next_child == NULL));
1719 
1720  /* children must all be gone by now because of barrier above */
1721  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1722 
1723  for (i=0; i<nproc; i++) {
1724  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1725  }
1726 
1727  for (i=0, thunk=queue->tq_free_thunks; thunk != NULL; i++, thunk=thunk->th.th_next_free);
1728 
1729  KMP_DEBUG_ASSERT (i == queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1730 
1731  for (i = 0; i < nproc; i++) {
1732  KMP_DEBUG_ASSERT( ! tq->tq_curr_thunk[i] );
1733  }
1734 #endif
1735  /* unlink the root queue entry */
1736  tq -> tq_root = NULL;
1737 
1738  /* release storage for root queue entry */
1739  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, global_tid));
1740 
1741  queue->tq_flags |= TQF_DEALLOCATED;
1742  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1743 
1744  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1745 
1746  /* release the workers now that the data structures are up to date */
1747  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1748  }
1749 
1750  th = __kmp_threads[ global_tid ];
1751 
1752  /* Reset ORDERED SECTION to parallel version */
1753  th->th.th_dispatch->th_deo_fcn = 0;
1754 
1755  /* Reset ORDERED SECTION to parallel version */
1756  th->th.th_dispatch->th_dxo_fcn = 0;
1757  }
1758  else {
1759  /* in serial execution context, dequeue the last task */
1760  /* and execute it, if there were any tasks encountered */
1761 
1762  if (queue->tq_nfull > 0) {
1763  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1764 
1765  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1766 
1767  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1768  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1769  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1770  /* instrumentation does copy-out. */
1771 
1772  /* no need for test_then_or call since already locked */
1773  thunk->th_flags |= TQF_IS_LAST_TASK;
1774  }
1775 
1776  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, thunk, queue));
1777 
1778  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1779  }
1780 
1781  /* destroy the unattached serial queue now that there is no more work to do */
1782  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", queue, global_tid));
1783  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1784 
1785 #ifdef KMP_DEBUG
1786  i = 0;
1787  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1788  ++i;
1789  KMP_DEBUG_ASSERT (i == queue->tq_nslots + 1);
1790 #endif
1791  /* release storage for unattached serial queue */
1792  KF_TRACE(50, ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1793 
1794  queue->tq_flags |= TQF_DEALLOCATED;
1795  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1796  }
1797 
1798  KE_TRACE( 10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1799 }
1800 
1801 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1802 /* Returns nonzero if just filled up queue */
1803 
1804 kmp_int32
1805 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1806 {
1807  kmp_int32 ret;
1808  kmpc_task_queue_t *queue;
1809  int in_parallel;
1810  kmp_taskq_t *tq;
1811 
1812  KE_TRACE( 10, ("__kmpc_task called (%d)\n", global_tid));
1813 
1814  KMP_DEBUG_ASSERT (!(thunk->th_flags & TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1815 
1816  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1817  queue = thunk->th.th_shareds->sv_queue;
1818  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1819 
1820  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1821  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1822 
1823  /* For serial execution dequeue the preceding task and execute it, if one exists */
1824  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1825 
1826  if (!in_parallel && queue->tq_nfull > 0) {
1827  kmpc_thunk_t *prev_thunk;
1828 
1829  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1830 
1831  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1832 
1833  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, prev_thunk, queue));
1834 
1835  __kmp_execute_task_from_queue( tq, loc, global_tid, prev_thunk, in_parallel );
1836  }
1837 
1838  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
1839  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
1840  /* task queue is not full and allocates a thunk (which is then passed to */
1841  /* __kmpc_task()). So, the enqueue below should never fail due to a full queue. */
1842 
1843  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1844  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1845 
1846  ret = __kmp_enqueue_task ( tq, global_tid, queue, thunk, in_parallel );
1847 
1848  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1849  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1850 
1851  KE_TRACE( 10, ("__kmpc_task return (%d)\n", global_tid));
1852 
1853  return ret;
1854 }
1855 
1856 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1857 /* this should never be called unless in a parallel context */
1858 
1859 void
1860 __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status)
1861 {
1862  kmpc_task_queue_t *queue;
1863  kmp_taskq_t *tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1864  int tid = __kmp_tid_from_gtid( global_tid );
1865 
1866  KE_TRACE( 10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1867  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1868  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1869 
1870  queue = thunk->th.th_shareds->sv_queue;
1871 
1872  if ( __kmp_env_consistency_check )
1873  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1874 
1875  /* thunk->th_task is the taskq_task */
1876  KMP_DEBUG_ASSERT (thunk->th_flags & TQF_TASKQ_TASK);
1877 
1878  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1879  KMP_DEBUG_ASSERT (queue->tq_taskq_slot == NULL);
1880 
1881  /* dequeue taskq thunk from curr_thunk stack */
1882  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1883  thunk->th_encl_thunk = NULL;
1884 
1885  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1886 
1887  thunk->th_status = status;
1888 
1889  KMP_MB(); /* flush thunk->th_status before taskq_task enqueued to avoid race condition */
1890 
1891  /* enqueue taskq_task in thunk into special slot in queue */
1892  /* GEH - probably don't need to lock taskq slot since only one */
1893  /* thread enqueues & already a lock set at dequeue point */
1894 
1895  queue->tq_taskq_slot = thunk;
1896 
1897  KE_TRACE( 10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1898 }
1899 
1900 /* ends a taskq_task; done generating tasks */
1901 
1902 void
1903 __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1904 {
1905  kmp_taskq_t *tq;
1906  kmpc_task_queue_t *queue;
1907  int in_parallel;
1908  int tid;
1909 
1910  KE_TRACE( 10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1911 
1912  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1913  queue = thunk->th.th_shareds->sv_queue;
1914  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1915  tid = __kmp_tid_from_gtid( global_tid );
1916 
1917  if ( __kmp_env_consistency_check )
1918  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1919 
1920  if (in_parallel) {
1921 #if KMP_ARCH_X86 || \
1922  KMP_ARCH_X86_64
1923 
1924  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_ALL_TASKS_QUEUED );
1925 #else
1926  {
1927  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1928 
1929  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1930  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1931 
1932  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1933 
1934  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1935  }
1936 #endif
1937  }
1938 
1939  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1940  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in the */
1941  /* queue if TQF_IS_LASTPRIVATE so we can positively identify that last task */
1942  /* and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. When */
1943  /* __kmpc_end_taskq_task() is called we are done generating all the tasks, so */
1944  /* we know the last one in the queue is the lastprivate task. Mark the queue */
1945  /* as having gotten to this state via tq_flags & TQF_IS_LAST_TASK; when that */
1946  /* task actually executes mark it via th_flags & TQF_IS_LAST_TASK (this th_flags */
1947  /* bit signals the instrumented code to do copy-outs after execution). */
1948 
1949  if (! in_parallel) {
1950  /* No synchronization needed for serial context */
1951  queue->tq_flags |= TQF_IS_LAST_TASK;
1952  }
1953  else {
1954 #if KMP_ARCH_X86 || \
1955  KMP_ARCH_X86_64
1956 
1957  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_IS_LAST_TASK );
1958 #else
1959  {
1960  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1961 
1962  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1963  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1964 
1965  queue->tq_flags |= TQF_IS_LAST_TASK;
1966 
1967  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1968  }
1969 #endif
1970  /* to prevent race condition where last task is dequeued but */
1971  /* flag isn't visible yet (not sure about this) */
1972  KMP_MB();
1973  }
1974  }
1975 
1976  /* dequeue taskq thunk from curr_thunk stack */
1977  if (in_parallel) {
1978  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1979  thunk->th_encl_thunk = NULL;
1980 
1981  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1982  }
1983 
1984  KE_TRACE( 10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1985 }
1986 
1987 /* returns thunk for a regular task based on taskq_thunk */
1988 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
1989 
1990 kmpc_thunk_t *
1991 __kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task)
1992 {
1993  kmp_taskq_t *tq;
1994  kmpc_task_queue_t *queue;
1995  kmpc_thunk_t *new_thunk;
1996  int in_parallel;
1997 
1998  KE_TRACE( 10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1999 
2000  KMP_DEBUG_ASSERT (taskq_thunk->th_flags & TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2001 
2002  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
2003  queue = taskq_thunk->th.th_shareds->sv_queue;
2004  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2005 
2006  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
2007  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
2008  /* task queue is not full and allocates a thunk (which is then passed to */
2009  /* __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be */
2010  /* the next to be enqueued in __kmpc_task(). */
2011 
2012  new_thunk = __kmp_alloc_thunk (queue, in_parallel, global_tid);
2013  new_thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[0].ai_data;
2014  new_thunk->th_encl_thunk = NULL;
2015  new_thunk->th_task = task;
2016 
2017  /* GEH - shouldn't need to lock the read of tq_flags here */
2018  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2019 
2020  new_thunk->th_status = 0;
2021 
2022  KMP_DEBUG_ASSERT (!(new_thunk->th_flags & TQF_TASKQ_TASK));
2023 
2024  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2025  KF_DUMP(100, __kmp_dump_thunk( tq, new_thunk, global_tid ));
2026 
2027  KE_TRACE( 10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2028 
2029  return new_thunk;
2030 }
2031 
2032 /* --------------------------------------------------------------------------- */
Definition: kmp.h:200
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)