Actual source code: gs.c

  1: #define PETSCKSP_DLL

  3: /***********************************gs.c***************************************

  5: Author: Henry M. Tufo III

  7: e-mail: hmt@cs.brown.edu

  9: snail-mail:
 10: Division of Applied Mathematics
 11: Brown University
 12: Providence, RI 02912

 14: Last Modification: 
 15: 6.21.97
 16: ************************************gs.c**************************************/

 18: /***********************************gs.c***************************************
 19: File Description:
 20: -----------------

 22: ************************************gs.c**************************************/

 24:  #include ../src/ksp/pc/impls/tfs/tfs.h

 26: /* default length of number of items via tree - doubles if exceeded */
 27: #define TREE_BUF_SZ 2048;
 28: #define GS_VEC_SZ   1



 32: /***********************************gs.c***************************************
 33: Type: struct gather_scatter_id 
 34: ------------------------------

 36: ************************************gs.c**************************************/
 37: typedef struct gather_scatter_id {
 38:   PetscInt id;
 39:   PetscInt nel_min;
 40:   PetscInt nel_max;
 41:   PetscInt nel_sum;
 42:   PetscInt negl;
 43:   PetscInt gl_max;
 44:   PetscInt gl_min;
 45:   PetscInt repeats;
 46:   PetscInt ordered;
 47:   PetscInt positive;
 48:   PetscScalar *vals;

 50:   /* bit mask info */
 51:   PetscInt *my_proc_mask;
 52:   PetscInt mask_sz;
 53:   PetscInt *ngh_buf;
 54:   PetscInt ngh_buf_sz;
 55:   PetscInt *nghs;
 56:   PetscInt num_nghs;
 57:   PetscInt max_nghs;
 58:   PetscInt *pw_nghs;
 59:   PetscInt num_pw_nghs;
 60:   PetscInt *tree_nghs;
 61:   PetscInt num_tree_nghs;

 63:   PetscInt num_loads;

 65:   /* repeats == true -> local info */
 66:   PetscInt nel;         /* number of unique elememts */
 67:   PetscInt *elms;       /* of size nel */
 68:   PetscInt nel_total;
 69:   PetscInt *local_elms; /* of size nel_total */
 70:   PetscInt *companion;  /* of size nel_total */

 72:   /* local info */
 73:   PetscInt num_local_total;
 74:   PetscInt local_strength;
 75:   PetscInt num_local;
 76:   PetscInt *num_local_reduce;
 77:   PetscInt **local_reduce;
 78:   PetscInt num_local_gop;
 79:   PetscInt *num_gop_local_reduce;
 80:   PetscInt **gop_local_reduce;

 82:   /* pairwise info */
 83:   PetscInt level;
 84:   PetscInt num_pairs;
 85:   PetscInt max_pairs;
 86:   PetscInt loc_node_pairs;
 87:   PetscInt max_node_pairs;
 88:   PetscInt min_node_pairs;
 89:   PetscInt avg_node_pairs;
 90:   PetscInt *pair_list;
 91:   PetscInt *msg_sizes;
 92:   PetscInt **node_list;
 93:   PetscInt len_pw_list;
 94:   PetscInt *pw_elm_list;
 95:   PetscScalar *pw_vals;

 97:   MPI_Request *msg_ids_in;
 98:   MPI_Request *msg_ids_out;

100:   PetscScalar *out;
101:   PetscScalar *in;
102:   PetscInt msg_total;

104:   /* tree - crystal accumulator info */
105:   PetscInt max_left_over;
106:   PetscInt *pre;
107:   PetscInt *in_num;
108:   PetscInt *out_num;
109:   PetscInt **in_list;
110:   PetscInt **out_list;

112:   /* new tree work*/
113:   PetscInt  tree_nel;
114:   PetscInt *tree_elms;
115:   PetscScalar *tree_buf;
116:   PetscScalar *tree_work;

118:   PetscInt  tree_map_sz;
119:   PetscInt *tree_map_in;
120:   PetscInt *tree_map_out;

122:   /* current memory status */
123:   PetscInt gl_bss_min;
124:   PetscInt gl_perm_min;

126:   /* max segment size for gs_gop_vec() */
127:   PetscInt vec_sz;

129:   /* hack to make paul happy */
130:   MPI_Comm gs_comm;

132: } gs_id;

134: static gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
135: static PetscErrorCode gsi_via_bit_mask(gs_id *gs);
136: static PetscErrorCode get_ngh_buf(gs_id *gs);
137: static PetscErrorCode set_pairwise(gs_id *gs);
138: static gs_id * gsi_new(void);
139: static PetscErrorCode set_tree(gs_id *gs);

141: /* same for all but vector flavor */
142: static PetscErrorCode gs_gop_local_out(gs_id *gs, PetscScalar *vals);
143: /* vector flavor */
144: static PetscErrorCode gs_gop_vec_local_out(gs_id *gs, PetscScalar *vals, PetscInt step);

146: static PetscErrorCode gs_gop_vec_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode gs_gop_vec_pairwise_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
148: static PetscErrorCode gs_gop_vec_local_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode gs_gop_vec_local_in_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
150: static PetscErrorCode gs_gop_vec_tree_plus(gs_id *gs, PetscScalar *vals, PetscInt step);


153: static PetscErrorCode gs_gop_local_plus(gs_id *gs, PetscScalar *vals);
154: static PetscErrorCode gs_gop_local_in_plus(gs_id *gs, PetscScalar *vals);

156: static PetscErrorCode gs_gop_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode gs_gop_pairwise_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
158: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim);

160: /* global vars */
161: /* from comm.c module */

163: static PetscInt num_gs_ids = 0;

165: /* should make this dynamic ... later */
166: static PetscInt msg_buf=MAX_MSG_BUF;
167: static PetscInt vec_sz=GS_VEC_SZ;
168: static PetscInt *tree_buf=NULL;
169: static PetscInt tree_buf_sz=0;
170: static PetscInt ntree=0;

172: /***************************************************************************/
173: PetscErrorCode gs_init_vec_sz(PetscInt size)
174: {
176:   vec_sz = size;
177:   return(0);
178: }

180: /******************************************************************************/
181: PetscErrorCode gs_init_msg_buf_sz(PetscInt buf_size)
182: {
184:   msg_buf = buf_size;
185:   return(0);
186: }

188: /******************************************************************************/
189: gs_id *gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
190: {
191:    gs_id *gs;
192:   MPI_Group gs_group;
193:   MPI_Comm  gs_comm;

197:   /* ensure that communication package has been initialized */
198:   comm_init();


201:   /* determines if we have enough dynamic/semi-static memory */
202:   /* checks input, allocs and sets gd_id template            */
203:   gs = gsi_check_args(elms,nel,level);

205:   /* only bit mask version up and working for the moment    */
206:   /* LATER :: get int list version working for sparse pblms */
207:   gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);


210:   MPI_Comm_group(MPI_COMM_WORLD,&gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211:   MPI_Comm_create(MPI_COMM_WORLD,gs_group,&gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
212:   gs->gs_comm=gs_comm;

214:   return(gs);
215: }

217: /******************************************************************************/
218: static gs_id *gsi_new(void)
219: {
221:   gs_id *gs;
222:   gs = (gs_id *) malloc(sizeof(gs_id));
223:   PetscMemzero(gs,sizeof(gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
224:   return(gs);
225: }

227: /******************************************************************************/
228: static gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
229: {
230:    PetscInt i, j, k, t2;
231:   PetscInt *companion, *elms, *unique, *iptr;
232:   PetscInt num_local=0, *num_to_reduce, **local_reduce;
233:   PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
234:   PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
235:   PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
236:   gs_id *gs;


240:   if (!in_elms)
241:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");}

243:   if (nel<0)
244:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");}

246:   if (nel==0)
247:     {PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}

249:   /* get space for gs template */
250:   gs = gsi_new();
251:   gs->id = ++num_gs_ids;

253:   /* hmt 6.4.99                                            */
254:   /* caller can set global ids that don't participate to 0 */
255:   /* gs_init ignores all zeros in elm list                 */
256:   /* negative global ids are still invalid                 */
257:   for (i=j=0;i<nel;i++)
258:     {if (in_elms[i]!=0) {j++;}}

260:   k=nel; nel=j;

262:   /* copy over in_elms list and create inverse map */
263:   elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
264:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

266:   for (i=j=0;i<k;i++)
267:     {
268:       if (in_elms[i]!=0)
269:         {elms[j] = in_elms[i]; companion[j++] = i;}
270:     }

272:   if (j!=nel)
273:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");}

275:   /* pre-pass ... check to see if sorted */
276:   elms[nel] = INT_MAX;
277:   iptr = elms;
278:   unique = elms+1;
279:   j=0;
280:   while (*iptr!=INT_MAX)
281:     {
282:       if (*iptr++>*unique++)
283:         {j=1; break;}
284:     }

286:   /* set up inverse map */
287:   if (j)
288:     {
289:       PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
290:       SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
291:     }
292:   else
293:     {PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
294:   elms[nel] = INT_MIN;

296:   /* first pass */
297:   /* determine number of unique elements, check pd */
298:   for (i=k=0;i<nel;i+=j)
299:     {
300:       t2 = elms[i];
301:       j=++i;
302: 
303:       /* clump 'em for now */
304:       while (elms[j]==t2) {j++;}
305: 
306:       /* how many together and num local */
307:       if (j-=i)
308:         {num_local++; k+=j;}
309:     }

311:   /* how many unique elements? */
312:   gs->repeats=k;
313:   gs->nel = nel-k;


316:   /* number of repeats? */
317:   gs->num_local = num_local;
318:   num_local+=2;
319:   gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
320:   gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

322:   unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
323:   gs->elms = unique;
324:   gs->nel_total = nel;
325:   gs->local_elms = elms;
326:   gs->companion = companion;

328:   /* compess map as well as keep track of local ops */
329:   for (num_local=i=j=0;i<gs->nel;i++)
330:     {
331:       k=j;
332:       t2 = unique[i] = elms[j];
333:       companion[i] = companion[j];
334: 
335:       while (elms[j]==t2) {j++;}

337:       if ((t2=(j-k))>1)
338:         {
339:           /* number together */
340:           num_to_reduce[num_local] = t2++;
341:           iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

343:           /* to use binary searching don't remap until we check intersection */
344:           *iptr++ = i;
345: 
346:           /* note that we're skipping the first one */
347:           while (++k<j)
348:             {*(iptr++) = companion[k];}
349:           *iptr = -1;
350:         }
351:     }

353:   /* sentinel for ngh_buf */
354:   unique[gs->nel]=INT_MAX;

356:   /* for two partition sort hack */
357:   num_to_reduce[num_local] = 0;
358:   local_reduce[num_local] = NULL;
359:   num_to_reduce[++num_local] = 0;
360:   local_reduce[num_local] = NULL;

362:   /* load 'em up */
363:   /* note one extra to hold NON_UNIFORM flag!!! */
364:   vals[2] = vals[1] = vals[0] = nel;
365:   if (gs->nel>0)
366:     {
367:        vals[3] = unique[0];
368:        vals[4] = unique[gs->nel-1];
369:     }
370:   else
371:     {
372:        vals[3] = INT_MAX;
373:        vals[4] = INT_MIN;
374:     }
375:   vals[5] = level;
376:   vals[6] = num_gs_ids;

378:   /* GLOBAL: send 'em out */
379:   giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

381:   /* must be semi-pos def - only pairwise depends on this */
382:   /* LATER - remove this restriction */
383:   if (vals[3]<0)
384:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");}

386:   if (vals[4]==INT_MAX)
387:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");}

389:   gs->nel_min = vals[0];
390:   gs->nel_max = vals[1];
391:   gs->nel_sum = vals[2];
392:   gs->gl_min  = vals[3];
393:   gs->gl_max  = vals[4];
394:   gs->negl    = vals[4]-vals[3]+1;

396:   if (gs->negl<=0)
397:     {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");}
398: 
399:   /* LATER :: add level == -1 -> program selects level */
400:   if (vals[5]<0)
401:     {vals[5]=0;}
402:   else if (vals[5]>num_nodes)
403:     {vals[5]=num_nodes;}
404:   gs->level = vals[5];

406:   return(gs);
407: }

409: /******************************************************************************/
410: static PetscErrorCode gsi_via_bit_mask(gs_id *gs)
411: {
412:    PetscInt i, nel, *elms;
413:   PetscInt t1;
414:   PetscInt **reduce;
415:   PetscInt *map;

419:   /* totally local removes ... ct_bits == 0 */
420:   get_ngh_buf(gs);

422:   if (gs->level)
423:     {set_pairwise(gs);}

425:   if (gs->max_left_over)
426:     {set_tree(gs);}

428:   /* intersection local and pairwise/tree? */
429:   gs->num_local_total = gs->num_local;
430:   gs->gop_local_reduce = gs->local_reduce;
431:   gs->num_gop_local_reduce = gs->num_local_reduce;

433:   map = gs->companion;

435:   /* is there any local compression */
436:   if (!gs->num_local) {
437:     gs->local_strength = NONE;
438:     gs->num_local_gop = 0;
439:   } else {
440:       /* ok find intersection */
441:       map = gs->companion;
442:       reduce = gs->local_reduce;
443:       for (i=0, t1=0; i<gs->num_local; i++, reduce++)
444:         {
445:           if ((ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
446:               ||
447:               ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
448:             {
449:               t1++;
450:               if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_ERR_PLIB,"nobody in list?");
451:               gs->num_local_reduce[i] *= -1;
452:             }
453:            **reduce=map[**reduce];
454:         }

456:       /* intersection is empty */
457:       if (!t1)
458:         {
459:           gs->local_strength = FULL;
460:           gs->num_local_gop = 0;
461:         }
462:       /* intersection not empty */
463:       else
464:         {
465:           gs->local_strength = PARTIAL;
466:           SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

468:           gs->num_local_gop = t1;
469:           gs->num_local_total =  gs->num_local;
470:           gs->num_local    -= t1;
471:           gs->gop_local_reduce = gs->local_reduce;
472:           gs->num_gop_local_reduce = gs->num_local_reduce;

474:           for (i=0; i<t1; i++)
475:             {
476:               if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_ERR_PLIB,"they aren't negative?");
477:               gs->num_gop_local_reduce[i] *= -1;
478:               gs->local_reduce++;
479:               gs->num_local_reduce++;
480:             }
481:           gs->local_reduce++;
482:           gs->num_local_reduce++;
483:         }
484:     }

486:   elms = gs->pw_elm_list;
487:   nel  = gs->len_pw_list;
488:   for (i=0; i<nel; i++)
489:     {elms[i] = map[elms[i]];}

491:   elms = gs->tree_map_in;
492:   nel  = gs->tree_map_sz;
493:   for (i=0; i<nel; i++)
494:     {elms[i] = map[elms[i]];}

496:   /* clean up */
497:   free((void*) gs->local_elms);
498:   free((void*) gs->companion);
499:   free((void*) gs->elms);
500:   free((void*) gs->ngh_buf);
501:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
502:   return(0);
503: }

505: /******************************************************************************/
506: static PetscErrorCode place_in_tree( PetscInt elm)
507: {
508:    PetscInt *tp, n;

511:   if (ntree==tree_buf_sz)
512:     {
513:       if (tree_buf_sz)
514:         {
515:           tp = tree_buf;
516:           n = tree_buf_sz;
517:           tree_buf_sz<<=1;
518:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
519:           ivec_copy(tree_buf,tp,n);
520:           free(tp);
521:         }
522:       else
523:         {
524:           tree_buf_sz = TREE_BUF_SZ;
525:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
526:         }
527:     }

529:   tree_buf[ntree++] = elm;
530:   return(0);
531: }

533: /******************************************************************************/
534: static PetscErrorCode get_ngh_buf(gs_id *gs)
535: {
536:    PetscInt i, j, npw=0, ntree_map=0;
537:   PetscInt p_mask_size, ngh_buf_size, buf_size;
538:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
539:   PetscInt *ngh_buf, *buf1, *buf2;
540:   PetscInt offset, per_load, num_loads, or_ct, start, end;
541:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
542:   PetscInt oper=GL_B_OR;
543:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

547:   /* to make life easier */
548:   nel   = gs->nel;
549:   elms  = gs->elms;
550:   level = gs->level;
551: 
552:   /* det #bytes needed for processor bit masks and init w/mask cor. to my_id */
553:   p_mask = (PetscInt*) malloc(p_mask_size=len_bit_mask(num_nodes));
554:   set_bit_mask(p_mask,p_mask_size,my_id);

556:   /* allocate space for masks and info bufs */
557:   gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
558:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
559:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
560:   t_mask = (PetscInt*) malloc(p_mask_size);
561:   gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

563:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
564:   /* had thought I could exploit rendezvous threshold */

566:   /* default is one pass */
567:   per_load = negl  = gs->negl;
568:   gs->num_loads = num_loads = 1;
569:   i=p_mask_size*negl;

571:   /* possible overflow on buffer size */
572:   /* overflow hack                    */
573:   if (i<0) {i=INT_MAX;}

575:   buf_size = PetscMin(msg_buf,i);

577:   /* can we do it? */
578:   if (p_mask_size>buf_size) SETERRQ2(PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);

580:   /* get giop buf space ... make *only* one malloc */
581:   buf1 = (PetscInt*) malloc(buf_size<<1);

583:   /* more than one gior exchange needed? */
584:   if (buf_size!=i)
585:     {
586:       per_load = buf_size/p_mask_size;
587:       buf_size = per_load*p_mask_size;
588:       gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
589:     }


592:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
593:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
594: 
595:   /* find giop work space */
596:   buf2 = buf1+buf_size;

598:   /* hold #ints needed for processor masks */
599:   gs->mask_sz=p_mask_size;

601:   /* init buffers */
602:   ivec_zero(sh_proc_mask,p_mask_size);
603:   ivec_zero(pw_sh_proc_mask,p_mask_size);
604:   ivec_zero(ngh_buf,ngh_buf_size);

606:   /* HACK reset tree info */
607:   tree_buf=NULL;
608:   tree_buf_sz=ntree=0;

610:   /* ok do it */
611:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
612:     {
613:       /* identity for bitwise or is 000...000 */
614:       ivec_zero(buf1,buf_size);

616:       /* load msg buffer */
617:       for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
618:         {
619:           offset = (offset-start)*p_mask_size;
620:           ivec_copy(buf1+offset,p_mask,p_mask_size);
621:         }

623:       /* GLOBAL: pass buffer */
624:       giop(buf1,buf2,buf_size,&oper);


627:       /* unload buffer into ngh_buf */
628:       ptr2=(elms+i_start);
629:       for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
630:         {
631:           /* I own it ... may have to pairwise it */
632:           if (j==*ptr2)
633:             {
634:               /* do i share it w/anyone? */
635:               ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
636:               /* guess not */
637:               if (ct1<2)
638:                 {ptr2++; ptr1+=p_mask_size; continue;}

640:               /* i do ... so keep info and turn off my bit */
641:               ivec_copy(ptr1,ptr3,p_mask_size);
642:               ivec_xor(ptr1,p_mask,p_mask_size);
643:               ivec_or(sh_proc_mask,ptr1,p_mask_size);
644: 
645:               /* is it to be done pairwise? */
646:               if (--ct1<=level)
647:                 {
648:                   npw++;
649: 
650:                   /* turn on high bit to indicate pw need to process */
651:                   *ptr2++ |= TOP_BIT;
652:                   ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
653:                   ptr1+=p_mask_size;
654:                   continue;
655:                 }

657:               /* get set for next and note that I have a tree contribution */
658:               /* could save exact elm index for tree here -> save a search */
659:               ptr2++; ptr1+=p_mask_size; ntree_map++;
660:             }
661:           /* i don't but still might be involved in tree */
662:           else
663:             {

665:               /* shared by how many? */
666:               ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));

668:               /* none! */
669:               if (ct1<2) continue;

671:               /* is it going to be done pairwise? but not by me of course!*/
672:               if (--ct1<=level) continue;
673:             }
674:           /* LATER we're going to have to process it NOW */
675:           /* nope ... tree it */
676:           place_in_tree(j);
677:         }
678:     }

680:   free((void*)t_mask);
681:   free((void*)buf1);

683:   gs->len_pw_list=npw;
684:   gs->num_nghs = ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));

686:   /* expand from bit mask list to int list and save ngh list */
687:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
688:   bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

690:   gs->num_pw_nghs = ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

692:   oper = GL_MAX;
693:   ct1 = gs->num_nghs;
694:   giop(&ct1,&ct2,1,&oper);
695:   gs->max_nghs = ct1;

697:   gs->tree_map_sz  = ntree_map;
698:   gs->max_left_over=ntree;

700:   free((void*)p_mask);
701:   free((void*)sh_proc_mask);
702:   return(0);
703: }

705: /******************************************************************************/
706: static PetscErrorCode set_pairwise(gs_id *gs)
707: {
708:    PetscInt i, j;
709:   PetscInt p_mask_size;
710:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
711:   PetscInt *ngh_buf, *buf2;
712:   PetscInt offset;
713:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
714:   PetscInt *pairwise_elm_list, len_pair_list=0;
715:   PetscInt *iptr, t1, i_start, nel, *elms;
716:   PetscInt ct;

720:   /* to make life easier */
721:   nel  = gs->nel;
722:   elms = gs->elms;
723:   ngh_buf = gs->ngh_buf;
724:   sh_proc_mask  = gs->pw_nghs;

726:   /* need a few temp masks */
727:   p_mask_size   = len_bit_mask(num_nodes);
728:   p_mask        = (PetscInt*) malloc(p_mask_size);
729:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

731:   /* set mask to my my_id's bit mask */
732:   set_bit_mask(p_mask,p_mask_size,my_id);

734:   p_mask_size /= sizeof(PetscInt);
735: 
736:   len_pair_list=gs->len_pw_list;
737:   gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

739:   /* how many processors (nghs) do we have to exchange with? */
740:   nprs=gs->num_pairs=ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));


743:   /* allocate space for gs_gop() info */
744:   gs->pair_list = msg_list = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
745:   gs->msg_sizes = msg_size  = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
746:   gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));

748:   /* init msg_size list */
749:   ivec_zero(msg_size,nprs);

751:   /* expand from bit mask list to int list */
752:   bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
753: 
754:   /* keep list of elements being handled pairwise */
755:   for (i=j=0;i<nel;i++)
756:     {
757:       if (elms[i] & TOP_BIT)
758:         {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
759:     }
760:   pairwise_elm_list[j] = -1;

762:   gs->msg_ids_out = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
763:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
764:   gs->msg_ids_in = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
765:   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
766:   gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

768:   /* find who goes to each processor */
769:   for (i_start=i=0;i<nprs;i++)
770:     {
771:       /* processor i's mask */
772:       set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

774:       /* det # going to processor i */
775:       for (ct=j=0;j<len_pair_list;j++)
776:         {
777:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
778:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
779:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
780:             {ct++;}
781:         }
782:       msg_size[i] = ct;
783:       i_start = PetscMax(i_start,ct);

785:       /*space to hold nodes in message to first neighbor */
786:       msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

788:       for (j=0;j<len_pair_list;j++)
789:         {
790:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
791:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
792:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
793:             {*iptr++ = j;}
794:         }
795:       *iptr = -1;
796:     }
797:   msg_nodes[nprs] = NULL;

799:   j=gs->loc_node_pairs=i_start;
800:   t1 = GL_MAX;
801:   giop(&i_start,&offset,1,&t1);
802:   gs->max_node_pairs = i_start;

804:   i_start=j;
805:   t1 = GL_MIN;
806:   giop(&i_start,&offset,1,&t1);
807:   gs->min_node_pairs = i_start;

809:   i_start=j;
810:   t1 = GL_ADD;
811:   giop(&i_start,&offset,1,&t1);
812:   gs->avg_node_pairs = i_start/num_nodes + 1;

814:   i_start=nprs;
815:   t1 = GL_MAX;
816:   giop(&i_start,&offset,1,&t1);
817:   gs->max_pairs = i_start;


820:   /* remap pairwise in tail of gsi_via_bit_mask() */
821:   gs->msg_total = ivec_sum(gs->msg_sizes,nprs);
822:   gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
823:   gs->in  = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

825:   /* reset malloc pool */
826:   free((void*)p_mask);
827:   free((void*)tmp_proc_mask);
828:   return(0);
829: }

831: /* to do pruned tree just save ngh buf copy for each one and decode here!
832: ******************************************************************************/
833: static PetscErrorCode set_tree(gs_id *gs)
834: {
835:   PetscInt i, j, n, nel;
836:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

839:   /* local work ptrs */
840:   elms = gs->elms;
841:   nel     = gs->nel;

843:   /* how many via tree */
844:   gs->tree_nel  = n = ntree;
845:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
846:   gs->tree_buf  = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
847:   gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
848:   j=gs->tree_map_sz;
849:   gs->tree_map_in = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
850:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

852:   /* search the longer of the two lists */
853:   /* note ... could save this info in get_ngh_buf and save searches */
854:   if (n<=nel)
855:     {
856:       /* bijective fct w/remap - search elm list */
857:       for (i=0; i<n; i++)
858:         {
859:           if ((j=ivec_binary_search(*tree_elms++,elms,nel))>=0)
860:             {*iptr_in++ = j; *iptr_out++ = i;}
861:         }
862:     }
863:   else
864:     {
865:       for (i=0; i<nel; i++)
866:         {
867:           if ((j=ivec_binary_search(*elms++,tree_elms,n))>=0)
868:             {*iptr_in++ = i; *iptr_out++ = j;}
869:         }
870:     }

872:   /* sentinel */
873:   *iptr_in = *iptr_out = -1;
874:   return(0);
875: }

877: /******************************************************************************/
878: static PetscErrorCode gs_gop_local_out( gs_id *gs,  PetscScalar *vals)
879: {
880:   PetscInt *num, *map, **reduce;
881:   PetscScalar tmp;

884:   num    = gs->num_gop_local_reduce;
885:   reduce = gs->gop_local_reduce;
886:   while ((map = *reduce++))
887:     {
888:       /* wall */
889:       if (*num == 2)
890:         {
891:           num ++;
892:           vals[map[1]] = vals[map[0]];
893:         }
894:       /* corner shared by three elements */
895:       else if (*num == 3)
896:         {
897:           num ++;
898:           vals[map[2]] = vals[map[1]] = vals[map[0]];
899:         }
900:       /* corner shared by four elements */
901:       else if (*num == 4)
902:         {
903:           num ++;
904:           vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
905:         }
906:       /* general case ... odd geoms ... 3D*/
907:       else
908:         {
909:           num++;
910:           tmp = *(vals + *map++);
911:           while (*map >= 0)
912:             {*(vals + *map++) = tmp;}
913:         }
914:     }
915:   return(0);
916: }

918: /******************************************************************************/
919: static PetscErrorCode gs_gop_local_plus( gs_id *gs,  PetscScalar *vals)
920: {
921:    PetscInt *num, *map, **reduce;
922:    PetscScalar tmp;

925:   num    = gs->num_local_reduce;
926:   reduce = gs->local_reduce;
927:   while ((map = *reduce))
928:     {
929:       /* wall */
930:       if (*num == 2)
931:         {
932:           num ++; reduce++;
933:           vals[map[1]] = vals[map[0]] += vals[map[1]];
934:         }
935:       /* corner shared by three elements */
936:       else if (*num == 3)
937:         {
938:           num ++; reduce++;
939:           vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
940:         }
941:       /* corner shared by four elements */
942:       else if (*num == 4)
943:         {
944:           num ++; reduce++;
945:           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
946:                                  (vals[map[1]] + vals[map[2]] + vals[map[3]]);
947:         }
948:       /* general case ... odd geoms ... 3D*/
949:       else
950:         {
951:           num ++;
952:           tmp = 0.0;
953:           while (*map >= 0)
954:             {tmp += *(vals + *map++);}

956:           map = *reduce++;
957:           while (*map >= 0)
958:             {*(vals + *map++) = tmp;}
959:         }
960:     }
961:   return(0);
962: }

964: /******************************************************************************/
965: static PetscErrorCode gs_gop_local_in_plus( gs_id *gs,  PetscScalar *vals)
966: {
967:    PetscInt *num, *map, **reduce;
968:    PetscScalar *base;

971:   num    = gs->num_gop_local_reduce;
972:   reduce = gs->gop_local_reduce;
973:   while ((map = *reduce++))
974:     {
975:       /* wall */
976:       if (*num == 2)
977:         {
978:           num ++;
979:           vals[map[0]] += vals[map[1]];
980:         }
981:       /* corner shared by three elements */
982:       else if (*num == 3)
983:         {
984:           num ++;
985:           vals[map[0]] += (vals[map[1]] + vals[map[2]]);
986:         }
987:       /* corner shared by four elements */
988:       else if (*num == 4)
989:         {
990:           num ++;
991:           vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
992:         }
993:       /* general case ... odd geoms ... 3D*/
994:       else
995:         {
996:           num++;
997:           base = vals + *map++;
998:           while (*map >= 0)
999:             {*base += *(vals + *map++);}
1000:         }
1001:     }
1002:   return(0);
1003: }

1005: /******************************************************************************/
1006: PetscErrorCode gs_free( gs_id *gs)
1007: {
1008:    PetscInt i;

1011:   if (gs->nghs) {free((void*) gs->nghs);}
1012:   if (gs->pw_nghs) {free((void*) gs->pw_nghs);}

1014:   /* tree */
1015:   if (gs->max_left_over)
1016:     {
1017:       if (gs->tree_elms) {free((void*) gs->tree_elms);}
1018:       if (gs->tree_buf) {free((void*) gs->tree_buf);}
1019:       if (gs->tree_work) {free((void*) gs->tree_work);}
1020:       if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1021:       if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1022:     }

1024:   /* pairwise info */
1025:   if (gs->num_pairs)
1026:     {
1027:       /* should be NULL already */
1028:       if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1029:       if (gs->elms) {free((void*) gs->elms);}
1030:       if (gs->local_elms) {free((void*) gs->local_elms);}
1031:       if (gs->companion) {free((void*) gs->companion);}
1032: 
1033:       /* only set if pairwise */
1034:       if (gs->vals) {free((void*) gs->vals);}
1035:       if (gs->in) {free((void*) gs->in);}
1036:       if (gs->out) {free((void*) gs->out);}
1037:       if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1038:       if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1039:       if (gs->pw_vals) {free((void*) gs->pw_vals);}
1040:       if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1041:       if (gs->node_list)
1042:         {
1043:           for (i=0;i<gs->num_pairs;i++)
1044:             {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1045:           free((void*) gs->node_list);
1046:         }
1047:       if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1048:       if (gs->pair_list) {free((void*) gs->pair_list);}
1049:     }

1051:   /* local info */
1052:   if (gs->num_local_total>=0)
1053:     {
1054:       for (i=0;i<gs->num_local_total+1;i++)
1055:         /*      for (i=0;i<gs->num_local_total;i++) */
1056:         {
1057:           if (gs->num_gop_local_reduce[i])
1058:             {free((void*) gs->gop_local_reduce[i]);}
1059:         }
1060:     }

1062:   /* if intersection tree/pairwise and local isn't empty */
1063:   if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1064:   if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}

1066:   free((void*) gs);
1067:   return(0);
1068: }

1070: /******************************************************************************/
1071: PetscErrorCode gs_gop_vec( gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
1072: {

1076:   switch (*op) {
1077:   case '+':
1078:     gs_gop_vec_plus(gs,vals,step);
1079:     break;
1080:   default:
1081:     PetscInfo1(0,"gs_gop_vec() :: %c is not a valid op",op[0]);
1082:     PetscInfo(0,"gs_gop_vec() :: default :: plus");
1083:     gs_gop_vec_plus(gs,vals,step);
1084:     break;
1085:   }
1086:   return(0);
1087: }

1089: /******************************************************************************/
1090: static PetscErrorCode gs_gop_vec_plus( gs_id *gs,  PetscScalar *vals,  PetscInt step)
1091: {
1093:   if (!gs) {SETERRQ(PETSC_ERR_PLIB,"gs_gop_vec() passed NULL gs handle!!!");}

1095:   /* local only operations!!! */
1096:   if (gs->num_local)
1097:     {gs_gop_vec_local_plus(gs,vals,step);}

1099:   /* if intersection tree/pairwise and local isn't empty */
1100:   if (gs->num_local_gop)
1101:     {
1102:       gs_gop_vec_local_in_plus(gs,vals,step);

1104:       /* pairwise */
1105:       if (gs->num_pairs)
1106:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

1108:       /* tree */
1109:       else if (gs->max_left_over)
1110:         {gs_gop_vec_tree_plus(gs,vals,step);}

1112:       gs_gop_vec_local_out(gs,vals,step);
1113:     }
1114:   /* if intersection tree/pairwise and local is empty */
1115:   else
1116:     {
1117:       /* pairwise */
1118:       if (gs->num_pairs)
1119:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

1121:       /* tree */
1122:       else if (gs->max_left_over)
1123:         {gs_gop_vec_tree_plus(gs,vals,step);}
1124:     }
1125:   return(0);
1126: }

1128: /******************************************************************************/
1129: static PetscErrorCode gs_gop_vec_local_plus( gs_id *gs,  PetscScalar *vals, PetscInt step)
1130: {
1131:    PetscInt *num, *map, **reduce;
1132:    PetscScalar *base;

1135:   num    = gs->num_local_reduce;
1136:   reduce = gs->local_reduce;
1137:   while ((map = *reduce))
1138:     {
1139:       base = vals + map[0] * step;

1141:       /* wall */
1142:       if (*num == 2)
1143:         {
1144:           num++; reduce++;
1145:           rvec_add (base,vals+map[1]*step,step);
1146:           rvec_copy(vals+map[1]*step,base,step);
1147:         }
1148:       /* corner shared by three elements */
1149:       else if (*num == 3)
1150:         {
1151:           num++; reduce++;
1152:           rvec_add (base,vals+map[1]*step,step);
1153:           rvec_add (base,vals+map[2]*step,step);
1154:           rvec_copy(vals+map[2]*step,base,step);
1155:           rvec_copy(vals+map[1]*step,base,step);
1156:         }
1157:       /* corner shared by four elements */
1158:       else if (*num == 4)
1159:         {
1160:           num++; reduce++;
1161:           rvec_add (base,vals+map[1]*step,step);
1162:           rvec_add (base,vals+map[2]*step,step);
1163:           rvec_add (base,vals+map[3]*step,step);
1164:           rvec_copy(vals+map[3]*step,base,step);
1165:           rvec_copy(vals+map[2]*step,base,step);
1166:           rvec_copy(vals+map[1]*step,base,step);
1167:         }
1168:       /* general case ... odd geoms ... 3D */
1169:       else
1170:         {
1171:           num++;
1172:           while (*++map >= 0)
1173:             {rvec_add (base,vals+*map*step,step);}
1174: 
1175:           map = *reduce;
1176:           while (*++map >= 0)
1177:             {rvec_copy(vals+*map*step,base,step);}
1178: 
1179:           reduce++;
1180:         }
1181:     }
1182:   return(0);
1183: }

1185: /******************************************************************************/
1186: static PetscErrorCode gs_gop_vec_local_in_plus( gs_id *gs,  PetscScalar *vals, PetscInt step)
1187: {
1188:    PetscInt  *num, *map, **reduce;
1189:    PetscScalar *base;
1191:   num    = gs->num_gop_local_reduce;
1192:   reduce = gs->gop_local_reduce;
1193:   while ((map = *reduce++))
1194:     {
1195:       base = vals + map[0] * step;

1197:       /* wall */
1198:       if (*num == 2)
1199:         {
1200:           num ++;
1201:           rvec_add(base,vals+map[1]*step,step);
1202:         }
1203:       /* corner shared by three elements */
1204:       else if (*num == 3)
1205:         {
1206:           num ++;
1207:           rvec_add(base,vals+map[1]*step,step);
1208:           rvec_add(base,vals+map[2]*step,step);
1209:         }
1210:       /* corner shared by four elements */
1211:       else if (*num == 4)
1212:         {
1213:           num ++;
1214:           rvec_add(base,vals+map[1]*step,step);
1215:           rvec_add(base,vals+map[2]*step,step);
1216:           rvec_add(base,vals+map[3]*step,step);
1217:         }
1218:       /* general case ... odd geoms ... 3D*/
1219:       else
1220:         {
1221:           num++;
1222:           while (*++map >= 0)
1223:             {rvec_add(base,vals+*map*step,step);}
1224:         }
1225:     }
1226:   return(0);
1227: }

1229: /******************************************************************************/
1230: static PetscErrorCode gs_gop_vec_local_out( gs_id *gs,  PetscScalar *vals, PetscInt step)
1231: {
1232:    PetscInt *num, *map, **reduce;
1233:    PetscScalar *base;

1236:   num    = gs->num_gop_local_reduce;
1237:   reduce = gs->gop_local_reduce;
1238:   while ((map = *reduce++))
1239:     {
1240:       base = vals + map[0] * step;

1242:       /* wall */
1243:       if (*num == 2)
1244:         {
1245:           num ++;
1246:           rvec_copy(vals+map[1]*step,base,step);
1247:         }
1248:       /* corner shared by three elements */
1249:       else if (*num == 3)
1250:         {
1251:           num ++;
1252:           rvec_copy(vals+map[1]*step,base,step);
1253:           rvec_copy(vals+map[2]*step,base,step);
1254:         }
1255:       /* corner shared by four elements */
1256:       else if (*num == 4)
1257:         {
1258:           num ++;
1259:           rvec_copy(vals+map[1]*step,base,step);
1260:           rvec_copy(vals+map[2]*step,base,step);
1261:           rvec_copy(vals+map[3]*step,base,step);
1262:         }
1263:       /* general case ... odd geoms ... 3D*/
1264:       else
1265:         {
1266:           num++;
1267:           while (*++map >= 0)
1268:             {rvec_copy(vals+*map*step,base,step);}
1269:         }
1270:     }
1271:   return(0);
1272: }

1274: /******************************************************************************/
1275: static PetscErrorCode gs_gop_vec_pairwise_plus( gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1276: {
1277:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1278:   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1279:   PetscInt *pw, *list, *size, **nodes;
1280:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1281:   MPI_Status status;
1282:   PetscBLASInt i1 = 1,dstep;

1286:   /* strip and load s */
1287:   msg_list =list         = gs->pair_list;
1288:   msg_size =size         = gs->msg_sizes;
1289:   msg_nodes=nodes        = gs->node_list;
1290:   iptr=pw                = gs->pw_elm_list;
1291:   dptr1=dptr3            = gs->pw_vals;
1292:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1293:   msg_ids_out = ids_out  = gs->msg_ids_out;
1294:   dptr2                  = gs->out;
1295:   in1=in2                = gs->in;

1297:   /* post the receives */
1298:   /*  msg_nodes=nodes; */
1299:   do
1300:     {
1301:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1302:          second one *list and do list++ afterwards */
1303:       MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1304:       list++;msg_ids_in++;
1305:       in1 += *size++ *step;
1306:     }
1307:   while (*++msg_nodes);
1308:   msg_nodes=nodes;

1310:   /* load gs values into in out gs buffers */
1311:   while (*iptr >= 0)
1312:     {
1313:       rvec_copy(dptr3,in_vals + *iptr*step,step);
1314:       dptr3+=step;
1315:       iptr++;
1316:     }

1318:   /* load out buffers and post the sends */
1319:   while ((iptr = *msg_nodes++))
1320:     {
1321:       dptr3 = dptr2;
1322:       while (*iptr >= 0)
1323:         {
1324:           rvec_copy(dptr2,dptr1 + *iptr*step,step);
1325:           dptr2+=step;
1326:           iptr++;
1327:         }
1328:       MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1329:       msg_size++; msg_list++;msg_ids_out++;
1330:     }

1332:   /* tree */
1333:   if (gs->max_left_over)
1334:     {gs_gop_vec_tree_plus(gs,in_vals,step);}

1336:   /* process the received data */
1337:   msg_nodes=nodes;
1338:   while ((iptr = *nodes++)){
1339:     PetscScalar d1 = 1.0;
1340:       /* Should I check the return value of MPI_Wait() or status? */
1341:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1342:       MPI_Wait(ids_in, &status);
1343:       ids_in++;
1344:       while (*iptr >= 0) {
1345:         dstep = PetscBLASIntCast(step);
1346:         BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1347:         in2+=step;
1348:         iptr++;
1349:       }
1350:   }

1352:   /* replace vals */
1353:   while (*pw >= 0)
1354:     {
1355:       rvec_copy(in_vals + *pw*step,dptr1,step);
1356:       dptr1+=step;
1357:       pw++;
1358:     }

1360:   /* clear isend message handles */
1361:   /* This changed for clarity though it could be the same */
1362:   while (*msg_nodes++)
1363:     /* Should I check the return value of MPI_Wait() or status? */
1364:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1365:     {MPI_Wait(ids_out, &status);ids_out++;}
1366: 

1368:   return(0);
1369: }

1371: /******************************************************************************/
1372: static PetscErrorCode gs_gop_vec_tree_plus( gs_id *gs,  PetscScalar *vals,  PetscInt step)
1373: {
1374:   PetscInt size, *in, *out;
1375:   PetscScalar *buf, *work;
1376:   PetscInt op[] = {GL_ADD,0};
1377:   PetscBLASInt i1 = 1;

1380:   /* copy over to local variables */
1381:   in   = gs->tree_map_in;
1382:   out  = gs->tree_map_out;
1383:   buf  = gs->tree_buf;
1384:   work = gs->tree_work;
1385:   size = gs->tree_nel*step;

1387:   /* zero out collection buffer */
1388:   rvec_zero(buf,size);


1391:   /* copy over my contributions */
1392:   while (*in >= 0)
1393:     {
1394:       PetscBLASInt dstep = PetscBLASIntCast(step);
1395:       BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1396:     }

1398:   /* perform fan in/out on full buffer */
1399:   /* must change grop to handle the blas */
1400:   grop(buf,work,size,op);

1402:   /* reset */
1403:   in   = gs->tree_map_in;
1404:   out  = gs->tree_map_out;

1406:   /* get the portion of the results I need */
1407:   while (*in >= 0)
1408:     {
1409:       PetscBLASInt dstep = PetscBLASIntCast(step);
1410:       BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1411:     }
1412:   return(0);
1413: }

1415: /******************************************************************************/
1416: PetscErrorCode gs_gop_hc( gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1417: {

1421:   switch (*op) {
1422:   case '+':
1423:     gs_gop_plus_hc(gs,vals,dim);
1424:     break;
1425:   default:
1426:     PetscInfo1(0,"gs_gop_hc() :: %c is not a valid op",op[0]);
1427:     PetscInfo(0,"gs_gop_hc() :: default :: plus\n");
1428:     gs_gop_plus_hc(gs,vals,dim);
1429:     break;
1430:   }
1431:   return(0);
1432: }

1434: /******************************************************************************/
1435: static PetscErrorCode gs_gop_plus_hc( gs_id *gs,  PetscScalar *vals, PetscInt dim)
1436: {
1438:   /* if there's nothing to do return */
1439:   if (dim<=0)
1440:     {  return(0);}

1442:   /* can't do more dimensions then exist */
1443:   dim = PetscMin(dim,i_log2_num_nodes);

1445:   /* local only operations!!! */
1446:   if (gs->num_local)
1447:     {gs_gop_local_plus(gs,vals);}

1449:   /* if intersection tree/pairwise and local isn't empty */
1450:   if (gs->num_local_gop)
1451:     {
1452:       gs_gop_local_in_plus(gs,vals);

1454:       /* pairwise will do tree inside ... */
1455:       if (gs->num_pairs)
1456:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}

1458:       /* tree only */
1459:       else if (gs->max_left_over)
1460:         {gs_gop_tree_plus_hc(gs,vals,dim);}
1461: 
1462:       gs_gop_local_out(gs,vals);
1463:     }
1464:   /* if intersection tree/pairwise and local is empty */
1465:   else
1466:     {
1467:       /* pairwise will do tree inside */
1468:       if (gs->num_pairs)
1469:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}
1470: 
1471:       /* tree */
1472:       else if (gs->max_left_over)
1473:         {gs_gop_tree_plus_hc(gs,vals,dim);}
1474:     }
1475:   return(0);
1476: }

1478: /******************************************************************************/
1479: static PetscErrorCode gs_gop_pairwise_plus_hc( gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1480: {
1481:    PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1482:    PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1483:    PetscInt *pw, *list, *size, **nodes;
1484:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1485:   MPI_Status status;
1486:   PetscInt i, mask=1;

1490:   for (i=1; i<dim; i++)
1491:     {mask<<=1; mask++;}


1494:   /* strip and load s */
1495:   msg_list =list         = gs->pair_list;
1496:   msg_size =size         = gs->msg_sizes;
1497:   msg_nodes=nodes        = gs->node_list;
1498:   iptr=pw                = gs->pw_elm_list;
1499:   dptr1=dptr3            = gs->pw_vals;
1500:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1501:   msg_ids_out = ids_out  = gs->msg_ids_out;
1502:   dptr2                  = gs->out;
1503:   in1=in2                = gs->in;

1505:   /* post the receives */
1506:   /*  msg_nodes=nodes; */
1507:   do
1508:     {
1509:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1510:          second one *list and do list++ afterwards */
1511:       if ((my_id|mask)==(*list|mask))
1512:         {
1513:           MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1514:           list++; msg_ids_in++;in1 += *size++;
1515:         }
1516:       else
1517:         {list++; size++;}
1518:     }
1519:   while (*++msg_nodes);

1521:   /* load gs values into in out gs buffers */
1522:   while (*iptr >= 0)
1523:     {*dptr3++ = *(in_vals + *iptr++);}

1525:   /* load out buffers and post the sends */
1526:   msg_nodes=nodes;
1527:   list = msg_list;
1528:   while ((iptr = *msg_nodes++))
1529:     {
1530:       if ((my_id|mask)==(*list|mask))
1531:         {
1532:           dptr3 = dptr2;
1533:           while (*iptr >= 0)
1534:             {*dptr2++ = *(dptr1 + *iptr++);}
1535:           /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1536:           /* is msg_ids_out++ correct? */
1537:           MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1538:           msg_size++;list++;msg_ids_out++;
1539:         }
1540:       else
1541:         {list++; msg_size++;}
1542:     }

1544:   /* do the tree while we're waiting */
1545:   if (gs->max_left_over)
1546:     {gs_gop_tree_plus_hc(gs,in_vals,dim);}

1548:   /* process the received data */
1549:   msg_nodes=nodes;
1550:   list = msg_list;
1551:   while ((iptr = *nodes++))
1552:     {
1553:       if ((my_id|mask)==(*list|mask))
1554:         {
1555:           /* Should I check the return value of MPI_Wait() or status? */
1556:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1557:           MPI_Wait(ids_in, &status);
1558:           ids_in++;
1559:           while (*iptr >= 0)
1560:             {*(dptr1 + *iptr++) += *in2++;}
1561:         }
1562:       list++;
1563:     }

1565:   /* replace vals */
1566:   while (*pw >= 0)
1567:     {*(in_vals + *pw++) = *dptr1++;}

1569:   /* clear isend message handles */
1570:   /* This changed for clarity though it could be the same */
1571:   while (*msg_nodes++)
1572:     {
1573:       if ((my_id|mask)==(*msg_list|mask))
1574:         {
1575:           /* Should I check the return value of MPI_Wait() or status? */
1576:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1577:           MPI_Wait(ids_out, &status);
1578:           ids_out++;
1579:         }
1580:       msg_list++;
1581:     }

1583:   return(0);
1584: }

1586: /******************************************************************************/
1587: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim)
1588: {
1589:   PetscInt size;
1590:   PetscInt *in, *out;
1591:   PetscScalar *buf, *work;
1592:   PetscInt op[] = {GL_ADD,0};

1595:   in   = gs->tree_map_in;
1596:   out  = gs->tree_map_out;
1597:   buf  = gs->tree_buf;
1598:   work = gs->tree_work;
1599:   size = gs->tree_nel;

1601:   rvec_zero(buf,size);

1603:   while (*in >= 0)
1604:     {*(buf + *out++) = *(vals + *in++);}

1606:   in   = gs->tree_map_in;
1607:   out  = gs->tree_map_out;

1609:   grop_hc(buf,work,size,op,dim);

1611:   while (*in >= 0)
1612:     {*(vals + *in++) = *(buf + *out++);}
1613:   return(0);
1614: }