Actual source code: gs.c
1: #define PETSCKSP_DLL
3: /***********************************gs.c***************************************
5: Author: Henry M. Tufo III
7: e-mail: hmt@cs.brown.edu
9: snail-mail:
10: Division of Applied Mathematics
11: Brown University
12: Providence, RI 02912
14: Last Modification:
15: 6.21.97
16: ************************************gs.c**************************************/
18: /***********************************gs.c***************************************
19: File Description:
20: -----------------
22: ************************************gs.c**************************************/
24: #include ../src/ksp/pc/impls/tfs/tfs.h
26: /* default length of number of items via tree - doubles if exceeded */
27: #define TREE_BUF_SZ 2048;
28: #define GS_VEC_SZ 1
32: /***********************************gs.c***************************************
33: Type: struct gather_scatter_id
34: ------------------------------
36: ************************************gs.c**************************************/
37: typedef struct gather_scatter_id {
38: PetscInt id;
39: PetscInt nel_min;
40: PetscInt nel_max;
41: PetscInt nel_sum;
42: PetscInt negl;
43: PetscInt gl_max;
44: PetscInt gl_min;
45: PetscInt repeats;
46: PetscInt ordered;
47: PetscInt positive;
48: PetscScalar *vals;
50: /* bit mask info */
51: PetscInt *my_proc_mask;
52: PetscInt mask_sz;
53: PetscInt *ngh_buf;
54: PetscInt ngh_buf_sz;
55: PetscInt *nghs;
56: PetscInt num_nghs;
57: PetscInt max_nghs;
58: PetscInt *pw_nghs;
59: PetscInt num_pw_nghs;
60: PetscInt *tree_nghs;
61: PetscInt num_tree_nghs;
63: PetscInt num_loads;
65: /* repeats == true -> local info */
66: PetscInt nel; /* number of unique elememts */
67: PetscInt *elms; /* of size nel */
68: PetscInt nel_total;
69: PetscInt *local_elms; /* of size nel_total */
70: PetscInt *companion; /* of size nel_total */
72: /* local info */
73: PetscInt num_local_total;
74: PetscInt local_strength;
75: PetscInt num_local;
76: PetscInt *num_local_reduce;
77: PetscInt **local_reduce;
78: PetscInt num_local_gop;
79: PetscInt *num_gop_local_reduce;
80: PetscInt **gop_local_reduce;
82: /* pairwise info */
83: PetscInt level;
84: PetscInt num_pairs;
85: PetscInt max_pairs;
86: PetscInt loc_node_pairs;
87: PetscInt max_node_pairs;
88: PetscInt min_node_pairs;
89: PetscInt avg_node_pairs;
90: PetscInt *pair_list;
91: PetscInt *msg_sizes;
92: PetscInt **node_list;
93: PetscInt len_pw_list;
94: PetscInt *pw_elm_list;
95: PetscScalar *pw_vals;
97: MPI_Request *msg_ids_in;
98: MPI_Request *msg_ids_out;
100: PetscScalar *out;
101: PetscScalar *in;
102: PetscInt msg_total;
104: /* tree - crystal accumulator info */
105: PetscInt max_left_over;
106: PetscInt *pre;
107: PetscInt *in_num;
108: PetscInt *out_num;
109: PetscInt **in_list;
110: PetscInt **out_list;
112: /* new tree work*/
113: PetscInt tree_nel;
114: PetscInt *tree_elms;
115: PetscScalar *tree_buf;
116: PetscScalar *tree_work;
118: PetscInt tree_map_sz;
119: PetscInt *tree_map_in;
120: PetscInt *tree_map_out;
122: /* current memory status */
123: PetscInt gl_bss_min;
124: PetscInt gl_perm_min;
126: /* max segment size for gs_gop_vec() */
127: PetscInt vec_sz;
129: /* hack to make paul happy */
130: MPI_Comm gs_comm;
132: } gs_id;
134: static gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
135: static PetscErrorCode gsi_via_bit_mask(gs_id *gs);
136: static PetscErrorCode get_ngh_buf(gs_id *gs);
137: static PetscErrorCode set_pairwise(gs_id *gs);
138: static gs_id * gsi_new(void);
139: static PetscErrorCode set_tree(gs_id *gs);
141: /* same for all but vector flavor */
142: static PetscErrorCode gs_gop_local_out(gs_id *gs, PetscScalar *vals);
143: /* vector flavor */
144: static PetscErrorCode gs_gop_vec_local_out(gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode gs_gop_vec_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode gs_gop_vec_pairwise_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
148: static PetscErrorCode gs_gop_vec_local_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode gs_gop_vec_local_in_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
150: static PetscErrorCode gs_gop_vec_tree_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
153: static PetscErrorCode gs_gop_local_plus(gs_id *gs, PetscScalar *vals);
154: static PetscErrorCode gs_gop_local_in_plus(gs_id *gs, PetscScalar *vals);
156: static PetscErrorCode gs_gop_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode gs_gop_pairwise_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
158: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim);
160: /* global vars */
161: /* from comm.c module */
163: static PetscInt num_gs_ids = 0;
165: /* should make this dynamic ... later */
166: static PetscInt msg_buf=MAX_MSG_BUF;
167: static PetscInt vec_sz=GS_VEC_SZ;
168: static PetscInt *tree_buf=NULL;
169: static PetscInt tree_buf_sz=0;
170: static PetscInt ntree=0;
172: /***************************************************************************/
173: PetscErrorCode gs_init_vec_sz(PetscInt size)
174: {
176: vec_sz = size;
177: return(0);
178: }
180: /******************************************************************************/
181: PetscErrorCode gs_init_msg_buf_sz(PetscInt buf_size)
182: {
184: msg_buf = buf_size;
185: return(0);
186: }
188: /******************************************************************************/
189: gs_id *gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
190: {
191: gs_id *gs;
192: MPI_Group gs_group;
193: MPI_Comm gs_comm;
197: /* ensure that communication package has been initialized */
198: comm_init();
201: /* determines if we have enough dynamic/semi-static memory */
202: /* checks input, allocs and sets gd_id template */
203: gs = gsi_check_args(elms,nel,level);
205: /* only bit mask version up and working for the moment */
206: /* LATER :: get int list version working for sparse pblms */
207: gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210: MPI_Comm_group(MPI_COMM_WORLD,&gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211: MPI_Comm_create(MPI_COMM_WORLD,gs_group,&gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
212: gs->gs_comm=gs_comm;
214: return(gs);
215: }
217: /******************************************************************************/
218: static gs_id *gsi_new(void)
219: {
221: gs_id *gs;
222: gs = (gs_id *) malloc(sizeof(gs_id));
223: PetscMemzero(gs,sizeof(gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
224: return(gs);
225: }
227: /******************************************************************************/
228: static gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
229: {
230: PetscInt i, j, k, t2;
231: PetscInt *companion, *elms, *unique, *iptr;
232: PetscInt num_local=0, *num_to_reduce, **local_reduce;
233: PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
234: PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
235: PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
236: gs_id *gs;
240: if (!in_elms)
241: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");}
243: if (nel<0)
244: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");}
246: if (nel==0)
247: {PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
249: /* get space for gs template */
250: gs = gsi_new();
251: gs->id = ++num_gs_ids;
253: /* hmt 6.4.99 */
254: /* caller can set global ids that don't participate to 0 */
255: /* gs_init ignores all zeros in elm list */
256: /* negative global ids are still invalid */
257: for (i=j=0;i<nel;i++)
258: {if (in_elms[i]!=0) {j++;}}
260: k=nel; nel=j;
262: /* copy over in_elms list and create inverse map */
263: elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
264: companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
266: for (i=j=0;i<k;i++)
267: {
268: if (in_elms[i]!=0)
269: {elms[j] = in_elms[i]; companion[j++] = i;}
270: }
272: if (j!=nel)
273: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");}
275: /* pre-pass ... check to see if sorted */
276: elms[nel] = INT_MAX;
277: iptr = elms;
278: unique = elms+1;
279: j=0;
280: while (*iptr!=INT_MAX)
281: {
282: if (*iptr++>*unique++)
283: {j=1; break;}
284: }
286: /* set up inverse map */
287: if (j)
288: {
289: PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
290: SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
291: }
292: else
293: {PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
294: elms[nel] = INT_MIN;
296: /* first pass */
297: /* determine number of unique elements, check pd */
298: for (i=k=0;i<nel;i+=j)
299: {
300: t2 = elms[i];
301: j=++i;
302:
303: /* clump 'em for now */
304: while (elms[j]==t2) {j++;}
305:
306: /* how many together and num local */
307: if (j-=i)
308: {num_local++; k+=j;}
309: }
311: /* how many unique elements? */
312: gs->repeats=k;
313: gs->nel = nel-k;
316: /* number of repeats? */
317: gs->num_local = num_local;
318: num_local+=2;
319: gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
320: gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
322: unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
323: gs->elms = unique;
324: gs->nel_total = nel;
325: gs->local_elms = elms;
326: gs->companion = companion;
328: /* compess map as well as keep track of local ops */
329: for (num_local=i=j=0;i<gs->nel;i++)
330: {
331: k=j;
332: t2 = unique[i] = elms[j];
333: companion[i] = companion[j];
334:
335: while (elms[j]==t2) {j++;}
337: if ((t2=(j-k))>1)
338: {
339: /* number together */
340: num_to_reduce[num_local] = t2++;
341: iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
343: /* to use binary searching don't remap until we check intersection */
344: *iptr++ = i;
345:
346: /* note that we're skipping the first one */
347: while (++k<j)
348: {*(iptr++) = companion[k];}
349: *iptr = -1;
350: }
351: }
353: /* sentinel for ngh_buf */
354: unique[gs->nel]=INT_MAX;
356: /* for two partition sort hack */
357: num_to_reduce[num_local] = 0;
358: local_reduce[num_local] = NULL;
359: num_to_reduce[++num_local] = 0;
360: local_reduce[num_local] = NULL;
362: /* load 'em up */
363: /* note one extra to hold NON_UNIFORM flag!!! */
364: vals[2] = vals[1] = vals[0] = nel;
365: if (gs->nel>0)
366: {
367: vals[3] = unique[0];
368: vals[4] = unique[gs->nel-1];
369: }
370: else
371: {
372: vals[3] = INT_MAX;
373: vals[4] = INT_MIN;
374: }
375: vals[5] = level;
376: vals[6] = num_gs_ids;
378: /* GLOBAL: send 'em out */
379: giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
381: /* must be semi-pos def - only pairwise depends on this */
382: /* LATER - remove this restriction */
383: if (vals[3]<0)
384: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");}
386: if (vals[4]==INT_MAX)
387: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");}
389: gs->nel_min = vals[0];
390: gs->nel_max = vals[1];
391: gs->nel_sum = vals[2];
392: gs->gl_min = vals[3];
393: gs->gl_max = vals[4];
394: gs->negl = vals[4]-vals[3]+1;
396: if (gs->negl<=0)
397: {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");}
398:
399: /* LATER :: add level == -1 -> program selects level */
400: if (vals[5]<0)
401: {vals[5]=0;}
402: else if (vals[5]>num_nodes)
403: {vals[5]=num_nodes;}
404: gs->level = vals[5];
406: return(gs);
407: }
409: /******************************************************************************/
410: static PetscErrorCode gsi_via_bit_mask(gs_id *gs)
411: {
412: PetscInt i, nel, *elms;
413: PetscInt t1;
414: PetscInt **reduce;
415: PetscInt *map;
419: /* totally local removes ... ct_bits == 0 */
420: get_ngh_buf(gs);
422: if (gs->level)
423: {set_pairwise(gs);}
425: if (gs->max_left_over)
426: {set_tree(gs);}
428: /* intersection local and pairwise/tree? */
429: gs->num_local_total = gs->num_local;
430: gs->gop_local_reduce = gs->local_reduce;
431: gs->num_gop_local_reduce = gs->num_local_reduce;
433: map = gs->companion;
435: /* is there any local compression */
436: if (!gs->num_local) {
437: gs->local_strength = NONE;
438: gs->num_local_gop = 0;
439: } else {
440: /* ok find intersection */
441: map = gs->companion;
442: reduce = gs->local_reduce;
443: for (i=0, t1=0; i<gs->num_local; i++, reduce++)
444: {
445: if ((ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
446: ||
447: ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
448: {
449: t1++;
450: if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_ERR_PLIB,"nobody in list?");
451: gs->num_local_reduce[i] *= -1;
452: }
453: **reduce=map[**reduce];
454: }
456: /* intersection is empty */
457: if (!t1)
458: {
459: gs->local_strength = FULL;
460: gs->num_local_gop = 0;
461: }
462: /* intersection not empty */
463: else
464: {
465: gs->local_strength = PARTIAL;
466: SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
468: gs->num_local_gop = t1;
469: gs->num_local_total = gs->num_local;
470: gs->num_local -= t1;
471: gs->gop_local_reduce = gs->local_reduce;
472: gs->num_gop_local_reduce = gs->num_local_reduce;
474: for (i=0; i<t1; i++)
475: {
476: if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_ERR_PLIB,"they aren't negative?");
477: gs->num_gop_local_reduce[i] *= -1;
478: gs->local_reduce++;
479: gs->num_local_reduce++;
480: }
481: gs->local_reduce++;
482: gs->num_local_reduce++;
483: }
484: }
486: elms = gs->pw_elm_list;
487: nel = gs->len_pw_list;
488: for (i=0; i<nel; i++)
489: {elms[i] = map[elms[i]];}
491: elms = gs->tree_map_in;
492: nel = gs->tree_map_sz;
493: for (i=0; i<nel; i++)
494: {elms[i] = map[elms[i]];}
496: /* clean up */
497: free((void*) gs->local_elms);
498: free((void*) gs->companion);
499: free((void*) gs->elms);
500: free((void*) gs->ngh_buf);
501: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
502: return(0);
503: }
505: /******************************************************************************/
506: static PetscErrorCode place_in_tree( PetscInt elm)
507: {
508: PetscInt *tp, n;
511: if (ntree==tree_buf_sz)
512: {
513: if (tree_buf_sz)
514: {
515: tp = tree_buf;
516: n = tree_buf_sz;
517: tree_buf_sz<<=1;
518: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
519: ivec_copy(tree_buf,tp,n);
520: free(tp);
521: }
522: else
523: {
524: tree_buf_sz = TREE_BUF_SZ;
525: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
526: }
527: }
529: tree_buf[ntree++] = elm;
530: return(0);
531: }
533: /******************************************************************************/
534: static PetscErrorCode get_ngh_buf(gs_id *gs)
535: {
536: PetscInt i, j, npw=0, ntree_map=0;
537: PetscInt p_mask_size, ngh_buf_size, buf_size;
538: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
539: PetscInt *ngh_buf, *buf1, *buf2;
540: PetscInt offset, per_load, num_loads, or_ct, start, end;
541: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
542: PetscInt oper=GL_B_OR;
543: PetscInt *ptr3, *t_mask, level, ct1, ct2;
547: /* to make life easier */
548: nel = gs->nel;
549: elms = gs->elms;
550: level = gs->level;
551:
552: /* det #bytes needed for processor bit masks and init w/mask cor. to my_id */
553: p_mask = (PetscInt*) malloc(p_mask_size=len_bit_mask(num_nodes));
554: set_bit_mask(p_mask,p_mask_size,my_id);
556: /* allocate space for masks and info bufs */
557: gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
558: gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
559: gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
560: t_mask = (PetscInt*) malloc(p_mask_size);
561: gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
563: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
564: /* had thought I could exploit rendezvous threshold */
566: /* default is one pass */
567: per_load = negl = gs->negl;
568: gs->num_loads = num_loads = 1;
569: i=p_mask_size*negl;
571: /* possible overflow on buffer size */
572: /* overflow hack */
573: if (i<0) {i=INT_MAX;}
575: buf_size = PetscMin(msg_buf,i);
577: /* can we do it? */
578: if (p_mask_size>buf_size) SETERRQ2(PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
580: /* get giop buf space ... make *only* one malloc */
581: buf1 = (PetscInt*) malloc(buf_size<<1);
583: /* more than one gior exchange needed? */
584: if (buf_size!=i)
585: {
586: per_load = buf_size/p_mask_size;
587: buf_size = per_load*p_mask_size;
588: gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
589: }
592: /* convert buf sizes from #bytes to #ints - 32 bit only! */
593: p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
594:
595: /* find giop work space */
596: buf2 = buf1+buf_size;
598: /* hold #ints needed for processor masks */
599: gs->mask_sz=p_mask_size;
601: /* init buffers */
602: ivec_zero(sh_proc_mask,p_mask_size);
603: ivec_zero(pw_sh_proc_mask,p_mask_size);
604: ivec_zero(ngh_buf,ngh_buf_size);
606: /* HACK reset tree info */
607: tree_buf=NULL;
608: tree_buf_sz=ntree=0;
610: /* ok do it */
611: for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
612: {
613: /* identity for bitwise or is 000...000 */
614: ivec_zero(buf1,buf_size);
616: /* load msg buffer */
617: for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
618: {
619: offset = (offset-start)*p_mask_size;
620: ivec_copy(buf1+offset,p_mask,p_mask_size);
621: }
623: /* GLOBAL: pass buffer */
624: giop(buf1,buf2,buf_size,&oper);
627: /* unload buffer into ngh_buf */
628: ptr2=(elms+i_start);
629: for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
630: {
631: /* I own it ... may have to pairwise it */
632: if (j==*ptr2)
633: {
634: /* do i share it w/anyone? */
635: ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
636: /* guess not */
637: if (ct1<2)
638: {ptr2++; ptr1+=p_mask_size; continue;}
640: /* i do ... so keep info and turn off my bit */
641: ivec_copy(ptr1,ptr3,p_mask_size);
642: ivec_xor(ptr1,p_mask,p_mask_size);
643: ivec_or(sh_proc_mask,ptr1,p_mask_size);
644:
645: /* is it to be done pairwise? */
646: if (--ct1<=level)
647: {
648: npw++;
649:
650: /* turn on high bit to indicate pw need to process */
651: *ptr2++ |= TOP_BIT;
652: ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
653: ptr1+=p_mask_size;
654: continue;
655: }
657: /* get set for next and note that I have a tree contribution */
658: /* could save exact elm index for tree here -> save a search */
659: ptr2++; ptr1+=p_mask_size; ntree_map++;
660: }
661: /* i don't but still might be involved in tree */
662: else
663: {
665: /* shared by how many? */
666: ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
668: /* none! */
669: if (ct1<2) continue;
671: /* is it going to be done pairwise? but not by me of course!*/
672: if (--ct1<=level) continue;
673: }
674: /* LATER we're going to have to process it NOW */
675: /* nope ... tree it */
676: place_in_tree(j);
677: }
678: }
680: free((void*)t_mask);
681: free((void*)buf1);
683: gs->len_pw_list=npw;
684: gs->num_nghs = ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
686: /* expand from bit mask list to int list and save ngh list */
687: gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
688: bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
690: gs->num_pw_nghs = ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
692: oper = GL_MAX;
693: ct1 = gs->num_nghs;
694: giop(&ct1,&ct2,1,&oper);
695: gs->max_nghs = ct1;
697: gs->tree_map_sz = ntree_map;
698: gs->max_left_over=ntree;
700: free((void*)p_mask);
701: free((void*)sh_proc_mask);
702: return(0);
703: }
705: /******************************************************************************/
706: static PetscErrorCode set_pairwise(gs_id *gs)
707: {
708: PetscInt i, j;
709: PetscInt p_mask_size;
710: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
711: PetscInt *ngh_buf, *buf2;
712: PetscInt offset;
713: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
714: PetscInt *pairwise_elm_list, len_pair_list=0;
715: PetscInt *iptr, t1, i_start, nel, *elms;
716: PetscInt ct;
720: /* to make life easier */
721: nel = gs->nel;
722: elms = gs->elms;
723: ngh_buf = gs->ngh_buf;
724: sh_proc_mask = gs->pw_nghs;
726: /* need a few temp masks */
727: p_mask_size = len_bit_mask(num_nodes);
728: p_mask = (PetscInt*) malloc(p_mask_size);
729: tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
731: /* set mask to my my_id's bit mask */
732: set_bit_mask(p_mask,p_mask_size,my_id);
734: p_mask_size /= sizeof(PetscInt);
735:
736: len_pair_list=gs->len_pw_list;
737: gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
739: /* how many processors (nghs) do we have to exchange with? */
740: nprs=gs->num_pairs=ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
743: /* allocate space for gs_gop() info */
744: gs->pair_list = msg_list = (PetscInt *) malloc(sizeof(PetscInt)*nprs);
745: gs->msg_sizes = msg_size = (PetscInt *) malloc(sizeof(PetscInt)*nprs);
746: gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));
748: /* init msg_size list */
749: ivec_zero(msg_size,nprs);
751: /* expand from bit mask list to int list */
752: bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
753:
754: /* keep list of elements being handled pairwise */
755: for (i=j=0;i<nel;i++)
756: {
757: if (elms[i] & TOP_BIT)
758: {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
759: }
760: pairwise_elm_list[j] = -1;
762: gs->msg_ids_out = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1));
763: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
764: gs->msg_ids_in = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1));
765: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
766: gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
768: /* find who goes to each processor */
769: for (i_start=i=0;i<nprs;i++)
770: {
771: /* processor i's mask */
772: set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);
774: /* det # going to processor i */
775: for (ct=j=0;j<len_pair_list;j++)
776: {
777: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
778: ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
779: if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
780: {ct++;}
781: }
782: msg_size[i] = ct;
783: i_start = PetscMax(i_start,ct);
785: /*space to hold nodes in message to first neighbor */
786: msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
788: for (j=0;j<len_pair_list;j++)
789: {
790: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
791: ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
792: if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
793: {*iptr++ = j;}
794: }
795: *iptr = -1;
796: }
797: msg_nodes[nprs] = NULL;
799: j=gs->loc_node_pairs=i_start;
800: t1 = GL_MAX;
801: giop(&i_start,&offset,1,&t1);
802: gs->max_node_pairs = i_start;
804: i_start=j;
805: t1 = GL_MIN;
806: giop(&i_start,&offset,1,&t1);
807: gs->min_node_pairs = i_start;
809: i_start=j;
810: t1 = GL_ADD;
811: giop(&i_start,&offset,1,&t1);
812: gs->avg_node_pairs = i_start/num_nodes + 1;
814: i_start=nprs;
815: t1 = GL_MAX;
816: giop(&i_start,&offset,1,&t1);
817: gs->max_pairs = i_start;
820: /* remap pairwise in tail of gsi_via_bit_mask() */
821: gs->msg_total = ivec_sum(gs->msg_sizes,nprs);
822: gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
823: gs->in = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
825: /* reset malloc pool */
826: free((void*)p_mask);
827: free((void*)tmp_proc_mask);
828: return(0);
829: }
831: /* to do pruned tree just save ngh buf copy for each one and decode here!
832: ******************************************************************************/
833: static PetscErrorCode set_tree(gs_id *gs)
834: {
835: PetscInt i, j, n, nel;
836: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
839: /* local work ptrs */
840: elms = gs->elms;
841: nel = gs->nel;
843: /* how many via tree */
844: gs->tree_nel = n = ntree;
845: gs->tree_elms = tree_elms = iptr_in = tree_buf;
846: gs->tree_buf = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
847: gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
848: j=gs->tree_map_sz;
849: gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
850: gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
852: /* search the longer of the two lists */
853: /* note ... could save this info in get_ngh_buf and save searches */
854: if (n<=nel)
855: {
856: /* bijective fct w/remap - search elm list */
857: for (i=0; i<n; i++)
858: {
859: if ((j=ivec_binary_search(*tree_elms++,elms,nel))>=0)
860: {*iptr_in++ = j; *iptr_out++ = i;}
861: }
862: }
863: else
864: {
865: for (i=0; i<nel; i++)
866: {
867: if ((j=ivec_binary_search(*elms++,tree_elms,n))>=0)
868: {*iptr_in++ = i; *iptr_out++ = j;}
869: }
870: }
872: /* sentinel */
873: *iptr_in = *iptr_out = -1;
874: return(0);
875: }
877: /******************************************************************************/
878: static PetscErrorCode gs_gop_local_out( gs_id *gs, PetscScalar *vals)
879: {
880: PetscInt *num, *map, **reduce;
881: PetscScalar tmp;
884: num = gs->num_gop_local_reduce;
885: reduce = gs->gop_local_reduce;
886: while ((map = *reduce++))
887: {
888: /* wall */
889: if (*num == 2)
890: {
891: num ++;
892: vals[map[1]] = vals[map[0]];
893: }
894: /* corner shared by three elements */
895: else if (*num == 3)
896: {
897: num ++;
898: vals[map[2]] = vals[map[1]] = vals[map[0]];
899: }
900: /* corner shared by four elements */
901: else if (*num == 4)
902: {
903: num ++;
904: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
905: }
906: /* general case ... odd geoms ... 3D*/
907: else
908: {
909: num++;
910: tmp = *(vals + *map++);
911: while (*map >= 0)
912: {*(vals + *map++) = tmp;}
913: }
914: }
915: return(0);
916: }
918: /******************************************************************************/
919: static PetscErrorCode gs_gop_local_plus( gs_id *gs, PetscScalar *vals)
920: {
921: PetscInt *num, *map, **reduce;
922: PetscScalar tmp;
925: num = gs->num_local_reduce;
926: reduce = gs->local_reduce;
927: while ((map = *reduce))
928: {
929: /* wall */
930: if (*num == 2)
931: {
932: num ++; reduce++;
933: vals[map[1]] = vals[map[0]] += vals[map[1]];
934: }
935: /* corner shared by three elements */
936: else if (*num == 3)
937: {
938: num ++; reduce++;
939: vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
940: }
941: /* corner shared by four elements */
942: else if (*num == 4)
943: {
944: num ++; reduce++;
945: vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
946: (vals[map[1]] + vals[map[2]] + vals[map[3]]);
947: }
948: /* general case ... odd geoms ... 3D*/
949: else
950: {
951: num ++;
952: tmp = 0.0;
953: while (*map >= 0)
954: {tmp += *(vals + *map++);}
956: map = *reduce++;
957: while (*map >= 0)
958: {*(vals + *map++) = tmp;}
959: }
960: }
961: return(0);
962: }
964: /******************************************************************************/
965: static PetscErrorCode gs_gop_local_in_plus( gs_id *gs, PetscScalar *vals)
966: {
967: PetscInt *num, *map, **reduce;
968: PetscScalar *base;
971: num = gs->num_gop_local_reduce;
972: reduce = gs->gop_local_reduce;
973: while ((map = *reduce++))
974: {
975: /* wall */
976: if (*num == 2)
977: {
978: num ++;
979: vals[map[0]] += vals[map[1]];
980: }
981: /* corner shared by three elements */
982: else if (*num == 3)
983: {
984: num ++;
985: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
986: }
987: /* corner shared by four elements */
988: else if (*num == 4)
989: {
990: num ++;
991: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
992: }
993: /* general case ... odd geoms ... 3D*/
994: else
995: {
996: num++;
997: base = vals + *map++;
998: while (*map >= 0)
999: {*base += *(vals + *map++);}
1000: }
1001: }
1002: return(0);
1003: }
1005: /******************************************************************************/
1006: PetscErrorCode gs_free( gs_id *gs)
1007: {
1008: PetscInt i;
1011: if (gs->nghs) {free((void*) gs->nghs);}
1012: if (gs->pw_nghs) {free((void*) gs->pw_nghs);}
1014: /* tree */
1015: if (gs->max_left_over)
1016: {
1017: if (gs->tree_elms) {free((void*) gs->tree_elms);}
1018: if (gs->tree_buf) {free((void*) gs->tree_buf);}
1019: if (gs->tree_work) {free((void*) gs->tree_work);}
1020: if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1021: if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1022: }
1024: /* pairwise info */
1025: if (gs->num_pairs)
1026: {
1027: /* should be NULL already */
1028: if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1029: if (gs->elms) {free((void*) gs->elms);}
1030: if (gs->local_elms) {free((void*) gs->local_elms);}
1031: if (gs->companion) {free((void*) gs->companion);}
1032:
1033: /* only set if pairwise */
1034: if (gs->vals) {free((void*) gs->vals);}
1035: if (gs->in) {free((void*) gs->in);}
1036: if (gs->out) {free((void*) gs->out);}
1037: if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1038: if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1039: if (gs->pw_vals) {free((void*) gs->pw_vals);}
1040: if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1041: if (gs->node_list)
1042: {
1043: for (i=0;i<gs->num_pairs;i++)
1044: {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1045: free((void*) gs->node_list);
1046: }
1047: if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1048: if (gs->pair_list) {free((void*) gs->pair_list);}
1049: }
1051: /* local info */
1052: if (gs->num_local_total>=0)
1053: {
1054: for (i=0;i<gs->num_local_total+1;i++)
1055: /* for (i=0;i<gs->num_local_total;i++) */
1056: {
1057: if (gs->num_gop_local_reduce[i])
1058: {free((void*) gs->gop_local_reduce[i]);}
1059: }
1060: }
1062: /* if intersection tree/pairwise and local isn't empty */
1063: if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1064: if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}
1066: free((void*) gs);
1067: return(0);
1068: }
1070: /******************************************************************************/
1071: PetscErrorCode gs_gop_vec( gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
1072: {
1076: switch (*op) {
1077: case '+':
1078: gs_gop_vec_plus(gs,vals,step);
1079: break;
1080: default:
1081: PetscInfo1(0,"gs_gop_vec() :: %c is not a valid op",op[0]);
1082: PetscInfo(0,"gs_gop_vec() :: default :: plus");
1083: gs_gop_vec_plus(gs,vals,step);
1084: break;
1085: }
1086: return(0);
1087: }
1089: /******************************************************************************/
1090: static PetscErrorCode gs_gop_vec_plus( gs_id *gs, PetscScalar *vals, PetscInt step)
1091: {
1093: if (!gs) {SETERRQ(PETSC_ERR_PLIB,"gs_gop_vec() passed NULL gs handle!!!");}
1095: /* local only operations!!! */
1096: if (gs->num_local)
1097: {gs_gop_vec_local_plus(gs,vals,step);}
1099: /* if intersection tree/pairwise and local isn't empty */
1100: if (gs->num_local_gop)
1101: {
1102: gs_gop_vec_local_in_plus(gs,vals,step);
1104: /* pairwise */
1105: if (gs->num_pairs)
1106: {gs_gop_vec_pairwise_plus(gs,vals,step);}
1108: /* tree */
1109: else if (gs->max_left_over)
1110: {gs_gop_vec_tree_plus(gs,vals,step);}
1112: gs_gop_vec_local_out(gs,vals,step);
1113: }
1114: /* if intersection tree/pairwise and local is empty */
1115: else
1116: {
1117: /* pairwise */
1118: if (gs->num_pairs)
1119: {gs_gop_vec_pairwise_plus(gs,vals,step);}
1121: /* tree */
1122: else if (gs->max_left_over)
1123: {gs_gop_vec_tree_plus(gs,vals,step);}
1124: }
1125: return(0);
1126: }
1128: /******************************************************************************/
1129: static PetscErrorCode gs_gop_vec_local_plus( gs_id *gs, PetscScalar *vals, PetscInt step)
1130: {
1131: PetscInt *num, *map, **reduce;
1132: PetscScalar *base;
1135: num = gs->num_local_reduce;
1136: reduce = gs->local_reduce;
1137: while ((map = *reduce))
1138: {
1139: base = vals + map[0] * step;
1141: /* wall */
1142: if (*num == 2)
1143: {
1144: num++; reduce++;
1145: rvec_add (base,vals+map[1]*step,step);
1146: rvec_copy(vals+map[1]*step,base,step);
1147: }
1148: /* corner shared by three elements */
1149: else if (*num == 3)
1150: {
1151: num++; reduce++;
1152: rvec_add (base,vals+map[1]*step,step);
1153: rvec_add (base,vals+map[2]*step,step);
1154: rvec_copy(vals+map[2]*step,base,step);
1155: rvec_copy(vals+map[1]*step,base,step);
1156: }
1157: /* corner shared by four elements */
1158: else if (*num == 4)
1159: {
1160: num++; reduce++;
1161: rvec_add (base,vals+map[1]*step,step);
1162: rvec_add (base,vals+map[2]*step,step);
1163: rvec_add (base,vals+map[3]*step,step);
1164: rvec_copy(vals+map[3]*step,base,step);
1165: rvec_copy(vals+map[2]*step,base,step);
1166: rvec_copy(vals+map[1]*step,base,step);
1167: }
1168: /* general case ... odd geoms ... 3D */
1169: else
1170: {
1171: num++;
1172: while (*++map >= 0)
1173: {rvec_add (base,vals+*map*step,step);}
1174:
1175: map = *reduce;
1176: while (*++map >= 0)
1177: {rvec_copy(vals+*map*step,base,step);}
1178:
1179: reduce++;
1180: }
1181: }
1182: return(0);
1183: }
1185: /******************************************************************************/
1186: static PetscErrorCode gs_gop_vec_local_in_plus( gs_id *gs, PetscScalar *vals, PetscInt step)
1187: {
1188: PetscInt *num, *map, **reduce;
1189: PetscScalar *base;
1191: num = gs->num_gop_local_reduce;
1192: reduce = gs->gop_local_reduce;
1193: while ((map = *reduce++))
1194: {
1195: base = vals + map[0] * step;
1197: /* wall */
1198: if (*num == 2)
1199: {
1200: num ++;
1201: rvec_add(base,vals+map[1]*step,step);
1202: }
1203: /* corner shared by three elements */
1204: else if (*num == 3)
1205: {
1206: num ++;
1207: rvec_add(base,vals+map[1]*step,step);
1208: rvec_add(base,vals+map[2]*step,step);
1209: }
1210: /* corner shared by four elements */
1211: else if (*num == 4)
1212: {
1213: num ++;
1214: rvec_add(base,vals+map[1]*step,step);
1215: rvec_add(base,vals+map[2]*step,step);
1216: rvec_add(base,vals+map[3]*step,step);
1217: }
1218: /* general case ... odd geoms ... 3D*/
1219: else
1220: {
1221: num++;
1222: while (*++map >= 0)
1223: {rvec_add(base,vals+*map*step,step);}
1224: }
1225: }
1226: return(0);
1227: }
1229: /******************************************************************************/
1230: static PetscErrorCode gs_gop_vec_local_out( gs_id *gs, PetscScalar *vals, PetscInt step)
1231: {
1232: PetscInt *num, *map, **reduce;
1233: PetscScalar *base;
1236: num = gs->num_gop_local_reduce;
1237: reduce = gs->gop_local_reduce;
1238: while ((map = *reduce++))
1239: {
1240: base = vals + map[0] * step;
1242: /* wall */
1243: if (*num == 2)
1244: {
1245: num ++;
1246: rvec_copy(vals+map[1]*step,base,step);
1247: }
1248: /* corner shared by three elements */
1249: else if (*num == 3)
1250: {
1251: num ++;
1252: rvec_copy(vals+map[1]*step,base,step);
1253: rvec_copy(vals+map[2]*step,base,step);
1254: }
1255: /* corner shared by four elements */
1256: else if (*num == 4)
1257: {
1258: num ++;
1259: rvec_copy(vals+map[1]*step,base,step);
1260: rvec_copy(vals+map[2]*step,base,step);
1261: rvec_copy(vals+map[3]*step,base,step);
1262: }
1263: /* general case ... odd geoms ... 3D*/
1264: else
1265: {
1266: num++;
1267: while (*++map >= 0)
1268: {rvec_copy(vals+*map*step,base,step);}
1269: }
1270: }
1271: return(0);
1272: }
1274: /******************************************************************************/
1275: static PetscErrorCode gs_gop_vec_pairwise_plus( gs_id *gs, PetscScalar *in_vals, PetscInt step)
1276: {
1277: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1278: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1279: PetscInt *pw, *list, *size, **nodes;
1280: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1281: MPI_Status status;
1282: PetscBLASInt i1 = 1,dstep;
1286: /* strip and load s */
1287: msg_list =list = gs->pair_list;
1288: msg_size =size = gs->msg_sizes;
1289: msg_nodes=nodes = gs->node_list;
1290: iptr=pw = gs->pw_elm_list;
1291: dptr1=dptr3 = gs->pw_vals;
1292: msg_ids_in = ids_in = gs->msg_ids_in;
1293: msg_ids_out = ids_out = gs->msg_ids_out;
1294: dptr2 = gs->out;
1295: in1=in2 = gs->in;
1297: /* post the receives */
1298: /* msg_nodes=nodes; */
1299: do
1300: {
1301: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1302: second one *list and do list++ afterwards */
1303: MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1304: list++;msg_ids_in++;
1305: in1 += *size++ *step;
1306: }
1307: while (*++msg_nodes);
1308: msg_nodes=nodes;
1310: /* load gs values into in out gs buffers */
1311: while (*iptr >= 0)
1312: {
1313: rvec_copy(dptr3,in_vals + *iptr*step,step);
1314: dptr3+=step;
1315: iptr++;
1316: }
1318: /* load out buffers and post the sends */
1319: while ((iptr = *msg_nodes++))
1320: {
1321: dptr3 = dptr2;
1322: while (*iptr >= 0)
1323: {
1324: rvec_copy(dptr2,dptr1 + *iptr*step,step);
1325: dptr2+=step;
1326: iptr++;
1327: }
1328: MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1329: msg_size++; msg_list++;msg_ids_out++;
1330: }
1332: /* tree */
1333: if (gs->max_left_over)
1334: {gs_gop_vec_tree_plus(gs,in_vals,step);}
1336: /* process the received data */
1337: msg_nodes=nodes;
1338: while ((iptr = *nodes++)){
1339: PetscScalar d1 = 1.0;
1340: /* Should I check the return value of MPI_Wait() or status? */
1341: /* Can this loop be replaced by a call to MPI_Waitall()? */
1342: MPI_Wait(ids_in, &status);
1343: ids_in++;
1344: while (*iptr >= 0) {
1345: dstep = PetscBLASIntCast(step);
1346: BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1347: in2+=step;
1348: iptr++;
1349: }
1350: }
1352: /* replace vals */
1353: while (*pw >= 0)
1354: {
1355: rvec_copy(in_vals + *pw*step,dptr1,step);
1356: dptr1+=step;
1357: pw++;
1358: }
1360: /* clear isend message handles */
1361: /* This changed for clarity though it could be the same */
1362: while (*msg_nodes++)
1363: /* Should I check the return value of MPI_Wait() or status? */
1364: /* Can this loop be replaced by a call to MPI_Waitall()? */
1365: {MPI_Wait(ids_out, &status);ids_out++;}
1366:
1368: return(0);
1369: }
1371: /******************************************************************************/
1372: static PetscErrorCode gs_gop_vec_tree_plus( gs_id *gs, PetscScalar *vals, PetscInt step)
1373: {
1374: PetscInt size, *in, *out;
1375: PetscScalar *buf, *work;
1376: PetscInt op[] = {GL_ADD,0};
1377: PetscBLASInt i1 = 1;
1380: /* copy over to local variables */
1381: in = gs->tree_map_in;
1382: out = gs->tree_map_out;
1383: buf = gs->tree_buf;
1384: work = gs->tree_work;
1385: size = gs->tree_nel*step;
1387: /* zero out collection buffer */
1388: rvec_zero(buf,size);
1391: /* copy over my contributions */
1392: while (*in >= 0)
1393: {
1394: PetscBLASInt dstep = PetscBLASIntCast(step);
1395: BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1396: }
1398: /* perform fan in/out on full buffer */
1399: /* must change grop to handle the blas */
1400: grop(buf,work,size,op);
1402: /* reset */
1403: in = gs->tree_map_in;
1404: out = gs->tree_map_out;
1406: /* get the portion of the results I need */
1407: while (*in >= 0)
1408: {
1409: PetscBLASInt dstep = PetscBLASIntCast(step);
1410: BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1411: }
1412: return(0);
1413: }
1415: /******************************************************************************/
1416: PetscErrorCode gs_gop_hc( gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1417: {
1421: switch (*op) {
1422: case '+':
1423: gs_gop_plus_hc(gs,vals,dim);
1424: break;
1425: default:
1426: PetscInfo1(0,"gs_gop_hc() :: %c is not a valid op",op[0]);
1427: PetscInfo(0,"gs_gop_hc() :: default :: plus\n");
1428: gs_gop_plus_hc(gs,vals,dim);
1429: break;
1430: }
1431: return(0);
1432: }
1434: /******************************************************************************/
1435: static PetscErrorCode gs_gop_plus_hc( gs_id *gs, PetscScalar *vals, PetscInt dim)
1436: {
1438: /* if there's nothing to do return */
1439: if (dim<=0)
1440: { return(0);}
1442: /* can't do more dimensions then exist */
1443: dim = PetscMin(dim,i_log2_num_nodes);
1445: /* local only operations!!! */
1446: if (gs->num_local)
1447: {gs_gop_local_plus(gs,vals);}
1449: /* if intersection tree/pairwise and local isn't empty */
1450: if (gs->num_local_gop)
1451: {
1452: gs_gop_local_in_plus(gs,vals);
1454: /* pairwise will do tree inside ... */
1455: if (gs->num_pairs)
1456: {gs_gop_pairwise_plus_hc(gs,vals,dim);}
1458: /* tree only */
1459: else if (gs->max_left_over)
1460: {gs_gop_tree_plus_hc(gs,vals,dim);}
1461:
1462: gs_gop_local_out(gs,vals);
1463: }
1464: /* if intersection tree/pairwise and local is empty */
1465: else
1466: {
1467: /* pairwise will do tree inside */
1468: if (gs->num_pairs)
1469: {gs_gop_pairwise_plus_hc(gs,vals,dim);}
1470:
1471: /* tree */
1472: else if (gs->max_left_over)
1473: {gs_gop_tree_plus_hc(gs,vals,dim);}
1474: }
1475: return(0);
1476: }
1478: /******************************************************************************/
1479: static PetscErrorCode gs_gop_pairwise_plus_hc( gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1480: {
1481: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1482: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1483: PetscInt *pw, *list, *size, **nodes;
1484: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1485: MPI_Status status;
1486: PetscInt i, mask=1;
1490: for (i=1; i<dim; i++)
1491: {mask<<=1; mask++;}
1494: /* strip and load s */
1495: msg_list =list = gs->pair_list;
1496: msg_size =size = gs->msg_sizes;
1497: msg_nodes=nodes = gs->node_list;
1498: iptr=pw = gs->pw_elm_list;
1499: dptr1=dptr3 = gs->pw_vals;
1500: msg_ids_in = ids_in = gs->msg_ids_in;
1501: msg_ids_out = ids_out = gs->msg_ids_out;
1502: dptr2 = gs->out;
1503: in1=in2 = gs->in;
1505: /* post the receives */
1506: /* msg_nodes=nodes; */
1507: do
1508: {
1509: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1510: second one *list and do list++ afterwards */
1511: if ((my_id|mask)==(*list|mask))
1512: {
1513: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1514: list++; msg_ids_in++;in1 += *size++;
1515: }
1516: else
1517: {list++; size++;}
1518: }
1519: while (*++msg_nodes);
1521: /* load gs values into in out gs buffers */
1522: while (*iptr >= 0)
1523: {*dptr3++ = *(in_vals + *iptr++);}
1525: /* load out buffers and post the sends */
1526: msg_nodes=nodes;
1527: list = msg_list;
1528: while ((iptr = *msg_nodes++))
1529: {
1530: if ((my_id|mask)==(*list|mask))
1531: {
1532: dptr3 = dptr2;
1533: while (*iptr >= 0)
1534: {*dptr2++ = *(dptr1 + *iptr++);}
1535: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1536: /* is msg_ids_out++ correct? */
1537: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1538: msg_size++;list++;msg_ids_out++;
1539: }
1540: else
1541: {list++; msg_size++;}
1542: }
1544: /* do the tree while we're waiting */
1545: if (gs->max_left_over)
1546: {gs_gop_tree_plus_hc(gs,in_vals,dim);}
1548: /* process the received data */
1549: msg_nodes=nodes;
1550: list = msg_list;
1551: while ((iptr = *nodes++))
1552: {
1553: if ((my_id|mask)==(*list|mask))
1554: {
1555: /* Should I check the return value of MPI_Wait() or status? */
1556: /* Can this loop be replaced by a call to MPI_Waitall()? */
1557: MPI_Wait(ids_in, &status);
1558: ids_in++;
1559: while (*iptr >= 0)
1560: {*(dptr1 + *iptr++) += *in2++;}
1561: }
1562: list++;
1563: }
1565: /* replace vals */
1566: while (*pw >= 0)
1567: {*(in_vals + *pw++) = *dptr1++;}
1569: /* clear isend message handles */
1570: /* This changed for clarity though it could be the same */
1571: while (*msg_nodes++)
1572: {
1573: if ((my_id|mask)==(*msg_list|mask))
1574: {
1575: /* Should I check the return value of MPI_Wait() or status? */
1576: /* Can this loop be replaced by a call to MPI_Waitall()? */
1577: MPI_Wait(ids_out, &status);
1578: ids_out++;
1579: }
1580: msg_list++;
1581: }
1583: return(0);
1584: }
1586: /******************************************************************************/
1587: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim)
1588: {
1589: PetscInt size;
1590: PetscInt *in, *out;
1591: PetscScalar *buf, *work;
1592: PetscInt op[] = {GL_ADD,0};
1595: in = gs->tree_map_in;
1596: out = gs->tree_map_out;
1597: buf = gs->tree_buf;
1598: work = gs->tree_work;
1599: size = gs->tree_nel;
1601: rvec_zero(buf,size);
1603: while (*in >= 0)
1604: {*(buf + *out++) = *(vals + *in++);}
1606: in = gs->tree_map_in;
1607: out = gs->tree_map_out;
1609: grop_hc(buf,work,size,op,dim);
1611: while (*in >= 0)
1612: {*(vals + *in++) = *(buf + *out++);}
1613: return(0);
1614: }