Actual source code: baijov.c
1: #define PETSCMAT_DLL
4: /*
5: Routines to compute overlapping regions of a parallel MPI matrix
6: and to find submatrices that were shared across processors.
7: */
8: #include ../src/mat/impls/baij/mpi/mpibaij.h
9: #include petscbt.h
11: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
12: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14: EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15: EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
19: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20: {
22: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
23: IS *is_new;
26: PetscMalloc(imax*sizeof(IS),&is_new);
27: /* Convert the indices into block format */
28: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
29: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
30: for (i=0; i<ov; ++i) {
31: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
32: }
33: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
34: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
35: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
36: PetscFree(is_new);
37: return(0);
38: }
40: /*
41: Sample message format:
42: If a processor A wants processor B to process some elements corresponding
43: to index sets is[1], is[5]
44: mesg [0] = 2 (no of index sets in the mesg)
45: -----------
46: mesg [1] = 1 => is[1]
47: mesg [2] = sizeof(is[1]);
48: -----------
49: mesg [5] = 5 => is[5]
50: mesg [6] = sizeof(is[5]);
51: -----------
52: mesg [7]
53: mesg [n] data(is[1])
54: -----------
55: mesg[n+1]
56: mesg[m] data(is[5])
57: -----------
58:
59: Notes:
60: nrqs - no of requests sent (or to be sent out)
61: nrqr - no of requests recieved (which have to be or which have been processed
62: */
65: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
66: {
67: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
68: const PetscInt **idx,*idx_i;
69: PetscInt *n,*w3,*w4,*rtable,**data,len;
71: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
72: PetscInt Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
73: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
74: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
75: PetscBT *table;
76: MPI_Comm comm;
77: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
78: MPI_Status *s_status,*recv_status;
81: comm = ((PetscObject)C)->comm;
82: size = c->size;
83: rank = c->rank;
84: Mbs = c->Mbs;
86: PetscObjectGetNewTag((PetscObject)C,&tag1);
87: PetscObjectGetNewTag((PetscObject)C,&tag2);
89: PetscMalloc3(imax+1,const PetscInt*,&idx,imax,PetscInt,&n,Mbs,PetscInt,&rtable);
90: for (i=0; i<imax; i++) {
91: ISGetIndices(is[i],&idx[i]);
92: ISGetLocalSize(is[i],&n[i]);
93: }
94:
95: /* Create hash table for the mapping :row -> proc*/
96: for (i=0,j=0; i<size; i++) {
97: len = c->rangebs[i+1];
98: for (; j<len; j++) {
99: rtable[j] = i;
100: }
101: }
103: /* evaluate communication - mesg to who,length of mesg, and buffer space
104: required. Based on this, buffers are allocated, and data copied into them*/
105: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);
106: PetscMemzero(w1,size*sizeof(PetscMPIInt));
107: PetscMemzero(w2,size*sizeof(PetscMPIInt));
108: PetscMemzero(w3,size*sizeof(PetscInt));
109: for (i=0; i<imax; i++) {
110: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
111: idx_i = idx[i];
112: len = n[i];
113: for (j=0; j<len; j++) {
114: row = idx_i[j];
115: if (row < 0) {
116: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
117: }
118: proc = rtable[row];
119: w4[proc]++;
120: }
121: for (j=0; j<size; j++){
122: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
123: }
124: }
126: nrqs = 0; /* no of outgoing messages */
127: msz = 0; /* total mesg length (for all proc */
128: w1[rank] = 0; /* no mesg sent to itself */
129: w3[rank] = 0;
130: for (i=0; i<size; i++) {
131: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
132: }
133: /* pa - is list of processors to communicate with */
134: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
135: for (i=0,j=0; i<size; i++) {
136: if (w1[i]) {pa[j] = i; j++;}
137: }
139: /* Each message would have a header = 1 + 2*(no of IS) + data */
140: for (i=0; i<nrqs; i++) {
141: j = pa[i];
142: w1[j] += w2[j] + 2*w3[j];
143: msz += w1[j];
144: }
145:
146: /* Determine the number of messages to expect, their lengths, from from-ids */
147: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
148: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
150: /* Now post the Irecvs corresponding to these messages */
151: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
152:
153: /* Allocate Memory for outgoing messages */
154: PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);
155: PetscMemzero(outdat,size*sizeof(PetscInt*));
156: PetscMemzero(ptr,size*sizeof(PetscInt*));
157: {
158: PetscInt *iptr = tmp,ict = 0;
159: for (i=0; i<nrqs; i++) {
160: j = pa[i];
161: iptr += ict;
162: outdat[j] = iptr;
163: ict = w1[j];
164: }
165: }
167: /* Form the outgoing messages */
168: /*plug in the headers*/
169: for (i=0; i<nrqs; i++) {
170: j = pa[i];
171: outdat[j][0] = 0;
172: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
173: ptr[j] = outdat[j] + 2*w3[j] + 1;
174: }
175:
176: /* Memory for doing local proc's work*/
177: {
178: PetscInt *d_p;
179: char *t_p;
181: /* should change to use PetscMallocN() */
182: PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);
184: PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
185: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));
186: data = (PetscInt **)(table + imax);
187: isz = (PetscInt *)(data + imax);
188: d_p = (PetscInt *)(isz + imax);
189: t_p = (char *)(d_p + Mbs*imax);
190: for (i=0; i<imax; i++) {
191: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
192: data[i] = d_p + (Mbs)*i;
193: }
194: }
196: /* Parse the IS and update local tables and the outgoing buf with the data*/
197: {
198: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
199: PetscBT table_i;
201: for (i=0; i<imax; i++) {
202: PetscMemzero(ctr,size*sizeof(PetscInt));
203: n_i = n[i];
204: table_i = table[i];
205: idx_i = idx[i];
206: data_i = data[i];
207: isz_i = isz[i];
208: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
209: row = idx_i[j];
210: proc = rtable[row];
211: if (proc != rank) { /* copy to the outgoing buffer */
212: ctr[proc]++;
213: *ptr[proc] = row;
214: ptr[proc]++;
215: } else { /* Update the local table */
216: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
217: }
218: }
219: /* Update the headers for the current IS */
220: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
221: if ((ctr_j = ctr[j])) {
222: outdat_j = outdat[j];
223: k = ++outdat_j[0];
224: outdat_j[2*k] = ctr_j;
225: outdat_j[2*k-1] = i;
226: }
227: }
228: isz[i] = isz_i;
229: }
230: }
231:
232: /* Now post the sends */
233: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
234: for (i=0; i<nrqs; ++i) {
235: j = pa[i];
236: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
237: }
238:
239: /* No longer need the original indices*/
240: for (i=0; i<imax; ++i) {
241: ISRestoreIndices(is[i],idx+i);
242: }
243: PetscFree3(idx,n,rtable);
245: for (i=0; i<imax; ++i) {
246: ISDestroy(is[i]);
247: }
248:
249: /* Do Local work*/
250: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
252: /* Receive messages*/
253: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
254: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
255:
256: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
257: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
259: /* Phase 1 sends are complete - deallocate buffers */
260: PetscFree4(outdat,ptr,tmp,ctr);
261: PetscFree4(w1,w2,w3,w4);
263: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
264: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
265: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
266: PetscFree(rbuf[0]);
267: PetscFree(rbuf);
269: /* Send the data back*/
270: /* Do a global reduction to know the buffer space req for incoming messages*/
271: {
272: PetscMPIInt *rw1;
273:
274: PetscMalloc(size*sizeof(PetscInt),&rw1);
275: PetscMemzero(rw1,size*sizeof(PetscInt));
277: for (i=0; i<nrqr; ++i) {
278: proc = recv_status[i].MPI_SOURCE;
279: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280: rw1[proc] = isz1[i];
281: }
282:
283: PetscFree(onodes1);
284: PetscFree(olengths1);
286: /* Determine the number of messages to expect, their lengths, from from-ids */
287: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
288: PetscFree(rw1);
289: }
290: /* Now post the Irecvs corresponding to these messages */
291: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
292:
293: /* Now post the sends */
294: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
295: for (i=0; i<nrqr; ++i) {
296: j = recv_status[i].MPI_SOURCE;
297: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
298: }
300: /* receive work done on other processors*/
301: {
302: PetscMPIInt idex;
303: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
304: PetscBT table_i;
305: MPI_Status *status2;
306:
307: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
308: for (i=0; i<nrqs; ++i) {
309: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
310: /* Process the message*/
311: rbuf2_i = rbuf2[idex];
312: ct1 = 2*rbuf2_i[0]+1;
313: jmax = rbuf2[idex][0];
314: for (j=1; j<=jmax; j++) {
315: max = rbuf2_i[2*j];
316: is_no = rbuf2_i[2*j-1];
317: isz_i = isz[is_no];
318: data_i = data[is_no];
319: table_i = table[is_no];
320: for (k=0; k<max; k++,ct1++) {
321: row = rbuf2_i[ct1];
322: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
323: }
324: isz[is_no] = isz_i;
325: }
326: }
327: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
328: PetscFree(status2);
329: }
330:
331: for (i=0; i<imax; ++i) {
332: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
333: }
334:
335:
336: PetscFree(onodes2);
337: PetscFree(olengths2);
339: PetscFree(pa);
340: PetscFree(rbuf2[0]);
341: PetscFree(rbuf2);
342: PetscFree(s_waits1);
343: PetscFree(r_waits1);
344: PetscFree(s_waits2);
345: PetscFree(r_waits2);
346: PetscFree(table);
347: PetscFree(s_status);
348: PetscFree(recv_status);
349: PetscFree(xdata[0]);
350: PetscFree(xdata);
351: PetscFree(isz1);
352: return(0);
353: }
357: /*
358: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
359: the work on the local processor.
361: Inputs:
362: C - MAT_MPIBAIJ;
363: imax - total no of index sets processed at a time;
364: table - an array of char - size = Mbs bits.
365:
366: Output:
367: isz - array containing the count of the solution elements corresponding
368: to each index set;
369: data - pointer to the solutions
370: */
371: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
372: {
373: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
374: Mat A = c->A,B = c->B;
375: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
376: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
377: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
378: PetscBT table_i;
381: rstart = c->rstartbs;
382: cstart = c->cstartbs;
383: ai = a->i;
384: aj = a->j;
385: bi = b->i;
386: bj = b->j;
387: garray = c->garray;
389:
390: for (i=0; i<imax; i++) {
391: data_i = data[i];
392: table_i = table[i];
393: isz_i = isz[i];
394: for (j=0,max=isz[i]; j<max; j++) {
395: row = data_i[j] - rstart;
396: start = ai[row];
397: end = ai[row+1];
398: for (k=start; k<end; k++) { /* Amat */
399: val = aj[k] + cstart;
400: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401: }
402: start = bi[row];
403: end = bi[row+1];
404: for (k=start; k<end; k++) { /* Bmat */
405: val = garray[bj[k]];
406: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
407: }
408: }
409: isz[i] = isz_i;
410: }
411: return(0);
412: }
415: /*
416: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
417: and return the output
419: Input:
420: C - the matrix
421: nrqr - no of messages being processed.
422: rbuf - an array of pointers to the recieved requests
423:
424: Output:
425: xdata - array of messages to be sent back
426: isz1 - size of each message
428: For better efficiency perhaps we should malloc separately each xdata[i],
429: then if a remalloc is required we need only copy the data for that one row
430: rather than all previous rows as it is now where a single large chunck of
431: memory is used.
433: */
434: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
435: {
436: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
437: Mat A = c->A,B = c->B;
438: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
440: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
443: PetscInt *rbuf_i,kmax,rbuf_0;
444: PetscBT xtable;
445: PetscMPIInt rank;
448: rank = c->rank;
449: Mbs = c->Mbs;
450: rstart = c->rstartbs;
451: cstart = c->cstartbs;
452: ai = a->i;
453: aj = a->j;
454: bi = b->i;
455: bj = b->j;
456: garray = c->garray;
457:
458:
459: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
460: rbuf_i = rbuf[i];
461: rbuf_0 = rbuf_i[0];
462: ct += rbuf_0;
463: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
464: }
465:
466: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
467: else max1 = 1;
468: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
469: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
470: ++no_malloc;
471: PetscBTCreate(Mbs,xtable);
472: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
473:
474: ct3 = 0;
475: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
476: rbuf_i = rbuf[i];
477: rbuf_0 = rbuf_i[0];
478: ct1 = 2*rbuf_0+1;
479: ct2 = ct1;
480: ct3 += ct1;
481: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
482: PetscBTMemzero(Mbs,xtable);
483: oct2 = ct2;
484: kmax = rbuf_i[2*j];
485: for (k=0; k<kmax; k++,ct1++) {
486: row = rbuf_i[ct1];
487: if (!PetscBTLookupSet(xtable,row)) {
488: if (!(ct3 < mem_estimate)) {
489: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
491: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
492: PetscFree(xdata[0]);
493: xdata[0] = tmp;
494: mem_estimate = new_estimate; ++no_malloc;
495: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496: }
497: xdata[i][ct2++] = row;
498: ct3++;
499: }
500: }
501: for (k=oct2,max2=ct2; k<max2; k++) {
502: row = xdata[i][k] - rstart;
503: start = ai[row];
504: end = ai[row+1];
505: for (l=start; l<end; l++) {
506: val = aj[l] + cstart;
507: if (!PetscBTLookupSet(xtable,val)) {
508: if (!(ct3 < mem_estimate)) {
509: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
510: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
511: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
512: PetscFree(xdata[0]);
513: xdata[0] = tmp;
514: mem_estimate = new_estimate; ++no_malloc;
515: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
516: }
517: xdata[i][ct2++] = val;
518: ct3++;
519: }
520: }
521: start = bi[row];
522: end = bi[row+1];
523: for (l=start; l<end; l++) {
524: val = garray[bj[l]];
525: if (!PetscBTLookupSet(xtable,val)) {
526: if (!(ct3 < mem_estimate)) {
527: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
528: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
529: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
530: PetscFree(xdata[0]);
531: xdata[0] = tmp;
532: mem_estimate = new_estimate; ++no_malloc;
533: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
534: }
535: xdata[i][ct2++] = val;
536: ct3++;
537: }
538: }
539: }
540: /* Update the header*/
541: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
542: xdata[i][2*j-1] = rbuf_i[2*j-1];
543: }
544: xdata[i][0] = rbuf_0;
545: xdata[i+1] = xdata[i] + ct2;
546: isz1[i] = ct2; /* size of each message */
547: }
548: PetscBTDestroy(xtable);
549: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
550: return(0);
551: }
553: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
557: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
558: {
559: IS *isrow_new,*iscol_new;
560: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
562: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
565: /* The compression and expansion should be avoided. Does'nt point
566: out errors might change the indices hence buggey */
568: PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);
569: ISCompressIndicesGeneral(N,bs,ismax,isrow,isrow_new);
570: ISCompressIndicesGeneral(N,bs,ismax,iscol,iscol_new);
572: /* Allocate memory to hold all the submatrices */
573: if (scall != MAT_REUSE_MATRIX) {
574: PetscMalloc((ismax+1)*sizeof(Mat),submat);
575: }
576: /* Determine the number of stages through which submatrices are done */
577: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
578: if (!nmax) nmax = 1;
579: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
580:
581: /* Make sure every processor loops through the nstages */
582: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
583: for (i=0,pos=0; i<nstages; i++) {
584: if (pos+nmax <= ismax) max_no = nmax;
585: else if (pos == ismax) max_no = 0;
586: else max_no = ismax-pos;
587: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
588: pos += max_no;
589: }
590:
591: for (i=0; i<ismax; i++) {
592: ISDestroy(isrow_new[i]);
593: ISDestroy(iscol_new[i]);
594: }
595: PetscFree2(isrow_new,iscol_new);
596: return(0);
597: }
599: #if defined (PETSC_USE_CTABLE)
602: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
603: {
604: PetscInt nGlobalNd = proc_gnode[size];
605: PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
606:
608: if (fproc > size) fproc = size;
609: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
610: if (row < proc_gnode[fproc]) fproc--;
611: else fproc++;
612: }
613: *rank = fproc;
614: return(0);
615: }
616: #endif
618: /* -------------------------------------------------------------------------*/
619: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
622: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
623: {
624: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
625: Mat A = c->A;
626: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
627: const PetscInt **irow,**icol,*irow_i;
628: PetscInt *nrow,*ncol,*w3,*w4,start;
630: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
631: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
632: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
633: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
634: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
635: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
636: PetscInt bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
637: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
638: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
639: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
640: MPI_Request *r_waits4,*s_waits3,*s_waits4;
641: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
642: MPI_Status *r_status3,*r_status4,*s_status4;
643: MPI_Comm comm;
644: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
645: MatScalar *a_a=a->a,*b_a=b->a;
646: PetscTruth flag;
647: PetscMPIInt *onodes1,*olengths1;
649: #if defined (PETSC_USE_CTABLE)
650: PetscInt tt;
651: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
652: #else
653: PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
654: #endif
657: comm = ((PetscObject)C)->comm;
658: tag0 = ((PetscObject)C)->tag;
659: size = c->size;
660: rank = c->rank;
661:
662: /* Get some new tags to keep the communication clean */
663: PetscObjectGetNewTag((PetscObject)C,&tag1);
664: PetscObjectGetNewTag((PetscObject)C,&tag2);
665: PetscObjectGetNewTag((PetscObject)C,&tag3);
667: #if defined(PETSC_USE_CTABLE)
668: PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);
669: #else
670: PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);
671: /* Create hash table for the mapping :row -> proc*/
672: for (i=0,j=0; i<size; i++) {
673: jmax = c->rowners[i+1];
674: for (; j<jmax; j++) {
675: rtable[j] = i;
676: }
677: }
678: #endif
679:
680: for (i=0; i<ismax; i++) {
681: ISGetIndices(isrow[i],&irow[i]);
682: ISGetIndices(iscol[i],&icol[i]);
683: ISGetLocalSize(isrow[i],&nrow[i]);
684: ISGetLocalSize(iscol[i],&ncol[i]);
685: }
687: /* evaluate communication - mesg to who,length of mesg,and buffer space
688: required. Based on this, buffers are allocated, and data copied into them*/
689: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);
690: PetscMemzero(w1,size*sizeof(PetscMPIInt));
691: PetscMemzero(w2,size*sizeof(PetscMPIInt));
692: PetscMemzero(w3,size*sizeof(PetscInt));
693: for (i=0; i<ismax; i++) {
694: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
695: jmax = nrow[i];
696: irow_i = irow[i];
697: for (j=0; j<jmax; j++) {
698: row = irow_i[j];
699: #if defined (PETSC_USE_CTABLE)
700: PetscGetProc(row,size,c->rangebs,&proc);
701: #else
702: proc = rtable[row];
703: #endif
704: w4[proc]++;
705: }
706: for (j=0; j<size; j++) {
707: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
708: }
709: }
711: nrqs = 0; /* no of outgoing messages */
712: msz = 0; /* total mesg length for all proc */
713: w1[rank] = 0; /* no mesg sent to intself */
714: w3[rank] = 0;
715: for (i=0; i<size; i++) {
716: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
717: }
718: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
719: for (i=0,j=0; i<size; i++) {
720: if (w1[i]) { pa[j] = i; j++; }
721: }
723: /* Each message would have a header = 1 + 2*(no of IS) + data */
724: for (i=0; i<nrqs; i++) {
725: j = pa[i];
726: w1[j] += w2[j] + 2* w3[j];
727: msz += w1[j];
728: }
730: /* Determine the number of messages to expect, their lengths, from from-ids */
731: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
732: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
734: /* Now post the Irecvs corresponding to these messages */
735: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
736:
737: PetscFree(onodes1);
738: PetscFree(olengths1);
740: /* Allocate Memory for outgoing messages */
741: PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);
742: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
743: PetscMemzero(ptr,size*sizeof(PetscInt*));
744: {
745: PetscInt *iptr = tmp,ict = 0;
746: for (i=0; i<nrqs; i++) {
747: j = pa[i];
748: iptr += ict;
749: sbuf1[j] = iptr;
750: ict = w1[j];
751: }
752: }
754: /* Form the outgoing messages */
755: /* Initialise the header space */
756: for (i=0; i<nrqs; i++) {
757: j = pa[i];
758: sbuf1[j][0] = 0;
759: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
760: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
761: }
762:
763: /* Parse the isrow and copy data into outbuf */
764: for (i=0; i<ismax; i++) {
765: PetscMemzero(ctr,size*sizeof(PetscInt));
766: irow_i = irow[i];
767: jmax = nrow[i];
768: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
769: row = irow_i[j];
770: #if defined (PETSC_USE_CTABLE)
771: PetscGetProc(row,size,c->rangebs,&proc);
772: #else
773: proc = rtable[row];
774: #endif
775: if (proc != rank) { /* copy to the outgoing buf*/
776: ctr[proc]++;
777: *ptr[proc] = row;
778: ptr[proc]++;
779: }
780: }
781: /* Update the headers for the current IS */
782: for (j=0; j<size; j++) { /* Can Optimise this loop too */
783: if ((ctr_j = ctr[j])) {
784: sbuf1_j = sbuf1[j];
785: k = ++sbuf1_j[0];
786: sbuf1_j[2*k] = ctr_j;
787: sbuf1_j[2*k-1] = i;
788: }
789: }
790: }
792: /* Now post the sends */
793: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
794: for (i=0; i<nrqs; ++i) {
795: j = pa[i];
796: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
797: }
799: /* Post Recieves to capture the buffer size */
800: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
801: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
802: rbuf2[0] = tmp + msz;
803: for (i=1; i<nrqs; ++i) {
804: j = pa[i];
805: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
806: }
807: for (i=0; i<nrqs; ++i) {
808: j = pa[i];
809: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
810: }
812: /* Send to other procs the buf size they should allocate */
814: /* Receive messages*/
815: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
816: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
817: PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);
818: {
819: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
820: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
822: for (i=0; i<nrqr; ++i) {
823: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
824: req_size[idex] = 0;
825: rbuf1_i = rbuf1[idex];
826: start = 2*rbuf1_i[0] + 1;
827: MPI_Get_count(r_status1+i,MPIU_INT,&end);
828: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
829: sbuf2_i = sbuf2[idex];
830: for (j=start; j<end; j++) {
831: id = rbuf1_i[j] - rstart;
832: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
833: sbuf2_i[j] = ncols;
834: req_size[idex] += ncols;
835: }
836: req_source[idex] = r_status1[i].MPI_SOURCE;
837: /* form the header */
838: sbuf2_i[0] = req_size[idex];
839: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
840: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
841: }
842: }
843: PetscFree(r_status1);
844: PetscFree(r_waits1);
846: /* recv buffer sizes */
847: /* Receive messages*/
849: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
850: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
851: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
852: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
853: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
855: for (i=0; i<nrqs; ++i) {
856: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
857: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
858: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
859: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
860: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
861: }
862: PetscFree(r_status2);
863: PetscFree(r_waits2);
864:
865: /* Wait on sends1 and sends2 */
866: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
867: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
869: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
870: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
871: PetscFree(s_status1);
872: PetscFree(s_status2);
873: PetscFree(s_waits1);
874: PetscFree(s_waits2);
876: /* Now allocate buffers for a->j, and send them off */
877: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
878: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
879: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
880: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
881:
882: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
883: {
884: for (i=0; i<nrqr; i++) {
885: rbuf1_i = rbuf1[i];
886: sbuf_aj_i = sbuf_aj[i];
887: ct1 = 2*rbuf1_i[0] + 1;
888: ct2 = 0;
889: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
890: kmax = rbuf1[i][2*j];
891: for (k=0; k<kmax; k++,ct1++) {
892: row = rbuf1_i[ct1] - rstart;
893: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
894: ncols = nzA + nzB;
895: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
897: /* load the column indices for this row into cols*/
898: cols = sbuf_aj_i + ct2;
899: for (l=0; l<nzB; l++) {
900: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
901: else break;
902: }
903: imark = l;
904: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
905: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
906: ct2 += ncols;
907: }
908: }
909: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
910: }
911: }
912: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
913: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
915: /* Allocate buffers for a->a, and send them off */
916: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
917: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
918: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
919: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
920:
921: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
922: {
923: for (i=0; i<nrqr; i++) {
924: rbuf1_i = rbuf1[i];
925: sbuf_aa_i = sbuf_aa[i];
926: ct1 = 2*rbuf1_i[0]+1;
927: ct2 = 0;
928: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
929: kmax = rbuf1_i[2*j];
930: for (k=0; k<kmax; k++,ct1++) {
931: row = rbuf1_i[ct1] - rstart;
932: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
933: ncols = nzA + nzB;
934: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
935: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
937: /* load the column values for this row into vals*/
938: vals = sbuf_aa_i+ct2*bs2;
939: for (l=0; l<nzB; l++) {
940: if ((bmap[cworkB[l]]) < cstart) {
941: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
942: }
943: else break;
944: }
945: imark = l;
946: for (l=0; l<nzA; l++) {
947: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
948: }
949: for (l=imark; l<nzB; l++) {
950: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
951: }
952: ct2 += ncols;
953: }
954: }
955: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
956: }
957: }
958: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
959: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
960: PetscFree(rbuf1[0]);
961: PetscFree(rbuf1);
963: /* Form the matrix */
964: /* create col map */
965: {
966: const PetscInt *icol_i;
967: #if defined (PETSC_USE_CTABLE)
968: /* Create row map*/
969: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
970: for (i=0; i<ismax; i++) {
971: PetscTableCreate(ncol[i]+1,&colmaps[i]);
972: }
973: #else
974: PetscMalloc(ismax*sizeof(PetscInt*),&cmap);
975: PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);
976: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
977: #endif
978: for (i=0; i<ismax; i++) {
979: jmax = ncol[i];
980: icol_i = icol[i];
981: #if defined (PETSC_USE_CTABLE)
982: lcol1_gcol1 = colmaps[i];
983: for (j=0; j<jmax; j++) {
984: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
985: }
986: #else
987: cmap_i = cmap[i];
988: for (j=0; j<jmax; j++) {
989: cmap_i[icol_i[j]] = j+1;
990: }
991: #endif
992: }
993: }
995: /* Create lens which is required for MatCreate... */
996: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
997: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
998: lens[0] = (PetscInt*)(lens + ismax);
999: PetscMemzero(lens[0],j*sizeof(PetscInt));
1000: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1001:
1002: /* Update lens from local data */
1003: for (i=0; i<ismax; i++) {
1004: jmax = nrow[i];
1005: #if defined (PETSC_USE_CTABLE)
1006: lcol1_gcol1 = colmaps[i];
1007: #else
1008: cmap_i = cmap[i];
1009: #endif
1010: irow_i = irow[i];
1011: lens_i = lens[i];
1012: for (j=0; j<jmax; j++) {
1013: row = irow_i[j];
1014: #if defined (PETSC_USE_CTABLE)
1015: PetscGetProc(row,size,c->rangebs,&proc);
1016: #else
1017: proc = rtable[row];
1018: #endif
1019: if (proc == rank) {
1020: /* Get indices from matA and then from matB */
1021: row = row - rstart;
1022: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1023: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1024: #if defined (PETSC_USE_CTABLE)
1025: for (k=0; k<nzA; k++) {
1026: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1027: if (tt) { lens_i[j]++; }
1028: }
1029: for (k=0; k<nzB; k++) {
1030: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1031: if (tt) { lens_i[j]++; }
1032: }
1033: #else
1034: for (k=0; k<nzA; k++) {
1035: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1036: }
1037: for (k=0; k<nzB; k++) {
1038: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1039: }
1040: #endif
1041: }
1042: }
1043: }
1044: #if defined (PETSC_USE_CTABLE)
1045: /* Create row map*/
1046: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1047: for (i=0; i<ismax; i++){
1048: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1049: }
1050: #else
1051: /* Create row map*/
1052: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);
1053: rmap[0] = (PetscInt*)(rmap + ismax);
1054: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1055: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1056: #endif
1057: for (i=0; i<ismax; i++) {
1058: irow_i = irow[i];
1059: jmax = nrow[i];
1060: #if defined (PETSC_USE_CTABLE)
1061: lrow1_grow1 = rowmaps[i];
1062: for (j=0; j<jmax; j++) {
1063: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1064: }
1065: #else
1066: rmap_i = rmap[i];
1067: for (j=0; j<jmax; j++) {
1068: rmap_i[irow_i[j]] = j;
1069: }
1070: #endif
1071: }
1073: /* Update lens from offproc data */
1074: {
1075: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1076: PetscMPIInt ii;
1078: for (tmp2=0; tmp2<nrqs; tmp2++) {
1079: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1080: idex = pa[ii];
1081: sbuf1_i = sbuf1[idex];
1082: jmax = sbuf1_i[0];
1083: ct1 = 2*jmax+1;
1084: ct2 = 0;
1085: rbuf2_i = rbuf2[ii];
1086: rbuf3_i = rbuf3[ii];
1087: for (j=1; j<=jmax; j++) {
1088: is_no = sbuf1_i[2*j-1];
1089: max1 = sbuf1_i[2*j];
1090: lens_i = lens[is_no];
1091: #if defined (PETSC_USE_CTABLE)
1092: lcol1_gcol1 = colmaps[is_no];
1093: lrow1_grow1 = rowmaps[is_no];
1094: #else
1095: cmap_i = cmap[is_no];
1096: rmap_i = rmap[is_no];
1097: #endif
1098: for (k=0; k<max1; k++,ct1++) {
1099: #if defined (PETSC_USE_CTABLE)
1100: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1101: row--;
1102: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1103: #else
1104: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1105: #endif
1106: max2 = rbuf2_i[ct1];
1107: for (l=0; l<max2; l++,ct2++) {
1108: #if defined (PETSC_USE_CTABLE)
1109: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1110: if (tt) {
1111: lens_i[row]++;
1112: }
1113: #else
1114: if (cmap_i[rbuf3_i[ct2]]) {
1115: lens_i[row]++;
1116: }
1117: #endif
1118: }
1119: }
1120: }
1121: }
1122: }
1123: PetscFree(r_status3);
1124: PetscFree(r_waits3);
1125: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1126: PetscFree(s_status3);
1127: PetscFree(s_waits3);
1129: /* Create the submatrices */
1130: if (scall == MAT_REUSE_MATRIX) {
1131: /*
1132: Assumes new rows are same length as the old rows, hence bug!
1133: */
1134: for (i=0; i<ismax; i++) {
1135: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1136: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) {
1137: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1138: }
1139: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1140: if (!flag) {
1141: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1142: }
1143: /* Initial matrix as if empty */
1144: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1145: submats[i]->factor = C->factor;
1146: }
1147: } else {
1148: for (i=0; i<ismax; i++) {
1149: MatCreate(PETSC_COMM_SELF,submats+i);
1150: MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);
1151: MatSetType(submats[i],((PetscObject)A)->type_name);
1152: MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1153: MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1154: }
1155: }
1157: /* Assemble the matrices */
1158: /* First assemble the local rows */
1159: {
1160: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1161: MatScalar *imat_a;
1162:
1163: for (i=0; i<ismax; i++) {
1164: mat = (Mat_SeqBAIJ*)submats[i]->data;
1165: imat_ilen = mat->ilen;
1166: imat_j = mat->j;
1167: imat_i = mat->i;
1168: imat_a = mat->a;
1170: #if defined (PETSC_USE_CTABLE)
1171: lcol1_gcol1 = colmaps[i];
1172: lrow1_grow1 = rowmaps[i];
1173: #else
1174: cmap_i = cmap[i];
1175: rmap_i = rmap[i];
1176: #endif
1177: irow_i = irow[i];
1178: jmax = nrow[i];
1179: for (j=0; j<jmax; j++) {
1180: row = irow_i[j];
1181: #if defined (PETSC_USE_CTABLE)
1182: PetscGetProc(row,size,c->rangebs,&proc);
1183: #else
1184: proc = rtable[row];
1185: #endif
1186: if (proc == rank) {
1187: row = row - rstart;
1188: nzA = a_i[row+1] - a_i[row];
1189: nzB = b_i[row+1] - b_i[row];
1190: cworkA = a_j + a_i[row];
1191: cworkB = b_j + b_i[row];
1192: vworkA = a_a + a_i[row]*bs2;
1193: vworkB = b_a + b_i[row]*bs2;
1194: #if defined (PETSC_USE_CTABLE)
1195: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1196: row--;
1197: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1198: #else
1199: row = rmap_i[row + rstart];
1200: #endif
1201: mat_i = imat_i[row];
1202: mat_a = imat_a + mat_i*bs2;
1203: mat_j = imat_j + mat_i;
1204: ilen_row = imat_ilen[row];
1206: /* load the column indices for this row into cols*/
1207: for (l=0; l<nzB; l++) {
1208: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1209: #if defined (PETSC_USE_CTABLE)
1210: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1211: if (tcol) {
1212: #else
1213: if ((tcol = cmap_i[ctmp])) {
1214: #endif
1215: *mat_j++ = tcol - 1;
1216: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1217: mat_a += bs2;
1218: ilen_row++;
1219: }
1220: } else break;
1221: }
1222: imark = l;
1223: for (l=0; l<nzA; l++) {
1224: #if defined (PETSC_USE_CTABLE)
1225: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1226: if (tcol) {
1227: #else
1228: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1229: #endif
1230: *mat_j++ = tcol - 1;
1231: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1232: mat_a += bs2;
1233: ilen_row++;
1234: }
1235: }
1236: for (l=imark; l<nzB; l++) {
1237: #if defined (PETSC_USE_CTABLE)
1238: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1239: if (tcol) {
1240: #else
1241: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1242: #endif
1243: *mat_j++ = tcol - 1;
1244: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1245: mat_a += bs2;
1246: ilen_row++;
1247: }
1248: }
1249: imat_ilen[row] = ilen_row;
1250: }
1251: }
1252:
1253: }
1254: }
1256: /* Now assemble the off proc rows*/
1257: {
1258: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1259: PetscInt *imat_j,*imat_i;
1260: MatScalar *imat_a,*rbuf4_i;
1261: PetscMPIInt ii;
1263: for (tmp2=0; tmp2<nrqs; tmp2++) {
1264: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1265: idex = pa[ii];
1266: sbuf1_i = sbuf1[idex];
1267: jmax = sbuf1_i[0];
1268: ct1 = 2*jmax + 1;
1269: ct2 = 0;
1270: rbuf2_i = rbuf2[ii];
1271: rbuf3_i = rbuf3[ii];
1272: rbuf4_i = rbuf4[ii];
1273: for (j=1; j<=jmax; j++) {
1274: is_no = sbuf1_i[2*j-1];
1275: #if defined (PETSC_USE_CTABLE)
1276: lrow1_grow1 = rowmaps[is_no];
1277: lcol1_gcol1 = colmaps[is_no];
1278: #else
1279: rmap_i = rmap[is_no];
1280: cmap_i = cmap[is_no];
1281: #endif
1282: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1283: imat_ilen = mat->ilen;
1284: imat_j = mat->j;
1285: imat_i = mat->i;
1286: imat_a = mat->a;
1287: max1 = sbuf1_i[2*j];
1288: for (k=0; k<max1; k++,ct1++) {
1289: row = sbuf1_i[ct1];
1290: #if defined (PETSC_USE_CTABLE)
1291: PetscTableFind(lrow1_grow1,row+1,&row);
1292: row--;
1293: if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1294: #else
1295: row = rmap_i[row];
1296: #endif
1297: ilen = imat_ilen[row];
1298: mat_i = imat_i[row];
1299: mat_a = imat_a + mat_i*bs2;
1300: mat_j = imat_j + mat_i;
1301: max2 = rbuf2_i[ct1];
1302: for (l=0; l<max2; l++,ct2++) {
1303: #if defined (PETSC_USE_CTABLE)
1304: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1305: if (tcol) {
1306: #else
1307: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1308: #endif
1309: *mat_j++ = tcol - 1;
1310: /* *mat_a++= rbuf4_i[ct2]; */
1311: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1312: mat_a += bs2;
1313: ilen++;
1314: }
1315: }
1316: imat_ilen[row] = ilen;
1317: }
1318: }
1319: }
1320: }
1321: PetscFree(r_status4);
1322: PetscFree(r_waits4);
1323: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1324: PetscFree(s_waits4);
1325: PetscFree(s_status4);
1327: /* Restore the indices */
1328: for (i=0; i<ismax; i++) {
1329: ISRestoreIndices(isrow[i],irow+i);
1330: ISRestoreIndices(iscol[i],icol+i);
1331: }
1333: /* Destroy allocated memory */
1334: #if defined(PETSC_USE_CTABLE)
1335: PetscFree4(irow,icol,nrow,ncol);
1336: #else
1337: PetscFree5(irow,icol,nrow,ncol,rtable);
1338: #endif
1339: PetscFree4(w1,w2,w3,w4);
1340: PetscFree(pa);
1342: PetscFree4(sbuf1,ptr,tmp,ctr);
1343: PetscFree(sbuf1);
1344: PetscFree(rbuf2);
1345: for (i=0; i<nrqr; ++i) {
1346: PetscFree(sbuf2[i]);
1347: }
1348: for (i=0; i<nrqs; ++i) {
1349: PetscFree(rbuf3[i]);
1350: PetscFree(rbuf4[i]);
1351: }
1352: PetscFree3(sbuf2,req_size,req_source);
1353: PetscFree(rbuf3);
1354: PetscFree(rbuf4);
1355: PetscFree(sbuf_aj[0]);
1356: PetscFree(sbuf_aj);
1357: PetscFree(sbuf_aa[0]);
1358: PetscFree(sbuf_aa);
1360: #if defined (PETSC_USE_CTABLE)
1361: for (i=0; i<ismax; i++){
1362: PetscTableDestroy(rowmaps[i]);
1363: PetscTableDestroy(colmaps[i]);
1364: }
1365: PetscFree(colmaps);
1366: PetscFree(rowmaps);
1367: #else
1368: PetscFree(rmap);
1369: PetscFree(cmap[0]);
1370: PetscFree(cmap);
1371: #endif
1372: PetscFree(lens);
1374: for (i=0; i<ismax; i++) {
1375: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1376: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1377: }
1378: return(0);
1379: }