Actual source code: mpiov.c
2: #define PETSCMAT_DLL
4: /*
5: Routines to compute overlapping regions of a parallel MPI matrix
6: and to find submatrices that were shared across processors.
7: */
8: #include ../src/mat/impls/aij/mpi/mpiaij.h
9: #include petscbt.h
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
19: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20: {
22: PetscInt i;
25: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: for (i=0; i<ov; ++i) {
27: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
28: }
29: return(0);
30: }
32: /*
33: Sample message format:
34: If a processor A wants processor B to process some elements corresponding
35: to index sets is[1],is[5]
36: mesg [0] = 2 (no of index sets in the mesg)
37: -----------
38: mesg [1] = 1 => is[1]
39: mesg [2] = sizeof(is[1]);
40: -----------
41: mesg [3] = 5 => is[5]
42: mesg [4] = sizeof(is[5]);
43: -----------
44: mesg [5]
45: mesg [n] datas[1]
46: -----------
47: mesg[n+1]
48: mesg[m] data(is[5])
49: -----------
50:
51: Notes:
52: nrqs - no of requests sent (or to be sent out)
53: nrqr - no of requests recieved (which have to be or which have been processed
54: */
57: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58: {
59: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
60: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*rtable,**data,len;
64: PetscMPIInt size,rank,tag1,tag2;
65: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67: PetscBT *table;
68: MPI_Comm comm;
69: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70: MPI_Status *s_status,*recv_status;
73: comm = ((PetscObject)C)->comm;
74: size = c->size;
75: rank = c->rank;
76: m = C->rmap->N;
78: PetscObjectGetNewTag((PetscObject)C,&tag1);
79: PetscObjectGetNewTag((PetscObject)C,&tag2);
80:
81: PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);
82:
83: for (i=0; i<imax; i++) {
84: ISGetIndices(is[i],&idx[i]);
85: ISGetLocalSize(is[i],&n[i]);
86: }
87:
88: /* Create hash table for the mapping :row -> proc*/
89: for (i=0,j=0; i<size; i++) {
90: len = C->rmap->range[i+1];
91: for (; j<len; j++) {
92: rtable[j] = i;
93: }
94: }
96: /* evaluate communication - mesg to who,length of mesg, and buffer space
97: required. Based on this, buffers are allocated, and data copied into them*/
98: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);
99: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
100: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
101: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
102: for (i=0; i<imax; i++) {
103: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
104: idx_i = idx[i];
105: len = n[i];
106: for (j=0; j<len; j++) {
107: row = idx_i[j];
108: if (row < 0) {
109: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110: }
111: proc = rtable[row];
112: w4[proc]++;
113: }
114: for (j=0; j<size; j++){
115: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116: }
117: }
119: nrqs = 0; /* no of outgoing messages */
120: msz = 0; /* total mesg length (for all proc */
121: w1[rank] = 0; /* no mesg sent to intself */
122: w3[rank] = 0;
123: for (i=0; i<size; i++) {
124: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125: }
126: /* pa - is list of processors to communicate with */
127: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
128: for (i=0,j=0; i<size; i++) {
129: if (w1[i]) {pa[j] = i; j++;}
130: }
132: /* Each message would have a header = 1 + 2*(no of IS) + data */
133: for (i=0; i<nrqs; i++) {
134: j = pa[i];
135: w1[j] += w2[j] + 2*w3[j];
136: msz += w1[j];
137: }
139: /* Determine the number of messages to expect, their lengths, from from-ids */
140: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
141: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
143: /* Now post the Irecvs corresponding to these messages */
144: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
146: /* Allocate Memory for outgoing messages */
147: PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);
148: PetscMemzero(outdat,size*sizeof(PetscInt*));
149: PetscMemzero(ptr,size*sizeof(PetscInt*));
151: {
152: PetscInt *iptr = tmp,ict = 0;
153: for (i=0; i<nrqs; i++) {
154: j = pa[i];
155: iptr += ict;
156: outdat[j] = iptr;
157: ict = w1[j];
158: }
159: }
161: /* Form the outgoing messages */
162: /*plug in the headers*/
163: for (i=0; i<nrqs; i++) {
164: j = pa[i];
165: outdat[j][0] = 0;
166: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
167: ptr[j] = outdat[j] + 2*w3[j] + 1;
168: }
169:
170: /* Memory for doing local proc's work*/
171: {
172: PetscInt *d_p;
173: char *t_p;
175: /* should replace with PetscMallocN() */
176: PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);
178: PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
179: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);
180: data = (PetscInt **)(table + imax);
181: isz = (PetscInt *)(data + imax);
182: d_p = (PetscInt *)(isz + imax);
183: t_p = (char *)(d_p + m*imax);
184: for (i=0; i<imax; i++) {
185: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186: data[i] = d_p + (m)*i;
187: }
188: }
190: /* Parse the IS and update local tables and the outgoing buf with the data*/
191: {
192: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
193: PetscBT table_i;
195: for (i=0; i<imax; i++) {
196: PetscMemzero(ctr,size*sizeof(PetscInt));
197: n_i = n[i];
198: table_i = table[i];
199: idx_i = idx[i];
200: data_i = data[i];
201: isz_i = isz[i];
202: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
203: row = idx_i[j];
204: proc = rtable[row];
205: if (proc != rank) { /* copy to the outgoing buffer */
206: ctr[proc]++;
207: *ptr[proc] = row;
208: ptr[proc]++;
209: } else { /* Update the local table */
210: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211: }
212: }
213: /* Update the headers for the current IS */
214: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215: if ((ctr_j = ctr[j])) {
216: outdat_j = outdat[j];
217: k = ++outdat_j[0];
218: outdat_j[2*k] = ctr_j;
219: outdat_j[2*k-1] = i;
220: }
221: }
222: isz[i] = isz_i;
223: }
224: }
226: /* Now post the sends */
227: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
228: for (i=0; i<nrqs; ++i) {
229: j = pa[i];
230: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
231: }
232:
233: /* No longer need the original indices*/
234: for (i=0; i<imax; ++i) {
235: ISRestoreIndices(is[i],idx+i);
236: }
237: PetscFree3(idx,n,rtable);
239: for (i=0; i<imax; ++i) {
240: ISDestroy(is[i]);
241: }
242:
243: /* Do Local work*/
244: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
246: /* Receive messages*/
247: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
248: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
249:
250: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
251: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
253: /* Phase 1 sends are complete - deallocate buffers */
254: PetscFree4(outdat,ptr,tmp,ctr);
255: PetscFree4(w1,w2,w3,w4);
257: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
258: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
259: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
260: PetscFree(rbuf[0]);
261: PetscFree(rbuf);
263:
264: /* Send the data back*/
265: /* Do a global reduction to know the buffer space req for incoming messages*/
266: {
267: PetscMPIInt *rw1;
268:
269: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
270: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
272: for (i=0; i<nrqr; ++i) {
273: proc = recv_status[i].MPI_SOURCE;
274: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
275: rw1[proc] = isz1[i];
276: }
277: PetscFree(onodes1);
278: PetscFree(olengths1);
280: /* Determine the number of messages to expect, their lengths, from from-ids */
281: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
282: PetscFree(rw1);
283: }
284: /* Now post the Irecvs corresponding to these messages */
285: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
287: /* Now post the sends */
288: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
289: for (i=0; i<nrqr; ++i) {
290: j = recv_status[i].MPI_SOURCE;
291: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
292: }
294: /* receive work done on other processors*/
295: {
296: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
297: PetscMPIInt idex;
298: PetscBT table_i;
299: MPI_Status *status2;
300:
301: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
302: for (i=0; i<nrqs; ++i) {
303: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
304: /* Process the message*/
305: rbuf2_i = rbuf2[idex];
306: ct1 = 2*rbuf2_i[0]+1;
307: jmax = rbuf2[idex][0];
308: for (j=1; j<=jmax; j++) {
309: max = rbuf2_i[2*j];
310: is_no = rbuf2_i[2*j-1];
311: isz_i = isz[is_no];
312: data_i = data[is_no];
313: table_i = table[is_no];
314: for (k=0; k<max; k++,ct1++) {
315: row = rbuf2_i[ct1];
316: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317: }
318: isz[is_no] = isz_i;
319: }
320: }
322: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
323: PetscFree(status2);
324: }
325:
326: for (i=0; i<imax; ++i) {
327: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
328: }
329:
330: PetscFree(onodes2);
331: PetscFree(olengths2);
333: PetscFree(pa);
334: PetscFree(rbuf2[0]);
335: PetscFree(rbuf2);
336: PetscFree(s_waits1);
337: PetscFree(r_waits1);
338: PetscFree(s_waits2);
339: PetscFree(r_waits2);
340: PetscFree(table);
341: PetscFree(s_status);
342: PetscFree(recv_status);
343: PetscFree(xdata[0]);
344: PetscFree(xdata);
345: PetscFree(isz1);
346: return(0);
347: }
351: /*
352: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
353: the work on the local processor.
355: Inputs:
356: C - MAT_MPIAIJ;
357: imax - total no of index sets processed at a time;
358: table - an array of char - size = m bits.
359:
360: Output:
361: isz - array containing the count of the solution elements correspondign
362: to each index set;
363: data - pointer to the solutions
364: */
365: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
366: {
367: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
368: Mat A = c->A,B = c->B;
369: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
370: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
371: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
372: PetscBT table_i;
375: rstart = C->rmap->rstart;
376: cstart = C->cmap->rstart;
377: ai = a->i;
378: aj = a->j;
379: bi = b->i;
380: bj = b->j;
381: garray = c->garray;
383:
384: for (i=0; i<imax; i++) {
385: data_i = data[i];
386: table_i = table[i];
387: isz_i = isz[i];
388: for (j=0,max=isz[i]; j<max; j++) {
389: row = data_i[j] - rstart;
390: start = ai[row];
391: end = ai[row+1];
392: for (k=start; k<end; k++) { /* Amat */
393: val = aj[k] + cstart;
394: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
395: }
396: start = bi[row];
397: end = bi[row+1];
398: for (k=start; k<end; k++) { /* Bmat */
399: val = garray[bj[k]];
400: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401: }
402: }
403: isz[i] = isz_i;
404: }
405: return(0);
406: }
410: /*
411: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
412: and return the output
414: Input:
415: C - the matrix
416: nrqr - no of messages being processed.
417: rbuf - an array of pointers to the recieved requests
418:
419: Output:
420: xdata - array of messages to be sent back
421: isz1 - size of each message
423: For better efficiency perhaps we should malloc separately each xdata[i],
424: then if a remalloc is required we need only copy the data for that one row
425: rather then all previous rows as it is now where a single large chunck of
426: memory is used.
428: */
429: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
430: {
431: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
432: Mat A = c->A,B = c->B;
433: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
435: PetscMPIInt rank;
436: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
437: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
438: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
439: PetscInt *rbuf_i,kmax,rbuf_0;
440: PetscBT xtable;
443: rank = c->rank;
444: m = C->rmap->N;
445: rstart = C->rmap->rstart;
446: cstart = C->cmap->rstart;
447: ai = a->i;
448: aj = a->j;
449: bi = b->i;
450: bj = b->j;
451: garray = c->garray;
452:
453:
454: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
455: rbuf_i = rbuf[i];
456: rbuf_0 = rbuf_i[0];
457: ct += rbuf_0;
458: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
459: }
460:
461: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
462: else max1 = 1;
463: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
464: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
465: ++no_malloc;
466: PetscBTCreate(m,xtable);
467: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
468:
469: ct3 = 0;
470: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
471: rbuf_i = rbuf[i];
472: rbuf_0 = rbuf_i[0];
473: ct1 = 2*rbuf_0+1;
474: ct2 = ct1;
475: ct3 += ct1;
476: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
477: PetscBTMemzero(m,xtable);
478: oct2 = ct2;
479: kmax = rbuf_i[2*j];
480: for (k=0; k<kmax; k++,ct1++) {
481: row = rbuf_i[ct1];
482: if (!PetscBTLookupSet(xtable,row)) {
483: if (!(ct3 < mem_estimate)) {
484: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
485: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
486: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
487: PetscFree(xdata[0]);
488: xdata[0] = tmp;
489: mem_estimate = new_estimate; ++no_malloc;
490: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
491: }
492: xdata[i][ct2++] = row;
493: ct3++;
494: }
495: }
496: for (k=oct2,max2=ct2; k<max2; k++) {
497: row = xdata[i][k] - rstart;
498: start = ai[row];
499: end = ai[row+1];
500: for (l=start; l<end; l++) {
501: val = aj[l] + cstart;
502: if (!PetscBTLookupSet(xtable,val)) {
503: if (!(ct3 < mem_estimate)) {
504: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
505: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
506: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
507: PetscFree(xdata[0]);
508: xdata[0] = tmp;
509: mem_estimate = new_estimate; ++no_malloc;
510: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
511: }
512: xdata[i][ct2++] = val;
513: ct3++;
514: }
515: }
516: start = bi[row];
517: end = bi[row+1];
518: for (l=start; l<end; l++) {
519: val = garray[bj[l]];
520: if (!PetscBTLookupSet(xtable,val)) {
521: if (!(ct3 < mem_estimate)) {
522: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
523: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
524: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
525: PetscFree(xdata[0]);
526: xdata[0] = tmp;
527: mem_estimate = new_estimate; ++no_malloc;
528: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
529: }
530: xdata[i][ct2++] = val;
531: ct3++;
532: }
533: }
534: }
535: /* Update the header*/
536: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
537: xdata[i][2*j-1] = rbuf_i[2*j-1];
538: }
539: xdata[i][0] = rbuf_0;
540: xdata[i+1] = xdata[i] + ct2;
541: isz1[i] = ct2; /* size of each message */
542: }
543: PetscBTDestroy(xtable);
544: PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
545: return(0);
546: }
547: /* -------------------------------------------------------------------------*/
548: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
549: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
550: /*
551: Every processor gets the entire matrix
552: */
555: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
556: {
557: Mat B;
558: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
559: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
561: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
562: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
563: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
564: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
567: MPI_Comm_size(((PetscObject)A)->comm,&size);
568: MPI_Comm_rank(((PetscObject)A)->comm,&rank);
570: if (scall == MAT_INITIAL_MATRIX) {
571: /* ----------------------------------------------------------------
572: Tell every processor the number of nonzeros per row
573: */
574: PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);
575: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
576: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
577: }
578: sendcount = A->rmap->rend - A->rmap->rstart;
579: PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);
580: for (i=0; i<size; i++) {
581: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
582: displs[i] = A->rmap->range[i];
583: }
584: #if defined(PETSC_HAVE_MPI_IN_PLACE)
585: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
586: #else
587: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
588: #endif
589: /* ---------------------------------------------------------------
590: Create the sequential matrix of the same type as the local block diagonal
591: */
592: MatCreate(PETSC_COMM_SELF,&B);
593: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
594: MatSetType(B,((PetscObject)a->A)->type_name);
595: MatSeqAIJSetPreallocation(B,0,lens);
596: PetscMalloc(sizeof(Mat),Bin);
597: **Bin = B;
598: b = (Mat_SeqAIJ *)B->data;
600: /*--------------------------------------------------------------------
601: Copy my part of matrix column indices over
602: */
603: sendcount = ad->nz + bd->nz;
604: jsendbuf = b->j + b->i[rstarts[rank]];
605: a_jsendbuf = ad->j;
606: b_jsendbuf = bd->j;
607: n = A->rmap->rend - A->rmap->rstart;
608: cnt = 0;
609: for (i=0; i<n; i++) {
611: /* put in lower diagonal portion */
612: m = bd->i[i+1] - bd->i[i];
613: while (m > 0) {
614: /* is it above diagonal (in bd (compressed) numbering) */
615: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
616: jsendbuf[cnt++] = garray[*b_jsendbuf++];
617: m--;
618: }
620: /* put in diagonal portion */
621: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
622: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
623: }
625: /* put in upper diagonal portion */
626: while (m-- > 0) {
627: jsendbuf[cnt++] = garray[*b_jsendbuf++];
628: }
629: }
630: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
632: /*--------------------------------------------------------------------
633: Gather all column indices to all processors
634: */
635: for (i=0; i<size; i++) {
636: recvcounts[i] = 0;
637: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
638: recvcounts[i] += lens[j];
639: }
640: }
641: displs[0] = 0;
642: for (i=1; i<size; i++) {
643: displs[i] = displs[i-1] + recvcounts[i-1];
644: }
645: #if defined(PETSC_HAVE_MPI_IN_PLACE)
646: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
647: #else
648: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
649: #endif
650: /*--------------------------------------------------------------------
651: Assemble the matrix into useable form (note numerical values not yet set)
652: */
653: /* set the b->ilen (length of each row) values */
654: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
655: /* set the b->i indices */
656: b->i[0] = 0;
657: for (i=1; i<=A->rmap->N; i++) {
658: b->i[i] = b->i[i-1] + lens[i-1];
659: }
660: PetscFree(lens);
661: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
662: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
664: } else {
665: B = **Bin;
666: b = (Mat_SeqAIJ *)B->data;
667: }
669: /*--------------------------------------------------------------------
670: Copy my part of matrix numerical values into the values location
671: */
672: if (flag == MAT_GET_VALUES){
673: sendcount = ad->nz + bd->nz;
674: sendbuf = b->a + b->i[rstarts[rank]];
675: a_sendbuf = ad->a;
676: b_sendbuf = bd->a;
677: b_sendj = bd->j;
678: n = A->rmap->rend - A->rmap->rstart;
679: cnt = 0;
680: for (i=0; i<n; i++) {
682: /* put in lower diagonal portion */
683: m = bd->i[i+1] - bd->i[i];
684: while (m > 0) {
685: /* is it above diagonal (in bd (compressed) numbering) */
686: if (garray[*b_sendj] > A->rmap->rstart + i) break;
687: sendbuf[cnt++] = *b_sendbuf++;
688: m--;
689: b_sendj++;
690: }
692: /* put in diagonal portion */
693: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
694: sendbuf[cnt++] = *a_sendbuf++;
695: }
697: /* put in upper diagonal portion */
698: while (m-- > 0) {
699: sendbuf[cnt++] = *b_sendbuf++;
700: b_sendj++;
701: }
702: }
703: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
704:
705: /* -----------------------------------------------------------------
706: Gather all numerical values to all processors
707: */
708: if (!recvcounts) {
709: PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);
710: }
711: for (i=0; i<size; i++) {
712: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
713: }
714: displs[0] = 0;
715: for (i=1; i<size; i++) {
716: displs[i] = displs[i-1] + recvcounts[i-1];
717: }
718: recvbuf = b->a;
719: #if defined(PETSC_HAVE_MPI_IN_PLACE)
720: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
721: #else
722: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
723: #endif
724: } /* endof (flag == MAT_GET_VALUES) */
725: PetscFree2(recvcounts,displs);
727: if (A->symmetric){
728: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
729: } else if (A->hermitian) {
730: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
731: } else if (A->structurally_symmetric) {
732: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
733: }
734: return(0);
735: }
739: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
740: {
742: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
743: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
746: /*
747: Check for special case each processor gets entire matrix
748: */
749: if (ismax == 1 && C->rmap->N == C->cmap->N) {
750: ISIdentity(*isrow,&rowflag);
751: ISIdentity(*iscol,&colflag);
752: ISGetLocalSize(*isrow,&nrow);
753: ISGetLocalSize(*iscol,&ncol);
754: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
755: wantallmatrix = PETSC_TRUE;
756: PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
757: }
758: }
759: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);
760: if (twantallmatrix) {
761: MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
762: return(0);
763: }
765: /* Allocate memory to hold all the submatrices */
766: if (scall != MAT_REUSE_MATRIX) {
767: PetscMalloc((ismax+1)*sizeof(Mat),submat);
768: }
769: /* Determine the number of stages through which submatrices are done */
770: nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
771: if (!nmax) nmax = 1;
772: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
774: /* Make sure every processor loops through the nstages */
775: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
777: for (i=0,pos=0; i<nstages; i++) {
778: if (pos+nmax <= ismax) max_no = nmax;
779: else if (pos == ismax) max_no = 0;
780: else max_no = ismax-pos;
781: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
782: pos += max_no;
783: }
784: return(0);
785: }
787: /* -------------------------------------------------------------------------*/
790: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
791: {
792: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
793: Mat A = c->A;
794: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
795: const PetscInt **icol,**irow;
796: PetscInt *nrow,*ncol,start;
798: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
799: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
800: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
801: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
802: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
803: const PetscInt *irow_i;
804: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
805: PetscInt *rmap_i;
806: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
807: MPI_Request *r_waits4,*s_waits3,*s_waits4;
808: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
809: MPI_Status *r_status3,*r_status4,*s_status4;
810: MPI_Comm comm;
811: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
812: PetscTruth sorted;
813: PetscMPIInt *onodes1,*olengths1;
814: PetscMPIInt idex,idex2,end;
817: comm = ((PetscObject)C)->comm;
818: tag0 = ((PetscObject)C)->tag;
819: size = c->size;
820: rank = c->rank;
821:
822: /* Get some new tags to keep the communication clean */
823: PetscObjectGetNewTag((PetscObject)C,&tag1);
824: PetscObjectGetNewTag((PetscObject)C,&tag2);
825: PetscObjectGetNewTag((PetscObject)C,&tag3);
827: /* Check if the col indices are sorted */
828: for (i=0; i<ismax; i++) {
829: ISSorted(isrow[i],&sorted);
830: /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
831: ISSorted(iscol[i],&sorted);
832: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
833: }
834: PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);
836: for (i=0; i<ismax; i++) {
837: ISGetIndices(isrow[i],&irow[i]);
838: ISGetIndices(iscol[i],&icol[i]);
839: ISGetLocalSize(isrow[i],&nrow[i]);
840: ISGetLocalSize(iscol[i],&ncol[i]);
841: }
843: /* evaluate communication - mesg to who, length of mesg, and buffer space
844: required. Based on this, buffers are allocated, and data copied into them*/
845: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4); /* mesg size */
846: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialize work vector*/
847: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialize work vector*/
848: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialize work vector*/
849: for (i=0; i<ismax; i++) {
850: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
851: jmax = nrow[i];
852: irow_i = irow[i];
853: for (j=0; j<jmax; j++) {
854: l = 0;
855: row = irow_i[j];
856: while (row >= C->rmap->range[l+1]) l++;
857: proc = l;
858: w4[proc]++;
859: }
860: for (j=0; j<size; j++) {
861: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
862: }
863: }
864:
865: nrqs = 0; /* no of outgoing messages */
866: msz = 0; /* total mesg length (for all procs) */
867: w1[rank] = 0; /* no mesg sent to self */
868: w3[rank] = 0;
869: for (i=0; i<size; i++) {
870: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
871: }
872: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
873: for (i=0,j=0; i<size; i++) {
874: if (w1[i]) { pa[j] = i; j++; }
875: }
877: /* Each message would have a header = 1 + 2*(no of IS) + data */
878: for (i=0; i<nrqs; i++) {
879: j = pa[i];
880: w1[j] += w2[j] + 2* w3[j];
881: msz += w1[j];
882: }
884: /* Determine the number of messages to expect, their lengths, from from-ids */
885: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
886: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
888: /* Now post the Irecvs corresponding to these messages */
889: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
890:
891: PetscFree(onodes1);
892: PetscFree(olengths1);
893:
894: /* Allocate Memory for outgoing messages */
895: PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);
896: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
897: PetscMemzero(ptr,size*sizeof(PetscInt*));
899: {
900: PetscInt *iptr = tmp,ict = 0;
901: for (i=0; i<nrqs; i++) {
902: j = pa[i];
903: iptr += ict;
904: sbuf1[j] = iptr;
905: ict = w1[j];
906: }
907: }
909: /* Form the outgoing messages */
910: /* Initialize the header space */
911: for (i=0; i<nrqs; i++) {
912: j = pa[i];
913: sbuf1[j][0] = 0;
914: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
915: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
916: }
917:
918: /* Parse the isrow and copy data into outbuf */
919: for (i=0; i<ismax; i++) {
920: PetscMemzero(ctr,size*sizeof(PetscInt));
921: irow_i = irow[i];
922: jmax = nrow[i];
923: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
924: l = 0;
925: row = irow_i[j];
926: while (row >= C->rmap->range[l+1]) l++;
927: proc = l;
928: if (proc != rank) { /* copy to the outgoing buf*/
929: ctr[proc]++;
930: *ptr[proc] = row;
931: ptr[proc]++;
932: }
933: }
934: /* Update the headers for the current IS */
935: for (j=0; j<size; j++) { /* Can Optimise this loop too */
936: if ((ctr_j = ctr[j])) {
937: sbuf1_j = sbuf1[j];
938: k = ++sbuf1_j[0];
939: sbuf1_j[2*k] = ctr_j;
940: sbuf1_j[2*k-1] = i;
941: }
942: }
943: }
945: /* Now post the sends */
946: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
947: for (i=0; i<nrqs; ++i) {
948: j = pa[i];
949: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
950: }
952: /* Post Receives to capture the buffer size */
953: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
954: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
955: rbuf2[0] = tmp + msz;
956: for (i=1; i<nrqs; ++i) {
957: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
958: }
959: for (i=0; i<nrqs; ++i) {
960: j = pa[i];
961: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
962: }
964: /* Send to other procs the buf size they should allocate */
965:
967: /* Receive messages*/
968: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
969: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
970: PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);
971: {
972: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
973: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
974: PetscInt *sbuf2_i;
976: for (i=0; i<nrqr; ++i) {
977: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
978: req_size[idex] = 0;
979: rbuf1_i = rbuf1[idex];
980: start = 2*rbuf1_i[0] + 1;
981: MPI_Get_count(r_status1+i,MPIU_INT,&end);
982: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
983: sbuf2_i = sbuf2[idex];
984: for (j=start; j<end; j++) {
985: id = rbuf1_i[j] - rstart;
986: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
987: sbuf2_i[j] = ncols;
988: req_size[idex] += ncols;
989: }
990: req_source[idex] = r_status1[i].MPI_SOURCE;
991: /* form the header */
992: sbuf2_i[0] = req_size[idex];
993: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
994: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
995: }
996: }
997: PetscFree(r_status1);
998: PetscFree(r_waits1);
1000: /* recv buffer sizes */
1001: /* Receive messages*/
1002:
1003: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1004: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1005: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1006: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1007: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1009: for (i=0; i<nrqs; ++i) {
1010: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1011: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1012: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1013: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1014: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1015: }
1016: PetscFree(r_status2);
1017: PetscFree(r_waits2);
1018:
1019: /* Wait on sends1 and sends2 */
1020: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1021: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1023: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1024: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1025: PetscFree(s_status1);
1026: PetscFree(s_status2);
1027: PetscFree(s_waits1);
1028: PetscFree(s_waits2);
1030: /* Now allocate buffers for a->j, and send them off */
1031: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1032: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1033: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1034: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1035:
1036: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1037: {
1038: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1039: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1040: PetscInt cend = C->cmap->rend;
1041: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1043: for (i=0; i<nrqr; i++) {
1044: rbuf1_i = rbuf1[i];
1045: sbuf_aj_i = sbuf_aj[i];
1046: ct1 = 2*rbuf1_i[0] + 1;
1047: ct2 = 0;
1048: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1049: kmax = rbuf1[i][2*j];
1050: for (k=0; k<kmax; k++,ct1++) {
1051: row = rbuf1_i[ct1] - rstart;
1052: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1053: ncols = nzA + nzB;
1054: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1056: /* load the column indices for this row into cols*/
1057: cols = sbuf_aj_i + ct2;
1058:
1059: lwrite = 0;
1060: for (l=0; l<nzB; l++) {
1061: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1062: }
1063: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1064: for (l=0; l<nzB; l++) {
1065: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1066: }
1068: ct2 += ncols;
1069: }
1070: }
1071: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1072: }
1073: }
1074: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1075: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1077: /* Allocate buffers for a->a, and send them off */
1078: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1079: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1080: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1081: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1082:
1083: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1084: {
1085: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1086: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1087: PetscInt cend = C->cmap->rend;
1088: PetscInt *b_j = b->j;
1089: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1090:
1091: for (i=0; i<nrqr; i++) {
1092: rbuf1_i = rbuf1[i];
1093: sbuf_aa_i = sbuf_aa[i];
1094: ct1 = 2*rbuf1_i[0]+1;
1095: ct2 = 0;
1096: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1097: kmax = rbuf1_i[2*j];
1098: for (k=0; k<kmax; k++,ct1++) {
1099: row = rbuf1_i[ct1] - rstart;
1100: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1101: ncols = nzA + nzB;
1102: cworkB = b_j + b_i[row];
1103: vworkA = a_a + a_i[row];
1104: vworkB = b_a + b_i[row];
1106: /* load the column values for this row into vals*/
1107: vals = sbuf_aa_i+ct2;
1108:
1109: lwrite = 0;
1110: for (l=0; l<nzB; l++) {
1111: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1112: }
1113: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1114: for (l=0; l<nzB; l++) {
1115: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1116: }
1117:
1118: ct2 += ncols;
1119: }
1120: }
1121: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1122: }
1123: }
1124: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1125: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1126: PetscFree(rbuf1[0]);
1127: PetscFree(rbuf1);
1129: /* Form the matrix */
1130: /* create col map */
1131: {
1132: const PetscInt *icol_i;
1133:
1134: PetscMalloc(ismax*sizeof(PetscInt*),&cmap);
1135: PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);
1136: PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));
1137: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1138: for (i=0; i<ismax; i++) {
1139: jmax = ncol[i];
1140: icol_i = icol[i];
1141: cmap_i = cmap[i];
1142: for (j=0; j<jmax; j++) {
1143: cmap_i[icol_i[j]] = j+1;
1144: }
1145: }
1146: }
1148: /* Create lens which is required for MatCreate... */
1149: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1150: PetscMalloc(ismax*sizeof(PetscInt*),&lens);
1151: PetscMalloc(j*sizeof(PetscInt),&lens[0]);
1152: PetscMemzero(lens[0],j*sizeof(PetscInt));
1153: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1154:
1155: /* Update lens from local data */
1156: for (i=0; i<ismax; i++) {
1157: jmax = nrow[i];
1158: cmap_i = cmap[i];
1159: irow_i = irow[i];
1160: lens_i = lens[i];
1161: for (j=0; j<jmax; j++) {
1162: l = 0;
1163: row = irow_i[j];
1164: while (row >= C->rmap->range[l+1]) l++;
1165: proc = l;
1166: if (proc == rank) {
1167: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1168: for (k=0; k<ncols; k++) {
1169: if (cmap_i[cols[k]]) { lens_i[j]++;}
1170: }
1171: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1172: }
1173: }
1174: }
1175:
1176: /* Create row map*/
1177: PetscMalloc(ismax*sizeof(PetscInt*),&rmap);
1178: PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);
1179: PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1180: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1181: for (i=0; i<ismax; i++) {
1182: rmap_i = rmap[i];
1183: irow_i = irow[i];
1184: jmax = nrow[i];
1185: for (j=0; j<jmax; j++) {
1186: rmap_i[irow_i[j]] = j;
1187: }
1188: }
1189:
1190: /* Update lens from offproc data */
1191: {
1192: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1194: for (tmp2=0; tmp2<nrqs; tmp2++) {
1195: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1196: idex = pa[idex2];
1197: sbuf1_i = sbuf1[idex];
1198: jmax = sbuf1_i[0];
1199: ct1 = 2*jmax+1;
1200: ct2 = 0;
1201: rbuf2_i = rbuf2[idex2];
1202: rbuf3_i = rbuf3[idex2];
1203: for (j=1; j<=jmax; j++) {
1204: is_no = sbuf1_i[2*j-1];
1205: max1 = sbuf1_i[2*j];
1206: lens_i = lens[is_no];
1207: cmap_i = cmap[is_no];
1208: rmap_i = rmap[is_no];
1209: for (k=0; k<max1; k++,ct1++) {
1210: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1211: max2 = rbuf2_i[ct1];
1212: for (l=0; l<max2; l++,ct2++) {
1213: if (cmap_i[rbuf3_i[ct2]]) {
1214: lens_i[row]++;
1215: }
1216: }
1217: }
1218: }
1219: }
1220: }
1221: PetscFree(r_status3);
1222: PetscFree(r_waits3);
1223: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1224: PetscFree(s_status3);
1225: PetscFree(s_waits3);
1227: /* Create the submatrices */
1228: if (scall == MAT_REUSE_MATRIX) {
1229: PetscTruth flag;
1231: /*
1232: Assumes new rows are same length as the old rows,hence bug!
1233: */
1234: for (i=0; i<ismax; i++) {
1235: mat = (Mat_SeqAIJ *)(submats[i]->data);
1236: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1237: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1238: }
1239: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1240: if (!flag) {
1241: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1242: }
1243: /* Initial matrix as if empty */
1244: PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1245: submats[i]->factor = C->factor;
1246: }
1247: } else {
1248: for (i=0; i<ismax; i++) {
1249: MatCreate(PETSC_COMM_SELF,submats+i);
1250: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1251: MatSetType(submats[i],((PetscObject)A)->type_name);
1252: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1253: }
1254: }
1256: /* Assemble the matrices */
1257: /* First assemble the local rows */
1258: {
1259: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1260: PetscScalar *imat_a;
1261:
1262: for (i=0; i<ismax; i++) {
1263: mat = (Mat_SeqAIJ*)submats[i]->data;
1264: imat_ilen = mat->ilen;
1265: imat_j = mat->j;
1266: imat_i = mat->i;
1267: imat_a = mat->a;
1268: cmap_i = cmap[i];
1269: rmap_i = rmap[i];
1270: irow_i = irow[i];
1271: jmax = nrow[i];
1272: for (j=0; j<jmax; j++) {
1273: l = 0;
1274: row = irow_i[j];
1275: while (row >= C->rmap->range[l+1]) l++;
1276: proc = l;
1277: if (proc == rank) {
1278: old_row = row;
1279: row = rmap_i[row];
1280: ilen_row = imat_ilen[row];
1281: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1282: mat_i = imat_i[row] ;
1283: mat_a = imat_a + mat_i;
1284: mat_j = imat_j + mat_i;
1285: for (k=0; k<ncols; k++) {
1286: if ((tcol = cmap_i[cols[k]])) {
1287: *mat_j++ = tcol - 1;
1288: *mat_a++ = vals[k];
1289: ilen_row++;
1290: }
1291: }
1292: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1293: imat_ilen[row] = ilen_row;
1294: }
1295: }
1296: }
1297: }
1299: /* Now assemble the off proc rows*/
1300: {
1301: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1302: PetscInt *imat_j,*imat_i;
1303: PetscScalar *imat_a,*rbuf4_i;
1305: for (tmp2=0; tmp2<nrqs; tmp2++) {
1306: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1307: idex = pa[idex2];
1308: sbuf1_i = sbuf1[idex];
1309: jmax = sbuf1_i[0];
1310: ct1 = 2*jmax + 1;
1311: ct2 = 0;
1312: rbuf2_i = rbuf2[idex2];
1313: rbuf3_i = rbuf3[idex2];
1314: rbuf4_i = rbuf4[idex2];
1315: for (j=1; j<=jmax; j++) {
1316: is_no = sbuf1_i[2*j-1];
1317: rmap_i = rmap[is_no];
1318: cmap_i = cmap[is_no];
1319: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1320: imat_ilen = mat->ilen;
1321: imat_j = mat->j;
1322: imat_i = mat->i;
1323: imat_a = mat->a;
1324: max1 = sbuf1_i[2*j];
1325: for (k=0; k<max1; k++,ct1++) {
1326: row = sbuf1_i[ct1];
1327: row = rmap_i[row];
1328: ilen = imat_ilen[row];
1329: mat_i = imat_i[row] ;
1330: mat_a = imat_a + mat_i;
1331: mat_j = imat_j + mat_i;
1332: max2 = rbuf2_i[ct1];
1333: for (l=0; l<max2; l++,ct2++) {
1334: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1335: *mat_j++ = tcol - 1;
1336: *mat_a++ = rbuf4_i[ct2];
1337: ilen++;
1338: }
1339: }
1340: imat_ilen[row] = ilen;
1341: }
1342: }
1343: }
1344: }
1345: PetscFree(r_status4);
1346: PetscFree(r_waits4);
1347: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1348: PetscFree(s_waits4);
1349: PetscFree(s_status4);
1351: /* Restore the indices */
1352: for (i=0; i<ismax; i++) {
1353: ISRestoreIndices(isrow[i],irow+i);
1354: ISRestoreIndices(iscol[i],icol+i);
1355: }
1357: /* Destroy allocated memory */
1358: PetscFree4(irow,icol,nrow,ncol);
1359: PetscFree4(w1,w2,w3,w4);
1360: PetscFree(pa);
1362: PetscFree4(sbuf1,ptr,tmp,ctr);
1363: PetscFree(rbuf2);
1364: for (i=0; i<nrqr; ++i) {
1365: PetscFree(sbuf2[i]);
1366: }
1367: for (i=0; i<nrqs; ++i) {
1368: PetscFree(rbuf3[i]);
1369: PetscFree(rbuf4[i]);
1370: }
1372: PetscFree3(sbuf2,req_size,req_source);
1373: PetscFree(rbuf3);
1374: PetscFree(rbuf4);
1375: PetscFree(sbuf_aj[0]);
1376: PetscFree(sbuf_aj);
1377: PetscFree(sbuf_aa[0]);
1378: PetscFree(sbuf_aa);
1379:
1380: PetscFree(cmap[0]);
1381: PetscFree(cmap);
1382: PetscFree(rmap[0]);
1383: PetscFree(rmap);
1384: PetscFree(lens[0]);
1385: PetscFree(lens);
1387: for (i=0; i<ismax; i++) {
1388: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1389: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1390: }
1391: return(0);
1392: }