Actual source code: mpiov.c

  2: #define PETSCMAT_DLL

  4: /*
  5:    Routines to compute overlapping regions of a parallel MPI matrix
  6:   and to find submatrices that were shared across processors.
  7: */
 8:  #include ../src/mat/impls/aij/mpi/mpiaij.h
 9:  #include petscbt.h

 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
 13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
 14: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 15: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 19: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 20: {
 22:   PetscInt       i;

 25:   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 26:   for (i=0; i<ov; ++i) {
 27:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 28:   }
 29:   return(0);
 30: }

 32: /*
 33:   Sample message format:
 34:   If a processor A wants processor B to process some elements corresponding
 35:   to index sets is[1],is[5]
 36:   mesg [0] = 2   (no of index sets in the mesg)
 37:   -----------  
 38:   mesg [1] = 1 => is[1]
 39:   mesg [2] = sizeof(is[1]);
 40:   -----------  
 41:   mesg [3] = 5  => is[5]
 42:   mesg [4] = sizeof(is[5]);
 43:   -----------
 44:   mesg [5] 
 45:   mesg [n]  datas[1]
 46:   -----------  
 47:   mesg[n+1]
 48:   mesg[m]  data(is[5])
 49:   -----------  
 50:   
 51:   Notes:
 52:   nrqs - no of requests sent (or to be sent out)
 53:   nrqr - no of requests recieved (which have to be or which have been processed
 54: */
 57: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
 58: {
 59:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
 60:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
 61:   const PetscInt **idx,*idx_i;
 62:   PetscInt       *n,*rtable,**data,len;
 64:   PetscMPIInt    size,rank,tag1,tag2;
 65:   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
 66:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
 67:   PetscBT        *table;
 68:   MPI_Comm       comm;
 69:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
 70:   MPI_Status     *s_status,*recv_status;

 73:   comm   = ((PetscObject)C)->comm;
 74:   size   = c->size;
 75:   rank   = c->rank;
 76:   m      = C->rmap->N;

 78:   PetscObjectGetNewTag((PetscObject)C,&tag1);
 79:   PetscObjectGetNewTag((PetscObject)C,&tag2);
 80: 
 81:   PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);
 82: 
 83:   for (i=0; i<imax; i++) {
 84:     ISGetIndices(is[i],&idx[i]);
 85:     ISGetLocalSize(is[i],&n[i]);
 86:   }
 87: 
 88:   /* Create hash table for the mapping :row -> proc*/
 89:   for (i=0,j=0; i<size; i++) {
 90:     len = C->rmap->range[i+1];
 91:     for (; j<len; j++) {
 92:       rtable[j] = i;
 93:     }
 94:   }

 96:   /* evaluate communication - mesg to who,length of mesg, and buffer space
 97:      required. Based on this, buffers are allocated, and data copied into them*/
 98:   PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);
 99:   PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
100:   PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
101:   PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
102:   for (i=0; i<imax; i++) {
103:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
104:     idx_i = idx[i];
105:     len   = n[i];
106:     for (j=0; j<len; j++) {
107:       row  = idx_i[j];
108:       if (row < 0) {
109:         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110:       }
111:       proc = rtable[row];
112:       w4[proc]++;
113:     }
114:     for (j=0; j<size; j++){
115:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116:     }
117:   }

119:   nrqs     = 0;              /* no of outgoing messages */
120:   msz      = 0;              /* total mesg length (for all proc */
121:   w1[rank] = 0;              /* no mesg sent to intself */
122:   w3[rank] = 0;
123:   for (i=0; i<size; i++) {
124:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125:   }
126:   /* pa - is list of processors to communicate with */
127:   PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
128:   for (i=0,j=0; i<size; i++) {
129:     if (w1[i]) {pa[j] = i; j++;}
130:   }

132:   /* Each message would have a header = 1 + 2*(no of IS) + data */
133:   for (i=0; i<nrqs; i++) {
134:     j      = pa[i];
135:     w1[j] += w2[j] + 2*w3[j];
136:     msz   += w1[j];
137:   }

139:   /* Determine the number of messages to expect, their lengths, from from-ids */
140:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
141:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

143:   /* Now post the Irecvs corresponding to these messages */
144:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

146:   /* Allocate Memory for outgoing messages */
147:   PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);
148:   PetscMemzero(outdat,size*sizeof(PetscInt*));
149:   PetscMemzero(ptr,size*sizeof(PetscInt*));

151:   {
152:     PetscInt *iptr = tmp,ict  = 0;
153:     for (i=0; i<nrqs; i++) {
154:       j         = pa[i];
155:       iptr     +=  ict;
156:       outdat[j] = iptr;
157:       ict       = w1[j];
158:     }
159:   }

161:   /* Form the outgoing messages */
162:   /*plug in the headers*/
163:   for (i=0; i<nrqs; i++) {
164:     j            = pa[i];
165:     outdat[j][0] = 0;
166:     PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
167:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168:   }
169: 
170:   /* Memory for doing local proc's work*/
171:   {
172:     PetscInt  *d_p;
173:     char      *t_p;

175:     /* should replace with PetscMallocN() */
176:     PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177:       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);
178:     PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
179:       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);
180:     data  = (PetscInt **)(table + imax);
181:     isz   = (PetscInt  *)(data  + imax);
182:     d_p   = (PetscInt  *)(isz   + imax);
183:     t_p   = (char *)(d_p   + m*imax);
184:     for (i=0; i<imax; i++) {
185:       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186:       data[i]  = d_p + (m)*i;
187:     }
188:   }

190:   /* Parse the IS and update local tables and the outgoing buf with the data*/
191:   {
192:     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193:     PetscBT table_i;

195:     for (i=0; i<imax; i++) {
196:       PetscMemzero(ctr,size*sizeof(PetscInt));
197:       n_i     = n[i];
198:       table_i = table[i];
199:       idx_i   = idx[i];
200:       data_i  = data[i];
201:       isz_i   = isz[i];
202:       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203:         row  = idx_i[j];
204:         proc = rtable[row];
205:         if (proc != rank) { /* copy to the outgoing buffer */
206:           ctr[proc]++;
207:           *ptr[proc] = row;
208:           ptr[proc]++;
209:         } else { /* Update the local table */
210:           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211:         }
212:       }
213:       /* Update the headers for the current IS */
214:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215:         if ((ctr_j = ctr[j])) {
216:           outdat_j        = outdat[j];
217:           k               = ++outdat_j[0];
218:           outdat_j[2*k]   = ctr_j;
219:           outdat_j[2*k-1] = i;
220:         }
221:       }
222:       isz[i] = isz_i;
223:     }
224:   }

226:   /*  Now  post the sends */
227:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
228:   for (i=0; i<nrqs; ++i) {
229:     j    = pa[i];
230:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
231:   }
232: 
233:   /* No longer need the original indices*/
234:   for (i=0; i<imax; ++i) {
235:     ISRestoreIndices(is[i],idx+i);
236:   }
237:   PetscFree3(idx,n,rtable);

239:   for (i=0; i<imax; ++i) {
240:     ISDestroy(is[i]);
241:   }
242: 
243:   /* Do Local work*/
244:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);

246:   /* Receive messages*/
247:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
248:   if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
249: 
250:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
251:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}

253:   /* Phase 1 sends are complete - deallocate buffers */
254:   PetscFree4(outdat,ptr,tmp,ctr);
255:   PetscFree4(w1,w2,w3,w4);

257:   PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
258:   PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
259:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
260:   PetscFree(rbuf[0]);
261:   PetscFree(rbuf);

263: 
264:  /* Send the data back*/
265:   /* Do a global reduction to know the buffer space req for incoming messages*/
266:   {
267:     PetscMPIInt *rw1;
268: 
269:     PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
270:     PetscMemzero(rw1,size*sizeof(PetscMPIInt));

272:     for (i=0; i<nrqr; ++i) {
273:       proc      = recv_status[i].MPI_SOURCE;
274:       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
275:       rw1[proc] = isz1[i];
276:     }
277:     PetscFree(onodes1);
278:     PetscFree(olengths1);

280:     /* Determine the number of messages to expect, their lengths, from from-ids */
281:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
282:     PetscFree(rw1);
283:   }
284:   /* Now post the Irecvs corresponding to these messages */
285:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

287:   /*  Now  post the sends */
288:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
289:   for (i=0; i<nrqr; ++i) {
290:     j    = recv_status[i].MPI_SOURCE;
291:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
292:   }

294:   /* receive work done on other processors*/
295:   {
296:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
297:     PetscMPIInt idex;
298:     PetscBT     table_i;
299:     MPI_Status  *status2;
300: 
301:     PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
302:     for (i=0; i<nrqs; ++i) {
303:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
304:       /* Process the message*/
305:       rbuf2_i = rbuf2[idex];
306:       ct1     = 2*rbuf2_i[0]+1;
307:       jmax    = rbuf2[idex][0];
308:       for (j=1; j<=jmax; j++) {
309:         max     = rbuf2_i[2*j];
310:         is_no   = rbuf2_i[2*j-1];
311:         isz_i   = isz[is_no];
312:         data_i  = data[is_no];
313:         table_i = table[is_no];
314:         for (k=0; k<max; k++,ct1++) {
315:           row = rbuf2_i[ct1];
316:           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317:         }
318:         isz[is_no] = isz_i;
319:       }
320:     }

322:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
323:     PetscFree(status2);
324:   }
325: 
326:   for (i=0; i<imax; ++i) {
327:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
328:   }
329: 
330:   PetscFree(onodes2);
331:   PetscFree(olengths2);

333:   PetscFree(pa);
334:   PetscFree(rbuf2[0]);
335:   PetscFree(rbuf2);
336:   PetscFree(s_waits1);
337:   PetscFree(r_waits1);
338:   PetscFree(s_waits2);
339:   PetscFree(r_waits2);
340:   PetscFree(table);
341:   PetscFree(s_status);
342:   PetscFree(recv_status);
343:   PetscFree(xdata[0]);
344:   PetscFree(xdata);
345:   PetscFree(isz1);
346:   return(0);
347: }

351: /*  
352:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 
353:        the work on the local processor.

355:      Inputs:
356:       C      - MAT_MPIAIJ;
357:       imax - total no of index sets processed at a time;
358:       table  - an array of char - size = m bits.
359:       
360:      Output:
361:       isz    - array containing the count of the solution elements correspondign
362:                to each index set;
363:       data   - pointer to the solutions
364: */
365: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
366: {
367:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
368:   Mat        A = c->A,B = c->B;
369:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
370:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
371:   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
372:   PetscBT    table_i;

375:   rstart = C->rmap->rstart;
376:   cstart = C->cmap->rstart;
377:   ai     = a->i;
378:   aj     = a->j;
379:   bi     = b->i;
380:   bj     = b->j;
381:   garray = c->garray;

383: 
384:   for (i=0; i<imax; i++) {
385:     data_i  = data[i];
386:     table_i = table[i];
387:     isz_i   = isz[i];
388:     for (j=0,max=isz[i]; j<max; j++) {
389:       row   = data_i[j] - rstart;
390:       start = ai[row];
391:       end   = ai[row+1];
392:       for (k=start; k<end; k++) { /* Amat */
393:         val = aj[k] + cstart;
394:         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
395:       }
396:       start = bi[row];
397:       end   = bi[row+1];
398:       for (k=start; k<end; k++) { /* Bmat */
399:         val = garray[bj[k]];
400:         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401:       }
402:     }
403:     isz[i] = isz_i;
404:   }
405:   return(0);
406: }

410: /*     
411:       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
412:          and return the output

414:          Input:
415:            C    - the matrix
416:            nrqr - no of messages being processed.
417:            rbuf - an array of pointers to the recieved requests
418:            
419:          Output:
420:            xdata - array of messages to be sent back
421:            isz1  - size of each message

423:   For better efficiency perhaps we should malloc separately each xdata[i],
424: then if a remalloc is required we need only copy the data for that one row
425: rather then all previous rows as it is now where a single large chunck of 
426: memory is used.

428: */
429: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
430: {
431:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
432:   Mat            A = c->A,B = c->B;
433:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
435:   PetscMPIInt    rank;
436:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
437:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
438:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
439:   PetscInt       *rbuf_i,kmax,rbuf_0;
440:   PetscBT        xtable;

443:   rank   = c->rank;
444:   m      = C->rmap->N;
445:   rstart = C->rmap->rstart;
446:   cstart = C->cmap->rstart;
447:   ai     = a->i;
448:   aj     = a->j;
449:   bi     = b->i;
450:   bj     = b->j;
451:   garray = c->garray;
452: 
453: 
454:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
455:     rbuf_i  =  rbuf[i];
456:     rbuf_0  =  rbuf_i[0];
457:     ct     += rbuf_0;
458:     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
459:   }
460: 
461:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
462:   else      max1 = 1;
463:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
464:   PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
465:   ++no_malloc;
466:   PetscBTCreate(m,xtable);
467:   PetscMemzero(isz1,nrqr*sizeof(PetscInt));
468: 
469:   ct3 = 0;
470:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
471:     rbuf_i =  rbuf[i];
472:     rbuf_0 =  rbuf_i[0];
473:     ct1    =  2*rbuf_0+1;
474:     ct2    =  ct1;
475:     ct3    += ct1;
476:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
477:       PetscBTMemzero(m,xtable);
478:       oct2 = ct2;
479:       kmax = rbuf_i[2*j];
480:       for (k=0; k<kmax; k++,ct1++) {
481:         row = rbuf_i[ct1];
482:         if (!PetscBTLookupSet(xtable,row)) {
483:           if (!(ct3 < mem_estimate)) {
484:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
485:             PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
486:             PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
487:             PetscFree(xdata[0]);
488:             xdata[0]     = tmp;
489:             mem_estimate = new_estimate; ++no_malloc;
490:             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
491:           }
492:           xdata[i][ct2++] = row;
493:           ct3++;
494:         }
495:       }
496:       for (k=oct2,max2=ct2; k<max2; k++) {
497:         row   = xdata[i][k] - rstart;
498:         start = ai[row];
499:         end   = ai[row+1];
500:         for (l=start; l<end; l++) {
501:           val = aj[l] + cstart;
502:           if (!PetscBTLookupSet(xtable,val)) {
503:             if (!(ct3 < mem_estimate)) {
504:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
505:               PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
506:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
507:               PetscFree(xdata[0]);
508:               xdata[0]     = tmp;
509:               mem_estimate = new_estimate; ++no_malloc;
510:               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
511:             }
512:             xdata[i][ct2++] = val;
513:             ct3++;
514:           }
515:         }
516:         start = bi[row];
517:         end   = bi[row+1];
518:         for (l=start; l<end; l++) {
519:           val = garray[bj[l]];
520:           if (!PetscBTLookupSet(xtable,val)) {
521:             if (!(ct3 < mem_estimate)) {
522:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
523:               PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
524:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
525:               PetscFree(xdata[0]);
526:               xdata[0]     = tmp;
527:               mem_estimate = new_estimate; ++no_malloc;
528:               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
529:             }
530:             xdata[i][ct2++] = val;
531:             ct3++;
532:           }
533:         }
534:       }
535:       /* Update the header*/
536:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
537:       xdata[i][2*j-1] = rbuf_i[2*j-1];
538:     }
539:     xdata[i][0] = rbuf_0;
540:     xdata[i+1]  = xdata[i] + ct2;
541:     isz1[i]     = ct2; /* size of each message */
542:   }
543:   PetscBTDestroy(xtable);
544:   PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
545:   return(0);
546: }
547: /* -------------------------------------------------------------------------*/
548: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
549: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
550: /*
551:     Every processor gets the entire matrix
552: */
555: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
556: {
557:   Mat            B;
558:   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
559:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
561:   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
562:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
563:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
564:   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;

567:   MPI_Comm_size(((PetscObject)A)->comm,&size);
568:   MPI_Comm_rank(((PetscObject)A)->comm,&rank);

570:   if (scall == MAT_INITIAL_MATRIX) {
571:     /* ----------------------------------------------------------------
572:          Tell every processor the number of nonzeros per row
573:     */
574:     PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);
575:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
576:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
577:     }
578:     sendcount = A->rmap->rend - A->rmap->rstart;
579:     PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);
580:     for (i=0; i<size; i++) {
581:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
582:       displs[i]     = A->rmap->range[i];
583:     }
584: #if defined(PETSC_HAVE_MPI_IN_PLACE)
585:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
586: #else
587:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
588: #endif
589:     /* ---------------------------------------------------------------
590:          Create the sequential matrix of the same type as the local block diagonal
591:     */
592:     MatCreate(PETSC_COMM_SELF,&B);
593:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
594:     MatSetType(B,((PetscObject)a->A)->type_name);
595:     MatSeqAIJSetPreallocation(B,0,lens);
596:     PetscMalloc(sizeof(Mat),Bin);
597:     **Bin = B;
598:     b = (Mat_SeqAIJ *)B->data;

600:     /*--------------------------------------------------------------------
601:        Copy my part of matrix column indices over
602:     */
603:     sendcount  = ad->nz + bd->nz;
604:     jsendbuf   = b->j + b->i[rstarts[rank]];
605:     a_jsendbuf = ad->j;
606:     b_jsendbuf = bd->j;
607:     n          = A->rmap->rend - A->rmap->rstart;
608:     cnt        = 0;
609:     for (i=0; i<n; i++) {

611:       /* put in lower diagonal portion */
612:       m = bd->i[i+1] - bd->i[i];
613:       while (m > 0) {
614:         /* is it above diagonal (in bd (compressed) numbering) */
615:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
616:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
617:         m--;
618:       }

620:       /* put in diagonal portion */
621:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
622:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
623:       }

625:       /* put in upper diagonal portion */
626:       while (m-- > 0) {
627:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
628:       }
629:     }
630:     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

632:     /*--------------------------------------------------------------------
633:        Gather all column indices to all processors
634:     */
635:     for (i=0; i<size; i++) {
636:       recvcounts[i] = 0;
637:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
638:         recvcounts[i] += lens[j];
639:       }
640:     }
641:     displs[0]  = 0;
642:     for (i=1; i<size; i++) {
643:       displs[i] = displs[i-1] + recvcounts[i-1];
644:     }
645: #if defined(PETSC_HAVE_MPI_IN_PLACE)
646:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
647: #else
648:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
649: #endif
650:     /*--------------------------------------------------------------------
651:         Assemble the matrix into useable form (note numerical values not yet set)
652:     */
653:     /* set the b->ilen (length of each row) values */
654:     PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
655:     /* set the b->i indices */
656:     b->i[0] = 0;
657:     for (i=1; i<=A->rmap->N; i++) {
658:       b->i[i] = b->i[i-1] + lens[i-1];
659:     }
660:     PetscFree(lens);
661:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
662:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

664:   } else {
665:     B  = **Bin;
666:     b = (Mat_SeqAIJ *)B->data;
667:   }

669:   /*--------------------------------------------------------------------
670:        Copy my part of matrix numerical values into the values location 
671:   */
672:   if (flag == MAT_GET_VALUES){
673:     sendcount = ad->nz + bd->nz;
674:     sendbuf   = b->a + b->i[rstarts[rank]];
675:     a_sendbuf = ad->a;
676:     b_sendbuf = bd->a;
677:     b_sendj   = bd->j;
678:     n         = A->rmap->rend - A->rmap->rstart;
679:     cnt       = 0;
680:     for (i=0; i<n; i++) {

682:       /* put in lower diagonal portion */
683:       m = bd->i[i+1] - bd->i[i];
684:       while (m > 0) {
685:         /* is it above diagonal (in bd (compressed) numbering) */
686:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
687:         sendbuf[cnt++] = *b_sendbuf++;
688:         m--;
689:         b_sendj++;
690:       }

692:       /* put in diagonal portion */
693:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
694:         sendbuf[cnt++] = *a_sendbuf++;
695:       }

697:       /* put in upper diagonal portion */
698:       while (m-- > 0) {
699:         sendbuf[cnt++] = *b_sendbuf++;
700:         b_sendj++;
701:       }
702:     }
703:     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
704: 
705:     /* ----------------------------------------------------------------- 
706:        Gather all numerical values to all processors 
707:     */
708:     if (!recvcounts) {
709:       PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);
710:     }
711:     for (i=0; i<size; i++) {
712:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
713:     }
714:     displs[0]  = 0;
715:     for (i=1; i<size; i++) {
716:       displs[i] = displs[i-1] + recvcounts[i-1];
717:     }
718:     recvbuf   = b->a;
719: #if defined(PETSC_HAVE_MPI_IN_PLACE)
720:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
721: #else
722:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
723: #endif
724:   }  /* endof (flag == MAT_GET_VALUES) */
725:   PetscFree2(recvcounts,displs);

727:   if (A->symmetric){
728:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
729:   } else if (A->hermitian) {
730:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
731:   } else if (A->structurally_symmetric) {
732:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
733:   }
734:   return(0);
735: }

739: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
740: {
742:   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
743:   PetscTruth     rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;

746:   /*
747:        Check for special case each processor gets entire matrix
748:   */
749:   if (ismax == 1 && C->rmap->N == C->cmap->N) {
750:     ISIdentity(*isrow,&rowflag);
751:     ISIdentity(*iscol,&colflag);
752:     ISGetLocalSize(*isrow,&nrow);
753:     ISGetLocalSize(*iscol,&ncol);
754:     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
755:       wantallmatrix = PETSC_TRUE;
756:       PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
757:     }
758:   }
759:   MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);
760:   if (twantallmatrix) {
761:     MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
762:     return(0);
763:   }

765:   /* Allocate memory to hold all the submatrices */
766:   if (scall != MAT_REUSE_MATRIX) {
767:     PetscMalloc((ismax+1)*sizeof(Mat),submat);
768:   }
769:   /* Determine the number of stages through which submatrices are done */
770:   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
771:   if (!nmax) nmax = 1;
772:   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);

774:   /* Make sure every processor loops through the nstages */
775:   MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);

777:   for (i=0,pos=0; i<nstages; i++) {
778:     if (pos+nmax <= ismax) max_no = nmax;
779:     else if (pos == ismax) max_no = 0;
780:     else                   max_no = ismax-pos;
781:     MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
782:     pos += max_no;
783:   }
784:   return(0);
785: }

787: /* -------------------------------------------------------------------------*/
790: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
791: {
792:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
793:   Mat            A = c->A;
794:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
795:   const PetscInt **icol,**irow;
796:   PetscInt       *nrow,*ncol,start;
798:   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
799:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
800:   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
801:   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
802:   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
803:   const PetscInt *irow_i;
804:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
805:   PetscInt       *rmap_i;
806:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
807:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
808:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
809:   MPI_Status     *r_status3,*r_status4,*s_status4;
810:   MPI_Comm       comm;
811:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
812:   PetscTruth     sorted;
813:   PetscMPIInt    *onodes1,*olengths1;
814:   PetscMPIInt    idex,idex2,end;

817:   comm   = ((PetscObject)C)->comm;
818:   tag0   = ((PetscObject)C)->tag;
819:   size   = c->size;
820:   rank   = c->rank;
821: 
822:   /* Get some new tags to keep the communication clean */
823:   PetscObjectGetNewTag((PetscObject)C,&tag1);
824:   PetscObjectGetNewTag((PetscObject)C,&tag2);
825:   PetscObjectGetNewTag((PetscObject)C,&tag3);

827:     /* Check if the col indices are sorted */
828:   for (i=0; i<ismax; i++) {
829:     ISSorted(isrow[i],&sorted);
830:     /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
831:     ISSorted(iscol[i],&sorted);
832:     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
833:   }
834:   PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);

836:   for (i=0; i<ismax; i++) {
837:     ISGetIndices(isrow[i],&irow[i]);
838:     ISGetIndices(iscol[i],&icol[i]);
839:     ISGetLocalSize(isrow[i],&nrow[i]);
840:     ISGetLocalSize(iscol[i],&ncol[i]);
841:   }

843:   /* evaluate communication - mesg to who, length of mesg, and buffer space
844:      required. Based on this, buffers are allocated, and data copied into them*/
845:   PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4); /* mesg size */
846:   PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialize work vector*/
847:   PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialize work vector*/
848:   PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialize work vector*/
849:   for (i=0; i<ismax; i++) {
850:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
851:     jmax   = nrow[i];
852:     irow_i = irow[i];
853:     for (j=0; j<jmax; j++) {
854:       l = 0;
855:       row  = irow_i[j];
856:       while (row >= C->rmap->range[l+1]) l++;
857:       proc = l;
858:       w4[proc]++;
859:     }
860:     for (j=0; j<size; j++) {
861:       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
862:     }
863:   }
864: 
865:   nrqs     = 0;              /* no of outgoing messages */
866:   msz      = 0;              /* total mesg length (for all procs) */
867:   w1[rank] = 0;              /* no mesg sent to self */
868:   w3[rank] = 0;
869:   for (i=0; i<size; i++) {
870:     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
871:   }
872:   PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
873:   for (i=0,j=0; i<size; i++) {
874:     if (w1[i]) { pa[j] = i; j++; }
875:   }

877:   /* Each message would have a header = 1 + 2*(no of IS) + data */
878:   for (i=0; i<nrqs; i++) {
879:     j     = pa[i];
880:     w1[j] += w2[j] + 2* w3[j];
881:     msz   += w1[j];
882:   }

884:   /* Determine the number of messages to expect, their lengths, from from-ids */
885:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
886:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

888:   /* Now post the Irecvs corresponding to these messages */
889:   PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
890: 
891:   PetscFree(onodes1);
892:   PetscFree(olengths1);
893: 
894:   /* Allocate Memory for outgoing messages */
895:   PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);
896:   PetscMemzero(sbuf1,size*sizeof(PetscInt*));
897:   PetscMemzero(ptr,size*sizeof(PetscInt*));

899:   {
900:     PetscInt *iptr = tmp,ict = 0;
901:     for (i=0; i<nrqs; i++) {
902:       j         = pa[i];
903:       iptr     += ict;
904:       sbuf1[j]  = iptr;
905:       ict       = w1[j];
906:     }
907:   }

909:   /* Form the outgoing messages */
910:   /* Initialize the header space */
911:   for (i=0; i<nrqs; i++) {
912:     j           = pa[i];
913:     sbuf1[j][0] = 0;
914:     PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
915:     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
916:   }
917: 
918:   /* Parse the isrow and copy data into outbuf */
919:   for (i=0; i<ismax; i++) {
920:     PetscMemzero(ctr,size*sizeof(PetscInt));
921:     irow_i = irow[i];
922:     jmax   = nrow[i];
923:     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
924:       l = 0;
925:       row  = irow_i[j];
926:       while (row >= C->rmap->range[l+1]) l++;
927:       proc = l;
928:       if (proc != rank) { /* copy to the outgoing buf*/
929:         ctr[proc]++;
930:         *ptr[proc] = row;
931:         ptr[proc]++;
932:       }
933:     }
934:     /* Update the headers for the current IS */
935:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
936:       if ((ctr_j = ctr[j])) {
937:         sbuf1_j        = sbuf1[j];
938:         k              = ++sbuf1_j[0];
939:         sbuf1_j[2*k]   = ctr_j;
940:         sbuf1_j[2*k-1] = i;
941:       }
942:     }
943:   }

945:   /*  Now  post the sends */
946:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
947:   for (i=0; i<nrqs; ++i) {
948:     j    = pa[i];
949:     MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
950:   }

952:   /* Post Receives to capture the buffer size */
953:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
954:   PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
955:   rbuf2[0] = tmp + msz;
956:   for (i=1; i<nrqs; ++i) {
957:     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
958:   }
959:   for (i=0; i<nrqs; ++i) {
960:     j    = pa[i];
961:     MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
962:   }

964:   /* Send to other procs the buf size they should allocate */
965: 

967:   /* Receive messages*/
968:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
969:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
970:   PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);
971:   {
972:     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
973:     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
974:     PetscInt    *sbuf2_i;

976:     for (i=0; i<nrqr; ++i) {
977:       MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
978:       req_size[idex] = 0;
979:       rbuf1_i         = rbuf1[idex];
980:       start           = 2*rbuf1_i[0] + 1;
981:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
982:       PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
983:       sbuf2_i         = sbuf2[idex];
984:       for (j=start; j<end; j++) {
985:         id               = rbuf1_i[j] - rstart;
986:         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
987:         sbuf2_i[j]       = ncols;
988:         req_size[idex] += ncols;
989:       }
990:       req_source[idex] = r_status1[i].MPI_SOURCE;
991:       /* form the header */
992:       sbuf2_i[0]   = req_size[idex];
993:       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
994:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
995:     }
996:   }
997:   PetscFree(r_status1);
998:   PetscFree(r_waits1);

1000:   /*  recv buffer sizes */
1001:   /* Receive messages*/
1002: 
1003:   PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1004:   PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1005:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1006:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1007:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);

1009:   for (i=0; i<nrqs; ++i) {
1010:     MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1011:     PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1012:     PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1013:     MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1014:     MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1015:   }
1016:   PetscFree(r_status2);
1017:   PetscFree(r_waits2);
1018: 
1019:   /* Wait on sends1 and sends2 */
1020:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1021:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);

1023:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1024:   if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1025:   PetscFree(s_status1);
1026:   PetscFree(s_status2);
1027:   PetscFree(s_waits1);
1028:   PetscFree(s_waits2);

1030:   /* Now allocate buffers for a->j, and send them off */
1031:   PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1032:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1033:   PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1034:   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1035: 
1036:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1037:   {
1038:     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1039:     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1040:     PetscInt cend = C->cmap->rend;
1041:     PetscInt *a_j = a->j,*b_j = b->j,ctmp;

1043:     for (i=0; i<nrqr; i++) {
1044:       rbuf1_i   = rbuf1[i];
1045:       sbuf_aj_i = sbuf_aj[i];
1046:       ct1       = 2*rbuf1_i[0] + 1;
1047:       ct2       = 0;
1048:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1049:         kmax = rbuf1[i][2*j];
1050:         for (k=0; k<kmax; k++,ct1++) {
1051:           row    = rbuf1_i[ct1] - rstart;
1052:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1053:           ncols  = nzA + nzB;
1054:           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];

1056:           /* load the column indices for this row into cols*/
1057:           cols  = sbuf_aj_i + ct2;
1058: 
1059:           lwrite = 0;
1060:           for (l=0; l<nzB; l++) {
1061:             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1062:           }
1063:           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1064:           for (l=0; l<nzB; l++) {
1065:             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1066:           }

1068:           ct2 += ncols;
1069:         }
1070:       }
1071:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1072:     }
1073:   }
1074:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1075:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);

1077:   /* Allocate buffers for a->a, and send them off */
1078:   PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1079:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1080:   PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1081:   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1082: 
1083:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1084:   {
1085:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1086:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1087:     PetscInt    cend = C->cmap->rend;
1088:     PetscInt    *b_j = b->j;
1089:     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1090: 
1091:     for (i=0; i<nrqr; i++) {
1092:       rbuf1_i   = rbuf1[i];
1093:       sbuf_aa_i = sbuf_aa[i];
1094:       ct1       = 2*rbuf1_i[0]+1;
1095:       ct2       = 0;
1096:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1097:         kmax = rbuf1_i[2*j];
1098:         for (k=0; k<kmax; k++,ct1++) {
1099:           row    = rbuf1_i[ct1] - rstart;
1100:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1101:           ncols  = nzA + nzB;
1102:           cworkB = b_j + b_i[row];
1103:           vworkA = a_a + a_i[row];
1104:           vworkB = b_a + b_i[row];

1106:           /* load the column values for this row into vals*/
1107:           vals  = sbuf_aa_i+ct2;
1108: 
1109:           lwrite = 0;
1110:           for (l=0; l<nzB; l++) {
1111:             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1112:           }
1113:           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1114:           for (l=0; l<nzB; l++) {
1115:             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1116:           }
1117: 
1118:           ct2 += ncols;
1119:         }
1120:       }
1121:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1122:     }
1123:   }
1124:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1125:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1126:   PetscFree(rbuf1[0]);
1127:   PetscFree(rbuf1);

1129:   /* Form the matrix */
1130:   /* create col map */
1131:   {
1132:     const PetscInt *icol_i;
1133: 
1134:     PetscMalloc(ismax*sizeof(PetscInt*),&cmap);
1135:     PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);
1136:     PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));
1137:     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1138:     for (i=0; i<ismax; i++) {
1139:       jmax   = ncol[i];
1140:       icol_i = icol[i];
1141:       cmap_i = cmap[i];
1142:       for (j=0; j<jmax; j++) {
1143:         cmap_i[icol_i[j]] = j+1;
1144:       }
1145:     }
1146:   }

1148:   /* Create lens which is required for MatCreate... */
1149:   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1150:   PetscMalloc(ismax*sizeof(PetscInt*),&lens);
1151:   PetscMalloc(j*sizeof(PetscInt),&lens[0]);
1152:   PetscMemzero(lens[0],j*sizeof(PetscInt));
1153:   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1154: 
1155:   /* Update lens from local data */
1156:   for (i=0; i<ismax; i++) {
1157:     jmax   = nrow[i];
1158:     cmap_i = cmap[i];
1159:     irow_i = irow[i];
1160:     lens_i = lens[i];
1161:     for (j=0; j<jmax; j++) {
1162:       l = 0;
1163:       row  = irow_i[j];
1164:       while (row >= C->rmap->range[l+1]) l++;
1165:       proc = l;
1166:       if (proc == rank) {
1167:         MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1168:         for (k=0; k<ncols; k++) {
1169:           if (cmap_i[cols[k]]) { lens_i[j]++;}
1170:         }
1171:         MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1172:       }
1173:     }
1174:   }
1175: 
1176:   /* Create row map*/
1177:   PetscMalloc(ismax*sizeof(PetscInt*),&rmap);
1178:   PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);
1179:   PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1180:   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1181:   for (i=0; i<ismax; i++) {
1182:     rmap_i = rmap[i];
1183:     irow_i = irow[i];
1184:     jmax   = nrow[i];
1185:     for (j=0; j<jmax; j++) {
1186:       rmap_i[irow_i[j]] = j;
1187:     }
1188:   }
1189: 
1190:   /* Update lens from offproc data */
1191:   {
1192:     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

1194:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1195:       MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1196:       idex   = pa[idex2];
1197:       sbuf1_i = sbuf1[idex];
1198:       jmax    = sbuf1_i[0];
1199:       ct1     = 2*jmax+1;
1200:       ct2     = 0;
1201:       rbuf2_i = rbuf2[idex2];
1202:       rbuf3_i = rbuf3[idex2];
1203:       for (j=1; j<=jmax; j++) {
1204:         is_no   = sbuf1_i[2*j-1];
1205:         max1    = sbuf1_i[2*j];
1206:         lens_i  = lens[is_no];
1207:         cmap_i  = cmap[is_no];
1208:         rmap_i  = rmap[is_no];
1209:         for (k=0; k<max1; k++,ct1++) {
1210:           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1211:           max2 = rbuf2_i[ct1];
1212:           for (l=0; l<max2; l++,ct2++) {
1213:             if (cmap_i[rbuf3_i[ct2]]) {
1214:               lens_i[row]++;
1215:             }
1216:           }
1217:         }
1218:       }
1219:     }
1220:   }
1221:   PetscFree(r_status3);
1222:   PetscFree(r_waits3);
1223:   if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1224:   PetscFree(s_status3);
1225:   PetscFree(s_waits3);

1227:   /* Create the submatrices */
1228:   if (scall == MAT_REUSE_MATRIX) {
1229:     PetscTruth flag;

1231:     /*
1232:         Assumes new rows are same length as the old rows,hence bug!
1233:     */
1234:     for (i=0; i<ismax; i++) {
1235:       mat = (Mat_SeqAIJ *)(submats[i]->data);
1236:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1237:         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1238:       }
1239:       PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1240:       if (!flag) {
1241:         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1242:       }
1243:       /* Initial matrix as if empty */
1244:       PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1245:       submats[i]->factor = C->factor;
1246:     }
1247:   } else {
1248:     for (i=0; i<ismax; i++) {
1249:       MatCreate(PETSC_COMM_SELF,submats+i);
1250:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1251:       MatSetType(submats[i],((PetscObject)A)->type_name);
1252:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1253:     }
1254:   }

1256:   /* Assemble the matrices */
1257:   /* First assemble the local rows */
1258:   {
1259:     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1260:     PetscScalar *imat_a;
1261: 
1262:     for (i=0; i<ismax; i++) {
1263:       mat       = (Mat_SeqAIJ*)submats[i]->data;
1264:       imat_ilen = mat->ilen;
1265:       imat_j    = mat->j;
1266:       imat_i    = mat->i;
1267:       imat_a    = mat->a;
1268:       cmap_i    = cmap[i];
1269:       rmap_i    = rmap[i];
1270:       irow_i    = irow[i];
1271:       jmax      = nrow[i];
1272:       for (j=0; j<jmax; j++) {
1273:         l = 0;
1274:         row      = irow_i[j];
1275:         while (row >= C->rmap->range[l+1]) l++;
1276:         proc = l;
1277:         if (proc == rank) {
1278:           old_row  = row;
1279:           row      = rmap_i[row];
1280:           ilen_row = imat_ilen[row];
1281:           MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1282:           mat_i    = imat_i[row] ;
1283:           mat_a    = imat_a + mat_i;
1284:           mat_j    = imat_j + mat_i;
1285:           for (k=0; k<ncols; k++) {
1286:             if ((tcol = cmap_i[cols[k]])) {
1287:               *mat_j++ = tcol - 1;
1288:               *mat_a++ = vals[k];
1289:               ilen_row++;
1290:             }
1291:           }
1292:           MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1293:           imat_ilen[row] = ilen_row;
1294:         }
1295:       }
1296:     }
1297:   }

1299:   /*   Now assemble the off proc rows*/
1300:   {
1301:     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1302:     PetscInt    *imat_j,*imat_i;
1303:     PetscScalar *imat_a,*rbuf4_i;

1305:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1306:       MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1307:       idex   = pa[idex2];
1308:       sbuf1_i = sbuf1[idex];
1309:       jmax    = sbuf1_i[0];
1310:       ct1     = 2*jmax + 1;
1311:       ct2     = 0;
1312:       rbuf2_i = rbuf2[idex2];
1313:       rbuf3_i = rbuf3[idex2];
1314:       rbuf4_i = rbuf4[idex2];
1315:       for (j=1; j<=jmax; j++) {
1316:         is_no     = sbuf1_i[2*j-1];
1317:         rmap_i    = rmap[is_no];
1318:         cmap_i    = cmap[is_no];
1319:         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1320:         imat_ilen = mat->ilen;
1321:         imat_j    = mat->j;
1322:         imat_i    = mat->i;
1323:         imat_a    = mat->a;
1324:         max1      = sbuf1_i[2*j];
1325:         for (k=0; k<max1; k++,ct1++) {
1326:           row   = sbuf1_i[ct1];
1327:           row   = rmap_i[row];
1328:           ilen  = imat_ilen[row];
1329:           mat_i = imat_i[row] ;
1330:           mat_a = imat_a + mat_i;
1331:           mat_j = imat_j + mat_i;
1332:           max2 = rbuf2_i[ct1];
1333:           for (l=0; l<max2; l++,ct2++) {
1334:             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1335:               *mat_j++ = tcol - 1;
1336:               *mat_a++ = rbuf4_i[ct2];
1337:               ilen++;
1338:             }
1339:           }
1340:           imat_ilen[row] = ilen;
1341:         }
1342:       }
1343:     }
1344:   }
1345:   PetscFree(r_status4);
1346:   PetscFree(r_waits4);
1347:   if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1348:   PetscFree(s_waits4);
1349:   PetscFree(s_status4);

1351:   /* Restore the indices */
1352:   for (i=0; i<ismax; i++) {
1353:     ISRestoreIndices(isrow[i],irow+i);
1354:     ISRestoreIndices(iscol[i],icol+i);
1355:   }

1357:   /* Destroy allocated memory */
1358:   PetscFree4(irow,icol,nrow,ncol);
1359:   PetscFree4(w1,w2,w3,w4);
1360:   PetscFree(pa);

1362:   PetscFree4(sbuf1,ptr,tmp,ctr);
1363:   PetscFree(rbuf2);
1364:   for (i=0; i<nrqr; ++i) {
1365:     PetscFree(sbuf2[i]);
1366:   }
1367:   for (i=0; i<nrqs; ++i) {
1368:     PetscFree(rbuf3[i]);
1369:     PetscFree(rbuf4[i]);
1370:   }

1372:   PetscFree3(sbuf2,req_size,req_source);
1373:   PetscFree(rbuf3);
1374:   PetscFree(rbuf4);
1375:   PetscFree(sbuf_aj[0]);
1376:   PetscFree(sbuf_aj);
1377:   PetscFree(sbuf_aa[0]);
1378:   PetscFree(sbuf_aa);
1379: 
1380:   PetscFree(cmap[0]);
1381:   PetscFree(cmap);
1382:   PetscFree(rmap[0]);
1383:   PetscFree(rmap);
1384:   PetscFree(lens[0]);
1385:   PetscFree(lens);

1387:   for (i=0; i<ismax; i++) {
1388:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1389:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1390:   }
1391:   return(0);
1392: }