Actual source code: vpscat.c

  1: #define PETSCVEC_DLL

  3: /*
  4:     Defines parallel vector scatters.
  5: */

 7:  #include private/isimpl.h
 8:  #include private/vecimpl.h
 9:  #include ../src/vec/vec/impls/dvecimpl.h
 10:  #include ../src/vec/vec/impls/mpi/pvecimpl.h

 14: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 15: {
 16:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 17:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 18:   PetscErrorCode         ierr;
 19:   PetscInt               i;
 20:   PetscMPIInt            rank;
 21:   PetscViewerFormat      format;
 22:   PetscTruth             iascii;

 25:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
 26:   if (iascii) {
 27:     MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
 28:     PetscViewerGetFormat(viewer,&format);
 29:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 30:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 32:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 33:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 34:       itmp = to->starts[to->n+1];
 35:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 36:       itmp = from->starts[from->n+1];
 37:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 38:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);

 40:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 41:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 43:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 44:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 45:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 47:     } else {
 48:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 49:       if (to->n) {
 50:         for (i=0; i<to->n; i++){
 51:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 52:         }
 53:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 54:         for (i=0; i<to->starts[to->n]; i++){
 55:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
 56:         }
 57:       }

 59:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 60:       if (from->n) {
 61:         for (i=0; i<from->n; i++){
 62:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 63:         }

 65:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 66:         for (i=0; i<from->starts[from->n]; i++){
 67:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
 68:         }
 69:       }
 70:       if (to->local.n) {
 71:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
 72:         for (i=0; i<to->local.n; i++){
 73:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 74:         }
 75:       }

 77:       PetscViewerFlush(viewer);
 78:     }
 79:   } else {
 80:     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 81:   }
 82:   return(0);
 83: }

 85: /* -----------------------------------------------------------------------------------*/
 86: /*
 87:       The next routine determines what part of  the local part of the scatter is an
 88:   exact copy of values into their current location. We check this here and
 89:   then know that we need not perform that portion of the scatter when the vector is
 90:   scattering to itself with INSERT_VALUES.

 92:      This is currently not used but would speed up, for example DALocalToLocalBegin/End()

 94: */
 97: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 98: {
 99:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
101:   PetscInt       *nto_slots,*nfrom_slots,j = 0;
102: 
104:   for (i=0; i<n; i++) {
105:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
106:   }

108:   if (!n_nonmatching) {
109:     to->nonmatching_computed = PETSC_TRUE;
110:     to->n_nonmatching        = from->n_nonmatching = 0;
111:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
112:   } else if (n_nonmatching == n) {
113:     to->nonmatching_computed = PETSC_FALSE;
114:     PetscInfo(scatter,"All values non-matching\n");
115:   } else {
116:     to->nonmatching_computed= PETSC_TRUE;
117:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;
118:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
119:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
120:     to->slots_nonmatching   = nto_slots;
121:     from->slots_nonmatching = nfrom_slots;
122:     for (i=0; i<n; i++) {
123:       if (to_slots[i] != from_slots[i]) {
124:         nto_slots[j]   = to_slots[i];
125:         nfrom_slots[j] = from_slots[i];
126:         j++;
127:       }
128:     }
129:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
130:   }
131:   return(0);
132: }

134: /* --------------------------------------------------------------------------------------*/

136: /* -------------------------------------------------------------------------------------*/
139: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
140: {
141:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
142:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
143:   PetscErrorCode         ierr;
144:   PetscInt               i;

147:   if (to->use_readyreceiver) {
148:     /*
149:        Since we have already posted sends we must cancel them before freeing 
150:        the requests
151:     */
152:     for (i=0; i<from->n; i++) {
153:       MPI_Cancel(from->requests+i);
154:     }
155:     for (i=0; i<to->n; i++) {
156:       MPI_Cancel(to->rev_requests+i);
157:     }
158:   }

160: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
161:   if (to->use_alltoallw) {
162:     PetscFree3(to->wcounts,to->wdispls,to->types);
163:     PetscFree3(from->wcounts,from->wdispls,from->types);
164:   }
165: #endif

167: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
168:   if (to->use_window) {
169:     MPI_Win_free(&from->window);
170:     MPI_Win_free(&to->window);
171:     PetscFree(from->winstarts);
172:     PetscFree(to->winstarts);
173:   }
174: #endif

176:   if (to->use_alltoallv) {
177:     PetscFree2(to->counts,to->displs);
178:     PetscFree2(from->counts,from->displs);
179:   }

181:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
182:   /* 
183:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
184:      message passing.
185:   */
186: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
187:   if (!to->use_alltoallv && !to->use_window) {   /* currently the to->requests etc are ALWAYS allocated even if not used */
188:     if (to->requests) {
189:       for (i=0; i<to->n; i++) {
190:         MPI_Request_free(to->requests + i);
191:       }
192:     }
193:     if (to->rev_requests) {
194:       for (i=0; i<to->n; i++) {
195:         MPI_Request_free(to->rev_requests + i);
196:       }
197:     }
198:   }
199:   /*
200:       MPICH could not properly cancel requests thus with ready receiver mode we
201:     cannot free the requests. It may be fixed now, if not then put the following 
202:     code inside a if !to->use_readyreceiver) {
203:   */
204:   if (!to->use_alltoallv && !to->use_window) {    /* currently the from->requests etc are ALWAYS allocated even if not used */
205:     if (from->requests) {
206:       for (i=0; i<from->n; i++) {
207:         MPI_Request_free(from->requests + i);
208:       }
209:     }

211:     if (from->rev_requests) {
212:       for (i=0; i<from->n; i++) {
213:         MPI_Request_free(from->rev_requests + i);
214:       }
215:     }
216:   }
217: #endif

219:   PetscFree(to->local.vslots);
220:   PetscFree(from->local.vslots);
221:   PetscFree2(to->counts,to->displs);
222:   PetscFree2(from->counts,from->displs);
223:   PetscFree(to->local.slots_nonmatching);
224:   PetscFree(from->local.slots_nonmatching);
225:   PetscFree(to->rev_requests);
226:   PetscFree(from->rev_requests);
227:   PetscFree(to->requests);
228:   PetscFree(from->requests);
229:   PetscFree4(to->values,to->indices,to->starts,to->procs);
230:   PetscFree2(to->sstatus,to->rstatus);
231:   PetscFree4(from->values,from->indices,from->starts,from->procs);
232:   PetscFree(from);
233:   PetscFree(to);
234:   PetscHeaderDestroy(ctx);
235:   return(0);
236: }



240: /* --------------------------------------------------------------------------------------*/
241: /*
242:     Special optimization to see if the local part of the scatter is actually 
243:     a copy. The scatter routines call PetscMemcpy() instead.
244:  
245: */
248: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
249: {
250:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
251:   PetscInt       to_start,from_start;

255:   to_start   = to_slots[0];
256:   from_start = from_slots[0];

258:   for (i=1; i<n; i++) {
259:     to_start   += bs;
260:     from_start += bs;
261:     if (to_slots[i]   != to_start)   return(0);
262:     if (from_slots[i] != from_start) return(0);
263:   }
264:   to->is_copy       = PETSC_TRUE;
265:   to->copy_start    = to_slots[0];
266:   to->copy_length   = bs*sizeof(PetscScalar)*n;
267:   from->is_copy     = PETSC_TRUE;
268:   from->copy_start  = from_slots[0];
269:   from->copy_length = bs*sizeof(PetscScalar)*n;
270:   PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
271:   return(0);
272: }

274: /* --------------------------------------------------------------------------------------*/

278: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
279: {
280:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
281:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
282:   PetscErrorCode         ierr;
283:   PetscInt               ny,bs = in_from->bs;

286:   out->begin     = in->begin;
287:   out->end       = in->end;
288:   out->copy      = in->copy;
289:   out->destroy   = in->destroy;
290:   out->view      = in->view;

292:   /* allocate entire send scatter context */
293:   PetscNewLog(out,VecScatter_MPI_General,&out_to);
294:   PetscNewLog(out,VecScatter_MPI_General,&out_from);

296:   ny                = in_to->starts[in_to->n];
297:   out_to->n         = in_to->n;
298:   out_to->type      = in_to->type;
299:   out_to->sendfirst = in_to->sendfirst;

301:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
302:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
303:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
304:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
305:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
306:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
307: 
308:   out->todata       = (void*)out_to;
309:   out_to->local.n   = in_to->local.n;
310:   out_to->local.nonmatching_computed = PETSC_FALSE;
311:   out_to->local.n_nonmatching        = 0;
312:   out_to->local.slots_nonmatching    = 0;
313:   if (in_to->local.n) {
314:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
315:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
316:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
317:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
318:   } else {
319:     out_to->local.vslots   = 0;
320:     out_from->local.vslots = 0;
321:   }

323:   /* allocate entire receive context */
324:   out_from->type      = in_from->type;
325:   ny                  = in_from->starts[in_from->n];
326:   out_from->n         = in_from->n;
327:   out_from->sendfirst = in_from->sendfirst;

329:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
330:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
331:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
332:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
333:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
334:   out->fromdata       = (void*)out_from;
335:   out_from->local.n   = in_from->local.n;
336:   out_from->local.nonmatching_computed = PETSC_FALSE;
337:   out_from->local.n_nonmatching        = 0;
338:   out_from->local.slots_nonmatching    = 0;

340:   /* 
341:       set up the request arrays for use with isend_init() and irecv_init()
342:   */
343:   {
344:     PetscMPIInt tag;
345:     MPI_Comm    comm;
346:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
347:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
348:     PetscInt    i;
349:     PetscTruth  flg;
350:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
351:     MPI_Request *rev_swaits,*rev_rwaits;
352:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

354:     PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
355:     PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);

357:     rev_rwaits = out_to->rev_requests;
358:     rev_swaits = out_from->rev_requests;

360:     out_from->bs = out_to->bs = bs;
361:     tag     = ((PetscObject)out)->tag;
362:     comm    = ((PetscObject)out)->comm;

364:     /* Register the receives that you will use later (sends for scatter reverse) */
365:     for (i=0; i<out_from->n; i++) {
366:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
367:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
368:     }

370:     flg  = PETSC_FALSE;
371:     PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_rsend",&flg,PETSC_NULL);
372:     if (flg) {
373:       out_to->use_readyreceiver    = PETSC_TRUE;
374:       out_from->use_readyreceiver  = PETSC_TRUE;
375:       for (i=0; i<out_to->n; i++) {
376:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
377:       }
378:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
379:       MPI_Barrier(comm);
380:       PetscInfo(in,"Using VecScatter ready receiver mode\n");
381:     } else {
382:       out_to->use_readyreceiver    = PETSC_FALSE;
383:       out_from->use_readyreceiver  = PETSC_FALSE;
384:       flg                          = PETSC_FALSE;
385:       PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_ssend",&flg,PETSC_NULL);
386:       if (flg) {
387:         PetscInfo(in,"Using VecScatter Ssend mode\n");
388:       }
389:       for (i=0; i<out_to->n; i++) {
390:         if (!flg) {
391:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
392:         } else {
393:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
394:         }
395:       }
396:     }
397:     /* Register receives for scatter reverse */
398:     for (i=0; i<out_to->n; i++) {
399:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
400:     }
401:   }

403:   return(0);
404: }

408: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
409: {
410:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
411:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
412:   PetscErrorCode         ierr;
413:   PetscInt               ny,bs = in_from->bs;
414:   PetscMPIInt            size;

417:   MPI_Comm_size(((PetscObject)in)->comm,&size);
418:   out->begin     = in->begin;
419:   out->end       = in->end;
420:   out->copy      = in->copy;
421:   out->destroy   = in->destroy;
422:   out->view      = in->view;

424:   /* allocate entire send scatter context */
425:   PetscNewLog(out,VecScatter_MPI_General,&out_to);
426:   PetscNewLog(out,VecScatter_MPI_General,&out_from);

428:   ny                = in_to->starts[in_to->n];
429:   out_to->n         = in_to->n;
430:   out_to->type      = in_to->type;
431:   out_to->sendfirst = in_to->sendfirst;

433:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
434:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
435:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
436:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
437:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
438:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
439: 
440:   out->todata       = (void*)out_to;
441:   out_to->local.n   = in_to->local.n;
442:   out_to->local.nonmatching_computed = PETSC_FALSE;
443:   out_to->local.n_nonmatching        = 0;
444:   out_to->local.slots_nonmatching    = 0;
445:   if (in_to->local.n) {
446:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
447:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
448:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
449:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
450:   } else {
451:     out_to->local.vslots   = 0;
452:     out_from->local.vslots = 0;
453:   }

455:   /* allocate entire receive context */
456:   out_from->type      = in_from->type;
457:   ny                  = in_from->starts[in_from->n];
458:   out_from->n         = in_from->n;
459:   out_from->sendfirst = in_from->sendfirst;

461:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
462:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
463:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
464:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
465:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
466:   out->fromdata       = (void*)out_from;
467:   out_from->local.n   = in_from->local.n;
468:   out_from->local.nonmatching_computed = PETSC_FALSE;
469:   out_from->local.n_nonmatching        = 0;
470:   out_from->local.slots_nonmatching    = 0;

472:   out_to->use_alltoallv = out_from->use_alltoallv = PETSC_TRUE;

474:   PetscMalloc2(size,PetscMPIInt,&out_to->counts,size,PetscMPIInt,&out_to->displs);
475:   PetscMemcpy(out_to->counts,in_to->counts,size*sizeof(PetscMPIInt));
476:   PetscMemcpy(out_to->displs,in_to->displs,size*sizeof(PetscMPIInt));

478:   PetscMalloc2(size,PetscMPIInt,&out_from->counts,size,PetscMPIInt,&out_from->displs);
479:   PetscMemcpy(out_from->counts,in_from->counts,size*sizeof(PetscMPIInt));
480:   PetscMemcpy(out_from->displs,in_from->displs,size*sizeof(PetscMPIInt));
481:   return(0);
482: }
483: /* --------------------------------------------------------------------------------------------------
484:     Packs and unpacks the message data into send or from receive buffers. 

486:     These could be generated automatically. 

488:     Fortran kernels etc. could be used.
489: */
490: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
491: {
492:   PetscInt i;
493:   for (i=0; i<n; i++) {
494:     y[i]  = x[indicesx[i]];
495:   }
496: }
497: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
498: {
499:   PetscInt i;
500:   switch (addv) {
501:   case INSERT_VALUES:
502:     for (i=0; i<n; i++) {
503:       y[indicesy[i]] = x[i];
504:     }
505:     break;
506:   case ADD_VALUES:
507:     for (i=0; i<n; i++) {
508:       y[indicesy[i]] += x[i];
509:     }
510:     break;
511: #if !defined(PETSC_USE_COMPLEX)
512:   case MAX_VALUES:
513:     for (i=0; i<n; i++) {
514:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
515:     }
516: #else
517:   case MAX_VALUES:
518: #endif
519:   case NOT_SET_VALUES:
520:     break;
521:   }
522: }

524: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
525: {
526:   PetscInt i;
527:   switch (addv) {
528:   case INSERT_VALUES:
529:     for (i=0; i<n; i++) {
530:       y[indicesy[i]] = x[indicesx[i]];
531:     }
532:     break;
533:   case ADD_VALUES:
534:     for (i=0; i<n; i++) {
535:       y[indicesy[i]] += x[indicesx[i]];
536:     }
537:     break;
538: #if !defined(PETSC_USE_COMPLEX)
539:   case MAX_VALUES:
540:     for (i=0; i<n; i++) {
541:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
542:     }
543: #else
544:   case MAX_VALUES:
545: #endif
546:   case NOT_SET_VALUES:
547:     break;
548:   }
549: }

551:   /* ----------------------------------------------------------------------------------------------- */
552: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
553: {
554:   PetscInt i,idx;

556:   for (i=0; i<n; i++) {
557:     idx   = *indicesx++;
558:     y[0]  = x[idx];
559:     y[1]  = x[idx+1];
560:     y    += 2;
561:   }
562: }
563: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
564: {
565:   PetscInt i,idy;

567:   switch (addv) {
568:   case INSERT_VALUES:
569:     for (i=0; i<n; i++) {
570:       idy       = *indicesy++;
571:       y[idy]    = x[0];
572:       y[idy+1]  = x[1];
573:       x    += 2;
574:     }
575:     break;
576:   case ADD_VALUES:
577:     for (i=0; i<n; i++) {
578:       idy       = *indicesy++;
579:       y[idy]    += x[0];
580:       y[idy+1]  += x[1];
581:       x    += 2;
582:     }
583:     break;
584: #if !defined(PETSC_USE_COMPLEX)
585:   case MAX_VALUES:
586:     for (i=0; i<n; i++) {
587:       idy       = *indicesy++;
588:       y[idy]    = PetscMax(y[idy],x[0]);
589:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
590:       x    += 2;
591:     }
592: #else
593:   case MAX_VALUES:
594: #endif
595:   case NOT_SET_VALUES:
596:     break;
597:   }
598: }

600: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
601: {
602:   PetscInt i,idx,idy;

604:   switch (addv) {
605:   case INSERT_VALUES:
606:     for (i=0; i<n; i++) {
607:       idx       = *indicesx++;
608:       idy       = *indicesy++;
609:       y[idy]    = x[idx];
610:       y[idy+1]  = x[idx+1];
611:     }
612:     break;
613:   case ADD_VALUES:
614:     for (i=0; i<n; i++) {
615:       idx       = *indicesx++;
616:       idy       = *indicesy++;
617:       y[idy]    += x[idx];
618:       y[idy+1]  += x[idx+1];
619:     }
620:     break;
621: #if !defined(PETSC_USE_COMPLEX)
622:   case MAX_VALUES:
623:     for (i=0; i<n; i++) {
624:       idx       = *indicesx++;
625:       idy       = *indicesy++;
626:       y[idy]    = PetscMax(y[idy],x[idx]);
627:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
628:     }
629: #else
630:   case MAX_VALUES:
631: #endif
632:   case NOT_SET_VALUES:
633:     break;
634:   }
635: }
636:   /* ----------------------------------------------------------------------------------------------- */
637: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
638: {
639:   PetscInt i,idx;

641:   for (i=0; i<n; i++) {
642:     idx   = *indicesx++;
643:     y[0]  = x[idx];
644:     y[1]  = x[idx+1];
645:     y[2]  = x[idx+2];
646:     y    += 3;
647:   }
648: }
649: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
650: {
651:   PetscInt i,idy;

653:   switch (addv) {
654:   case INSERT_VALUES:
655:     for (i=0; i<n; i++) {
656:       idy       = *indicesy++;
657:       y[idy]    = x[0];
658:       y[idy+1]  = x[1];
659:       y[idy+2]  = x[2];
660:       x    += 3;
661:     }
662:     break;
663:   case ADD_VALUES:
664:     for (i=0; i<n; i++) {
665:       idy       = *indicesy++;
666:       y[idy]    += x[0];
667:       y[idy+1]  += x[1];
668:       y[idy+2]  += x[2];
669:       x    += 3;
670:     }
671:     break;
672: #if !defined(PETSC_USE_COMPLEX)
673:   case MAX_VALUES:
674:     for (i=0; i<n; i++) {
675:       idy       = *indicesy++;
676:       y[idy]    = PetscMax(y[idy],x[0]);
677:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
678:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
679:       x    += 3;
680:     }
681: #else
682:   case MAX_VALUES:
683: #endif
684:   case NOT_SET_VALUES:
685:     break;
686:   }
687: }

689: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
690: {
691:   PetscInt i,idx,idy;

693:   switch (addv) {
694:   case INSERT_VALUES:
695:     for (i=0; i<n; i++) {
696:       idx       = *indicesx++;
697:       idy       = *indicesy++;
698:       y[idy]    = x[idx];
699:       y[idy+1]  = x[idx+1];
700:       y[idy+2]  = x[idx+2];
701:     }
702:     break;
703:   case ADD_VALUES:
704:     for (i=0; i<n; i++) {
705:       idx       = *indicesx++;
706:       idy       = *indicesy++;
707:       y[idy]    += x[idx];
708:       y[idy+1]  += x[idx+1];
709:       y[idy+2]  += x[idx+2];
710:     }
711:     break;
712: #if !defined(PETSC_USE_COMPLEX)
713:   case MAX_VALUES:
714:     for (i=0; i<n; i++) {
715:       idx       = *indicesx++;
716:       idy       = *indicesy++;
717:       y[idy]    = PetscMax(y[idy],x[idx]);
718:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
719:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
720:     }
721: #else
722:   case MAX_VALUES:
723: #endif
724:   case NOT_SET_VALUES:
725:     break;
726:   }
727: }
728:   /* ----------------------------------------------------------------------------------------------- */
729: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
730: {
731:   PetscInt i,idx;

733:   for (i=0; i<n; i++) {
734:     idx   = *indicesx++;
735:     y[0]  = x[idx];
736:     y[1]  = x[idx+1];
737:     y[2]  = x[idx+2];
738:     y[3]  = x[idx+3];
739:     y    += 4;
740:   }
741: }
742: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
743: {
744:   PetscInt i,idy;

746:   switch (addv) {
747:   case INSERT_VALUES:
748:     for (i=0; i<n; i++) {
749:       idy       = *indicesy++;
750:       y[idy]    = x[0];
751:       y[idy+1]  = x[1];
752:       y[idy+2]  = x[2];
753:       y[idy+3]  = x[3];
754:       x    += 4;
755:     }
756:     break;
757:   case ADD_VALUES:
758:     for (i=0; i<n; i++) {
759:       idy       = *indicesy++;
760:       y[idy]    += x[0];
761:       y[idy+1]  += x[1];
762:       y[idy+2]  += x[2];
763:       y[idy+3]  += x[3];
764:       x    += 4;
765:     }
766:     break;
767: #if !defined(PETSC_USE_COMPLEX)
768:   case MAX_VALUES:
769:     for (i=0; i<n; i++) {
770:       idy       = *indicesy++;
771:       y[idy]    = PetscMax(y[idy],x[0]);
772:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
773:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
774:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
775:       x    += 4;
776:     }
777: #else
778:   case MAX_VALUES:
779: #endif
780:   case NOT_SET_VALUES:
781:     break;
782:   }
783: }

785: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
786: {
787:   PetscInt i,idx,idy;

789:   switch (addv) {
790:   case INSERT_VALUES:
791:     for (i=0; i<n; i++) {
792:       idx       = *indicesx++;
793:       idy       = *indicesy++;
794:       y[idy]    = x[idx];
795:       y[idy+1]  = x[idx+1];
796:       y[idy+2]  = x[idx+2];
797:       y[idy+3]  = x[idx+3];
798:     }
799:     break;
800:   case ADD_VALUES:
801:     for (i=0; i<n; i++) {
802:       idx       = *indicesx++;
803:       idy       = *indicesy++;
804:       y[idy]    += x[idx];
805:       y[idy+1]  += x[idx+1];
806:       y[idy+2]  += x[idx+2];
807:       y[idy+3]  += x[idx+3];
808:     }
809:     break;
810: #if !defined(PETSC_USE_COMPLEX)
811:   case MAX_VALUES:
812:     for (i=0; i<n; i++) {
813:       idx       = *indicesx++;
814:       idy       = *indicesy++;
815:       y[idy]    = PetscMax(y[idy],x[idx]);
816:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
817:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
818:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
819:     }
820: #else
821:   case MAX_VALUES:
822: #endif
823:   case NOT_SET_VALUES:
824:     break;
825:   }
826: }
827:   /* ----------------------------------------------------------------------------------------------- */
828: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
829: {
830:   PetscInt i,idx;

832:   for (i=0; i<n; i++) {
833:     idx   = *indicesx++;
834:     y[0]  = x[idx];
835:     y[1]  = x[idx+1];
836:     y[2]  = x[idx+2];
837:     y[3]  = x[idx+3];
838:     y[4]  = x[idx+4];
839:     y    += 5;
840:   }
841: }
842: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
843: {
844:   PetscInt i,idy;

846:   switch (addv) {
847:   case INSERT_VALUES:
848:     for (i=0; i<n; i++) {
849:       idy       = *indicesy++;
850:       y[idy]    = x[0];
851:       y[idy+1]  = x[1];
852:       y[idy+2]  = x[2];
853:       y[idy+3]  = x[3];
854:       y[idy+4]  = x[4];
855:       x    += 5;
856:     }
857:     break;
858:   case ADD_VALUES:
859:     for (i=0; i<n; i++) {
860:       idy       = *indicesy++;
861:       y[idy]    += x[0];
862:       y[idy+1]  += x[1];
863:       y[idy+2]  += x[2];
864:       y[idy+3]  += x[3];
865:       y[idy+4]  += x[4];
866:       x    += 5;
867:     }
868:     break;
869: #if !defined(PETSC_USE_COMPLEX)
870:   case MAX_VALUES:
871:     for (i=0; i<n; i++) {
872:       idy       = *indicesy++;
873:       y[idy]    = PetscMax(y[idy],x[0]);
874:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
875:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
876:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
877:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
878:       x    += 5;
879:     }
880: #else
881:   case MAX_VALUES:
882: #endif
883:   case NOT_SET_VALUES:
884:     break;
885:   }
886: }

888: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
889: {
890:   PetscInt i,idx,idy;

892:   switch (addv) {
893:   case INSERT_VALUES:
894:     for (i=0; i<n; i++) {
895:       idx       = *indicesx++;
896:       idy       = *indicesy++;
897:       y[idy]    = x[idx];
898:       y[idy+1]  = x[idx+1];
899:       y[idy+2]  = x[idx+2];
900:       y[idy+3]  = x[idx+3];
901:       y[idy+4]  = x[idx+4];
902:     }
903:     break;
904:   case ADD_VALUES:
905:     for (i=0; i<n; i++) {
906:       idx       = *indicesx++;
907:       idy       = *indicesy++;
908:       y[idy]    += x[idx];
909:       y[idy+1]  += x[idx+1];
910:       y[idy+2]  += x[idx+2];
911:       y[idy+3]  += x[idx+3];
912:       y[idy+4]  += x[idx+4];
913:     }
914:     break;
915: #if !defined(PETSC_USE_COMPLEX)
916:   case MAX_VALUES:
917:     for (i=0; i<n; i++) {
918:       idx       = *indicesx++;
919:       idy       = *indicesy++;
920:       y[idy]    = PetscMax(y[idy],x[idx]);
921:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
922:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
923:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
924:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
925:     }
926: #else
927:   case MAX_VALUES:
928: #endif
929:   case NOT_SET_VALUES:
930:     break;
931:   }
932: }
933:   /* ----------------------------------------------------------------------------------------------- */
934: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
935: {
936:   PetscInt i,idx;

938:   for (i=0; i<n; i++) {
939:     idx   = *indicesx++;
940:     y[0]  = x[idx];
941:     y[1]  = x[idx+1];
942:     y[2]  = x[idx+2];
943:     y[3]  = x[idx+3];
944:     y[4]  = x[idx+4];
945:     y[5]  = x[idx+5];
946:     y    += 6;
947:   }
948: }
949: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
950: {
951:   PetscInt i,idy;

953:   switch (addv) {
954:   case INSERT_VALUES:
955:     for (i=0; i<n; i++) {
956:       idy       = *indicesy++;
957:       y[idy]    = x[0];
958:       y[idy+1]  = x[1];
959:       y[idy+2]  = x[2];
960:       y[idy+3]  = x[3];
961:       y[idy+4]  = x[4];
962:       y[idy+5]  = x[5];
963:       x    += 6;
964:     }
965:     break;
966:   case ADD_VALUES:
967:     for (i=0; i<n; i++) {
968:       idy       = *indicesy++;
969:       y[idy]    += x[0];
970:       y[idy+1]  += x[1];
971:       y[idy+2]  += x[2];
972:       y[idy+3]  += x[3];
973:       y[idy+4]  += x[4];
974:       y[idy+5]  += x[5];
975:       x    += 6;
976:     }
977:     break;
978: #if !defined(PETSC_USE_COMPLEX)
979:   case MAX_VALUES:
980:     for (i=0; i<n; i++) {
981:       idy       = *indicesy++;
982:       y[idy]    = PetscMax(y[idy],x[0]);
983:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
984:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
985:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
986:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
987:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
988:       x    += 6;
989:     }
990: #else
991:   case MAX_VALUES:
992: #endif
993:   case NOT_SET_VALUES:
994:     break;
995:   }
996: }

998: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
999: {
1000:   PetscInt i,idx,idy;

1002:   switch (addv) {
1003:   case INSERT_VALUES:
1004:     for (i=0; i<n; i++) {
1005:       idx       = *indicesx++;
1006:       idy       = *indicesy++;
1007:       y[idy]    = x[idx];
1008:       y[idy+1]  = x[idx+1];
1009:       y[idy+2]  = x[idx+2];
1010:       y[idy+3]  = x[idx+3];
1011:       y[idy+4]  = x[idx+4];
1012:       y[idy+5]  = x[idx+5];
1013:     }
1014:     break;
1015:   case ADD_VALUES:
1016:     for (i=0; i<n; i++) {
1017:       idx       = *indicesx++;
1018:       idy       = *indicesy++;
1019:       y[idy]    += x[idx];
1020:       y[idy+1]  += x[idx+1];
1021:       y[idy+2]  += x[idx+2];
1022:       y[idy+3]  += x[idx+3];
1023:       y[idy+4]  += x[idx+4];
1024:       y[idy+5]  += x[idx+5];
1025:     }
1026:     break;
1027: #if !defined(PETSC_USE_COMPLEX)
1028:   case MAX_VALUES:
1029:     for (i=0; i<n; i++) {
1030:       idx       = *indicesx++;
1031:       idy       = *indicesy++;
1032:       y[idy]    = PetscMax(y[idy],x[idx]);
1033:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1034:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1035:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1036:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1037:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1038:     }
1039: #else
1040:   case MAX_VALUES:
1041: #endif
1042:   case NOT_SET_VALUES:
1043:     break;
1044:   }
1045: }
1046:   /* ----------------------------------------------------------------------------------------------- */
1047: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1048: {
1049:   PetscInt i,idx;

1051:   for (i=0; i<n; i++) {
1052:     idx   = *indicesx++;
1053:     y[0]  = x[idx];
1054:     y[1]  = x[idx+1];
1055:     y[2]  = x[idx+2];
1056:     y[3]  = x[idx+3];
1057:     y[4]  = x[idx+4];
1058:     y[5]  = x[idx+5];
1059:     y[6]  = x[idx+6];
1060:     y    += 7;
1061:   }
1062: }
1063: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1064: {
1065:   PetscInt i,idy;

1067:   switch (addv) {
1068:   case INSERT_VALUES:
1069:     for (i=0; i<n; i++) {
1070:       idy       = *indicesy++;
1071:       y[idy]    = x[0];
1072:       y[idy+1]  = x[1];
1073:       y[idy+2]  = x[2];
1074:       y[idy+3]  = x[3];
1075:       y[idy+4]  = x[4];
1076:       y[idy+5]  = x[5];
1077:       y[idy+6]  = x[6];
1078:       x    += 7;
1079:     }
1080:     break;
1081:   case ADD_VALUES:
1082:     for (i=0; i<n; i++) {
1083:       idy       = *indicesy++;
1084:       y[idy]    += x[0];
1085:       y[idy+1]  += x[1];
1086:       y[idy+2]  += x[2];
1087:       y[idy+3]  += x[3];
1088:       y[idy+4]  += x[4];
1089:       y[idy+5]  += x[5];
1090:       y[idy+6]  += x[6];
1091:       x    += 7;
1092:     }
1093:     break;
1094: #if !defined(PETSC_USE_COMPLEX)
1095:   case MAX_VALUES:
1096:     for (i=0; i<n; i++) {
1097:       idy       = *indicesy++;
1098:       y[idy]    = PetscMax(y[idy],x[0]);
1099:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1100:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1101:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1102:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1103:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1104:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1105:       x    += 7;
1106:     }
1107: #else
1108:   case MAX_VALUES:
1109: #endif
1110:   case NOT_SET_VALUES:
1111:     break;
1112:   }
1113: }

1115: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1116: {
1117:   PetscInt i,idx,idy;

1119:   switch (addv) {
1120:   case INSERT_VALUES:
1121:     for (i=0; i<n; i++) {
1122:       idx       = *indicesx++;
1123:       idy       = *indicesy++;
1124:       y[idy]    = x[idx];
1125:       y[idy+1]  = x[idx+1];
1126:       y[idy+2]  = x[idx+2];
1127:       y[idy+3]  = x[idx+3];
1128:       y[idy+4]  = x[idx+4];
1129:       y[idy+5]  = x[idx+5];
1130:       y[idy+6]  = x[idx+6];
1131:     }
1132:     break;
1133:   case ADD_VALUES:
1134:     for (i=0; i<n; i++) {
1135:       idx       = *indicesx++;
1136:       idy       = *indicesy++;
1137:       y[idy]    += x[idx];
1138:       y[idy+1]  += x[idx+1];
1139:       y[idy+2]  += x[idx+2];
1140:       y[idy+3]  += x[idx+3];
1141:       y[idy+4]  += x[idx+4];
1142:       y[idy+5]  += x[idx+5];
1143:       y[idy+6]  += x[idx+6];
1144:     }
1145:     break;
1146: #if !defined(PETSC_USE_COMPLEX)
1147:   case MAX_VALUES:
1148:     for (i=0; i<n; i++) {
1149:       idx       = *indicesx++;
1150:       idy       = *indicesy++;
1151:       y[idy]    = PetscMax(y[idy],x[idx]);
1152:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1153:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1154:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1155:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1156:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1157:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1158:     }
1159: #else
1160:   case MAX_VALUES:
1161: #endif
1162:   case NOT_SET_VALUES:
1163:     break;
1164:   }
1165: }
1166:   /* ----------------------------------------------------------------------------------------------- */
1167: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1168: {
1169:   PetscInt i,idx;

1171:   for (i=0; i<n; i++) {
1172:     idx   = *indicesx++;
1173:     y[0]  = x[idx];
1174:     y[1]  = x[idx+1];
1175:     y[2]  = x[idx+2];
1176:     y[3]  = x[idx+3];
1177:     y[4]  = x[idx+4];
1178:     y[5]  = x[idx+5];
1179:     y[6]  = x[idx+6];
1180:     y[7]  = x[idx+7];
1181:     y    += 8;
1182:   }
1183: }
1184: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1185: {
1186:   PetscInt i,idy;

1188:   switch (addv) {
1189:   case INSERT_VALUES:
1190:     for (i=0; i<n; i++) {
1191:       idy       = *indicesy++;
1192:       y[idy]    = x[0];
1193:       y[idy+1]  = x[1];
1194:       y[idy+2]  = x[2];
1195:       y[idy+3]  = x[3];
1196:       y[idy+4]  = x[4];
1197:       y[idy+5]  = x[5];
1198:       y[idy+6]  = x[6];
1199:       y[idy+7]  = x[7];
1200:       x    += 8;
1201:     }
1202:     break;
1203:   case ADD_VALUES:
1204:     for (i=0; i<n; i++) {
1205:       idy       = *indicesy++;
1206:       y[idy]    += x[0];
1207:       y[idy+1]  += x[1];
1208:       y[idy+2]  += x[2];
1209:       y[idy+3]  += x[3];
1210:       y[idy+4]  += x[4];
1211:       y[idy+5]  += x[5];
1212:       y[idy+6]  += x[6];
1213:       y[idy+7]  += x[7];
1214:       x    += 8;
1215:     }
1216:     break;
1217: #if !defined(PETSC_USE_COMPLEX)
1218:   case MAX_VALUES:
1219:     for (i=0; i<n; i++) {
1220:       idy       = *indicesy++;
1221:       y[idy]    = PetscMax(y[idy],x[0]);
1222:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1223:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1224:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1225:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1226:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1227:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1228:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1229:       x    += 8;
1230:     }
1231: #else
1232:   case MAX_VALUES:
1233: #endif
1234:   case NOT_SET_VALUES:
1235:     break;
1236:   }
1237: }

1239: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1240: {
1241:   PetscInt i,idx,idy;

1243:   switch (addv) {
1244:   case INSERT_VALUES:
1245:     for (i=0; i<n; i++) {
1246:       idx       = *indicesx++;
1247:       idy       = *indicesy++;
1248:       y[idy]    = x[idx];
1249:       y[idy+1]  = x[idx+1];
1250:       y[idy+2]  = x[idx+2];
1251:       y[idy+3]  = x[idx+3];
1252:       y[idy+4]  = x[idx+4];
1253:       y[idy+5]  = x[idx+5];
1254:       y[idy+6]  = x[idx+6];
1255:       y[idy+7]  = x[idx+7];
1256:     }
1257:     break;
1258:   case ADD_VALUES:
1259:     for (i=0; i<n; i++) {
1260:       idx       = *indicesx++;
1261:       idy       = *indicesy++;
1262:       y[idy]    += x[idx];
1263:       y[idy+1]  += x[idx+1];
1264:       y[idy+2]  += x[idx+2];
1265:       y[idy+3]  += x[idx+3];
1266:       y[idy+4]  += x[idx+4];
1267:       y[idy+5]  += x[idx+5];
1268:       y[idy+6]  += x[idx+6];
1269:       y[idy+7]  += x[idx+7];
1270:     }
1271:     break;
1272: #if !defined(PETSC_USE_COMPLEX)
1273:   case MAX_VALUES:
1274:     for (i=0; i<n; i++) {
1275:       idx       = *indicesx++;
1276:       idy       = *indicesy++;
1277:       y[idy]    = PetscMax(y[idy],x[idx]);
1278:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1279:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1280:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1281:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1282:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1283:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1284:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1285:     }
1286: #else
1287:   case MAX_VALUES:
1288: #endif
1289:   case NOT_SET_VALUES:
1290:     break;
1291:   }
1292: }

1294:   /* ----------------------------------------------------------------------------------------------- */
1295: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1296: {
1297:   PetscInt i,idx;

1299:   for (i=0; i<n; i++) {
1300:     idx   = *indicesx++;
1301:     y[0]  = x[idx];
1302:     y[1]  = x[idx+1];
1303:     y[2]  = x[idx+2];
1304:     y[3]  = x[idx+3];
1305:     y[4]  = x[idx+4];
1306:     y[5]  = x[idx+5];
1307:     y[6]  = x[idx+6];
1308:     y[7]  = x[idx+7];
1309:     y[8]  = x[idx+8];
1310:     y[9]  = x[idx+9];
1311:     y[10] = x[idx+10];
1312:     y[11] = x[idx+11];
1313:     y    += 12;
1314:   }
1315: }
1316: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1317: {
1318:   PetscInt i,idy;

1320:   switch (addv) {
1321:   case INSERT_VALUES:
1322:     for (i=0; i<n; i++) {
1323:       idy       = *indicesy++;
1324:       y[idy]    = x[0];
1325:       y[idy+1]  = x[1];
1326:       y[idy+2]  = x[2];
1327:       y[idy+3]  = x[3];
1328:       y[idy+4]  = x[4];
1329:       y[idy+5]  = x[5];
1330:       y[idy+6]  = x[6];
1331:       y[idy+7]  = x[7];
1332:       y[idy+8]  = x[8];
1333:       y[idy+9]  = x[9];
1334:       y[idy+10] = x[10];
1335:       y[idy+11] = x[11];
1336:       x    += 12;
1337:     }
1338:     break;
1339:   case ADD_VALUES:
1340:     for (i=0; i<n; i++) {
1341:       idy       = *indicesy++;
1342:       y[idy]    += x[0];
1343:       y[idy+1]  += x[1];
1344:       y[idy+2]  += x[2];
1345:       y[idy+3]  += x[3];
1346:       y[idy+4]  += x[4];
1347:       y[idy+5]  += x[5];
1348:       y[idy+6]  += x[6];
1349:       y[idy+7]  += x[7];
1350:       y[idy+8]  += x[8];
1351:       y[idy+9]  += x[9];
1352:       y[idy+10] += x[10];
1353:       y[idy+11] += x[11];
1354:       x    += 12;
1355:     }
1356:     break;
1357: #if !defined(PETSC_USE_COMPLEX)
1358:   case MAX_VALUES:
1359:     for (i=0; i<n; i++) {
1360:       idy       = *indicesy++;
1361:       y[idy]    = PetscMax(y[idy],x[0]);
1362:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1363:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1364:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1365:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1366:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1367:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1368:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1369:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1370:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1371:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1372:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1373:       x    += 12;
1374:     }
1375: #else
1376:   case MAX_VALUES:
1377: #endif
1378:   case NOT_SET_VALUES:
1379:     break;
1380:   }
1381: }

1383: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1384: {
1385:   PetscInt i,idx,idy;

1387:   switch (addv) {
1388:   case INSERT_VALUES:
1389:     for (i=0; i<n; i++) {
1390:       idx       = *indicesx++;
1391:       idy       = *indicesy++;
1392:       y[idy]    = x[idx];
1393:       y[idy+1]  = x[idx+1];
1394:       y[idy+2]  = x[idx+2];
1395:       y[idy+3]  = x[idx+3];
1396:       y[idy+4]  = x[idx+4];
1397:       y[idy+5]  = x[idx+5];
1398:       y[idy+6]  = x[idx+6];
1399:       y[idy+7]  = x[idx+7];
1400:       y[idy+8]  = x[idx+8];
1401:       y[idy+9]  = x[idx+9];
1402:       y[idy+10] = x[idx+10];
1403:       y[idy+11] = x[idx+11];
1404:     }
1405:     break;
1406:   case ADD_VALUES:
1407:     for (i=0; i<n; i++) {
1408:       idx       = *indicesx++;
1409:       idy       = *indicesy++;
1410:       y[idy]    += x[idx];
1411:       y[idy+1]  += x[idx+1];
1412:       y[idy+2]  += x[idx+2];
1413:       y[idy+3]  += x[idx+3];
1414:       y[idy+4]  += x[idx+4];
1415:       y[idy+5]  += x[idx+5];
1416:       y[idy+6]  += x[idx+6];
1417:       y[idy+7]  += x[idx+7];
1418:       y[idy+8]  += x[idx+8];
1419:       y[idy+9]  += x[idx+9];
1420:       y[idy+10] += x[idx+10];
1421:       y[idy+11] += x[idx+11];
1422:     }
1423:     break;
1424: #if !defined(PETSC_USE_COMPLEX)
1425:   case MAX_VALUES:
1426:     for (i=0; i<n; i++) {
1427:       idx       = *indicesx++;
1428:       idy       = *indicesy++;
1429:       y[idy]    = PetscMax(y[idy],x[idx]);
1430:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1431:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1432:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1433:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1434:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1435:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1436:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1437:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1438:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1439:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1440:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1441:     }
1442: #else
1443:   case MAX_VALUES:
1444: #endif
1445:   case NOT_SET_VALUES:
1446:     break;
1447:   }
1448: }

1450: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1451: #define BS 1
1452:  #include ../src/vec/vec/utils/vpscat.h
1453: #define BS 2
1454:  #include ../src/vec/vec/utils/vpscat.h
1455: #define BS 3
1456:  #include ../src/vec/vec/utils/vpscat.h
1457: #define BS 4
1458:  #include ../src/vec/vec/utils/vpscat.h
1459: #define BS 5
1460:  #include ../src/vec/vec/utils/vpscat.h
1461: #define BS 6
1462:  #include ../src/vec/vec/utils/vpscat.h
1463: #define BS 7
1464:  #include ../src/vec/vec/utils/vpscat.h
1465: #define BS 8
1466:  #include ../src/vec/vec/utils/vpscat.h
1467: #define BS 12
1468:  #include ../src/vec/vec/utils/vpscat.h

1470: /* ==========================================================================================*/

1472: /*              create parallel to sequential scatter context                           */

1474: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);

1478: /*@
1479:      VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send and receive.

1481:      Collective on VecScatter

1483:    Input Parameters:
1484: +     VecScatter - obtained with VecScatterCreateEmpty()
1485: .     nsends -
1486: .     sendSizes -
1487: .     sendProcs -
1488: .     sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1489: .     nrecvs - number of receives to expect
1490: .     recvSizes - 
1491: .     recvProcs - processes who are sending to me
1492: .     recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1493: -     bs - size of block

1495:      Notes:  sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1496:       to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.

1498:        Probably does not handle sends to self properly. It should remove those from the counts that are used
1499:       in allocating space inside of the from struct

1501:   Level: intermediate

1503: @*/
1504: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1505: {
1506:   VecScatter_MPI_General *from, *to;
1507:   PetscInt               sendSize, recvSize;
1508:   PetscInt               n, i;
1509:   PetscErrorCode         ierr;

1511:   /* allocate entire send scatter context */
1512:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1513:   to->n = nsends;
1514:   for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1515:   PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1516:   PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1517:   PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1518:   to->starts[0] = 0;
1519:   for(n = 0; n < to->n; n++) {
1520:     if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1521:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1522:     to->procs[n] = sendProcs[n];
1523:     for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1524:       to->indices[i] = sendIdx[i];
1525:     }
1526:   }
1527:   ctx->todata = (void *) to;

1529:   /* allocate entire receive scatter context */
1530:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1531:   from->n = nrecvs;
1532:   for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1533:   PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1534:   PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1535:   from->starts[0] = 0;
1536:   for(n = 0; n < from->n; n++) {
1537:     if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1538:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1539:     from->procs[n] = recvProcs[n];
1540:     for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1541:       from->indices[i] = recvIdx[i];
1542:     }
1543:   }
1544:   ctx->fromdata = (void *)from;

1546:   /* No local scatter optimization */
1547:   from->local.n      = 0;
1548:   from->local.vslots = 0;
1549:   to->local.n        = 0;
1550:   to->local.vslots   = 0;
1551:   from->local.nonmatching_computed = PETSC_FALSE;
1552:   from->local.n_nonmatching        = 0;
1553:   from->local.slots_nonmatching    = 0;
1554:   to->local.nonmatching_computed   = PETSC_FALSE;
1555:   to->local.n_nonmatching          = 0;
1556:   to->local.slots_nonmatching      = 0;

1558:   from->type = VEC_SCATTER_MPI_GENERAL;
1559:   to->type   = VEC_SCATTER_MPI_GENERAL;
1560:   from->bs = bs;
1561:   to->bs   = bs;
1562:   VecScatterCreateCommon_PtoS(from, to, ctx);
1563:   return(0);
1564: }

1566: /*
1567:    bs indicates how many elements there are in each block. Normally this would be 1.

1569:    contains check that PetscMPIInt can handle the sizes needed 
1570: */
1573: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1574: {
1575:   VecScatter_MPI_General *from,*to;
1576:   PetscMPIInt            size,rank,imdex,tag,n;
1577:   PetscInt               *source = PETSC_NULL,*owners = PETSC_NULL;
1578:   PetscInt               *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1579:   PetscMPIInt            *nprocs = PETSC_NULL,nrecvs;
1580:   PetscInt               i,j,idx,nsends;
1581:   PetscInt               *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1582:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
1583:   PetscMPIInt            *onodes1,*olengths1;
1584:   MPI_Comm               comm;
1585:   MPI_Request            *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1586:   MPI_Status             recv_status,*send_status;
1587:   PetscErrorCode         ierr;

1590:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1591:   PetscObjectGetComm((PetscObject)xin,&comm);
1592:   MPI_Comm_rank(comm,&rank);
1593:   MPI_Comm_size(comm,&size);
1594:   owners = xin->map->range;
1595:   VecGetSize(yin,&lengthy);
1596:   VecGetSize(xin,&lengthx);

1598:   /*  first count number of contributors to each processor */
1599:   PetscMalloc2(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner);
1600:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
1601:   j      = 0;
1602:   nsends = 0;
1603:   for (i=0; i<nx; i++) {
1604:     idx = inidx[i];
1605:     if (idx < owners[j]) j = 0;
1606:     for (; j<size; j++) {
1607:       if (idx < owners[j+1]) {
1608:         if (!nprocs[j]++) nsends++;
1609:         owner[i] = j;
1610:         break;
1611:       }
1612:     }
1613:   }
1614:   nprocslocal  = nprocs[rank];
1615:   nprocs[rank] = 0;
1616:   if (nprocslocal) nsends--;
1617:   /* inform other processors of number of messages and max length*/
1618:   PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
1619:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
1620:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
1621:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];

1623:   /* post receives:   */
1624:   PetscMalloc3(recvtotal,PetscInt,&rvalues,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1625:   count  = 0;
1626:   for (i=0; i<nrecvs; i++) {
1627:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
1628:     count += olengths1[i];
1629:   }

1631:   /* do sends:
1632:      1) starts[i] gives the starting index in svalues for stuff going to 
1633:      the ith processor
1634:   */
1635:   PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1636:   starts[0]  = 0;
1637:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1638:   for (i=0; i<nx; i++) {
1639:     if (owner[i] != rank) {
1640:       svalues[starts[owner[i]]++] = inidx[i];
1641:     }
1642:   }

1644:   starts[0] = 0;
1645:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1646:   count = 0;
1647:   for (i=0; i<size; i++) {
1648:     if (nprocs[i]) {
1649:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
1650:     }
1651:   }

1653:   /*  wait on receives */
1654:   count  = nrecvs;
1655:   slen   = 0;
1656:   while (count) {
1657:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1658:     /* unpack receives into our local space */
1659:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1660:     slen += n;
1661:     count--;
1662:   }

1664:   if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
1665: 
1666:   /* allocate entire send scatter context */
1667:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1668:   to->n = nrecvs;
1669:   PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1670:   PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,nrecvs,PetscMPIInt,&to->procs);
1671:   PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,&to->rstatus);
1672:   ctx->todata   = (void*)to;
1673:   to->starts[0] = 0;

1675:   if (nrecvs) {

1677:     /* move the data into the send scatter */
1678:     base     = owners[rank];
1679:     rsvalues = rvalues;
1680:     for (i=0; i<nrecvs; i++) {
1681:       to->starts[i+1] = to->starts[i] + olengths1[i];
1682:       to->procs[i]    = onodes1[i];
1683:       values = rsvalues;
1684:       rsvalues += olengths1[i];
1685:       for (j=0; j<olengths1[i]; j++) {
1686:         to->indices[to->starts[i] + j] = values[j] - base;
1687:       }
1688:     }
1689:   }
1690:   PetscFree(olengths1);
1691:   PetscFree(onodes1);
1692:   PetscFree3(rvalues,source,recv_waits);

1694:   /* allocate entire receive scatter context */
1695:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1696:   from->n = nsends;

1698:   PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1699:   PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1700:   ctx->fromdata  = (void*)from;

1702:   /* move data into receive scatter */
1703:   PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1704:   count = 0; from->starts[0] = start[0] = 0;
1705:   for (i=0; i<size; i++) {
1706:     if (nprocs[i]) {
1707:       lowner[i]            = count;
1708:       from->procs[count++] = i;
1709:       from->starts[count]  = start[count] = start[count-1] + nprocs[i];
1710:     }
1711:   }

1713:   for (i=0; i<nx; i++) {
1714:     if (owner[i] != rank) {
1715:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
1716:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1717:     }
1718:   }
1719:   PetscFree2(lowner,start);
1720:   PetscFree2(nprocs,owner);
1721: 
1722:   /* wait on sends */
1723:   if (nsends) {
1724:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1725:     MPI_Waitall(nsends,send_waits,send_status);
1726:     PetscFree(send_status);
1727:   }
1728:   PetscFree3(svalues,send_waits,starts);

1730:   if (nprocslocal) {
1731:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1732:     /* we have a scatter to ourselves */
1733:     PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1734:     PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1735:     nt   = 0;
1736:     for (i=0; i<nx; i++) {
1737:       idx = inidx[i];
1738:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1739:         to->local.vslots[nt]     = idx - owners[rank];
1740:         from->local.vslots[nt++] = inidy[i];
1741:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1742:       }
1743:     }
1744:   } else {
1745:     from->local.n      = 0;
1746:     from->local.vslots = 0;
1747:     to->local.n        = 0;
1748:     to->local.vslots   = 0;
1749:   }

1751:   from->local.nonmatching_computed = PETSC_FALSE;
1752:   from->local.n_nonmatching        = 0;
1753:   from->local.slots_nonmatching    = 0;
1754:   to->local.nonmatching_computed   = PETSC_FALSE;
1755:   to->local.n_nonmatching          = 0;
1756:   to->local.slots_nonmatching      = 0;

1758:   from->type = VEC_SCATTER_MPI_GENERAL;
1759:   to->type   = VEC_SCATTER_MPI_GENERAL;
1760:   from->bs = bs;
1761:   to->bs   = bs;
1762:   VecScatterCreateCommon_PtoS(from,to,ctx);
1763:   return(0);
1764: }

1766: /*
1767:    bs indicates how many elements there are in each block. Normally this would be 1.
1768: */
1771: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1772: {
1773:   MPI_Comm       comm = ((PetscObject)ctx)->comm;
1774:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
1775:   PetscInt       bs   = to->bs;
1776:   PetscMPIInt    size;
1777:   PetscInt       i, n;
1779: 
1781:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1782:   ctx->destroy = VecScatterDestroy_PtoP;

1784:   ctx->reproduce = PETSC_FALSE;
1785:   to->sendfirst  = PETSC_FALSE;
1786:   PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_reproduce",&ctx->reproduce,PETSC_NULL);
1787:   PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst,PETSC_NULL);
1788:   from->sendfirst = to->sendfirst;

1790:   MPI_Comm_size(comm,&size);
1791:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1792:   to->contiq = PETSC_FALSE;
1793:   n = from->starts[from->n];
1794:   from->contiq = PETSC_TRUE;
1795:   for (i=1; i<n; i++) {
1796:     if (from->indices[i] != from->indices[i-1] + bs) {
1797:       from->contiq = PETSC_FALSE;
1798:       break;
1799:     }
1800:   }

1802:   to->use_alltoallv = PETSC_FALSE;
1803:   PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv,PETSC_NULL);
1804:   from->use_alltoallv = to->use_alltoallv;
1805:   if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1806: #if defined(PETSC_HAVE_MPI_ALLTOALLW)  && !defined(PETSC_USE_64BIT_INDICES)
1807:   if (to->use_alltoallv) {
1808:     to->use_alltoallw = PETSC_FALSE;
1809:     PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw,PETSC_NULL);
1810:   }
1811:   from->use_alltoallw = to->use_alltoallw;
1812:   if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1813: #endif

1815: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1816:   to->use_window = PETSC_FALSE;
1817:   PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_window",&to->use_window,PETSC_NULL);
1818:   from->use_window = to->use_window;
1819: #endif

1821:   if (to->use_alltoallv) {

1823:     PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1824:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1825:     for (i=0; i<to->n; i++) {
1826:       to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1827:     }
1828:     to->displs[0] = 0;
1829:     for (i=1; i<size; i++) {
1830:       to->displs[i] = to->displs[i-1] + to->counts[i-1];
1831:     }

1833:     PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1834:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1835:     for (i=0; i<from->n; i++) {
1836:       from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1837:     }
1838:     from->displs[0] = 0;
1839:     for (i=1; i<size; i++) {
1840:       from->displs[i] = from->displs[i-1] + from->counts[i-1];
1841:     }
1842: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1843:     if (to->use_alltoallw) {
1844:       PetscMPIInt mpibs = PetscMPIIntCast(bs), mpilen;
1845:       ctx->packtogether = PETSC_FALSE;
1846:       PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1847:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1848:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1849:       for (i=0; i<size; i++) {
1850:         to->types[i] = MPIU_SCALAR;
1851:       }

1853:       for (i=0; i<to->n; i++) {
1854:         to->wcounts[to->procs[i]] = 1;
1855:         mpilen = PetscMPIIntCast(to->starts[i+1]-to->starts[i]);
1856:         MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1857:         MPI_Type_commit(to->types+to->procs[i]);
1858:       }
1859:       PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1860:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1861:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1862:       for (i=0; i<size; i++) {
1863:         from->types[i] = MPIU_SCALAR;
1864:       }
1865:       if (from->contiq) {
1866:         PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1867:         for (i=0; i<from->n; i++) {
1868:           from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1869:         }
1870:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1871:         for (i=1; i<from->n; i++) {
1872:           from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1873:         }
1874:       } else {
1875:         for (i=0; i<from->n; i++) {
1876:           from->wcounts[from->procs[i]] = 1;
1877:           mpilen = PetscMPIIntCast(from->starts[i+1]-from->starts[i]);
1878:           MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1879:           MPI_Type_commit(from->types+from->procs[i]);
1880:         }
1881:       }
1882:     } else {
1883:       ctx->copy = VecScatterCopy_PtoP_AllToAll;
1884:     }
1885: #else 
1886:     to->use_alltoallw   = PETSC_FALSE;
1887:     from->use_alltoallw = PETSC_FALSE;
1888:     ctx->copy           = VecScatterCopy_PtoP_AllToAll;
1889: #endif
1890: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1891:   } else if (to->use_window) {
1892:     PetscMPIInt temptag,winsize;
1893:     MPI_Request *request;
1894:     MPI_Status  *status;
1895: 
1896:     PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1897:     winsize = (to->n ? to->starts[to->n] : 0)*bs*sizeof(PetscScalar);
1898:     MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1899:     PetscMalloc(to->n,&to->winstarts);
1900:     PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1901:     for (i=0; i<to->n; i++) {
1902:       MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1903:     }
1904:     for (i=0; i<from->n; i++) {
1905:       MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1906:     }
1907:     MPI_Waitall(to->n,request,status);
1908:     PetscFree2(request,status);

1910:     winsize = (from->n ? from->starts[from->n] : 0)*bs*sizeof(PetscScalar);
1911:     MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1912:     PetscMalloc(from->n,&from->winstarts);
1913:     PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1914:     for (i=0; i<from->n; i++) {
1915:       MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1916:     }
1917:     for (i=0; i<to->n; i++) {
1918:       MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1919:     }
1920:     MPI_Waitall(from->n,request,status);
1921:     PetscFree2(request,status);
1922: #endif
1923:   } else {
1924:     PetscTruth  use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1925:     PetscInt    *sstarts = to->starts,  *rstarts = from->starts;
1926:     PetscMPIInt *sprocs  = to->procs,   *rprocs  = from->procs;
1927:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
1928:     MPI_Request *rev_swaits,*rev_rwaits;
1929:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

1931:     /* allocate additional wait variables for the "reverse" scatter */
1932:     PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1933:     PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1934:     to->rev_requests   = rev_rwaits;
1935:     from->rev_requests = rev_swaits;

1937:     /* Register the receives that you will use later (sends for scatter reverse) */
1938:     PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_rsend",&use_rsend,PETSC_NULL);
1939:     PetscOptionsGetTruth(PETSC_NULL,"-vecscatter_ssend",&use_ssend,PETSC_NULL);
1940:     if (use_rsend) {
1941:       PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1942:       to->use_readyreceiver    = PETSC_TRUE;
1943:       from->use_readyreceiver  = PETSC_TRUE;
1944:     } else {
1945:       to->use_readyreceiver    = PETSC_FALSE;
1946:       from->use_readyreceiver  = PETSC_FALSE;
1947:     }
1948:     if (use_ssend) {
1949:       PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1950:     }

1952:     for (i=0; i<from->n; i++) {
1953:       if (use_rsend) {
1954:         MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1955:       } else if (use_ssend) {
1956:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1957:       } else {
1958:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1959:       }
1960:     }

1962:     for (i=0; i<to->n; i++) {
1963:       if (use_rsend) {
1964:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1965:       } else if (use_ssend) {
1966:         MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1967:       } else {
1968:         MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1969:       }
1970:     }
1971:     /* Register receives for scatter and reverse */
1972:     for (i=0; i<from->n; i++) {
1973:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1974:     }
1975:     for (i=0; i<to->n; i++) {
1976:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1977:     }
1978:     if (use_rsend) {
1979:       if (to->n)   {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1980:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1981:       MPI_Barrier(comm);
1982:     }

1984:     ctx->copy      = VecScatterCopy_PtoP_X;
1985:   }
1986:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
1987:   switch (bs) {
1988:   case 12:
1989:     ctx->begin     = VecScatterBegin_12;
1990:     ctx->end       = VecScatterEnd_12;
1991:     break;
1992:   case 8:
1993:     ctx->begin     = VecScatterBegin_8;
1994:     ctx->end       = VecScatterEnd_8;
1995:     break;
1996:   case 7:
1997:     ctx->begin     = VecScatterBegin_7;
1998:     ctx->end       = VecScatterEnd_7;
1999:     break;
2000:   case 6:
2001:     ctx->begin     = VecScatterBegin_6;
2002:     ctx->end       = VecScatterEnd_6;
2003:     break;
2004:   case 5:
2005:     ctx->begin     = VecScatterBegin_5;
2006:     ctx->end       = VecScatterEnd_5;
2007:     break;
2008:   case 4:
2009:     ctx->begin     = VecScatterBegin_4;
2010:     ctx->end       = VecScatterEnd_4;
2011:     break;
2012:   case 3:
2013:     ctx->begin     = VecScatterBegin_3;
2014:     ctx->end       = VecScatterEnd_3;
2015:     break;
2016:   case 2:
2017:     ctx->begin     = VecScatterBegin_2;
2018:     ctx->end       = VecScatterEnd_2;
2019:     break;
2020:   case 1:
2021:     ctx->begin     = VecScatterBegin_1;
2022:     ctx->end       = VecScatterEnd_1;
2023:     break;
2024:   default:
2025:     SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2026:   }
2027:   ctx->view      = VecScatterView_MPI;
2028:   /* Check if the local scatter is actually a copy; important special case */
2029:   if (to->local.n) {
2030:     VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
2031:   }
2032:   return(0);
2033: }



2037: /* ------------------------------------------------------------------------------------*/
2038: /*
2039:          Scatter from local Seq vectors to a parallel vector. 
2040:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
2041:          reverses the result.
2042: */
2045: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2046: {
2047:   PetscErrorCode         ierr;
2048:   MPI_Request            *waits;
2049:   VecScatter_MPI_General *to,*from;

2052:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2053:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2054:   from          = (VecScatter_MPI_General*)ctx->todata;
2055:   ctx->todata   = (void*)to;
2056:   ctx->fromdata = (void*)from;
2057:   /* these two are special, they are ALWAYS stored in to struct */
2058:   to->sstatus   = from->sstatus;
2059:   to->rstatus   = from->rstatus;

2061:   from->sstatus = 0;
2062:   from->rstatus = 0;

2064:   waits              = from->rev_requests;
2065:   from->rev_requests = from->requests;
2066:   from->requests     = waits;
2067:   waits              = to->rev_requests;
2068:   to->rev_requests   = to->requests;
2069:   to->requests       = waits;
2070:   return(0);
2071: }

2073: /* ---------------------------------------------------------------------------------*/
2076: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
2077: {
2079:   PetscMPIInt    size,rank,tag,imdex,n;
2080:   PetscInt       *owners = xin->map->range;
2081:   PetscMPIInt    *nprocs = PETSC_NULL;
2082:   PetscInt       i,j,idx,nsends,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
2083:   PetscInt       *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
2084:   PetscInt       *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,*values = PETSC_NULL,*rsvalues,recvtotal,lastidx;
2085:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2086:   MPI_Comm       comm;
2087:   MPI_Request    *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
2088:   MPI_Status     recv_status,*send_status = PETSC_NULL;
2089:   PetscTruth     duplicate = PETSC_FALSE;
2090: #if defined(PETSC_USE_DEBUG)
2091:   PetscTruth     found = PETSC_FALSE;
2092: #endif

2095:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2096:   PetscObjectGetComm((PetscObject)xin,&comm);
2097:   MPI_Comm_size(comm,&size);
2098:   MPI_Comm_rank(comm,&rank);
2099:   if (size == 1) {
2100:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
2101:     return(0);
2102:   }

2104:   /*
2105:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2106:      They then call the StoPScatterCreate()
2107:   */
2108:   /*  first count number of contributors to each processor */
2109:   PetscMalloc3(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2110:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
2111:   lastidx = -1;
2112:   j       = 0;
2113:   for (i=0; i<nx; i++) {
2114:     /* if indices are NOT locally sorted, need to start search at the beginning */
2115:     if (lastidx > (idx = inidx[i])) j = 0;
2116:     lastidx = idx;
2117:     for (; j<size; j++) {
2118:       if (idx >= owners[j] && idx < owners[j+1]) {
2119:         nprocs[j]++;
2120:         owner[i] = j;
2121: #if defined(PETSC_USE_DEBUG)
2122:         found = PETSC_TRUE;
2123: #endif
2124:         break;
2125:       }
2126:     }
2127: #if defined(PETSC_USE_DEBUG)
2128:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2129:     found = PETSC_FALSE;
2130: #endif
2131:   }
2132:   nsends = 0;  for (i=0; i<size; i++) { nsends += (nprocs[i] > 0);}

2134:   /* inform other processors of number of messages and max length*/
2135:   PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
2136:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2137:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2138:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];

2140:   /* post receives:   */
2141:   PetscMalloc5(2*recvtotal,PetscInt,&rvalues,2*nx,PetscInt,&svalues,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);

2143:   count = 0;
2144:   for (i=0; i<nrecvs; i++) {
2145:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2146:     count += olengths1[i];
2147:   }
2148:   PetscFree(onodes1);

2150:   /* do sends:
2151:       1) starts[i] gives the starting index in svalues for stuff going to 
2152:          the ith processor
2153:   */
2154:   starts[0]= 0;
2155:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2156:   for (i=0; i<nx; i++) {
2157:     svalues[2*starts[owner[i]]]       = inidx[i];
2158:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2159:   }

2161:   starts[0] = 0;
2162:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2163:   count = 0;
2164:   for (i=0; i<size; i++) {
2165:     if (nprocs[i]) {
2166:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2167:       count++;
2168:     }
2169:   }
2170:   PetscFree3(nprocs,owner,starts);

2172:   /*  wait on receives */
2173:   count = nrecvs;
2174:   slen  = 0;
2175:   while (count) {
2176:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2177:     /* unpack receives into our local space */
2178:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2179:     slen += n/2;
2180:     count--;
2181:   }
2182:   if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
2183: 
2184:   PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2185:   base  = owners[rank];
2186:   count = 0;
2187:   rsvalues = rvalues;
2188:   for (i=0; i<nrecvs; i++) {
2189:     values = rsvalues;
2190:     rsvalues += 2*olengths1[i];
2191:     for (j=0; j<olengths1[i]; j++) {
2192:       local_inidx[count]   = values[2*j] - base;
2193:       local_inidy[count++] = values[2*j+1];
2194:     }
2195:   }
2196:   PetscFree(olengths1);

2198:   /* wait on sends */
2199:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2200:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);

2202:   /*
2203:      should sort and remove duplicates from local_inidx,local_inidy 
2204:   */

2206: #if defined(do_it_slow)
2207:   /* sort on the from index */
2208:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2209:   start = 0;
2210:   while (start < slen) {
2211:     count = start+1;
2212:     last  = local_inidx[start];
2213:     while (count < slen && last == local_inidx[count]) count++;
2214:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2215:       /* sort on to index */
2216:       PetscSortInt(count-start,local_inidy+start);
2217:     }
2218:     /* remove duplicates; not most efficient way, but probably good enough */
2219:     i = start;
2220:     while (i < count-1) {
2221:       if (local_inidy[i] != local_inidy[i+1]) {
2222:         i++;
2223:       } else { /* found a duplicate */
2224:         duplicate = PETSC_TRUE;
2225:         for (j=i; j<slen-1; j++) {
2226:           local_inidx[j] = local_inidx[j+1];
2227:           local_inidy[j] = local_inidy[j+1];
2228:         }
2229:         slen--;
2230:         count--;
2231:       }
2232:     }
2233:     start = count;
2234:   }
2235: #endif
2236:   if (duplicate) {
2237:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2238:   }
2239:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2240:   PetscFree2(local_inidx,local_inidy);
2241:   return(0);
2242: }