Actual source code: vpscat.c

  1: /*$Id: vpscat.c,v 1.165 2001/09/08 13:16:48 bsmith Exp $*/
  2: /*
  3:     Defines parallel vector scatters.
  4: */

 6:  #include src/vec/is/isimpl.h
 7:  #include vecimpl.h
 8:  #include src/vec/impls/dvecimpl.h
 9:  #include src/vec/impls/mpi/pvecimpl.h
 10:  #include petscsys.h

 14: int VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 15: {
 16:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 17:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 18:   int                    i,rank,ierr;
 19:   PetscViewerFormat      format;
 20:   PetscTruth             isascii;

 23:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);
 24:   if (isascii) {
 25:     MPI_Comm_rank(ctx->comm,&rank);
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 28:       int nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 30:       MPI_Reduce(&to->n,&nsend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 31:       MPI_Reduce(&from->n,&nrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 32:       itmp = to->starts[to->n+1];
 33:       MPI_Reduce(&itmp,&lensend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 34:       itmp = from->starts[from->n+1];
 35:       MPI_Reduce(&itmp,&lenrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 36:       MPI_Reduce(&itmp,&alldata,1,MPI_INT,MPI_SUM,0,ctx->comm);

 38:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %d\n",nsend_max);
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %d\n",nrecv_max);
 41:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %d\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %d\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 43:       PetscViewerASCIIPrintf(viewer,"  Total data sent %d\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 45:     } else {
 46:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %d; Number to self = %d\n",rank,to->n,to->local.n);
 47:       if (to->n) {
 48:         for (i=0; i<to->n; i++){
 49:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %d length = %d to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 50:         }
 51:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 52:         for (i=0; i<to->starts[to->n]; i++){
 53:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d \n",rank,to->indices[i]);
 54:         }
 55:       }

 57:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %d; Number from self = %d\n",rank,from->n,from->local.n);
 58:       if (from->n) {
 59:         for (i=0; i<from->n; i++){
 60:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d length %d from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 61:         }

 63:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 64:         for (i=0; i<from->starts[from->n]; i++){
 65:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d \n",rank,from->indices[i]);
 66:         }
 67:       }
 68:       if (to->local.n) {
 69:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
 70:         for (i=0; i<to->local.n; i++){
 71:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %d to %d \n",rank,from->local.slots[i],to->local.slots[i]);
 72:         }
 73:       }

 75:       PetscViewerFlush(viewer);
 76:     }
 77:   } else {
 78:     SETERRQ1(1,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 79:   }
 80:   return(0);
 81: }

 83: /* -----------------------------------------------------------------------------------*/
 84: /*
 85:       The next routine determines what part of  the local part of the scatter is an
 86:   exact copy of values into their current location. We check this here and
 87:   then know that we need not perform that portion of the scatter.
 88: */
 91: int VecScatterLocalOptimize_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from)
 92: {
 93:   int n = gen_to->n,n_nonmatching = 0,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
 94:   int *nto_slots,*nfrom_slots,j = 0,ierr;
 95: 
 97:   for (i=0; i<n; i++) {
 98:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
 99:   }

101:   if (!n_nonmatching) {
102:     gen_to->nonmatching_computed = PETSC_TRUE;
103:     gen_to->n_nonmatching        = gen_from->n_nonmatching = 0;
104:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to 0\n", n);
105:   } else if (n_nonmatching == n) {
106:     gen_to->nonmatching_computed = PETSC_FALSE;
107:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:All values non-matching\n");
108:   } else {
109:     gen_to->nonmatching_computed= PETSC_TRUE;
110:     gen_to->n_nonmatching       = gen_from->n_nonmatching = n_nonmatching;
111:     PetscMalloc(n_nonmatching*sizeof(int),&nto_slots);
112:     gen_to->slots_nonmatching   = nto_slots;
113:     PetscMalloc(n_nonmatching*sizeof(int),&nfrom_slots);
114:     gen_from->slots_nonmatching = nfrom_slots;
115:     for (i=0; i<n; i++) {
116:       if (to_slots[i] != from_slots[i]) {
117:         nto_slots[j]   = to_slots[i];
118:         nfrom_slots[j] = from_slots[i];
119:         j++;
120:       }
121:     }
122:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to %d\n",n,n_nonmatching);
123:   }
124:   return(0);
125: }

127: /* --------------------------------------------------------------------------------------*/
130: int VecScatterCopy_PtoP(VecScatter in,VecScatter out)
131: {
132:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
133:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
134:   int                    len,ny,ierr;

137:   out->postrecvs = in->postrecvs;
138:   out->begin     = in->begin;
139:   out->end       = in->end;
140:   out->copy      = in->copy;
141:   out->destroy   = in->destroy;
142:   out->view      = in->view;

144:   /* allocate entire send scatter context */
145:   PetscNew(VecScatter_MPI_General,&out_to);
146:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
147:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
148:   ny                = in_to->starts[in_to->n];
149:   len               = ny*(sizeof(int) + sizeof(PetscScalar)) + (in_to->n+1)*sizeof(int) +
150:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
151:   out_to->n         = in_to->n;
152:   out_to->type      = in_to->type;
153:   out_to->sendfirst = in_to->sendfirst;
154:   PetscMalloc(len,&out_to->values);
155:   PetscLogObjectMemory(out,len);
156:   out_to->requests  = (MPI_Request*)(out_to->values + ny);
157:   out_to->indices   = (int*)(out_to->requests + out_to->n);
158:   out_to->starts    = (int*)(out_to->indices + ny);
159:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
160:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
161:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
162:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
163:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
164:   out_to->rstatus = out_to->rstatus + PetscMax(in_to->n,in_from->n) + 1;
165:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n) + 1)*sizeof(MPI_Status));
166:   out->todata      = (void*)out_to;
167:   out_to->local.n  = in_to->local.n;
168:   out_to->local.nonmatching_computed = PETSC_FALSE;
169:   out_to->local.n_nonmatching        = 0;
170:   out_to->local.slots_nonmatching    = 0;
171:   if (in_to->local.n) {
172:     PetscMalloc(in_to->local.n*sizeof(int),&out_to->local.slots);
173:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
174:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
175:   } else {
176:     out_to->local.slots = 0;
177:   }

179:   /* allocate entire receive context */
180:   PetscNew(VecScatter_MPI_General,&out_from);
181:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
182:   out_from->type      = in_from->type;
183:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
184:   ny                  = in_from->starts[in_from->n];
185:   len                 = ny*(sizeof(int) + sizeof(PetscScalar)) + (in_from->n+1)*sizeof(int) +
186:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
187:   out_from->n         = in_from->n;
188:   out_from->sendfirst = in_from->sendfirst;
189:   PetscMalloc(len,&out_from->values);
190:   PetscLogObjectMemory(out,len);
191:   out_from->requests  = (MPI_Request*)(out_from->values + ny);
192:   out_from->indices   = (int*)(out_from->requests + out_from->n);
193:   out_from->starts    = (int*)(out_from->indices + ny);
194:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
195:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
196:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
197:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
198:   out->fromdata       = (void*)out_from;
199:   out_from->local.n   = in_from->local.n;
200:   out_from->local.nonmatching_computed = PETSC_FALSE;
201:   out_from->local.n_nonmatching        = 0;
202:   out_from->local.slots_nonmatching    = 0;
203:   if (in_from->local.n) {
204:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
205:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
206:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
207:   } else {
208:     out_from->local.slots = 0;
209:   }
210:   return(0);
211: }

213: /* -------------------------------------------------------------------------------------*/
216: int VecScatterDestroy_PtoP(VecScatter ctx)
217: {
218:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
219:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

223:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
224:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
225:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
226:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}
227:   PetscFree(gen_to->sstatus);
228:   PetscFree(gen_to->values);
229:   PetscFree(gen_to);
230:   PetscFree(gen_from->values);
231:   PetscFree(gen_from);
232:   PetscLogObjectDestroy(ctx);
233:   PetscHeaderDestroy(ctx);
234:   return(0);
235: }

237: /* --------------------------------------------------------------------------------------*/
238: /*
239:      Even though the next routines are written with parallel 
240:   vectors, either xin or yin (but not both) may be Seq
241:   vectors, one for each processor.
242:   
243:      gen_from indices indicate where arriving stuff is stashed
244:      gen_to   indices indicate where departing stuff came from. 
245:      the naming can be VERY confusing.

247: */
250: int VecScatterBegin_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
251: {
252:   VecScatter_MPI_General *gen_to,*gen_from;
253:   MPI_Comm               comm = ctx->comm;
254:   PetscScalar            *xv,*yv,*val,*rvalues,*svalues;
255:   MPI_Request            *rwaits,*swaits;
256:   int                    tag = ctx->tag,i,j,*indices,*rstarts,*sstarts,*rprocs,*sprocs;
257:   int                    nrecvs,nsends,iend,ierr;

260:   VecGetArray(xin,&xv);
261:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
262:   if (mode & SCATTER_REVERSE){
263:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
264:     gen_from = (VecScatter_MPI_General*)ctx->todata;
265:   } else {
266:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
267:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
268:   }
269:   rvalues  = gen_from->values;
270:   svalues  = gen_to->values;
271:   nrecvs   = gen_from->n;
272:   nsends   = gen_to->n;
273:   rwaits   = gen_from->requests;
274:   swaits   = gen_to->requests;
275:   indices  = gen_to->indices;
276:   rstarts  = gen_from->starts;
277:   sstarts  = gen_to->starts;
278:   rprocs   = gen_from->procs;
279:   sprocs   = gen_to->procs;

281:   if (!(mode & SCATTER_LOCAL)) {

283:     if (gen_to->sendfirst) {
284:       /* do sends:  */
285:       for (i=0; i<nsends; i++) {
286:         val  = svalues + sstarts[i];
287:         iend = sstarts[i+1]-sstarts[i];
288:         /* pack the message */
289:         for (j=0; j<iend; j++) {
290:           val[j] = xv[*indices++];
291:         }
292:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
293:       }
294:     }
295: 
296:     /* post receives:   */
297:     for (i=0; i<nrecvs; i++) {
298:       MPI_Irecv(rvalues+rstarts[i],rstarts[i+1]-rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
299:     }

301:     if (!gen_to->sendfirst) {
302:       /* do sends:  */
303:       for (i=0; i<nsends; i++) {
304:         val  = svalues + sstarts[i];
305:         iend = sstarts[i+1]-sstarts[i];
306:         /* pack the message */
307:         for (j=0; j<iend; j++) {
308:           val[j] = xv[*indices++];
309:         }
310:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
311:       }
312:     }
313:   }

315:   /* take care of local scatters */
316:   if (gen_to->local.n && addv == INSERT_VALUES) {
317:     if (yv == xv && !gen_to->local.nonmatching_computed) {
318:       VecScatterLocalOptimize_Private(&gen_to->local,&gen_from->local);
319:     }
320:     if (gen_to->local.is_copy) {
321:       PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
322:     } else if (yv != xv || !gen_to->local.nonmatching_computed) {
323:       int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
324:       int n       = gen_to->local.n;
325:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
326:     } else {
327:       /* 
328:         In this case, it is copying the values into their old locations, thus we can skip those  
329:       */
330:       int *tslots = gen_to->local.slots_nonmatching,*fslots = gen_from->local.slots_nonmatching;
331:       int n       = gen_to->local.n_nonmatching;
332:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
333:     }
334:   } else if (gen_to->local.n) {
335:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
336:     int n = gen_to->local.n;
337:     if (addv == ADD_VALUES) {
338:       for (i=0; i<n; i++) {yv[fslots[i]] += xv[tslots[i]];}
339: #if !defined(PETSC_USE_COMPLEX)
340:     } else if (addv == MAX_VALUES) {
341:       for (i=0; i<n; i++) {yv[fslots[i]] = PetscMax(yv[fslots[i]],xv[tslots[i]]);}
342: #endif
343:     } else {SETERRQ(1,"Wrong insert option");}
344:   }

346:   VecRestoreArray(xin,&xv);
347:   if (xin != yin) {VecRestoreArray(yin,&yv);}
348:   return(0);
349: }

351: /* --------------------------------------------------------------------------------------*/
354: int VecScatterEnd_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
355: {
356:   VecScatter_MPI_General *gen_to,*gen_from;
357:   PetscScalar            *rvalues,*yv,*val;
358:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices;
359:   MPI_Request            *rwaits,*swaits;
360:   MPI_Status             rstatus,*sstatus;

363:   if (mode & SCATTER_LOCAL) return(0);
364:   VecGetArray(yin,&yv);

366:   if (mode & SCATTER_REVERSE){
367:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
368:     gen_from = (VecScatter_MPI_General*)ctx->todata;
369:     sstatus  = gen_from->sstatus;
370:   } else {
371:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
372:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
373:     sstatus  = gen_to->sstatus;
374:   }
375:   rvalues  = gen_from->values;
376:   nrecvs   = gen_from->n;
377:   nsends   = gen_to->n;
378:   rwaits   = gen_from->requests;
379:   swaits   = gen_to->requests;
380:   indices  = gen_from->indices;
381:   rstarts  = gen_from->starts;

383:   /*  wait on receives */
384:   count = nrecvs;
385:   while (count) {
386:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
387:     /* unpack receives into our local space */
388:     val      = rvalues + rstarts[imdex];
389:     n        = rstarts[imdex+1]-rstarts[imdex];
390:     lindices = indices + rstarts[imdex];
391:     if (addv == INSERT_VALUES) {
392:       for (i=0; i<n; i++) {
393:         yv[lindices[i]] = *val++;
394:       }
395:     } else if (addv == ADD_VALUES) {
396:       for (i=0; i<n; i++) {
397:         yv[lindices[i]] += *val++;
398:       }
399: #if !defined(PETSC_USE_COMPLEX)
400:     } else if (addv == MAX_VALUES) {
401:       for (i=0; i<n; i++) {
402:         yv[lindices[i]] = PetscMax(yv[lindices[i]],*val); val++;
403:       }
404: #endif
405:     }  else {SETERRQ(1,"Wrong insert option");}
406:     count--;
407:   }

409:   /* wait on sends */
410:   if (nsends) {
411:     MPI_Waitall(nsends,swaits,sstatus);
412:   }
413:   VecRestoreArray(yin,&yv);
414:   return(0);
415: }
416: /* ==========================================================================================*/
417: /*
418:     Special scatters for fixed block sizes. These provide better performance
419:     because the local copying and packing and unpacking are done with loop 
420:     unrolling to the size of the block.

422:     Also uses MPI persistent sends and receives, these (at least in theory)
423:     allow MPI to optimize repeated sends and receives of the same type.
424: */

426: /*
427:     This is for use with the "ready-receiver" mode. In theory on some
428:     machines it could lead to better performance. In practice we've never
429:     seen it give better performance. Accessed with the -vecscatter_rr flag.
430: */
433: int VecScatterPostRecvs_PtoP_X(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
434: {
435:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

438:   MPI_Startall_irecv(gen_from->starts[gen_from->n]*gen_from->bs,gen_from->n,gen_from->requests);
439:   return(0);
440: }

442: /* --------------------------------------------------------------------------------------*/
443: /*
444:     Special optimization to see if the local part of the scatter is actually 
445:     a copy. The scatter routines call PetscMemcpy() instead.
446:  
447: */
450: int VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from,int bs)
451: {
452:   int n = gen_to->n,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
453:   int to_start,from_start;
454: 
456:   to_start   = to_slots[0];
457:   from_start = from_slots[0];

459:   for (i=1; i<n; i++) {
460:     to_start   += bs;
461:     from_start += bs;
462:     if (to_slots[i]   != to_start)   return(0);
463:     if (from_slots[i] != from_start) return(0);
464:   }
465:   gen_to->is_copy       = PETSC_TRUE;
466:   gen_to->copy_start    = to_slots[0];
467:   gen_to->copy_length   = bs*sizeof(PetscScalar)*n;
468:   gen_from->is_copy     = PETSC_TRUE;
469:   gen_from->copy_start  = from_slots[0];
470:   gen_from->copy_length = bs*sizeof(PetscScalar)*n;

472:   PetscLogInfo(0,"VecScatterLocalOptimizeCopy_Private:Local scatter is a copy, optimizing for it\n");

474:   return(0);
475: }

477: /* --------------------------------------------------------------------------------------*/

481: int VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
482: {
483:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
484:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
485:   int                    len,ny,bs = in_from->bs,ierr;

488:   out->postrecvs = in->postrecvs;
489:   out->begin     = in->begin;
490:   out->end       = in->end;
491:   out->copy      = in->copy;
492:   out->destroy   = in->destroy;
493:   out->view      = in->view;

495:   /* allocate entire send scatter context */
496:   PetscNew(VecScatter_MPI_General,&out_to);
497:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
498:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
499:   ny                = in_to->starts[in_to->n];
500:   len               = ny*(sizeof(int) + bs*sizeof(PetscScalar)) + (in_to->n+1)*sizeof(int) +
501:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
502:   out_to->n         = in_to->n;
503:   out_to->type      = in_to->type;
504:   out_to->sendfirst = in_to->sendfirst;

506:   PetscMalloc(len,&out_to->values);
507:   PetscLogObjectMemory(out,len);
508:   out_to->requests  = (MPI_Request*)(out_to->values + bs*ny);
509:   out_to->indices   = (int*)(out_to->requests + out_to->n);
510:   out_to->starts    = (int*)(out_to->indices + ny);
511:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
512:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
513:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
514:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
515:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
516:   out_to->rstatus   =  out_to->sstatus + PetscMax(in_to->n,in_from->n) + 1;
517: 
518:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status));
519:   out->todata       = (void*)out_to;
520:   out_to->local.n   = in_to->local.n;
521:   out_to->local.nonmatching_computed = PETSC_FALSE;
522:   out_to->local.n_nonmatching        = 0;
523:   out_to->local.slots_nonmatching    = 0;
524:   if (in_to->local.n) {
525:     PetscMalloc(in_to->local.n*sizeof(int),out_to->local.slots);
526:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
527:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
528:   } else {
529:     out_to->local.slots = 0;
530:   }

532:   /* allocate entire receive context */
533:   PetscNew(VecScatter_MPI_General,&out_from);
534:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
535:   out_from->type      = in_from->type;
536:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
537:   ny                  = in_from->starts[in_from->n];
538:   len                 = ny*(sizeof(int) + bs*sizeof(PetscScalar)) + (in_from->n+1)*sizeof(int) +
539:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
540:   out_from->n         = in_from->n;
541:   out_from->sendfirst = in_from->sendfirst;
542:   PetscMalloc(len,&out_from->values);
543:   PetscLogObjectMemory(out,len);
544:   out_from->requests  = (MPI_Request*)(out_from->values + bs*ny);
545:   out_from->indices   = (int*)(out_from->requests + out_from->n);
546:   out_from->starts    = (int*)(out_from->indices + ny);
547:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
548:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
549:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
550:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
551:   out->fromdata       = (void*)out_from;
552:   out_from->local.n   = in_from->local.n;
553:   out_from->local.nonmatching_computed = PETSC_FALSE;
554:   out_from->local.n_nonmatching        = 0;
555:   out_from->local.slots_nonmatching    = 0;
556:   if (in_from->local.n) {
557:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
558:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
559:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
560:   } else {
561:     out_from->local.slots = 0;
562:   }

564:   /* 
565:       set up the request arrays for use with isend_init() and irecv_init()
566:   */
567:   {
568:     MPI_Comm    comm;
569:     int         *sstarts = out_to->starts,  *rstarts = out_from->starts;
570:     int         *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
571:     int         tag,i;
572:     PetscTruth  flg;
573:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
574:     MPI_Request *rev_swaits,*rev_rwaits;
575:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

577:     PetscMalloc((in_to->n+1)*sizeof(MPI_Request),&out_to->rev_requests);
578:     PetscMalloc((in_from->n+1)*sizeof(MPI_Request),&out_from->rev_requests);
579:     PetscLogObjectMemory(out,(in_to->n+in_from->n+2)*sizeof(MPI_Request));

581:     rev_rwaits = out_to->rev_requests;
582:     rev_swaits = out_from->rev_requests;

584:     out_from->bs = out_to->bs = bs;
585:     tag     = out->tag;
586:     comm    = out->comm;

588:     /* Register the receives that you will use later (sends for scatter reverse) */
589:     for (i=0; i<out_from->n; i++) {
590:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
591:                            comm,rwaits+i);
592:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
593:                            comm,rev_swaits+i);
594:     }

596:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
597:     if (flg) {
598:       out->postrecvs               = VecScatterPostRecvs_PtoP_X;
599:       out_to->use_readyreceiver    = PETSC_TRUE;
600:       out_from->use_readyreceiver  = PETSC_TRUE;
601:       for (i=0; i<out_to->n; i++) {
602:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
603:                               comm,swaits+i);
604:       }
605:       PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter ready receiver mode\n");
606:     } else {
607:       out->postrecvs               = 0;
608:       out_to->use_readyreceiver    = PETSC_FALSE;
609:       out_from->use_readyreceiver  = PETSC_FALSE;
610:       flg                          = PETSC_FALSE;
611:       PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
612:       if (flg) {
613:         PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter Ssend mode\n");
614:       }
615:       for (i=0; i<out_to->n; i++) {
616:         if (!flg) {
617:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
618:                                comm,swaits+i);
619:         } else {
620:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
621:                                comm,swaits+i);
622:         }
623:       }
624:     }
625:     /* Register receives for scatter reverse */
626:     for (i=0; i<out_to->n; i++) {
627:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
628:                            comm,rev_rwaits+i);
629:     }
630:   }

632:   return(0);
633: }

635: /* --------------------------------------------------------------------------------------*/

639: int VecScatterBegin_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
640: {
641:   VecScatter_MPI_General *gen_to,*gen_from;
642:   PetscScalar            *xv,*yv,*val,*svalues;
643:   MPI_Request            *rwaits,*swaits;
644:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

647:   VecGetArray(xin,&xv);
648:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

650:   if (mode & SCATTER_REVERSE) {
651:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
652:     gen_from = (VecScatter_MPI_General*)ctx->todata;
653:     rwaits   = gen_from->rev_requests;
654:     swaits   = gen_to->rev_requests;
655:   } else {
656:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
657:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
658:     rwaits   = gen_from->requests;
659:     swaits   = gen_to->requests;
660:   }
661:   svalues  = gen_to->values;
662:   nrecvs   = gen_from->n;
663:   nsends   = gen_to->n;
664:   indices  = gen_to->indices;
665:   sstarts  = gen_to->starts;

667:   if (!(mode & SCATTER_LOCAL)) {

669:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
670:       /* post receives since they were not posted in VecScatterPostRecvs()   */
671:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
672:     }
673:     if (ctx->packtogether) {
674:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
675:       len  = 12*sstarts[nsends];
676:       val  = svalues;
677:       for (i=0; i<len; i += 12) {
678:         idx     = *indices++;
679:         val[0]  = xv[idx];
680:         val[1]  = xv[idx+1];
681:         val[2]  = xv[idx+2];
682:         val[3]  = xv[idx+3];
683:         val[4]  = xv[idx+4];
684:         val[5]  = xv[idx+5];
685:         val[6]  = xv[idx+6];
686:         val[7]  = xv[idx+7];
687:         val[8]  = xv[idx+8];
688:         val[9]  = xv[idx+9];
689:         val[10] = xv[idx+10];
690:         val[11] = xv[idx+11];
691:         val    += 12;
692:       }
693:       MPI_Startall_isend(len,nsends,swaits);
694:     } else {
695:       /* this version packs and sends one at a time */
696:       val  = svalues;
697:       for (i=0; i<nsends; i++) {
698:         iend = sstarts[i+1]-sstarts[i];

700:         for (j=0; j<iend; j++) {
701:           idx     = *indices++;
702:           val[0]  = xv[idx];
703:           val[1]  = xv[idx+1];
704:           val[2]  = xv[idx+2];
705:           val[3]  = xv[idx+3];
706:           val[4]  = xv[idx+4];
707:           val[5]  = xv[idx+5];
708:           val[6]  = xv[idx+6];
709:           val[7]  = xv[idx+7];
710:           val[8]  = xv[idx+8];
711:           val[9]  = xv[idx+9];
712:           val[10] = xv[idx+10];
713:           val[11] = xv[idx+11];
714:           val    += 12;
715:         }
716:         MPI_Start_isend(12*iend,swaits+i);
717:       }
718:     }

720:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
721:       /* post receives since they were not posted in VecScatterPostRecvs()   */
722:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
723:     }
724:   }

726:   /* take care of local scatters */
727:   if (gen_to->local.n) {
728:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
729:     int n       = gen_to->local.n,il,ir;
730:     if (addv == INSERT_VALUES) {
731:       if (gen_to->local.is_copy) {
732:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
733:       } else {
734:         for (i=0; i<n; i++) {
735:           il = fslots[i]; ir = tslots[i];
736:           yv[il]    = xv[ir];
737:           yv[il+1]  = xv[ir+1];
738:           yv[il+2]  = xv[ir+2];
739:           yv[il+3]  = xv[ir+3];
740:           yv[il+4]  = xv[ir+4];
741:           yv[il+5]  = xv[ir+5];
742:           yv[il+6]  = xv[ir+6];
743:           yv[il+7]  = xv[ir+7];
744:           yv[il+8]  = xv[ir+8];
745:           yv[il+9]  = xv[ir+9];
746:           yv[il+10] = xv[ir+10];
747:           yv[il+11] = xv[ir+11];
748:         }
749:       }
750:     }  else if (addv == ADD_VALUES) {
751:       for (i=0; i<n; i++) {
752:         il = fslots[i]; ir = tslots[i];
753:         yv[il]    += xv[ir];
754:         yv[il+1]  += xv[ir+1];
755:         yv[il+2]  += xv[ir+2];
756:         yv[il+3]  += xv[ir+3];
757:         yv[il+4]  += xv[ir+4];
758:         yv[il+5]  += xv[ir+5];
759:         yv[il+6]  += xv[ir+6];
760:         yv[il+7]  += xv[ir+7];
761:         yv[il+8]  += xv[ir+8];
762:         yv[il+9]  += xv[ir+9];
763:         yv[il+10] += xv[ir+10];
764:         yv[il+11] += xv[ir+11];
765:       }
766: #if !defined(PETSC_USE_COMPLEX)
767:     }  else if (addv == MAX_VALUES) {
768:       for (i=0; i<n; i++) {
769:         il = fslots[i]; ir = tslots[i];
770:         yv[il]    = PetscMax(yv[il],xv[ir]);
771:         yv[il+1]  = PetscMax(yv[il+1],xv[ir+1]);
772:         yv[il+2]  = PetscMax(yv[il+2],xv[ir+2]);
773:         yv[il+3]  = PetscMax(yv[il+3],xv[ir+3]);
774:         yv[il+4]  = PetscMax(yv[il+4],xv[ir+4]);
775:         yv[il+5]  = PetscMax(yv[il+5],xv[ir+5]);
776:         yv[il+6]  = PetscMax(yv[il+6],xv[ir+6]);
777:         yv[il+7]  = PetscMax(yv[il+7],xv[ir+7]);
778:         yv[il+8]  = PetscMax(yv[il+8],xv[ir+8]);
779:         yv[il+9]  = PetscMax(yv[il+9],xv[ir+9]);
780:         yv[il+10] = PetscMax(yv[il+10],xv[ir+10]);
781:         yv[il+11] = PetscMax(yv[il+11],xv[ir+11]);
782:       }
783: #endif
784:     } else {SETERRQ(1,"Wrong insert option");}
785:   }
786:   VecRestoreArray(xin,&xv);
787:   if (xin != yin) {VecRestoreArray(yin,&yv);}
788:   return(0);
789: }

791: /* --------------------------------------------------------------------------------------*/

795: int VecScatterEnd_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
796: {
797:   VecScatter_MPI_General *gen_to,*gen_from;
798:   PetscScalar            *rvalues,*yv,*val;
799:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
800:   MPI_Request            *rwaits,*swaits;
801:   MPI_Status             *rstatus,*sstatus;

804:   if (mode & SCATTER_LOCAL) return(0);
805:   VecGetArray(yin,&yv);

807:   if (mode & SCATTER_REVERSE) {
808:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
809:     gen_from = (VecScatter_MPI_General*)ctx->todata;
810:     rwaits   = gen_from->rev_requests;
811:     swaits   = gen_to->rev_requests;
812:     sstatus  = gen_from->sstatus;
813:     rstatus  = gen_from->rstatus;
814:   } else {
815:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
816:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
817:     rwaits   = gen_from->requests;
818:     swaits   = gen_to->requests;
819:     sstatus  = gen_to->sstatus;
820:     rstatus  = gen_to->rstatus;
821:   }
822:   rvalues  = gen_from->values;
823:   nrecvs   = gen_from->n;
824:   nsends   = gen_to->n;
825:   indices  = gen_from->indices;
826:   rstarts  = gen_from->starts;

828:   /*  wait on receives */
829:   count = nrecvs;
830:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
831:     MPI_Waitall(nrecvs,rwaits,rstatus);
832:     n        = rstarts[count];
833:     val      = rvalues;
834:     lindices = indices;
835:     if (addv == INSERT_VALUES) {
836:       for (i=0; i<n; i++) {
837:         idx        = lindices[i];
838:         yv[idx]    = val[0];
839:         yv[idx+1]  = val[1];
840:         yv[idx+2]  = val[2];
841:         yv[idx+3]  = val[3];
842:         yv[idx+4]  = val[4];
843:         yv[idx+5]  = val[5];
844:         yv[idx+6]  = val[6];
845:         yv[idx+7]  = val[7];
846:         yv[idx+8]  = val[8];
847:         yv[idx+9]  = val[9];
848:         yv[idx+10] = val[10];
849:         yv[idx+11] = val[11];
850:         val       += 12;
851:       }
852:     } else if (addv == ADD_VALUES) {
853:       for (i=0; i<n; i++) {
854:         idx         = lindices[i];
855:         yv[idx]    += val[0];
856:         yv[idx+1]  += val[1];
857:         yv[idx+2]  += val[2];
858:         yv[idx+3]  += val[3];
859:         yv[idx+4]  += val[4];
860:         yv[idx+5]  += val[5];
861:         yv[idx+6]  += val[6];
862:         yv[idx+7]  += val[7];
863:         yv[idx+8]  += val[8];
864:         yv[idx+9]  += val[9];
865:         yv[idx+10] += val[10];
866:         yv[idx+11] += val[11];
867:         val        += 12;
868:       }
869: #if !defined(PETSC_USE_COMPLEX)
870:     } else if (addv == MAX_VALUES) {
871:       for (i=0; i<n; i++) {
872:         idx        = lindices[i];
873:         yv[idx]    = PetscMax(yv[idx],val[0]);
874:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
875:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
876:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
877:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
878:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
879:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
880:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
881:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
882:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
883:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
884:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
885:         val       += 12;
886:       }
887: #endif
888:     }  else {SETERRQ(1,"Wrong insert option");}
889:   } else { /* unpack each message as it arrives, default version */
890:     while (count) {
891:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
892:       /* unpack receives into our local space */
893:       val      = rvalues + 12*rstarts[imdex];
894:       lindices = indices + rstarts[imdex];
895:       n        = rstarts[imdex+1] - rstarts[imdex];
896:       if (addv == INSERT_VALUES) {
897:         for (i=0; i<n; i++) {
898:           idx        = lindices[i];
899:           yv[idx]    = val[0];
900:           yv[idx+1]  = val[1];
901:           yv[idx+2]  = val[2];
902:           yv[idx+3]  = val[3];
903:           yv[idx+4]  = val[4];
904:           yv[idx+5]  = val[5];
905:           yv[idx+6]  = val[6];
906:           yv[idx+7]  = val[7];
907:           yv[idx+8]  = val[8];
908:           yv[idx+9]  = val[9];
909:           yv[idx+10] = val[10];
910:           yv[idx+11] = val[11];
911:           val       += 12;
912:       }
913:     } else if (addv == ADD_VALUES) {
914:       for (i=0; i<n; i++) {
915:         idx        = lindices[i];
916:         yv[idx]    += val[0];
917:         yv[idx+1]  += val[1];
918:         yv[idx+2]  += val[2];
919:         yv[idx+3]  += val[3];
920:         yv[idx+4]  += val[4];
921:         yv[idx+5]  += val[5];
922:         yv[idx+6]  += val[6];
923:         yv[idx+7]  += val[7];
924:         yv[idx+8]  += val[8];
925:         yv[idx+9]  += val[9];
926:         yv[idx+10] += val[10];
927:         yv[idx+11] += val[11];
928:         val        += 12;
929:       }
930: #if !defined(PETSC_USE_COMPLEX)
931:     } else if (addv == MAX_VALUES) {
932:       for (i=0; i<n; i++) {
933:         idx        = lindices[i];
934:         yv[idx]    = PetscMax(yv[idx],val[0]);
935:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
936:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
937:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
938:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
939:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
940:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
941:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
942:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
943:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
944:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
945:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
946:         val        += 12;
947:       }
948: #endif
949:     }  else {SETERRQ(1,"Wrong insert option");}
950:     count--;
951:     }
952:   }
953:   /* wait on sends */
954:   if (nsends) {
955:     MPI_Waitall(nsends,swaits,sstatus);
956:   }
957:   VecRestoreArray(yin,&yv);
958:   return(0);
959: }

961: /* --------------------------------------------------------------------------------------*/

965: int VecScatterBegin_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
966: {
967:   VecScatter_MPI_General *gen_to,*gen_from;
968:   PetscScalar            *xv,*yv,*val,*svalues;
969:   MPI_Request            *rwaits,*swaits;
970:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

973:   VecGetArray(xin,&xv);
974:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
975:   if (mode & SCATTER_REVERSE) {
976:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
977:     gen_from = (VecScatter_MPI_General*)ctx->todata;
978:     rwaits   = gen_from->rev_requests;
979:     swaits   = gen_to->rev_requests;
980:   } else {
981:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
982:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
983:     rwaits   = gen_from->requests;
984:     swaits   = gen_to->requests;
985:   }
986:   svalues  = gen_to->values;
987:   nrecvs   = gen_from->n;
988:   nsends   = gen_to->n;
989:   indices  = gen_to->indices;
990:   sstarts  = gen_to->starts;

992:   if (!(mode & SCATTER_LOCAL)) {

994:     if (gen_to->sendfirst) {
995:       /* this version packs and sends one at a time */
996:       val  = svalues;
997:       for (i=0; i<nsends; i++) {
998:         iend = sstarts[i+1]-sstarts[i];

1000:         for (j=0; j<iend; j++) {
1001:           idx     = *indices++;
1002:           val[0] = xv[idx];
1003:           val[1] = xv[idx+1];
1004:           val[2] = xv[idx+2];
1005:           val[3] = xv[idx+3];
1006:           val[4] = xv[idx+4];
1007:           val    += 5;
1008:         }
1009:         MPI_Start_isend(5*iend,swaits+i);
1010:       }
1011:     }

1013:     if (!gen_from->use_readyreceiver) {
1014:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1015:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1016:     }

1018:     if (!gen_to->sendfirst) {
1019:       /* this version packs all the messages together and sends */
1020:       /*
1021:       len  = 5*sstarts[nsends];
1022:       val  = svalues;
1023:       for (i=0; i<len; i += 5) {
1024:         idx     = *indices++;
1025:         val[0] = xv[idx];
1026:         val[1] = xv[idx+1];
1027:         val[2] = xv[idx+2];
1028:         val[3] = xv[idx+3];
1029:         val[4] = xv[idx+4];
1030:         val      += 5;
1031:       }
1032:       MPI_Startall_isend(len,nsends,swaits);
1033:       */

1035:       /* this version packs and sends one at a time */
1036:       val  = svalues;
1037:       for (i=0; i<nsends; i++) {
1038:         iend = sstarts[i+1]-sstarts[i];

1040:         for (j=0; j<iend; j++) {
1041:           idx     = *indices++;
1042:           val[0] = xv[idx];
1043:           val[1] = xv[idx+1];
1044:           val[2] = xv[idx+2];
1045:           val[3] = xv[idx+3];
1046:           val[4] = xv[idx+4];
1047:           val    += 5;
1048:         }
1049:         MPI_Start_isend(5*iend,swaits+i);
1050:       }
1051:     }
1052:   }

1054:   /* take care of local scatters */
1055:   if (gen_to->local.n) {
1056:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1057:     int n       = gen_to->local.n,il,ir;
1058:     if (addv == INSERT_VALUES) {
1059:       if (gen_to->local.is_copy) {
1060:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1061:       } else {
1062:         for (i=0; i<n; i++) {
1063:           il = fslots[i]; ir = tslots[i];
1064:           yv[il]   = xv[ir];
1065:           yv[il+1] = xv[ir+1];
1066:           yv[il+2] = xv[ir+2];
1067:           yv[il+3] = xv[ir+3];
1068:           yv[il+4] = xv[ir+4];
1069:         }
1070:       }
1071:     }  else if (addv == ADD_VALUES) {
1072:       for (i=0; i<n; i++) {
1073:         il = fslots[i]; ir = tslots[i];
1074:         yv[il]   += xv[ir];
1075:         yv[il+1] += xv[ir+1];
1076:         yv[il+2] += xv[ir+2];
1077:         yv[il+3] += xv[ir+3];
1078:         yv[il+4] += xv[ir+4];
1079:       }
1080: #if !defined(PETSC_USE_COMPLEX)
1081:     }  else if (addv == MAX_VALUES) {
1082:       for (i=0; i<n; i++) {
1083:         il = fslots[i]; ir = tslots[i];
1084:         yv[il]   = PetscMax(yv[il],xv[ir]);
1085:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1086:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1087:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1088:         yv[il+4] = PetscMax(yv[il+4],xv[ir+4]);
1089:       }
1090: #endif
1091:     }  else {SETERRQ(1,"Wrong insert option");}
1092:   }
1093:   VecRestoreArray(xin,&xv);
1094:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1095:   return(0);
1096: }

1098: /* --------------------------------------------------------------------------------------*/

1102: int VecScatterEnd_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1103: {
1104:   VecScatter_MPI_General *gen_to,*gen_from;
1105:   PetscScalar            *rvalues,*yv,*val;
1106:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1107:   MPI_Request            *rwaits,*swaits;
1108:   MPI_Status             rstatus,*sstatus;

1111:   if (mode & SCATTER_LOCAL) return(0);
1112:   VecGetArray(yin,&yv);

1114:   if (mode & SCATTER_REVERSE) {
1115:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1116:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1117:     rwaits   = gen_from->rev_requests;
1118:     swaits   = gen_to->rev_requests;
1119:     sstatus  = gen_from->sstatus;
1120:   } else {
1121:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1122:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1123:     rwaits   = gen_from->requests;
1124:     swaits   = gen_to->requests;
1125:     sstatus  = gen_to->sstatus;
1126:   }
1127:   rvalues  = gen_from->values;
1128:   nrecvs   = gen_from->n;
1129:   nsends   = gen_to->n;
1130:   indices  = gen_from->indices;
1131:   rstarts  = gen_from->starts;

1133:   /*  wait on receives */
1134:   count = nrecvs;
1135:   while (count) {
1136:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1137:     /* unpack receives into our local space */
1138:     val      = rvalues + 5*rstarts[imdex];
1139:     lindices = indices + rstarts[imdex];
1140:     n        = rstarts[imdex+1] - rstarts[imdex];
1141:     if (addv == INSERT_VALUES) {
1142:       for (i=0; i<n; i++) {
1143:         idx       = lindices[i];
1144:         yv[idx]   = val[0];
1145:         yv[idx+1] = val[1];
1146:         yv[idx+2] = val[2];
1147:         yv[idx+3] = val[3];
1148:         yv[idx+4] = val[4];
1149:         val      += 5;
1150:       }
1151:     } else if (addv == ADD_VALUES) {
1152:       for (i=0; i<n; i++) {
1153:         idx       = lindices[i];
1154:         yv[idx]   += val[0];
1155:         yv[idx+1] += val[1];
1156:         yv[idx+2] += val[2];
1157:         yv[idx+3] += val[3];
1158:         yv[idx+4] += val[4];
1159:         val       += 5;
1160:       }
1161: #if !defined(PETSC_USE_COMPLEX)
1162:     } else if (addv == MAX_VALUES) {
1163:       for (i=0; i<n; i++) {
1164:         idx       = lindices[i];
1165:         yv[idx]   = PetscMax(yv[idx],val[0]);
1166:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1167:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1168:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1169:         yv[idx+4] = PetscMax(yv[idx+4],val[4]);
1170:         val       += 5;
1171:       }
1172: #endif
1173:     }  else {SETERRQ(1,"Wrong insert option");}
1174:     count--;
1175:   }
1176:   /* wait on sends */
1177:   if (nsends) {
1178:     MPI_Waitall(nsends,swaits,sstatus);
1179:   }
1180:   VecRestoreArray(yin,&yv);
1181:   return(0);
1182: }

1184: /* --------------------------------------------------------------------------------------*/

1188: int VecScatterBegin_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1189: {
1190:   VecScatter_MPI_General *gen_to,*gen_from;
1191:   PetscScalar            *xv,*yv,*val,*svalues;
1192:   MPI_Request            *rwaits,*swaits;
1193:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

1196:   VecGetArray(xin,&xv);
1197:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1199:   if (mode & SCATTER_REVERSE) {
1200:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1201:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1202:     rwaits   = gen_from->rev_requests;
1203:     swaits   = gen_to->rev_requests;
1204:   } else {
1205:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1206:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1207:     rwaits   = gen_from->requests;
1208:     swaits   = gen_to->requests;
1209:   }
1210:   svalues  = gen_to->values;
1211:   nrecvs   = gen_from->n;
1212:   nsends   = gen_to->n;
1213:   indices  = gen_to->indices;
1214:   sstarts  = gen_to->starts;

1216:   if (!(mode & SCATTER_LOCAL)) {

1218:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
1219:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1220:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1221:     }

1223:     if (ctx->packtogether) {
1224:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
1225:       len  = 4*sstarts[nsends];
1226:       val  = svalues;
1227:       for (i=0; i<len; i += 4) {
1228:         idx    = *indices++;
1229:         val[0] = xv[idx];
1230:         val[1] = xv[idx+1];
1231:         val[2] = xv[idx+2];
1232:         val[3] = xv[idx+3];
1233:         val    += 4;
1234:       }
1235:       MPI_Startall_isend(len,nsends,swaits);
1236:     } else {
1237:       /* this version packs and sends one at a time, default */
1238:       val  = svalues;
1239:       for (i=0; i<nsends; i++) {
1240:         iend = sstarts[i+1]-sstarts[i];

1242:         for (j=0; j<iend; j++) {
1243:           idx     = *indices++;
1244:           val[0] = xv[idx];
1245:           val[1] = xv[idx+1];
1246:           val[2] = xv[idx+2];
1247:           val[3] = xv[idx+3];
1248:           val    += 4;
1249:         }
1250:         MPI_Start_isend(4*iend,swaits+i);
1251:       }
1252:     }

1254:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
1255:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1256:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1257:     }
1258:   }

1260:   /* take care of local scatters */
1261:   if (gen_to->local.n) {
1262:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1263:     int n       = gen_to->local.n,il,ir;
1264:     if (addv == INSERT_VALUES) {
1265:       if (gen_to->local.is_copy) {
1266:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
1267:       } else {
1268:         for (i=0; i<n; i++) {
1269:           il = fslots[i]; ir = tslots[i];
1270:           yv[il]   = xv[ir];
1271:           yv[il+1] = xv[ir+1];
1272:           yv[il+2] = xv[ir+2];
1273:           yv[il+3] = xv[ir+3];
1274:         }
1275:       }
1276:     }  else if (addv == ADD_VALUES) {
1277:       for (i=0; i<n; i++) {
1278:         il = fslots[i]; ir = tslots[i];
1279:         yv[il]   += xv[ir];
1280:         yv[il+1] += xv[ir+1];
1281:         yv[il+2] += xv[ir+2];
1282:         yv[il+3] += xv[ir+3];
1283:       }
1284: #if !defined(PETSC_USE_COMPLEX)
1285:     }  else if (addv == MAX_VALUES) {
1286:       for (i=0; i<n; i++) {
1287:         il = fslots[i]; ir = tslots[i];
1288:         yv[il]   = PetscMax(yv[il],xv[ir]);
1289:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1290:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1291:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1292:       }
1293: #endif
1294:     }  else {SETERRQ(1,"Wrong insert option");}
1295:   }
1296:   VecRestoreArray(xin,&xv);
1297:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1298:   return(0);
1299: }

1301: /* --------------------------------------------------------------------------------------*/

1305: int VecScatterEnd_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1306: {
1307:   VecScatter_MPI_General *gen_to,*gen_from;
1308:   PetscScalar            *rvalues,*yv,*val;
1309:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1310:   MPI_Request            *rwaits,*swaits;
1311:   MPI_Status             *rstatus,*sstatus;

1314:   if (mode & SCATTER_LOCAL) return(0);
1315:   VecGetArray(yin,&yv);

1317:   if (mode & SCATTER_REVERSE) {
1318:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1319:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1320:     rwaits   = gen_from->rev_requests;
1321:     swaits   = gen_to->rev_requests;
1322:     sstatus  = gen_from->sstatus;
1323:     rstatus  = gen_from->rstatus;
1324:   } else {
1325:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1326:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1327:     rwaits   = gen_from->requests;
1328:     swaits   = gen_to->requests;
1329:     sstatus  = gen_to->sstatus;
1330:     rstatus  = gen_to->rstatus;
1331:   }
1332:   rvalues  = gen_from->values;
1333:   nrecvs   = gen_from->n;
1334:   nsends   = gen_to->n;
1335:   indices  = gen_from->indices;
1336:   rstarts  = gen_from->starts;

1338:   /*  wait on receives */
1339:   count = nrecvs;
1340:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
1341:     MPI_Waitall(nrecvs,rwaits,rstatus);
1342:     n        = rstarts[count];
1343:     val      = rvalues;
1344:     lindices = indices;
1345:     if (addv == INSERT_VALUES) {
1346:       for (i=0; i<n; i++) {
1347:         idx       = lindices[i];
1348:         yv[idx]   = val[0];
1349:         yv[idx+1] = val[1];
1350:         yv[idx+2] = val[2];
1351:         yv[idx+3] = val[3];
1352:         val      += 4;
1353:       }
1354:     } else if (addv == ADD_VALUES) {
1355:       for (i=0; i<n; i++) {
1356:         idx       = lindices[i];
1357:         yv[idx]   += val[0];
1358:         yv[idx+1] += val[1];
1359:         yv[idx+2] += val[2];
1360:         yv[idx+3] += val[3];
1361:         val       += 4;
1362:       }
1363: #if !defined(PETSC_USE_COMPLEX)
1364:     } else if (addv == MAX_VALUES) {
1365:       for (i=0; i<n; i++) {
1366:         idx       = lindices[i];
1367:         yv[idx]   = PetscMax(yv[idx],val[0]);
1368:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1369:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1370:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1371:         val       += 4;
1372:       }
1373: #endif
1374:     }  else {SETERRQ(1,"Wrong insert option");}
1375:   } else { /* unpack each message as it arrives, default version */
1376:     while (count) {
1377:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
1378:       /* unpack receives into our local space */
1379:       val      = rvalues + 4*rstarts[imdex];
1380:       lindices = indices + rstarts[imdex];
1381:       n        = rstarts[imdex+1] - rstarts[imdex];
1382:       if (addv == INSERT_VALUES) {
1383:         for (i=0; i<n; i++) {
1384:           idx       = lindices[i];
1385:           yv[idx]   = val[0];
1386:           yv[idx+1] = val[1];
1387:           yv[idx+2] = val[2];
1388:           yv[idx+3] = val[3];
1389:           val      += 4;
1390:         }
1391:       } else if (addv == ADD_VALUES) {
1392:         for (i=0; i<n; i++) {
1393:           idx       = lindices[i];
1394:           yv[idx]   += val[0];
1395:           yv[idx+1] += val[1];
1396:           yv[idx+2] += val[2];
1397:           yv[idx+3] += val[3];
1398:           val       += 4;
1399:         }
1400: #if !defined(PETSC_USE_COMPLEX)
1401:       } else if (addv == MAX_VALUES) {
1402:         for (i=0; i<n; i++) {
1403:           idx       = lindices[i];
1404:           yv[idx]   = PetscMax(yv[idx],val[0]);
1405:           yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1406:           yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1407:           yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1408:           val       += 4;
1409:         }
1410: #endif
1411:       }  else {SETERRQ(1,"Wrong insert option");}
1412:       count--;
1413:     }
1414:   }

1416:   /* wait on sends */
1417:   if (nsends) {
1418:     MPI_Waitall(nsends,swaits,sstatus);
1419:   }
1420:   VecRestoreArray(yin,&yv);
1421:   return(0);
1422: }

1424: /* --------------------------------------------------------------------------------------*/

1428: int VecScatterBegin_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1429: {
1430:   VecScatter_MPI_General *gen_to,*gen_from;
1431:   PetscScalar            *xv,*yv,*val,*svalues;
1432:   MPI_Request            *rwaits,*swaits;
1433:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1436:   VecGetArray(xin,&xv);
1437:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1439:   if (mode & SCATTER_REVERSE) {
1440:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1441:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1442:     rwaits   = gen_from->rev_requests;
1443:     swaits   = gen_to->rev_requests;
1444:   } else {
1445:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1446:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1447:     rwaits   = gen_from->requests;
1448:     swaits   = gen_to->requests;
1449:   }
1450:   svalues  = gen_to->values;
1451:   nrecvs   = gen_from->n;
1452:   nsends   = gen_to->n;
1453:   indices  = gen_to->indices;
1454:   sstarts  = gen_to->starts;

1456:   if (!(mode & SCATTER_LOCAL)) {

1458:     if (gen_to->sendfirst) {
1459:       /* this version packs and sends one at a time */
1460:       val  = svalues;
1461:       for (i=0; i<nsends; i++) {
1462:         iend = sstarts[i+1]-sstarts[i];

1464:         for (j=0; j<iend; j++) {
1465:           idx     = *indices++;
1466:           val[0] = xv[idx];
1467:           val[1] = xv[idx+1];
1468:           val[2] = xv[idx+2];
1469:           val    += 3;
1470:         }
1471:         MPI_Start_isend(3*iend,swaits+i);
1472:       }
1473:     }

1475:     if (!gen_from->use_readyreceiver) {
1476:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1477:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1478:     }

1480:     if (!gen_to->sendfirst) {
1481:       /* this version packs all the messages together and sends */
1482:       /*
1483:       len  = 3*sstarts[nsends];
1484:       val  = svalues;
1485:       for (i=0; i<len; i += 3) {
1486:         idx     = *indices++;
1487:         val[0] = xv[idx];
1488:         val[1] = xv[idx+1];
1489:         val[2] = xv[idx+2];
1490:         val      += 3;
1491:       }
1492:       MPI_Startall_isend(len,nsends,swaits);
1493:       */

1495:       /* this version packs and sends one at a time */
1496:       val  = svalues;
1497:       for (i=0; i<nsends; i++) {
1498:         iend = sstarts[i+1]-sstarts[i];

1500:         for (j=0; j<iend; j++) {
1501:           idx     = *indices++;
1502:           val[0] = xv[idx];
1503:           val[1] = xv[idx+1];
1504:           val[2] = xv[idx+2];
1505:           val    += 3;
1506:         }
1507:         MPI_Start_isend(3*iend,swaits+i);
1508:       }
1509:     }
1510:   }

1512:   /* take care of local scatters */
1513:   if (gen_to->local.n) {
1514:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1515:     int n       = gen_to->local.n,il,ir;
1516:     if (addv == INSERT_VALUES) {
1517:       if (gen_to->local.is_copy) {
1518:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1519:       } else {
1520:         for (i=0; i<n; i++) {
1521:           il = fslots[i]; ir = tslots[i];
1522:           yv[il]   = xv[ir];
1523:           yv[il+1] = xv[ir+1];
1524:           yv[il+2] = xv[ir+2];
1525:           yv[il+3] = xv[ir+3];
1526:         }
1527:       }
1528:     }  else if (addv == ADD_VALUES) {
1529:       for (i=0; i<n; i++) {
1530:         il = fslots[i]; ir = tslots[i];
1531:         yv[il]   += xv[ir];
1532:         yv[il+1] += xv[ir+1];
1533:         yv[il+2] += xv[ir+2];
1534:         yv[il+3] += xv[ir+3];
1535:       }
1536: #if !defined(PETSC_USE_COMPLEX)
1537:     }  else if (addv == MAX_VALUES) {
1538:       for (i=0; i<n; i++) {
1539:         il = fslots[i]; ir = tslots[i];
1540:         yv[il]   = PetscMax(yv[il],xv[ir]);
1541:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1542:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1543:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1544:       }
1545: #endif
1546:     }  else {SETERRQ(1,"Wrong insert option");}
1547:   }
1548:   VecRestoreArray(xin,&xv);
1549:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1550:   return(0);
1551: }

1553: /* --------------------------------------------------------------------------------------*/

1557: int VecScatterEnd_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1558: {
1559:   VecScatter_MPI_General *gen_to,*gen_from;
1560:   PetscScalar            *rvalues,*yv,*val;
1561:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1562:   MPI_Request            *rwaits,*swaits;
1563:   MPI_Status             rstatus,*sstatus;

1566:   if (mode & SCATTER_LOCAL) return(0);
1567:   VecGetArray(yin,&yv);

1569:   if (mode & SCATTER_REVERSE) {
1570:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1571:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1572:     rwaits   = gen_from->rev_requests;
1573:     swaits   = gen_to->rev_requests;
1574:     sstatus  = gen_from->sstatus;
1575:   } else {
1576:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1577:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1578:     rwaits   = gen_from->requests;
1579:     swaits   = gen_to->requests;
1580:     sstatus  = gen_to->sstatus;
1581:   }
1582:   rvalues  = gen_from->values;
1583:   nrecvs   = gen_from->n;
1584:   nsends   = gen_to->n;
1585:   indices  = gen_from->indices;
1586:   rstarts  = gen_from->starts;

1588:   /*  wait on receives */
1589:   count = nrecvs;
1590:   while (count) {
1591:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1592:     /* unpack receives into our local space */
1593:     val      = rvalues + 3*rstarts[imdex];
1594:     lindices = indices + rstarts[imdex];
1595:     n        = rstarts[imdex+1] - rstarts[imdex];
1596:     if (addv == INSERT_VALUES) {
1597:       for (i=0; i<n; i++) {
1598:         idx       = lindices[i];
1599:         yv[idx]   = val[0];
1600:         yv[idx+1] = val[1];
1601:         yv[idx+2] = val[2];
1602:         val      += 3;
1603:       }
1604:     } else if (addv == ADD_VALUES) {
1605:       for (i=0; i<n; i++) {
1606:         idx       = lindices[i];
1607:         yv[idx]   += val[0];
1608:         yv[idx+1] += val[1];
1609:         yv[idx+2] += val[2];
1610:         val       += 3;
1611:       }
1612: #if !defined(PETSC_USE_COMPLEX)
1613:     } else if (addv == MAX_VALUES) {
1614:       for (i=0; i<n; i++) {
1615:         idx       = lindices[i];
1616:         yv[idx]   = PetscMax(yv[idx],val[0]);
1617:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1618:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1619:         val       += 3;
1620:       }
1621: #endif
1622:     }  else {SETERRQ(1,"Wrong insert option");}
1623:     count--;
1624:   }
1625:   /* wait on sends */
1626:   if (nsends) {
1627:     MPI_Waitall(nsends,swaits,sstatus);
1628:   }
1629:   VecRestoreArray(yin,&yv);
1630:   return(0);
1631: }

1633: /* --------------------------------------------------------------------------------------*/

1637: int VecScatterBegin_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1638: {
1639:   VecScatter_MPI_General *gen_to,*gen_from;
1640:   PetscScalar            *xv,*yv,*val,*svalues;
1641:   MPI_Request            *rwaits,*swaits;
1642:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1645:   VecGetArray(xin,&xv);
1646:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1647:   if (mode & SCATTER_REVERSE) {
1648:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1649:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1650:     rwaits   = gen_from->rev_requests;
1651:     swaits   = gen_to->rev_requests;
1652:   } else {
1653:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1654:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1655:     rwaits   = gen_from->requests;
1656:     swaits   = gen_to->requests;
1657:   }
1658:   svalues  = gen_to->values;
1659:   nrecvs   = gen_from->n;
1660:   nsends   = gen_to->n;
1661:   indices  = gen_to->indices;
1662:   sstarts  = gen_to->starts;

1664:   if (!(mode & SCATTER_LOCAL)) {

1666:     if (gen_to->sendfirst) {
1667:       /* this version packs and sends one at a time */
1668:       val  = svalues;
1669:       for (i=0; i<nsends; i++) {
1670:         iend = sstarts[i+1]-sstarts[i];

1672:         for (j=0; j<iend; j++) {
1673:           idx     = *indices++;
1674:           val[0] = xv[idx];
1675:           val[1] = xv[idx+1];
1676:           val    += 2;
1677:         }
1678:         MPI_Start_isend(2*iend,swaits+i);
1679:       }
1680:     }

1682:     if (!gen_from->use_readyreceiver) {
1683:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1684:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1685:     }

1687:     if (!gen_to->sendfirst) {
1688:       /* this version packs all the messages together and sends */
1689:       /*
1690:       len  = 2*sstarts[nsends];
1691:       val  = svalues;
1692:       for (i=0; i<len; i += 2) {
1693:         idx     = *indices++;
1694:         val[0] = xv[idx];
1695:         val[1] = xv[idx+1];
1696:         val      += 2;
1697:       }
1698:       MPI_Startall_isend(len,nsends,swaits);
1699:       */

1701:       /* this version packs and sends one at a time */
1702:       val  = svalues;
1703:       for (i=0; i<nsends; i++) {
1704:         iend = sstarts[i+1]-sstarts[i];

1706:         for (j=0; j<iend; j++) {
1707:           idx     = *indices++;
1708:           val[0] = xv[idx];
1709:           val[1] = xv[idx+1];
1710:           val    += 2;
1711:         }
1712:         MPI_Start_isend(2*iend,swaits+i);
1713:       }
1714:     }
1715:   }

1717:   /* take care of local scatters */
1718:   if (gen_to->local.n) {
1719:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1720:     int n       = gen_to->local.n,il,ir;
1721:     if (addv == INSERT_VALUES) {
1722:       if (gen_to->local.is_copy) {
1723:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1724:       } else {
1725:         for (i=0; i<n; i++) {
1726:           il = fslots[i]; ir = tslots[i];
1727:           yv[il]   = xv[ir];
1728:           yv[il+1] = xv[ir+1];
1729:         }
1730:       }
1731:     }  else if (addv == ADD_VALUES) {
1732:       for (i=0; i<n; i++) {
1733:         il = fslots[i]; ir = tslots[i];
1734:         yv[il]   += xv[ir];
1735:         yv[il+1] += xv[ir+1];
1736:       }
1737: #if !defined(PETSC_USE_COMPLEX)
1738:     }  else if (addv == MAX_VALUES) {
1739:       for (i=0; i<n; i++) {
1740:         il = fslots[i]; ir = tslots[i];
1741:         yv[il]   = PetscMax(yv[il],xv[ir]);
1742:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1743:       }
1744: #endif
1745:     }  else {SETERRQ(1,"Wrong insert option");}
1746:   }
1747:   VecRestoreArray(xin,&xv);
1748:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1749:   return(0);
1750: }

1752: /* --------------------------------------------------------------------------------------*/

1756: int VecScatterEnd_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1757: {
1758:   VecScatter_MPI_General *gen_to,*gen_from;
1759:   PetscScalar            *rvalues,*yv,*val;
1760:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1761:   MPI_Request            *rwaits,*swaits;
1762:   MPI_Status             rstatus,*sstatus;

1765:   if (mode & SCATTER_LOCAL) return(0);
1766:   VecGetArray(yin,&yv);

1768:   if (mode & SCATTER_REVERSE) {
1769:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1770:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1771:     rwaits   = gen_from->rev_requests;
1772:     swaits   = gen_to->rev_requests;
1773:     sstatus  = gen_from->sstatus;
1774:   } else {
1775:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1776:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1777:     rwaits   = gen_from->requests;
1778:     swaits   = gen_to->requests;
1779:     sstatus  = gen_to->sstatus;
1780:   }
1781:   rvalues  = gen_from->values;
1782:   nrecvs   = gen_from->n;
1783:   nsends   = gen_to->n;
1784:   indices  = gen_from->indices;
1785:   rstarts  = gen_from->starts;

1787:   /*  wait on receives */
1788:   count = nrecvs;
1789:   while (count) {
1790:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1791:     /* unpack receives into our local space */
1792:     val      = rvalues + 2*rstarts[imdex];
1793:     lindices = indices + rstarts[imdex];
1794:     n        = rstarts[imdex+1] - rstarts[imdex];
1795:     if (addv == INSERT_VALUES) {
1796:       for (i=0; i<n; i++) {
1797:         idx       = lindices[i];
1798:         yv[idx]   = val[0];
1799:         yv[idx+1] = val[1];
1800:         val      += 2;
1801:       }
1802:     } else if (addv == ADD_VALUES) {
1803:       for (i=0; i<n; i++) {
1804:         idx       = lindices[i];
1805:         yv[idx]   += val[0];
1806:         yv[idx+1] += val[1];
1807:         val       += 2;
1808:       }
1809: #if !defined(PETSC_USE_COMPLEX)
1810:     } else if (addv == MAX_VALUES) {
1811:       for (i=0; i<n; i++) {
1812:         idx       = lindices[i];
1813:         yv[idx]   = PetscMax(yv[idx],val[0]);
1814:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1815:         val       += 2;
1816:       }
1817: #endif
1818:     }  else {SETERRQ(1,"Wrong insert option");}
1819:     count--;
1820:   }
1821:   /* wait on sends */
1822:   if (nsends) {
1823:     MPI_Waitall(nsends,swaits,sstatus);
1824:   }
1825:   VecRestoreArray(yin,&yv);
1826:   return(0);
1827: }

1829: /* ---------------------------------------------------------------------------------*/

1833: int VecScatterDestroy_PtoP_X(VecScatter ctx)
1834: {
1835:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
1836:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1837:   int                    i,ierr;

1840:   if (gen_to->use_readyreceiver) {
1841:     /*
1842:        Since we have already posted sends we must cancel them before freeing 
1843:        the requests
1844:     */
1845:     for (i=0; i<gen_from->n; i++) {
1846:       MPI_Cancel(gen_from->requests+i);
1847:     }
1848:   }

1850:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
1851:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
1852:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
1853:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}

1855:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1856:   /* 
1857:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
1858:      message passing.
1859:   */
1860: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1861:   for (i=0; i<gen_to->n; i++) {
1862:     MPI_Request_free(gen_to->requests + i);
1863:     MPI_Request_free(gen_to->rev_requests + i);
1864:   }

1866:   /*
1867:       MPICH could not properly cancel requests thus with ready receiver mode we
1868:     cannot free the requests. It may be fixed now, if not then put the following 
1869:     code inside a if !gen_to->use_readyreceiver) {
1870:   */
1871:   for (i=0; i<gen_from->n; i++) {
1872:     MPI_Request_free(gen_from->requests + i);
1873:     MPI_Request_free(gen_from->rev_requests + i);
1874:   }
1875: #endif
1876: 
1877:   PetscFree(gen_to->sstatus);
1878:   PetscFree(gen_to->values);
1879:   PetscFree(gen_to->rev_requests);
1880:   PetscFree(gen_to);
1881:   PetscFree(gen_from->values);
1882:   PetscFree(gen_from->rev_requests);
1883:   PetscFree(gen_from);
1884:   PetscLogObjectDestroy(ctx);
1885:   PetscHeaderDestroy(ctx);
1886:   return(0);
1887: }

1889: /* ==========================================================================================*/

1891: /*              create parallel to sequential scatter context                           */
1892: /*
1893:    bs indicates how many elements there are in each block. Normally
1894:    this would be 1.
1895: */
1898: int VecScatterCreate_PtoS(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,int bs,VecScatter ctx)
1899: {
1900:   VecScatter_MPI_General *from,*to;
1901:   int                    *source,*lens,rank,*owners;
1902:   int                    size,*lowner,*start,lengthy;
1903:   int                    *nprocs,i,j,n,idx,nsends,nrecvs;
1904:   int                    *owner,*starts,count,tag,slen,ierr;
1905:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len,*indx,nprocslocal;
1906:   MPI_Comm               comm;
1907:   MPI_Request            *send_waits,*recv_waits;
1908:   MPI_Status             recv_status,*send_status;
1909:   PetscMap               map;
1910:   PetscTruth             found;
1911: 
1913:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1914:   PetscObjectGetComm((PetscObject)xin,&comm);
1915:   MPI_Comm_rank(comm,&rank);
1916:   MPI_Comm_size(comm,&size);
1917:   VecGetPetscMap(xin,&map);
1918:   PetscMapGetGlobalRange(map,&owners);

1920:   VecGetSize(yin,&lengthy);

1922:   /*  first count number of contributors to each processor */
1923:   PetscMalloc(2*size*sizeof(int),&nprocs);
1924:   PetscMemzero(nprocs,2*size*sizeof(int));
1925:   PetscMalloc((nx+1)*sizeof(int),&owner);
1926:   for (i=0; i<nx; i++) {
1927:     idx = inidx[i];
1928:     found = PETSC_FALSE;
1929:     for (j=0; j<size; j++) {
1930:       if (idx >= owners[j] && idx < owners[j+1]) {
1931:         nprocs[2*j]++; nprocs[2*j+1] = 1; owner[i] = j; found = PETSC_TRUE; break;
1932:       }
1933:     }
1934:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
1935:   }
1936:   nprocslocal    = nprocs[2*rank];
1937:   nprocs[2*rank] = nprocs[2*rank+1] = 0;
1938:   nsends         = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

1940:   /* inform other processors of number of messages and max length*/
1941:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

1943:   /* post receives:   */
1944:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
1945:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
1946:   for (i=0; i<nrecvs; i++) {
1947:     MPI_Irecv((rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1948:   }

1950:   /* do sends:
1951:       1) starts[i] gives the starting index in svalues for stuff going to 
1952:          the ith processor
1953:   */
1954:   PetscMalloc((nx+1)*sizeof(int),&svalues);
1955:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
1956:   PetscMalloc((size+1)*sizeof(int),&starts);
1957:   starts[0]  = 0;
1958:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1959:   for (i=0; i<nx; i++) {
1960:     if (owner[i] != rank) {
1961:       svalues[starts[owner[i]]++] = inidx[i];
1962:     }
1963:   }

1965:   starts[0] = 0;
1966:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1967:   count = 0;
1968:   for (i=0; i<size; i++) {
1969:     if (nprocs[2*i+1]) {
1970:       MPI_Isend(svalues+starts[i],nprocs[2*i],MPI_INT,i,tag,comm,send_waits+count++);
1971:     }
1972:   }
1973:   PetscFree(starts);

1975:   base = owners[rank];

1977:   /*  wait on receives */
1978:   PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
1979:   source = lens + nrecvs;
1980:   count  = nrecvs;
1981:   slen   = 0;
1982:   while (count) {
1983:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1984:     /* unpack receives into our local space */
1985:     MPI_Get_count(&recv_status,MPI_INT,&n);
1986:     source[imdex]  = recv_status.MPI_SOURCE;
1987:     lens[imdex]    = n;
1988:     slen          += n;
1989:     count--;
1990:   }
1991:   PetscFree(recv_waits);
1992: 
1993:   /* allocate entire send scatter context */
1994:   PetscNew(VecScatter_MPI_General,&to);
1995:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1996:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
1997:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1998:   len = slen*sizeof(int) + bs*slen*sizeof(PetscScalar) + (nrecvs+1)*sizeof(int) +
1999:         nrecvs*(sizeof(int) + sizeof(MPI_Request));
2000:   to->n         = nrecvs;
2001:   PetscMalloc(len,&to->values);
2002:   PetscLogObjectMemory(ctx,len);
2003:   to->requests  = (MPI_Request*)(to->values + bs*slen);
2004:   to->indices   = (int*)(to->requests + nrecvs);
2005:   to->starts    = (int*)(to->indices + slen);
2006:   to->procs     = (int*)(to->starts + nrecvs + 1);
2007:   PetscMalloc(2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status),&to->sstatus);
2008:   to->rstatus   = to->sstatus + PetscMax(nrecvs,nsends) + 1;
2009:   PetscLogObjectMemory(ctx,2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status));
2010:   ctx->todata   = (void*)to;
2011:   to->starts[0] = 0;

2013:   if (nrecvs) {
2014:     PetscMalloc(nrecvs*sizeof(int),&indx);
2015:     for (i=0; i<nrecvs; i++) indx[i] = i;
2016:     PetscSortIntWithPermutation(nrecvs,source,indx);

2018:     /* move the data into the send scatter */
2019:     for (i=0; i<nrecvs; i++) {
2020:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
2021:       to->procs[i]    = source[indx[i]];
2022:       values = rvalues + indx[i]*nmax;
2023:       for (j=0; j<lens[indx[i]]; j++) {
2024:         to->indices[to->starts[i] + j] = values[j] - base;
2025:       }
2026:     }
2027:     PetscFree(indx);
2028:   }
2029:   PetscFree(rvalues);
2030:   PetscFree(lens);
2031: 
2032:   /* allocate entire receive scatter context */
2033:   PetscNew(VecScatter_MPI_General,&from);
2034:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
2035:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
2036:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2037:   len = ny*sizeof(int) + ny*bs*sizeof(PetscScalar) + (nsends+1)*sizeof(int) +
2038:         nsends*(sizeof(int) + sizeof(MPI_Request));
2039:   from->n        = nsends;
2040:   PetscMalloc(len,&from->values);
2041:   PetscLogObjectMemory(ctx,len);
2042:   from->requests = (MPI_Request*)(from->values + bs*ny);
2043:   from->indices  = (int*)(from->requests + nsends);
2044:   from->starts   = (int*)(from->indices + ny);
2045:   from->procs    = (int*)(from->starts + nsends + 1);
2046:   ctx->fromdata  = (void*)from;

2048:   /* move data into receive scatter */
2049:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2050:   start = lowner + size;
2051:   count = 0; from->starts[0] = start[0] = 0;
2052:   for (i=0; i<size; i++) {
2053:     if (nprocs[2*i+1]) {
2054:       lowner[i]            = count;
2055:       from->procs[count++] = i;
2056:       from->starts[count]  = start[count] = start[count-1] + nprocs[2*i];
2057:     }
2058:   }
2059:   for (i=0; i<nx; i++) {
2060:     if (owner[i] != rank) {
2061:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
2062:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2063:     }
2064:   }
2065:   PetscFree(lowner);
2066:   PetscFree(owner);
2067:   PetscFree(nprocs);
2068: 
2069:   /* wait on sends */
2070:   if (nsends) {
2071:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2072:     MPI_Waitall(nsends,send_waits,send_status);
2073:     PetscFree(send_status);
2074:   }
2075:   PetscFree(send_waits);
2076:   PetscFree(svalues);

2078:   if (nprocslocal) {
2079:     int nt;
2080:     /* we have a scatter to ourselves */
2081:     from->local.n     = to->local.n = nt = nprocslocal;
2082:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2083:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2084:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2085:     nt = 0;
2086:     for (i=0; i<nx; i++) {
2087:       idx = inidx[i];
2088:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2089:         to->local.slots[nt]     = idx - owners[rank];
2090:         from->local.slots[nt++] = inidy[i];
2091:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2092:       }
2093:     }
2094:   } else {
2095:     from->local.n     = 0;
2096:     from->local.slots = 0;
2097:     to->local.n       = 0;
2098:     to->local.slots   = 0;
2099:   }
2100:   from->local.nonmatching_computed = PETSC_FALSE;
2101:   from->local.n_nonmatching        = 0;
2102:   from->local.slots_nonmatching    = 0;
2103:   to->local.nonmatching_computed   = PETSC_FALSE;
2104:   to->local.n_nonmatching          = 0;
2105:   to->local.slots_nonmatching      = 0;

2107:   to->type   = VEC_SCATTER_MPI_GENERAL;
2108:   from->type = VEC_SCATTER_MPI_GENERAL;

2110:   from->bs = bs;
2111:   to->bs   = bs;
2112:   if (bs > 1) {
2113:     PetscTruth  flg,flgs = PETSC_FALSE;
2114:     int         *sstarts = to->starts,  *rstarts = from->starts;
2115:     int         *sprocs  = to->procs,   *rprocs  = from->procs;
2116:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
2117:     MPI_Request *rev_swaits,*rev_rwaits;
2118:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2120:     ctx->destroy   = VecScatterDestroy_PtoP_X;
2121:     ctx->copy      = VecScatterCopy_PtoP_X;
2122:     ctx->view      = VecScatterView_MPI;
2123: 
2124:     tag      = ctx->tag;
2125:     comm     = ctx->comm;

2127:     /* allocate additional wait variables for the "reverse" scatter */
2128:     PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&rev_rwaits);
2129:     to->rev_requests   = rev_rwaits;
2130:     PetscMalloc((nsends+1)*sizeof(MPI_Request),&rev_swaits);
2131:     from->rev_requests = rev_swaits;
2132:     PetscLogObjectMemory(ctx,(nsends+nrecvs+2)*sizeof(MPI_Request));

2134:     /* Register the receives that you will use later (sends for scatter reverse) */
2135:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flgs);
2136:     if (flgs) {
2137:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter Ssend mode\n");
2138:     }
2139:     for (i=0; i<from->n; i++) {
2140:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2141:                            comm,rwaits+i);
2142:       if (!flgs) {
2143:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2144:                              comm,rev_swaits+i);
2145:       } else {
2146:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2147:                               comm,rev_swaits+i);
2148:       }
2149:     }

2151:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
2152:     if (flg) {
2153:       ctx->postrecvs           = VecScatterPostRecvs_PtoP_X;
2154:       to->use_readyreceiver    = PETSC_TRUE;
2155:       from->use_readyreceiver  = PETSC_TRUE;
2156:       for (i=0; i<to->n; i++) {
2157:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2158:                               comm,swaits+i);
2159:       }
2160:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter ready receiver mode\n");
2161:     } else {
2162:       ctx->postrecvs           = 0;
2163:       to->use_readyreceiver    = PETSC_FALSE;
2164:       from->use_readyreceiver  = PETSC_FALSE;
2165:       for (i=0; i<to->n; i++) {
2166:         if (!flgs) {
2167:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2168:                                comm,swaits+i);
2169:         } else {
2170:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2171:                                comm,swaits+i);
2172:         }
2173:       }
2174:     }
2175:     /* Register receives for scatter reverse */
2176:     for (i=0; i<to->n; i++) {
2177:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2178:                            comm,rev_rwaits+i);
2179:     }

2181:     PetscLogInfo(0,"VecScatterCreate_PtoS:Using blocksize %d scatter\n",bs);
2182:     switch (bs) {
2183:     case 12:
2184:       ctx->begin     = VecScatterBegin_PtoP_12;
2185:       ctx->end       = VecScatterEnd_PtoP_12;
2186:       break;
2187:     case 5:
2188:       ctx->begin     = VecScatterBegin_PtoP_5;
2189:       ctx->end       = VecScatterEnd_PtoP_5;
2190:       break;
2191:     case 4:
2192:       ctx->begin     = VecScatterBegin_PtoP_4;
2193:       ctx->end       = VecScatterEnd_PtoP_4;
2194:       break;
2195:     case 3:
2196:       ctx->begin     = VecScatterBegin_PtoP_3;
2197:       ctx->end       = VecScatterEnd_PtoP_3;
2198:       break;
2199:     case 2:
2200:       ctx->begin     = VecScatterBegin_PtoP_2;
2201:       ctx->end       = VecScatterEnd_PtoP_2;
2202:       break;
2203:     default:
2204:       SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2205:     }
2206:   } else {
2207:     ctx->postrecvs = 0;
2208:     ctx->destroy   = VecScatterDestroy_PtoP;
2209:     ctx->begin     = VecScatterBegin_PtoP;
2210:     ctx->end       = VecScatterEnd_PtoP;
2211:     ctx->copy      = VecScatterCopy_PtoP;
2212:     ctx->view      = VecScatterView_MPI;
2213:   }

2215:   /* Check if the local scatter is actually a copy; important special case */
2216:   if (nprocslocal) {
2217:     VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
2218:   }

2220:   return(0);
2221: }

2223: /* ------------------------------------------------------------------------------------*/
2224: /*
2225:          Scatter from local Seq vectors to a parallel vector. 
2226: */
2229: int VecScatterCreate_StoP(int nx,int *inidx,int ny,int *inidy,Vec yin,VecScatter ctx)
2230: {
2231:   VecScatter_MPI_General *from,*to;
2232:   int                    *source,nprocslocal,*lens,rank = yin->stash.rank,*owners = yin->map->range;
2233:   int                    ierr,size = yin->stash.size,*lowner,*start;
2234:   int                    *nprocs,i,j,n,idx,nsends,nrecvs;
2235:   int                    *owner,*starts,count,tag,slen;
2236:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len;
2237:   PetscTruth             found;
2238:   MPI_Comm               comm = yin->comm;
2239:   MPI_Request            *send_waits,*recv_waits;
2240:   MPI_Status             recv_status,*send_status;

2243:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2244:   /*  first count number of contributors to each processor */
2245:   PetscMalloc(2*size*sizeof(int),&nprocs);
2246:   PetscMemzero(nprocs,2*size*sizeof(int));
2247:   PetscMalloc((nx+1)*sizeof(int),&owner);
2248:   for (i=0; i<nx; i++) {
2249:     idx = inidy[i];
2250:     found = PETSC_FALSE;
2251:     for (j=0; j<size; j++) {
2252:       if (idx >= owners[j] && idx < owners[j+1]) {
2253:         nprocs[2*j]++; nprocs[2*j+1] = 1; owner[i] = j; found = PETSC_TRUE; break;
2254:       }
2255:     }
2256:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2257:   }
2258:   nprocslocal    = nprocs[2*rank];
2259:   nprocs[2*rank] = nprocs[2*rank+1] = 0;
2260:   nsends = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

2262:   /* inform other processors of number of messages and max length*/
2263:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

2265:   /* post receives:   */
2266:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2267:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2268:   for (i=0; i<nrecvs; i++) {
2269:     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2270:   }

2272:   /* do sends:
2273:       1) starts[i] gives the starting index in svalues for stuff going to 
2274:          the ith processor
2275:   */
2276:   PetscMalloc((nx+1)*sizeof(int),&svalues);
2277:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2278:   PetscMalloc((size+1)*sizeof(int),&starts);
2279:   starts[0]  = 0;
2280:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2281:   for (i=0; i<nx; i++) {
2282:     if (owner[i] != rank) {
2283:       svalues[starts[owner[i]]++] = inidy[i];
2284:     }
2285:   }

2287:   starts[0] = 0;
2288:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2289:   count = 0;
2290:   for (i=0; i<size; i++) {
2291:     if (nprocs[2*i+1]) {
2292:       MPI_Isend(svalues+starts[i],nprocs[2*i],MPI_INT,i,tag,comm,send_waits+count);
2293:       count++;
2294:     }
2295:   }
2296:   PetscFree(starts);

2298:   /* allocate entire send scatter context */
2299:   PetscNew(VecScatter_MPI_General,&to);
2300:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
2301:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2302:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
2303:   len  = ny*(sizeof(int) + sizeof(PetscScalar)) + (nsends+1)*sizeof(int) +
2304:         nsends*(sizeof(int) + sizeof(MPI_Request));
2305:   to->n        = nsends;
2306:   PetscMalloc(len,&to->values);
2307:   PetscLogObjectMemory(ctx,len);
2308:   to->requests = (MPI_Request*)(to->values + ny);
2309:   to->indices  = (int*)(to->requests + nsends);
2310:   to->starts   = (int*)(to->indices + ny);
2311:   to->procs    = (int*)(to->starts + nsends + 1);
2312:   PetscMalloc((PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status),&to->sstatus);
2313:   to->rstatus  = to->sstatus + PetscMax(nsends,nrecvs) + 1;
2314:   PetscLogObjectMemory(ctx,2*(PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status));
2315:   ctx->todata  = (void*)to;

2317:   /* move data into send scatter context */
2318:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2319:   start         = lowner + size;
2320:   count         = 0;
2321:   to->starts[0] = start[0] = 0;
2322:   for (i=0; i<size; i++) {
2323:     if (nprocs[2*i+1]) {
2324:       lowner[i]          = count;
2325:       to->procs[count++] = i;
2326:       to->starts[count]  = start[count] = start[count-1] + nprocs[2*i];
2327:     }
2328:   }
2329:   for (i=0; i<nx; i++) {
2330:     if (owner[i] != rank) {
2331:       to->indices[start[lowner[owner[i]]]++] = inidx[i];
2332:     }
2333:   }
2334:   PetscFree(lowner);
2335:   PetscFree(owner);
2336:   PetscFree(nprocs);

2338:   base = owners[rank];

2340:   /*  wait on receives */
2341:   PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2342:   source = lens + nrecvs;
2343:   count  = nrecvs; slen = 0;
2344:   while (count) {
2345:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2346:     /* unpack receives into our local space */
2347:     MPI_Get_count(&recv_status,MPI_INT,&n);
2348:     source[imdex]  = recv_status.MPI_SOURCE;
2349:     lens[imdex]    = n;
2350:     slen          += n;
2351:     count--;
2352:   }
2353:   PetscFree(recv_waits);
2354: 
2355:   /* allocate entire receive scatter context */
2356:   PetscNew(VecScatter_MPI_General,&from);
2357:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
2358:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2359:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
2360:   len  = slen*(sizeof(int) + sizeof(PetscScalar)) + (nrecvs+1)*sizeof(int) +
2361:           nrecvs*(sizeof(int) + sizeof(MPI_Request));
2362:   from->n        = nrecvs;
2363:   PetscMalloc(len,&from->values);
2364:   PetscLogObjectMemory(ctx,len);
2365:   from->requests = (MPI_Request*)(from->values + slen);
2366:   from->indices  = (int*)(from->requests + nrecvs);
2367:   from->starts   = (int*)(from->indices + slen);
2368:   from->procs    = (int*)(from->starts + nrecvs + 1);
2369:   ctx->fromdata  = (void*)from;

2371:   /* move the data into the receive scatter context*/
2372:   from->starts[0] = 0;
2373:   for (i=0; i<nrecvs; i++) {
2374:     from->starts[i+1] = from->starts[i] + lens[i];
2375:     from->procs[i]    = source[i];
2376:     values            = rvalues + i*nmax;
2377:     for (j=0; j<lens[i]; j++) {
2378:       from->indices[from->starts[i] + j] = values[j] - base;
2379:     }
2380:   }
2381:   PetscFree(rvalues);
2382:   PetscFree(lens);
2383: 
2384:   /* wait on sends */
2385:   if (nsends) {
2386:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2387:     MPI_Waitall(nsends,send_waits,send_status);
2388:     PetscFree(send_status);
2389:   }
2390:   PetscFree(send_waits);
2391:   PetscFree(svalues);

2393:   if (nprocslocal) {
2394:     int nt;
2395:     /* we have a scatter to ourselves */
2396:     from->local.n     = to->local.n = nt = nprocslocal;
2397:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2398:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2399:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2400:     nt = 0;
2401:     for (i=0; i<ny; i++) {
2402:       idx = inidy[i];
2403:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2404:         from->local.slots[nt] = idx - owners[rank];
2405:         to->local.slots[nt++] = inidx[i];
2406:       }
2407:     }
2408:   } else {
2409:     from->local.n     = 0;
2410:     from->local.slots = 0;
2411:     to->local.n       = 0;
2412:     to->local.slots   = 0;

2414:   }
2415:   from->local.nonmatching_computed = PETSC_FALSE;
2416:   from->local.n_nonmatching        = 0;
2417:   from->local.slots_nonmatching    = 0;
2418:   to->local.nonmatching_computed   = PETSC_FALSE;
2419:   to->local.n_nonmatching          = 0;
2420:   to->local.slots_nonmatching      = 0;

2422:   to->type   = VEC_SCATTER_MPI_GENERAL;
2423:   from->type = VEC_SCATTER_MPI_GENERAL;

2425:   ctx->destroy    = VecScatterDestroy_PtoP;
2426:   ctx->postrecvs  = 0;
2427:   ctx->begin      = VecScatterBegin_PtoP;
2428:   ctx->end        = VecScatterEnd_PtoP;
2429:   ctx->copy       = 0;
2430:   ctx->view       = VecScatterView_MPI;

2432:   to->bs   = 1;
2433:   from->bs = 1;
2434:   return(0);
2435: }

2437: /* ---------------------------------------------------------------------------------*/
2440: int VecScatterCreate_PtoP(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,VecScatter ctx)
2441: {
2442:   int         *lens,rank,*owners = xin->map->range,size;
2443:   int         *nprocs,i,j,n,idx,nsends,nrecvs,*local_inidx,*local_inidy;
2444:   int         *owner,*starts,count,tag,slen,ierr;
2445:   int         *rvalues,*svalues,base,imdex,nmax,*values;
2446:   MPI_Comm    comm;
2447:   MPI_Request *send_waits,*recv_waits;
2448:   MPI_Status  recv_status;
2449:   PetscTruth  duplicate = PETSC_FALSE,found;

2452:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2453:   PetscObjectGetComm((PetscObject)xin,&comm);
2454:   MPI_Comm_size(comm,&size);
2455:   MPI_Comm_rank(comm,&rank);
2456:   if (size == 1) {
2457:     VecScatterCreate_StoP(nx,inidx,ny,inidy,yin,ctx);
2458:     return(0);
2459:   }

2461:   /*
2462:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2463:      They then call the StoPScatterCreate()
2464:   */
2465:   /*  first count number of contributors to each processor */
2466:   PetscMalloc(2*size*sizeof(int),&nprocs);
2467:   PetscMemzero(nprocs,2*size*sizeof(int));
2468:   PetscMalloc((nx+1)*sizeof(int),&owner);
2469:   for (i=0; i<nx; i++) {
2470:     idx   = inidx[i];
2471:     found = PETSC_FALSE;
2472:     for (j=0; j<size; j++) {
2473:       if (idx >= owners[j] && idx < owners[j+1]) {
2474:         nprocs[2*j]++; nprocs[2*j+1] = 1; owner[i] = j; found = PETSC_TRUE; break;
2475:       }
2476:     }
2477:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2478:   }
2479:   nsends = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

2481:   /* inform other processors of number of messages and max length*/
2482:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

2484:   /* post receives:   */
2485:   PetscMalloc(2*(nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2486:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2487:   for (i=0; i<nrecvs; i++) {
2488:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2489:   }

2491:   /* do sends:
2492:       1) starts[i] gives the starting index in svalues for stuff going to 
2493:          the ith processor
2494:   */
2495:   PetscMalloc(2*(nx+1)*sizeof(int),&svalues);
2496:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2497:   PetscMalloc((size+1)*sizeof(int),&starts);
2498:   starts[0]  = 0;
2499:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2500:   for (i=0; i<nx; i++) {
2501:     svalues[2*starts[owner[i]]]       = inidx[i];
2502:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2503:   }
2504:   PetscFree(owner);

2506:   starts[0] = 0;
2507:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2508:   count = 0;
2509:   for (i=0; i<size; i++) {
2510:     if (nprocs[2*i+1]) {
2511:       MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPI_INT,i,tag,comm,send_waits+count);
2512:       count++;
2513:     }
2514:   }
2515:   PetscFree(starts);
2516:   PetscFree(nprocs);

2518:   base = owners[rank];

2520:   /*  wait on receives */
2521:   PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2522:   count = nrecvs;
2523:   slen  = 0;
2524:   while (count) {
2525:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2526:     /* unpack receives into our local space */
2527:     MPI_Get_count(&recv_status,MPI_INT,&n);
2528:     lens[imdex]  =  n/2;
2529:     slen         += n/2;
2530:     count--;
2531:   }
2532:   PetscFree(recv_waits);
2533: 
2534:   PetscMalloc(2*(slen+1)*sizeof(int),&local_inidx);
2535:   local_inidy = local_inidx + slen;

2537:   count = 0;
2538:   for (i=0; i<nrecvs; i++) {
2539:     values = rvalues + 2*i*nmax;
2540:     for (j=0; j<lens[i]; j++) {
2541:       local_inidx[count]   = values[2*j] - base;
2542:       local_inidy[count++] = values[2*j+1];
2543:     }
2544:   }
2545:   PetscFree(rvalues);
2546:   PetscFree(lens);

2548:   /* wait on sends */
2549:   if (nsends) {
2550:     MPI_Status *send_status;
2551:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2552:     MPI_Waitall(nsends,send_waits,send_status);
2553:     PetscFree(send_status);
2554:   }
2555:   PetscFree(send_waits);
2556:   PetscFree(svalues);

2558:   /*
2559:      should sort and remove duplicates from local_inidx,local_inidy 
2560:   */

2562: #if defined(do_it_slow)
2563:   /* sort on the from index */
2564:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2565:   start = 0;
2566:   while (start < slen) {
2567:     count = start+1;
2568:     last  = local_inidx[start];
2569:     while (count < slen && last == local_inidx[count]) count++;
2570:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2571:       /* sort on to index */
2572:       PetscSortInt(count-start,local_inidy+start);
2573:     }
2574:     /* remove duplicates; not most efficient way, but probably good enough */
2575:     i = start;
2576:     while (i < count-1) {
2577:       if (local_inidy[i] != local_inidy[i+1]) {
2578:         i++;
2579:       } else { /* found a duplicate */
2580:         duplicate = PETSC_TRUE;
2581:         for (j=i; j<slen-1; j++) {
2582:           local_inidx[j] = local_inidx[j+1];
2583:           local_inidy[j] = local_inidy[j+1];
2584:         }
2585:         slen--;
2586:         count--;
2587:         /* printf("found dup %d %d\n",local_inidx[i],local_inidy[i]);*/
2588:       }
2589:     }
2590:     start = count;
2591:   }
2592: #endif
2593:   if (duplicate) {
2594:     PetscLogInfo(0,"VecScatterCreate_PtoP:Duplicate to from indices passed in VecScatterCreate(), they are ignored\n");
2595:   }
2596:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,yin,ctx);
2597:   PetscFree(local_inidx);
2598:   return(0);
2599: }