Actual source code: baijov.c

  1: /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/

  3: /*
  4:    Routines to compute overlapping regions of a parallel MPI matrix
  5:   and to find submatrices that were shared across processors.
  6: */
 7:  #include src/mat/impls/baij/mpi/mpibaij.h
 8:  #include petscbt.h

 10: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
 11: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
 12: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
 13: EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
 14: EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);

 18: int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov)
 19: {
 20:   Mat_MPIBAIJ  *c = (Mat_MPIBAIJ*)C->data;
 21:   int          i,ierr,N=C->N, bs=c->bs;
 22:   IS           *is_new;

 25:   PetscMalloc(imax*sizeof(IS),&is_new);
 26:   /* Convert the indices into block format */
 27:   ISCompressIndicesGeneral(N,bs,imax,is,is_new);
 28:   if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
 29:   for (i=0; i<ov; ++i) {
 30:     MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
 31:   }
 32:   for (i=0; i<imax; i++) {ISDestroy(is[i]);}
 33:   ISExpandIndicesGeneral(N,bs,imax,is_new,is);
 34:   for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
 35:   PetscFree(is_new);
 36:   return(0);
 37: }

 39: /*
 40:   Sample message format:
 41:   If a processor A wants processor B to process some elements corresponding
 42:   to index sets is[1], is[5]
 43:   mesg [0] = 2   (no of index sets in the mesg)
 44:   -----------  
 45:   mesg [1] = 1 => is[1]
 46:   mesg [2] = sizeof(is[1]);
 47:   -----------  
 48:   mesg [5] = 5  => is[5]
 49:   mesg [6] = sizeof(is[5]);
 50:   -----------
 51:   mesg [7] 
 52:   mesg [n]  data(is[1])
 53:   -----------  
 54:   mesg[n+1]
 55:   mesg[m]  data(is[5])
 56:   -----------  
 57:   
 58:   Notes:
 59:   nrqs - no of requests sent (or to be sent out)
 60:   nrqr - no of requests recieved (which have to be or which have been processed
 61: */
 64: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[])
 65: {
 66:   Mat_MPIBAIJ  *c = (Mat_MPIBAIJ*)C->data;
 67:   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
 68:   int         size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
 69:   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
 70:   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
 71:   PetscBT     *table;
 72:   MPI_Comm    comm;
 73:   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
 74:   MPI_Status  *s_status,*recv_status;

 77:   comm   = C->comm;
 78:   size   = c->size;
 79:   rank   = c->rank;
 80:   Mbs    = c->Mbs;

 82:   PetscObjectGetNewTag((PetscObject)C,&tag1);
 83:   PetscObjectGetNewTag((PetscObject)C,&tag2);

 85:   len    = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
 86:   PetscMalloc(len,&idx);
 87:   n      = (int*)(idx + imax);
 88:   rtable = n + imax;
 89: 
 90:   for (i=0; i<imax; i++) {
 91:     ISGetIndices(is[i],&idx[i]);
 92:     ISGetLocalSize(is[i],&n[i]);
 93:   }
 94: 
 95:   /* Create hash table for the mapping :row -> proc*/
 96:   for (i=0,j=0; i<size; i++) {
 97:     len = c->rowners[i+1];
 98:     for (; j<len; j++) {
 99:       rtable[j] = i;
100:     }
101:   }

103:   /* evaluate communication - mesg to who,length of mesg, and buffer space
104:      required. Based on this, buffers are allocated, and data copied into them*/
105:   PetscMalloc(size*4*sizeof(int),&w1);/*  mesg size */
106:   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
107:   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
108:   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
109:   PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
110:   for (i=0; i<imax; i++) {
111:     PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
112:     idx_i = idx[i];
113:     len   = n[i];
114:     for (j=0; j<len; j++) {
115:       row  = idx_i[j];
116:       if (row < 0) {
117:         SETERRQ(1,"Index set cannot have negative entries");
118:       }
119:       proc = rtable[row];
120:       w4[proc]++;
121:     }
122:     for (j=0; j<size; j++){
123:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
124:     }
125:   }

127:   nrqs     = 0;              /* no of outgoing messages */
128:   msz      = 0;              /* total mesg length (for all proc */
129:   w1[rank] = 0;              /* no mesg sent to itself */
130:   w3[rank] = 0;
131:   for (i=0; i<size; i++) {
132:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
133:   }
134:   /* pa - is list of processors to communicate with */
135:   PetscMalloc((nrqs+1)*sizeof(int),&pa);
136:   for (i=0,j=0; i<size; i++) {
137:     if (w1[i]) {pa[j] = i; j++;}
138:   }

140:   /* Each message would have a header = 1 + 2*(no of IS) + data */
141:   for (i=0; i<nrqs; i++) {
142:     j      = pa[i];
143:     w1[j] += w2[j] + 2*w3[j];
144:     msz   += w1[j];
145:   }
146: 
147:   /* Determine the number of messages to expect, their lengths, from from-ids */
148:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
149:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

151:   /* Now post the Irecvs corresponding to these messages */
152:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
153: 
154:   /* Allocate Memory for outgoing messages */
155:   len    = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
156:   PetscMalloc(len,&outdat);
157:   ptr    = outdat + size;     /* Pointers to the data in outgoing buffers */
158:   PetscMemzero(outdat,2*size*sizeof(int*));
159:   tmp    = (int*)(outdat + 2*size);
160:   ctr    = tmp + msz;

162:   {
163:     int *iptr = tmp,ict  = 0;
164:     for (i=0; i<nrqs; i++) {
165:       j         = pa[i];
166:       iptr     +=  ict;
167:       outdat[j] = iptr;
168:       ict       = w1[j];
169:     }
170:   }

172:   /* Form the outgoing messages */
173:   /*plug in the headers*/
174:   for (i=0; i<nrqs; i++) {
175:     j            = pa[i];
176:     outdat[j][0] = 0;
177:     PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
178:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
179:   }
180: 
181:   /* Memory for doing local proc's work*/
182:   {
183:     int  *d_p;
184:     char *t_p;

186:     len  = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
187:       (Mbs)*imax*sizeof(int)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
188:     PetscMalloc(len,&table);
189:     PetscMemzero(table,len);
190:     data = (int **)(table + imax);
191:     isz  = (int  *)(data  + imax);
192:     d_p  = (int  *)(isz   + imax);
193:     t_p  = (char *)(d_p   + Mbs*imax);
194:     for (i=0; i<imax; i++) {
195:       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
196:       data[i]  = d_p + (Mbs)*i;
197:     }
198:   }

200:   /* Parse the IS and update local tables and the outgoing buf with the data*/
201:   {
202:     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
203:     PetscBT table_i;

205:     for (i=0; i<imax; i++) {
206:       PetscMemzero(ctr,size*sizeof(int));
207:       n_i     = n[i];
208:       table_i = table[i];
209:       idx_i   = idx[i];
210:       data_i  = data[i];
211:       isz_i   = isz[i];
212:       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
213:         row  = idx_i[j];
214:         proc = rtable[row];
215:         if (proc != rank) { /* copy to the outgoing buffer */
216:           ctr[proc]++;
217:           *ptr[proc] = row;
218:           ptr[proc]++;
219:         }
220:         else { /* Update the local table */
221:           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
222:         }
223:       }
224:       /* Update the headers for the current IS */
225:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
226:         if ((ctr_j = ctr[j])) {
227:           outdat_j        = outdat[j];
228:           k               = ++outdat_j[0];
229:           outdat_j[2*k]   = ctr_j;
230:           outdat_j[2*k-1] = i;
231:         }
232:       }
233:       isz[i] = isz_i;
234:     }
235:   }
236: 
237:   /*  Now  post the sends */
238:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
239:   for (i=0; i<nrqs; ++i) {
240:     j    = pa[i];
241:     MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
242:   }
243: 
244:   /* No longer need the original indices*/
245:   for (i=0; i<imax; ++i) {
246:     ISRestoreIndices(is[i],idx+i);
247:   }
248:   PetscFree(idx);

250:   for (i=0; i<imax; ++i) {
251:     ISDestroy(is[i]);
252:   }
253: 
254:   /* Do Local work*/
255:   MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);

257:   /* Receive messages*/
258:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
259:   MPI_Waitall(nrqr,r_waits1,recv_status);
260: 
261:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
262:   MPI_Waitall(nrqs,s_waits1,s_status);

264:   /* Phase 1 sends are complete - deallocate buffers */
265:   PetscFree(outdat);
266:   PetscFree(w1);

268:   PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
269:   PetscMalloc((nrqr+1)*sizeof(int),&isz1);
270:   MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
271:   PetscFree(rbuf);

273:   /* Send the data back*/
274:   /* Do a global reduction to know the buffer space req for incoming messages*/
275:   {
276:     int *rw1;
277: 
278:     PetscMalloc(size*sizeof(int),&rw1);
279:     PetscMemzero(rw1,size*sizeof(int));

281:     for (i=0; i<nrqr; ++i) {
282:       proc      = recv_status[i].MPI_SOURCE;
283:       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
284:       rw1[proc] = isz1[i];
285:     }
286: 
287:     PetscFree(onodes1);
288:     PetscFree(olengths1);

290:     /* Determine the number of messages to expect, their lengths, from from-ids */
291:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
292:     PetscFree(rw1);
293:   }
294:   /* Now post the Irecvs corresponding to these messages */
295:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
296: 
297:   /*  Now  post the sends */
298:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
299:   for (i=0; i<nrqr; ++i) {
300:     j    = recv_status[i].MPI_SOURCE;
301:     MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
302:   }

304:   /* receive work done on other processors*/
305:   {
306:     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
307:     PetscBT     table_i;
308:     MPI_Status  *status2;
309: 
310:     PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);

312:     for (i=0; i<nrqs; ++i) {
313:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
314:       /* Process the message*/
315:       rbuf2_i = rbuf2[idex];
316:       ct1     = 2*rbuf2_i[0]+1;
317:       jmax    = rbuf2[idex][0];
318:       for (j=1; j<=jmax; j++) {
319:         max     = rbuf2_i[2*j];
320:         is_no   = rbuf2_i[2*j-1];
321:         isz_i   = isz[is_no];
322:         data_i  = data[is_no];
323:         table_i = table[is_no];
324:         for (k=0; k<max; k++,ct1++) {
325:           row = rbuf2_i[ct1];
326:           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
327:         }
328:         isz[is_no] = isz_i;
329:       }
330:     }
331:     MPI_Waitall(nrqr,s_waits2,status2);
332:     PetscFree(status2);
333:   }
334: 
335:   for (i=0; i<imax; ++i) {
336:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
337:   }
338: 
339: 
340:   PetscFree(onodes2);
341:   PetscFree(olengths2);

343:   PetscFree(pa);
344:   PetscFree(rbuf2);
345:   PetscFree(s_waits1);
346:   PetscFree(r_waits1);
347:   PetscFree(s_waits2);
348:   PetscFree(r_waits2);
349:   PetscFree(table);
350:   PetscFree(s_status);
351:   PetscFree(recv_status);
352:   PetscFree(xdata[0]);
353:   PetscFree(xdata);
354:   PetscFree(isz1);
355:   return(0);
356: }

360: /*  
361:    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 
362:        the work on the local processor.

364:      Inputs:
365:       C      - MAT_MPIBAIJ;
366:       imax - total no of index sets processed at a time;
367:       table  - an array of char - size = Mbs bits.
368:       
369:      Output:
370:       isz    - array containing the count of the solution elements corresponding
371:                to each index set;
372:       data   - pointer to the solutions
373: */
374: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
375: {
376:   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
377:   Mat         A = c->A,B = c->B;
378:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
379:   int         start,end,val,max,rstart,cstart,*ai,*aj;
380:   int         *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
381:   PetscBT     table_i;

384:   rstart = c->rstart;
385:   cstart = c->cstart;
386:   ai     = a->i;
387:   aj     = a->j;
388:   bi     = b->i;
389:   bj     = b->j;
390:   garray = c->garray;

392: 
393:   for (i=0; i<imax; i++) {
394:     data_i  = data[i];
395:     table_i = table[i];
396:     isz_i   = isz[i];
397:     for (j=0,max=isz[i]; j<max; j++) {
398:       row   = data_i[j] - rstart;
399:       start = ai[row];
400:       end   = ai[row+1];
401:       for (k=start; k<end; k++) { /* Amat */
402:         val = aj[k] + cstart;
403:         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
404:       }
405:       start = bi[row];
406:       end   = bi[row+1];
407:       for (k=start; k<end; k++) { /* Bmat */
408:         val = garray[bj[k]];
409:         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
410:       }
411:     }
412:     isz[i] = isz_i;
413:   }
414:   return(0);
415: }
418: /*     
419:       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
420:          and return the output

422:          Input:
423:            C    - the matrix
424:            nrqr - no of messages being processed.
425:            rbuf - an array of pointers to the recieved requests
426:            
427:          Output:
428:            xdata - array of messages to be sent back
429:            isz1  - size of each message

431:   For better efficiency perhaps we should malloc seperately each xdata[i],
432: then if a remalloc is required we need only copy the data for that one row
433: rather than all previous rows as it is now where a single large chunck of 
434: memory is used.

436: */
437: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
438: {
439:   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
440:   Mat         A = c->A,B = c->B;
441:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
442:   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
443:   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
444:   int         val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
445:   int         *rbuf_i,kmax,rbuf_0,ierr;
446:   PetscBT     xtable;

449:   rank   = c->rank;
450:   Mbs    = c->Mbs;
451:   rstart = c->rstart;
452:   cstart = c->cstart;
453:   ai     = a->i;
454:   aj     = a->j;
455:   bi     = b->i;
456:   bj     = b->j;
457:   garray = c->garray;
458: 
459: 
460:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
461:     rbuf_i  =  rbuf[i];
462:     rbuf_0  =  rbuf_i[0];
463:     ct     += rbuf_0;
464:     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
465:   }
466: 
467:   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
468:   else        max1 = 1;
469:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
470:   PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
471:   ++no_malloc;
472:   PetscBTCreate(Mbs,xtable);
473:   PetscMemzero(isz1,nrqr*sizeof(int));
474: 
475:   ct3 = 0;
476:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
477:     rbuf_i =  rbuf[i];
478:     rbuf_0 =  rbuf_i[0];
479:     ct1    =  2*rbuf_0+1;
480:     ct2    =  ct1;
481:     ct3    += ct1;
482:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
483:       PetscBTMemzero(Mbs,xtable);
484:       oct2 = ct2;
485:       kmax = rbuf_i[2*j];
486:       for (k=0; k<kmax; k++,ct1++) {
487:         row = rbuf_i[ct1];
488:         if (!PetscBTLookupSet(xtable,row)) {
489:           if (!(ct3 < mem_estimate)) {
490:             new_estimate = (int)(1.5*mem_estimate)+1;
491:             PetscMalloc(new_estimate * sizeof(int),&tmp);
492:             PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
493:             PetscFree(xdata[0]);
494:             xdata[0]     = tmp;
495:             mem_estimate = new_estimate; ++no_malloc;
496:             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
497:           }
498:           xdata[i][ct2++] = row;
499:           ct3++;
500:         }
501:       }
502:       for (k=oct2,max2=ct2; k<max2; k++)  {
503:         row   = xdata[i][k] - rstart;
504:         start = ai[row];
505:         end   = ai[row+1];
506:         for (l=start; l<end; l++) {
507:           val = aj[l] + cstart;
508:           if (!PetscBTLookupSet(xtable,val)) {
509:             if (!(ct3 < mem_estimate)) {
510:               new_estimate = (int)(1.5*mem_estimate)+1;
511:               PetscMalloc(new_estimate * sizeof(int),&tmp);
512:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
513:               PetscFree(xdata[0]);
514:               xdata[0]     = tmp;
515:               mem_estimate = new_estimate; ++no_malloc;
516:               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517:             }
518:             xdata[i][ct2++] = val;
519:             ct3++;
520:           }
521:         }
522:         start = bi[row];
523:         end   = bi[row+1];
524:         for (l=start; l<end; l++) {
525:           val = garray[bj[l]];
526:           if (!PetscBTLookupSet(xtable,val)) {
527:             if (!(ct3 < mem_estimate)) {
528:               new_estimate = (int)(1.5*mem_estimate)+1;
529:               PetscMalloc(new_estimate * sizeof(int),&tmp);
530:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
531:               PetscFree(xdata[0]);
532:               xdata[0]     = tmp;
533:               mem_estimate = new_estimate; ++no_malloc;
534:               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
535:             }
536:             xdata[i][ct2++] = val;
537:             ct3++;
538:           }
539:         }
540:       }
541:       /* Update the header*/
542:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
543:       xdata[i][2*j-1] = rbuf_i[2*j-1];
544:     }
545:     xdata[i][0] = rbuf_0;
546:     xdata[i+1]  = xdata[i] + ct2;
547:     isz1[i]     = ct2; /* size of each message */
548:   }
549:   PetscBTDestroy(xtable);
550:   PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc);
551:   return(0);
552: }

554: static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *);

558: int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
559: {
560:   IS          *isrow_new,*iscol_new;
561:   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
562:   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,N=C->N,bs=c->bs;

565:   /* The compression and expansion should be avoided. Does'nt point
566:      out errors might change the indices hence buggey */

568:   PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
569:   iscol_new = isrow_new + ismax;
570:   ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
571:   ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);

573:   /* Allocate memory to hold all the submatrices */
574:   if (scall != MAT_REUSE_MATRIX) {
575:     PetscMalloc((ismax+1)*sizeof(Mat),submat);
576:   }
577:   /* Determine the number of stages through which submatrices are done */
578:   nmax          = 20*1000000 / (c->Nbs * sizeof(int));
579:   if (!nmax) nmax = 1;
580:   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
581: 
582:   /* Make sure every processor loops through the nstages */
583:   MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
584:   for (i=0,pos=0; i<nstages; i++) {
585:     if (pos+nmax <= ismax) max_no = nmax;
586:     else if (pos == ismax) max_no = 0;
587:     else                   max_no = ismax-pos;
588:     MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
589:     pos += max_no;
590:   }
591: 
592:   for (i=0; i<ismax; i++) {
593:     ISDestroy(isrow_new[i]);
594:     ISDestroy(iscol_new[i]);
595:   }
596:   PetscFree(isrow_new);
597:   return(0);
598: }

600: #if defined (PETSC_USE_CTABLE)
603: int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc)
604: {
605:   int nGlobalNd = proc_gnode[numprocs];
606:   int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5);
607: 
609:   /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/
610:   if (fproc > numprocs) fproc = numprocs;
611:   while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) {
612:     if (gid < proc_gnode[fproc]) fproc--;
613:     else                         fproc++;
614:   }
615:   /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/
616:   *proc = fproc;
617:   return(0);
618: }
619: #endif

621: /* -------------------------------------------------------------------------*/
624: static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
625: {
626:   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
627:   Mat         A = c->A;
628:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
629:   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size;
630:   int         **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
631:   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
632:   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
633:   int         **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
634:   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
635:   int         bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
636:   int         cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
637:   int         *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
638:   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
639:   MPI_Request *r_waits4,*s_waits3,*s_waits4;
640:   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
641:   MPI_Status  *r_status3,*r_status4,*s_status4;
642:   MPI_Comm    comm;
643:   MatScalar   **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
644:   MatScalar   *a_a=a->a,*b_a=b->a;
645:   PetscTruth  flag;
646:   int         *onodes1,*olengths1;

648: #if defined (PETSC_USE_CTABLE)
649:   int tt;
650:   PetscTable  *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
651: #else
652:   int         **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
653: #endif

656:   comm   = C->comm;
657:   tag0   = C->tag;
658:   size   = c->size;
659:   rank   = c->rank;
660: 
661:   /* Get some new tags to keep the communication clean */
662:   PetscObjectGetNewTag((PetscObject)C,&tag1);
663:   PetscObjectGetNewTag((PetscObject)C,&tag2);
664:   PetscObjectGetNewTag((PetscObject)C,&tag3);

666:   /* Check if the col indices are sorted */
667:   for (i=0; i<ismax; i++) {
668:     ISSorted(iscol[i],(PetscTruth*)&j);
669:     if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
670:   }

672:   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int));
673: #if !defined (PETSC_USE_CTABLE)
674:   len    += (Mbs+1)*sizeof(int);
675: #endif
676:   PetscMalloc(len,&irow);
677:   icol = irow + ismax;
678:   nrow = (int*)(icol + ismax);
679:   ncol = nrow + ismax;
680: #if !defined (PETSC_USE_CTABLE)
681:   rtable = ncol + ismax;
682:   /* Create hash table for the mapping :row -> proc*/
683:   for (i=0,j=0; i<size; i++) {
684:     jmax = c->rowners[i+1];
685:     for (; j<jmax; j++) {
686:       rtable[j] = i;
687:     }
688:   }
689: #endif
690: 
691:   for (i=0; i<ismax; i++) {
692:     ISGetIndices(isrow[i],&irow[i]);
693:     ISGetIndices(iscol[i],&icol[i]);
694:     ISGetLocalSize(isrow[i],&nrow[i]);
695:     ISGetLocalSize(iscol[i],&ncol[i]);
696:   }

698:   /* evaluate communication - mesg to who,length of mesg,and buffer space
699:      required. Based on this, buffers are allocated, and data copied into them*/
700:   PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
701:   w2   = w1 + size;      /* if w2[i] marked, then a message to proc i*/
702:   w3   = w2 + size;      /* no of IS that needs to be sent to proc i */
703:   w4   = w3 + size;      /* temp work space used in determining w1, w2, w3 */
704:   PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
705:   for (i=0; i<ismax; i++) {
706:     PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
707:     jmax   = nrow[i];
708:     irow_i = irow[i];
709:     for (j=0; j<jmax; j++) {
710:       row  = irow_i[j];
711: #if defined (PETSC_USE_CTABLE)
712:       PetscGetProc(row,size,c->rowners,&proc);
713: #else
714:       proc = rtable[row];
715: #endif
716:       w4[proc]++;
717:     }
718:     for (j=0; j<size; j++) {
719:       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
720:     }
721:   }

723:   nrqs     = 0;              /* no of outgoing messages */
724:   msz      = 0;              /* total mesg length for all proc */
725:   w1[rank] = 0;              /* no mesg sent to intself */
726:   w3[rank] = 0;
727:   for (i=0; i<size; i++) {
728:     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
729:   }
730:   PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
731:   for (i=0,j=0; i<size; i++) {
732:     if (w1[i]) { pa[j] = i; j++; }
733:   }

735:   /* Each message would have a header = 1 + 2*(no of IS) + data */
736:   for (i=0; i<nrqs; i++) {
737:     j     = pa[i];
738:     w1[j] += w2[j] + 2* w3[j];
739:     msz   += w1[j];
740:   }

742:   /* Determine the number of messages to expect, their lengths, from from-ids */
743:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
744:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

746:   /* Now post the Irecvs corresponding to these messages */
747:   PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
748: 
749:   PetscFree(onodes1);
750:   PetscFree(olengths1);

752:   /* Allocate Memory for outgoing messages */
753:   len  = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
754:   PetscMalloc(len,&sbuf1);
755:   ptr  = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
756:   PetscMemzero(sbuf1,2*size*sizeof(int*));
757:   /* allocate memory for outgoing data + buf to receive the first reply */
758:   tmp  = (int*)(ptr + size);
759:   ctr  = tmp + 2*msz;

761:   {
762:     int *iptr = tmp,ict = 0;
763:     for (i=0; i<nrqs; i++) {
764:       j         = pa[i];
765:       iptr     += ict;
766:       sbuf1[j]  = iptr;
767:       ict       = w1[j];
768:     }
769:   }

771:   /* Form the outgoing messages */
772:   /* Initialise the header space */
773:   for (i=0; i<nrqs; i++) {
774:     j           = pa[i];
775:     sbuf1[j][0] = 0;
776:     PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
777:     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
778:   }
779: 
780:   /* Parse the isrow and copy data into outbuf */
781:   for (i=0; i<ismax; i++) {
782:     PetscMemzero(ctr,size*sizeof(int));
783:     irow_i = irow[i];
784:     jmax   = nrow[i];
785:     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
786:       row  = irow_i[j];
787: #if defined (PETSC_USE_CTABLE)
788:       PetscGetProc(row,size,c->rowners,&proc);
789: #else
790:       proc = rtable[row];
791: #endif
792:       if (proc != rank) { /* copy to the outgoing buf*/
793:         ctr[proc]++;
794:         *ptr[proc] = row;
795:         ptr[proc]++;
796:       }
797:     }
798:     /* Update the headers for the current IS */
799:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
800:       if ((ctr_j = ctr[j])) {
801:         sbuf1_j        = sbuf1[j];
802:         k              = ++sbuf1_j[0];
803:         sbuf1_j[2*k]   = ctr_j;
804:         sbuf1_j[2*k-1] = i;
805:       }
806:     }
807:   }

809:   /*  Now  post the sends */
810:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
811:   for (i=0; i<nrqs; ++i) {
812:     j = pa[i];
813:     MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
814:   }

816:   /* Post Recieves to capture the buffer size */
817:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
818:   PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
819:   rbuf2[0] = tmp + msz;
820:   for (i=1; i<nrqs; ++i) {
821:     j        = pa[i];
822:     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
823:   }
824:   for (i=0; i<nrqs; ++i) {
825:     j    = pa[i];
826:     MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
827:   }

829:   /* Send to other procs the buf size they should allocate */

831:   /* Receive messages*/
832:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
833:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
834:   len        = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
835:   PetscMalloc(len,&sbuf2);
836:   req_size   = (int*)(sbuf2 + nrqr);
837:   req_source = req_size + nrqr;
838: 
839:   {
840:     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
841:     int        *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;

843:     for (i=0; i<nrqr; ++i) {
844:       MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
845:       req_size[idex] = 0;
846:       rbuf1_i         = rbuf1[idex];
847:       start           = 2*rbuf1_i[0] + 1;
848:       MPI_Get_count(r_status1+i,MPI_INT,&end);
849:       PetscMalloc(end*sizeof(int),&sbuf2[idex]);
850:       sbuf2_i         = sbuf2[idex];
851:       for (j=start; j<end; j++) {
852:         id               = rbuf1_i[j] - rstart;
853:         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
854:         sbuf2_i[j]       = ncols;
855:         req_size[idex] += ncols;
856:       }
857:       req_source[idex] = r_status1[i].MPI_SOURCE;
858:       /* form the header */
859:       sbuf2_i[0]   = req_size[idex];
860:       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
861:       MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);
862:     }
863:   }
864:   PetscFree(r_status1);
865:   PetscFree(r_waits1);

867:   /*  recv buffer sizes */
868:   /* Receive messages*/

870:   PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
871:   PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
872:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
873:   PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
874:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);

876:   for (i=0; i<nrqs; ++i) {
877:     MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
878:     PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);
879:     PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
880:     MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,
881:                      r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
882:     MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,
883:                      r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
884:   }
885:   PetscFree(r_status2);
886:   PetscFree(r_waits2);
887: 
888:   /* Wait on sends1 and sends2 */
889:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
890:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);

892:   MPI_Waitall(nrqs,s_waits1,s_status1);
893:   MPI_Waitall(nrqr,s_waits2,s_status2);
894:   PetscFree(s_status1);
895:   PetscFree(s_status2);
896:   PetscFree(s_waits1);
897:   PetscFree(s_waits2);

899:   /* Now allocate buffers for a->j, and send them off */
900:   PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);
901:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
902:   PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
903:   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
904: 
905:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
906:   {
907:      for (i=0; i<nrqr; i++) {
908:       rbuf1_i   = rbuf1[i];
909:       sbuf_aj_i = sbuf_aj[i];
910:       ct1       = 2*rbuf1_i[0] + 1;
911:       ct2       = 0;
912:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
913:         kmax = rbuf1[i][2*j];
914:         for (k=0; k<kmax; k++,ct1++) {
915:           row    = rbuf1_i[ct1] - rstart;
916:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
917:           ncols  = nzA + nzB;
918:           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];

920:           /* load the column indices for this row into cols*/
921:           cols  = sbuf_aj_i + ct2;
922:           for (l=0; l<nzB; l++) {
923:             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
924:             else break;
925:           }
926:           imark = l;
927:           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
928:           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
929:           ct2 += ncols;
930:         }
931:       }
932:       MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
933:     }
934:   }
935:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
936:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);

938:   /* Allocate buffers for a->a, and send them off */
939:   PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
940:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
941:   PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
942:   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
943: 
944:   PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
945:   {
946:     for (i=0; i<nrqr; i++) {
947:       rbuf1_i   = rbuf1[i];
948:       sbuf_aa_i = sbuf_aa[i];
949:       ct1       = 2*rbuf1_i[0]+1;
950:       ct2       = 0;
951:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
952:         kmax = rbuf1_i[2*j];
953:         for (k=0; k<kmax; k++,ct1++) {
954:           row    = rbuf1_i[ct1] - rstart;
955:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
956:           ncols  = nzA + nzB;
957:           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
958:           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;

960:           /* load the column values for this row into vals*/
961:           vals  = sbuf_aa_i+ct2*bs2;
962:           for (l=0; l<nzB; l++) {
963:             if ((bmap[cworkB[l]]) < cstart) {
964:               PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
965:             }
966:             else break;
967:           }
968:           imark = l;
969:           for (l=0; l<nzA; l++) {
970:             PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
971:           }
972:           for (l=imark; l<nzB; l++) {
973:             PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
974:           }
975:           ct2 += ncols;
976:         }
977:       }
978:       MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
979:     }
980:   }
981:   PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
982:   PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
983:   PetscFree(rbuf1);

985:   /* Form the matrix */
986:   /* create col map */
987:   {
988:     int *icol_i;
989: #if defined (PETSC_USE_CTABLE)
990:     /* Create row map*/
991:     PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
992:     for (i=0; i<ismax; i++) {
993:       PetscTableCreate(ncol[i]+1,&colmaps[i]);
994:     }
995: #else
996:     len     = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
997:     PetscMalloc(len,&cmap);
998:     cmap[0] = (int *)(cmap + ismax);
999:     PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));
1000:     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1001: #endif
1002:     for (i=0; i<ismax; i++) {
1003:       jmax   = ncol[i];
1004:       icol_i = icol[i];
1005: #if defined (PETSC_USE_CTABLE)
1006:       lcol1_gcol1 = colmaps[i];
1007:       for (j=0; j<jmax; j++) {
1008:         PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1009:       }
1010: #else
1011:       cmap_i = cmap[i];
1012:       for (j=0; j<jmax; j++) {
1013:         cmap_i[icol_i[j]] = j+1;
1014:       }
1015: #endif
1016:     }
1017:   }

1019:   /* Create lens which is required for MatCreate... */
1020:   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1021:   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1022:   PetscMalloc(len,&lens);
1023:   lens[0] = (int *)(lens + ismax);
1024:   PetscMemzero(lens[0],j*sizeof(int));
1025:   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1026: 
1027:   /* Update lens from local data */
1028:   for (i=0; i<ismax; i++) {
1029:     jmax   = nrow[i];
1030: #if defined (PETSC_USE_CTABLE)
1031:     lcol1_gcol1 = colmaps[i];
1032: #else
1033:     cmap_i = cmap[i];
1034: #endif
1035:     irow_i = irow[i];
1036:     lens_i = lens[i];
1037:     for (j=0; j<jmax; j++) {
1038:       row  = irow_i[j];
1039: #if defined (PETSC_USE_CTABLE)
1040:       PetscGetProc(row,size,c->rowners,&proc);
1041: #else
1042:       proc = rtable[row];
1043: #endif
1044:       if (proc == rank) {
1045:         /* Get indices from matA and then from matB */
1046:         row    = row - rstart;
1047:         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1048:         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1049: #if defined (PETSC_USE_CTABLE)
1050:        for (k=0; k<nzA; k++) {
1051:          PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1052:           if (tt) { lens_i[j]++; }
1053:         }
1054:         for (k=0; k<nzB; k++) {
1055:           PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1056:           if (tt) { lens_i[j]++; }
1057:         }
1058: #else
1059:         for (k=0; k<nzA; k++) {
1060:           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1061:         }
1062:         for (k=0; k<nzB; k++) {
1063:           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1064:         }
1065: #endif
1066:       }
1067:     }
1068:   }
1069: #if defined (PETSC_USE_CTABLE)
1070:   /* Create row map*/
1071:   PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1072:   for (i=0; i<ismax; i++){
1073:     PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1074:   }
1075: #else
1076:   /* Create row map*/
1077:   len     = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int);
1078:   PetscMalloc(len,&rmap);
1079:   rmap[0] = (int *)(rmap + ismax);
1080:   PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));
1081:   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1082: #endif
1083:   for (i=0; i<ismax; i++) {
1084:     irow_i = irow[i];
1085:     jmax   = nrow[i];
1086: #if defined (PETSC_USE_CTABLE)
1087:     lrow1_grow1 = rowmaps[i];
1088:     for (j=0; j<jmax; j++) {
1089:       PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1090:     }
1091: #else
1092:     rmap_i = rmap[i];
1093:     for (j=0; j<jmax; j++) {
1094:       rmap_i[irow_i[j]] = j;
1095:     }
1096: #endif
1097:   }

1099:   /* Update lens from offproc data */
1100:   {
1101:     int *rbuf2_i,*rbuf3_i,*sbuf1_i;

1103:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1104:       MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1105:       idex   = pa[i];
1106:       sbuf1_i = sbuf1[idex];
1107:       jmax    = sbuf1_i[0];
1108:       ct1     = 2*jmax+1;
1109:       ct2     = 0;
1110:       rbuf2_i = rbuf2[i];
1111:       rbuf3_i = rbuf3[i];
1112:       for (j=1; j<=jmax; j++) {
1113:         is_no   = sbuf1_i[2*j-1];
1114:         max1    = sbuf1_i[2*j];
1115:         lens_i  = lens[is_no];
1116: #if defined (PETSC_USE_CTABLE)
1117:         lcol1_gcol1 = colmaps[is_no];
1118:         lrow1_grow1 = rowmaps[is_no];
1119: #else
1120:         cmap_i  = cmap[is_no];
1121:         rmap_i  = rmap[is_no];
1122: #endif
1123:         for (k=0; k<max1; k++,ct1++) {
1124: #if defined (PETSC_USE_CTABLE)
1125:           PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1126:           row--;
1127:           if(row < 0) { SETERRQ(1,"row not found in table"); }
1128: #else
1129:           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1130: #endif
1131:           max2 = rbuf2_i[ct1];
1132:           for (l=0; l<max2; l++,ct2++) {
1133: #if defined (PETSC_USE_CTABLE)
1134:             PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1135:             if (tt) {
1136:               lens_i[row]++;
1137:             }
1138: #else
1139:             if (cmap_i[rbuf3_i[ct2]]) {
1140:               lens_i[row]++;
1141:             }
1142: #endif
1143:           }
1144:         }
1145:       }
1146:     }
1147:   }
1148:   PetscFree(r_status3);
1149:   PetscFree(r_waits3);
1150:   MPI_Waitall(nrqr,s_waits3,s_status3);
1151:   PetscFree(s_status3);
1152:   PetscFree(s_waits3);

1154:   /* Create the submatrices */
1155:   if (scall == MAT_REUSE_MATRIX) {
1156:     /*
1157:         Assumes new rows are same length as the old rows, hence bug!
1158:     */
1159:     for (i=0; i<ismax; i++) {
1160:       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1161:       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1162:         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1163:       }
1164:       PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);
1165:       if (flag == PETSC_FALSE) {
1166:         SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1167:       }
1168:       /* Initial matrix as if empty */
1169:       PetscMemzero(mat->ilen,mat->mbs*sizeof(int));
1170:       submats[i]->factor = C->factor;
1171:     }
1172:   } else {
1173:     for (i=0; i<ismax; i++) {
1174:       MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);
1175:       MatSetType(submats[i],A->type_name);
1176:       MatSeqBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);
1177:       MatSeqSBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);
1178:     }
1179:   }

1181:   /* Assemble the matrices */
1182:   /* First assemble the local rows */
1183:   {
1184:     int       ilen_row,*imat_ilen,*imat_j,*imat_i;
1185:     MatScalar *imat_a;
1186: 
1187:     for (i=0; i<ismax; i++) {
1188:       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1189:       imat_ilen = mat->ilen;
1190:       imat_j    = mat->j;
1191:       imat_i    = mat->i;
1192:       imat_a    = mat->a;

1194: #if defined (PETSC_USE_CTABLE)
1195:       lcol1_gcol1 = colmaps[i];
1196:       lrow1_grow1 = rowmaps[i];
1197: #else
1198:       cmap_i    = cmap[i];
1199:       rmap_i    = rmap[i];
1200: #endif
1201:       irow_i    = irow[i];
1202:       jmax      = nrow[i];
1203:       for (j=0; j<jmax; j++) {
1204:         row      = irow_i[j];
1205: #if defined (PETSC_USE_CTABLE)
1206:         PetscGetProc(row,size,c->rowners,&proc);
1207: #else
1208:         proc = rtable[row];
1209: #endif
1210:         if (proc == rank) {
1211:           row      = row - rstart;
1212:           nzA      = a_i[row+1] - a_i[row];
1213:           nzB      = b_i[row+1] - b_i[row];
1214:           cworkA   = a_j + a_i[row];
1215:           cworkB   = b_j + b_i[row];
1216:           vworkA   = a_a + a_i[row]*bs2;
1217:           vworkB   = b_a + b_i[row]*bs2;
1218: #if defined (PETSC_USE_CTABLE)
1219:           PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1220:           row--;
1221:           if (row < 0) { SETERRQ(1,"row not found in table"); }
1222: #else
1223:           row      = rmap_i[row + rstart];
1224: #endif
1225:           mat_i    = imat_i[row];
1226:           mat_a    = imat_a + mat_i*bs2;
1227:           mat_j    = imat_j + mat_i;
1228:           ilen_row = imat_ilen[row];

1230:           /* load the column indices for this row into cols*/
1231:           for (l=0; l<nzB; l++) {
1232:             if ((ctmp = bmap[cworkB[l]]) < cstart) {
1233: #if defined (PETSC_USE_CTABLE)
1234:               PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1235:               if (tcol) {
1236: #else
1237:               if ((tcol = cmap_i[ctmp])) {
1238: #endif
1239:                 *mat_j++ = tcol - 1;
1240:                 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1241:                 mat_a   += bs2;
1242:                 ilen_row++;
1243:               }
1244:             } else break;
1245:           }
1246:           imark = l;
1247:           for (l=0; l<nzA; l++) {
1248: #if defined (PETSC_USE_CTABLE)
1249:             PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1250:             if (tcol) {
1251: #else
1252:             if ((tcol = cmap_i[cstart + cworkA[l]])) {
1253: #endif
1254:               *mat_j++ = tcol - 1;
1255:               PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1256:               mat_a   += bs2;
1257:               ilen_row++;
1258:             }
1259:           }
1260:           for (l=imark; l<nzB; l++) {
1261: #if defined (PETSC_USE_CTABLE)
1262:             PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1263:             if (tcol) {
1264: #else
1265:             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1266: #endif
1267:               *mat_j++ = tcol - 1;
1268:               PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1269:               mat_a   += bs2;
1270:               ilen_row++;
1271:             }
1272:           }
1273:           imat_ilen[row] = ilen_row;
1274:         }
1275:       }
1276: 
1277:     }
1278:   }

1280:   /*   Now assemble the off proc rows*/
1281:   {
1282:     int       *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1283:     int       *imat_j,*imat_i;
1284:     MatScalar *imat_a,*rbuf4_i;

1286:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1287:       MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1288:       idex   = pa[i];
1289:       sbuf1_i = sbuf1[idex];
1290:       jmax    = sbuf1_i[0];
1291:       ct1     = 2*jmax + 1;
1292:       ct2     = 0;
1293:       rbuf2_i = rbuf2[i];
1294:       rbuf3_i = rbuf3[i];
1295:       rbuf4_i = rbuf4[i];
1296:       for (j=1; j<=jmax; j++) {
1297:         is_no     = sbuf1_i[2*j-1];
1298: #if defined (PETSC_USE_CTABLE)
1299:         lrow1_grow1 = rowmaps[is_no];
1300:         lcol1_gcol1 = colmaps[is_no];
1301: #else
1302:         rmap_i    = rmap[is_no];
1303:         cmap_i    = cmap[is_no];
1304: #endif
1305:         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1306:         imat_ilen = mat->ilen;
1307:         imat_j    = mat->j;
1308:         imat_i    = mat->i;
1309:         imat_a    = mat->a;
1310:         max1      = sbuf1_i[2*j];
1311:         for (k=0; k<max1; k++,ct1++) {
1312:           row   = sbuf1_i[ct1];
1313: #if defined (PETSC_USE_CTABLE)
1314:           PetscTableFind(lrow1_grow1,row+1,&row);
1315:           row--;
1316:           if(row < 0) { SETERRQ(1,"row not found in table"); }
1317: #else
1318:           row   = rmap_i[row];
1319: #endif
1320:           ilen  = imat_ilen[row];
1321:           mat_i = imat_i[row];
1322:           mat_a = imat_a + mat_i*bs2;
1323:           mat_j = imat_j + mat_i;
1324:           max2 = rbuf2_i[ct1];
1325:           for (l=0; l<max2; l++,ct2++) {
1326: #if defined (PETSC_USE_CTABLE)
1327:             PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1328:             if (tcol) {
1329: #else
1330:             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1331: #endif
1332:               *mat_j++    = tcol - 1;
1333:               /* *mat_a++= rbuf4_i[ct2]; */
1334:               PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1335:               mat_a      += bs2;
1336:               ilen++;
1337:             }
1338:           }
1339:           imat_ilen[row] = ilen;
1340:         }
1341:       }
1342:     }
1343:   }
1344:   PetscFree(r_status4);
1345:   PetscFree(r_waits4);
1346:   MPI_Waitall(nrqr,s_waits4,s_status4);
1347:   PetscFree(s_waits4);
1348:   PetscFree(s_status4);

1350:   /* Restore the indices */
1351:   for (i=0; i<ismax; i++) {
1352:     ISRestoreIndices(isrow[i],irow+i);
1353:     ISRestoreIndices(iscol[i],icol+i);
1354:   }

1356:   /* Destroy allocated memory */
1357:   PetscFree(irow);
1358:   PetscFree(w1);
1359:   PetscFree(pa);

1361:   PetscFree(sbuf1);
1362:   PetscFree(rbuf2);
1363:   for (i=0; i<nrqr; ++i) {
1364:     PetscFree(sbuf2[i]);
1365:   }
1366:   for (i=0; i<nrqs; ++i) {
1367:     PetscFree(rbuf3[i]);
1368:     PetscFree(rbuf4[i]);
1369:   }

1371:   PetscFree(sbuf2);
1372:   PetscFree(rbuf3);
1373:   PetscFree(rbuf4);
1374:   PetscFree(sbuf_aj[0]);
1375:   PetscFree(sbuf_aj);
1376:   PetscFree(sbuf_aa[0]);
1377:   PetscFree(sbuf_aa);

1379: #if defined (PETSC_USE_CTABLE)
1380:   for (i=0; i<ismax; i++){
1381:     PetscTableDelete(rowmaps[i]);
1382:     PetscTableDelete(colmaps[i]);
1383:   }
1384:   PetscFree(colmaps);
1385:   PetscFree(rowmaps);
1386: #else
1387:   PetscFree(rmap);
1388:   PetscFree(cmap);
1389: #endif
1390:   PetscFree(lens);

1392:   for (i=0; i<ismax; i++) {
1393:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1394:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1395:   }

1397:   return(0);
1398: }