Actual source code: baijov.c
1: /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
11: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
12: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
13: EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
14: EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
18: int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov)
19: {
20: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
21: int i,ierr,N=C->N, bs=c->bs;
22: IS *is_new;
25: PetscMalloc(imax*sizeof(IS),&is_new);
26: /* Convert the indices into block format */
27: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
28: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
31: }
32: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
33: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
34: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
35: PetscFree(is_new);
36: return(0);
37: }
39: /*
40: Sample message format:
41: If a processor A wants processor B to process some elements corresponding
42: to index sets is[1], is[5]
43: mesg [0] = 2 (no of index sets in the mesg)
44: -----------
45: mesg [1] = 1 => is[1]
46: mesg [2] = sizeof(is[1]);
47: -----------
48: mesg [5] = 5 => is[5]
49: mesg [6] = sizeof(is[5]);
50: -----------
51: mesg [7]
52: mesg [n] data(is[1])
53: -----------
54: mesg[n+1]
55: mesg[m] data(is[5])
56: -----------
57:
58: Notes:
59: nrqs - no of requests sent (or to be sent out)
60: nrqr - no of requests recieved (which have to be or which have been processed
61: */
64: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[])
65: {
66: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
67: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
68: int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
69: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
70: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
71: PetscBT *table;
72: MPI_Comm comm;
73: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
74: MPI_Status *s_status,*recv_status;
77: comm = C->comm;
78: size = c->size;
79: rank = c->rank;
80: Mbs = c->Mbs;
82: PetscObjectGetNewTag((PetscObject)C,&tag1);
83: PetscObjectGetNewTag((PetscObject)C,&tag2);
85: len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
86: PetscMalloc(len,&idx);
87: n = (int*)(idx + imax);
88: rtable = n + imax;
89:
90: for (i=0; i<imax; i++) {
91: ISGetIndices(is[i],&idx[i]);
92: ISGetLocalSize(is[i],&n[i]);
93: }
94:
95: /* Create hash table for the mapping :row -> proc*/
96: for (i=0,j=0; i<size; i++) {
97: len = c->rowners[i+1];
98: for (; j<len; j++) {
99: rtable[j] = i;
100: }
101: }
103: /* evaluate communication - mesg to who,length of mesg, and buffer space
104: required. Based on this, buffers are allocated, and data copied into them*/
105: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
106: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
107: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
108: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
109: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
110: for (i=0; i<imax; i++) {
111: PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
112: idx_i = idx[i];
113: len = n[i];
114: for (j=0; j<len; j++) {
115: row = idx_i[j];
116: if (row < 0) {
117: SETERRQ(1,"Index set cannot have negative entries");
118: }
119: proc = rtable[row];
120: w4[proc]++;
121: }
122: for (j=0; j<size; j++){
123: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
124: }
125: }
127: nrqs = 0; /* no of outgoing messages */
128: msz = 0; /* total mesg length (for all proc */
129: w1[rank] = 0; /* no mesg sent to itself */
130: w3[rank] = 0;
131: for (i=0; i<size; i++) {
132: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
133: }
134: /* pa - is list of processors to communicate with */
135: PetscMalloc((nrqs+1)*sizeof(int),&pa);
136: for (i=0,j=0; i<size; i++) {
137: if (w1[i]) {pa[j] = i; j++;}
138: }
140: /* Each message would have a header = 1 + 2*(no of IS) + data */
141: for (i=0; i<nrqs; i++) {
142: j = pa[i];
143: w1[j] += w2[j] + 2*w3[j];
144: msz += w1[j];
145: }
146:
147: /* Determine the number of messages to expect, their lengths, from from-ids */
148: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
149: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
151: /* Now post the Irecvs corresponding to these messages */
152: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
153:
154: /* Allocate Memory for outgoing messages */
155: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
156: PetscMalloc(len,&outdat);
157: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
158: PetscMemzero(outdat,2*size*sizeof(int*));
159: tmp = (int*)(outdat + 2*size);
160: ctr = tmp + msz;
162: {
163: int *iptr = tmp,ict = 0;
164: for (i=0; i<nrqs; i++) {
165: j = pa[i];
166: iptr += ict;
167: outdat[j] = iptr;
168: ict = w1[j];
169: }
170: }
172: /* Form the outgoing messages */
173: /*plug in the headers*/
174: for (i=0; i<nrqs; i++) {
175: j = pa[i];
176: outdat[j][0] = 0;
177: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
178: ptr[j] = outdat[j] + 2*w3[j] + 1;
179: }
180:
181: /* Memory for doing local proc's work*/
182: {
183: int *d_p;
184: char *t_p;
186: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
187: (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
188: PetscMalloc(len,&table);
189: PetscMemzero(table,len);
190: data = (int **)(table + imax);
191: isz = (int *)(data + imax);
192: d_p = (int *)(isz + imax);
193: t_p = (char *)(d_p + Mbs*imax);
194: for (i=0; i<imax; i++) {
195: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
196: data[i] = d_p + (Mbs)*i;
197: }
198: }
200: /* Parse the IS and update local tables and the outgoing buf with the data*/
201: {
202: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
203: PetscBT table_i;
205: for (i=0; i<imax; i++) {
206: PetscMemzero(ctr,size*sizeof(int));
207: n_i = n[i];
208: table_i = table[i];
209: idx_i = idx[i];
210: data_i = data[i];
211: isz_i = isz[i];
212: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
213: row = idx_i[j];
214: proc = rtable[row];
215: if (proc != rank) { /* copy to the outgoing buffer */
216: ctr[proc]++;
217: *ptr[proc] = row;
218: ptr[proc]++;
219: }
220: else { /* Update the local table */
221: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
222: }
223: }
224: /* Update the headers for the current IS */
225: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
226: if ((ctr_j = ctr[j])) {
227: outdat_j = outdat[j];
228: k = ++outdat_j[0];
229: outdat_j[2*k] = ctr_j;
230: outdat_j[2*k-1] = i;
231: }
232: }
233: isz[i] = isz_i;
234: }
235: }
236:
237: /* Now post the sends */
238: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
239: for (i=0; i<nrqs; ++i) {
240: j = pa[i];
241: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
242: }
243:
244: /* No longer need the original indices*/
245: for (i=0; i<imax; ++i) {
246: ISRestoreIndices(is[i],idx+i);
247: }
248: PetscFree(idx);
250: for (i=0; i<imax; ++i) {
251: ISDestroy(is[i]);
252: }
253:
254: /* Do Local work*/
255: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
257: /* Receive messages*/
258: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
259: MPI_Waitall(nrqr,r_waits1,recv_status);
260:
261: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
262: MPI_Waitall(nrqs,s_waits1,s_status);
264: /* Phase 1 sends are complete - deallocate buffers */
265: PetscFree(outdat);
266: PetscFree(w1);
268: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
269: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
270: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
271: PetscFree(rbuf);
273: /* Send the data back*/
274: /* Do a global reduction to know the buffer space req for incoming messages*/
275: {
276: int *rw1;
277:
278: PetscMalloc(size*sizeof(int),&rw1);
279: PetscMemzero(rw1,size*sizeof(int));
281: for (i=0; i<nrqr; ++i) {
282: proc = recv_status[i].MPI_SOURCE;
283: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
284: rw1[proc] = isz1[i];
285: }
286:
287: PetscFree(onodes1);
288: PetscFree(olengths1);
290: /* Determine the number of messages to expect, their lengths, from from-ids */
291: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
292: PetscFree(rw1);
293: }
294: /* Now post the Irecvs corresponding to these messages */
295: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
296:
297: /* Now post the sends */
298: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
299: for (i=0; i<nrqr; ++i) {
300: j = recv_status[i].MPI_SOURCE;
301: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
302: }
304: /* receive work done on other processors*/
305: {
306: int idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
307: PetscBT table_i;
308: MPI_Status *status2;
309:
310: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
312: for (i=0; i<nrqs; ++i) {
313: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
314: /* Process the message*/
315: rbuf2_i = rbuf2[idex];
316: ct1 = 2*rbuf2_i[0]+1;
317: jmax = rbuf2[idex][0];
318: for (j=1; j<=jmax; j++) {
319: max = rbuf2_i[2*j];
320: is_no = rbuf2_i[2*j-1];
321: isz_i = isz[is_no];
322: data_i = data[is_no];
323: table_i = table[is_no];
324: for (k=0; k<max; k++,ct1++) {
325: row = rbuf2_i[ct1];
326: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
327: }
328: isz[is_no] = isz_i;
329: }
330: }
331: MPI_Waitall(nrqr,s_waits2,status2);
332: PetscFree(status2);
333: }
334:
335: for (i=0; i<imax; ++i) {
336: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
337: }
338:
339:
340: PetscFree(onodes2);
341: PetscFree(olengths2);
343: PetscFree(pa);
344: PetscFree(rbuf2);
345: PetscFree(s_waits1);
346: PetscFree(r_waits1);
347: PetscFree(s_waits2);
348: PetscFree(r_waits2);
349: PetscFree(table);
350: PetscFree(s_status);
351: PetscFree(recv_status);
352: PetscFree(xdata[0]);
353: PetscFree(xdata);
354: PetscFree(isz1);
355: return(0);
356: }
360: /*
361: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
362: the work on the local processor.
364: Inputs:
365: C - MAT_MPIBAIJ;
366: imax - total no of index sets processed at a time;
367: table - an array of char - size = Mbs bits.
368:
369: Output:
370: isz - array containing the count of the solution elements corresponding
371: to each index set;
372: data - pointer to the solutions
373: */
374: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
375: {
376: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
377: Mat A = c->A,B = c->B;
378: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
379: int start,end,val,max,rstart,cstart,*ai,*aj;
380: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
381: PetscBT table_i;
384: rstart = c->rstart;
385: cstart = c->cstart;
386: ai = a->i;
387: aj = a->j;
388: bi = b->i;
389: bj = b->j;
390: garray = c->garray;
392:
393: for (i=0; i<imax; i++) {
394: data_i = data[i];
395: table_i = table[i];
396: isz_i = isz[i];
397: for (j=0,max=isz[i]; j<max; j++) {
398: row = data_i[j] - rstart;
399: start = ai[row];
400: end = ai[row+1];
401: for (k=start; k<end; k++) { /* Amat */
402: val = aj[k] + cstart;
403: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
404: }
405: start = bi[row];
406: end = bi[row+1];
407: for (k=start; k<end; k++) { /* Bmat */
408: val = garray[bj[k]];
409: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
410: }
411: }
412: isz[i] = isz_i;
413: }
414: return(0);
415: }
418: /*
419: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
420: and return the output
422: Input:
423: C - the matrix
424: nrqr - no of messages being processed.
425: rbuf - an array of pointers to the recieved requests
426:
427: Output:
428: xdata - array of messages to be sent back
429: isz1 - size of each message
431: For better efficiency perhaps we should malloc seperately each xdata[i],
432: then if a remalloc is required we need only copy the data for that one row
433: rather than all previous rows as it is now where a single large chunck of
434: memory is used.
436: */
437: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
438: {
439: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
440: Mat A = c->A,B = c->B;
441: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
442: int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
443: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
444: int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
445: int *rbuf_i,kmax,rbuf_0,ierr;
446: PetscBT xtable;
449: rank = c->rank;
450: Mbs = c->Mbs;
451: rstart = c->rstart;
452: cstart = c->cstart;
453: ai = a->i;
454: aj = a->j;
455: bi = b->i;
456: bj = b->j;
457: garray = c->garray;
458:
459:
460: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
461: rbuf_i = rbuf[i];
462: rbuf_0 = rbuf_i[0];
463: ct += rbuf_0;
464: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
465: }
466:
467: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
468: else max1 = 1;
469: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
470: PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
471: ++no_malloc;
472: PetscBTCreate(Mbs,xtable);
473: PetscMemzero(isz1,nrqr*sizeof(int));
474:
475: ct3 = 0;
476: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
477: rbuf_i = rbuf[i];
478: rbuf_0 = rbuf_i[0];
479: ct1 = 2*rbuf_0+1;
480: ct2 = ct1;
481: ct3 += ct1;
482: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
483: PetscBTMemzero(Mbs,xtable);
484: oct2 = ct2;
485: kmax = rbuf_i[2*j];
486: for (k=0; k<kmax; k++,ct1++) {
487: row = rbuf_i[ct1];
488: if (!PetscBTLookupSet(xtable,row)) {
489: if (!(ct3 < mem_estimate)) {
490: new_estimate = (int)(1.5*mem_estimate)+1;
491: PetscMalloc(new_estimate * sizeof(int),&tmp);
492: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
493: PetscFree(xdata[0]);
494: xdata[0] = tmp;
495: mem_estimate = new_estimate; ++no_malloc;
496: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
497: }
498: xdata[i][ct2++] = row;
499: ct3++;
500: }
501: }
502: for (k=oct2,max2=ct2; k<max2; k++) {
503: row = xdata[i][k] - rstart;
504: start = ai[row];
505: end = ai[row+1];
506: for (l=start; l<end; l++) {
507: val = aj[l] + cstart;
508: if (!PetscBTLookupSet(xtable,val)) {
509: if (!(ct3 < mem_estimate)) {
510: new_estimate = (int)(1.5*mem_estimate)+1;
511: PetscMalloc(new_estimate * sizeof(int),&tmp);
512: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
513: PetscFree(xdata[0]);
514: xdata[0] = tmp;
515: mem_estimate = new_estimate; ++no_malloc;
516: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517: }
518: xdata[i][ct2++] = val;
519: ct3++;
520: }
521: }
522: start = bi[row];
523: end = bi[row+1];
524: for (l=start; l<end; l++) {
525: val = garray[bj[l]];
526: if (!PetscBTLookupSet(xtable,val)) {
527: if (!(ct3 < mem_estimate)) {
528: new_estimate = (int)(1.5*mem_estimate)+1;
529: PetscMalloc(new_estimate * sizeof(int),&tmp);
530: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
531: PetscFree(xdata[0]);
532: xdata[0] = tmp;
533: mem_estimate = new_estimate; ++no_malloc;
534: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
535: }
536: xdata[i][ct2++] = val;
537: ct3++;
538: }
539: }
540: }
541: /* Update the header*/
542: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
543: xdata[i][2*j-1] = rbuf_i[2*j-1];
544: }
545: xdata[i][0] = rbuf_0;
546: xdata[i+1] = xdata[i] + ct2;
547: isz1[i] = ct2; /* size of each message */
548: }
549: PetscBTDestroy(xtable);
550: PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc);
551: return(0);
552: }
554: static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *);
558: int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
559: {
560: IS *isrow_new,*iscol_new;
561: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
562: int nmax,nstages_local,nstages,i,pos,max_no,ierr,N=C->N,bs=c->bs;
565: /* The compression and expansion should be avoided. Does'nt point
566: out errors might change the indices hence buggey */
568: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
569: iscol_new = isrow_new + ismax;
570: ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
571: ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);
573: /* Allocate memory to hold all the submatrices */
574: if (scall != MAT_REUSE_MATRIX) {
575: PetscMalloc((ismax+1)*sizeof(Mat),submat);
576: }
577: /* Determine the number of stages through which submatrices are done */
578: nmax = 20*1000000 / (c->Nbs * sizeof(int));
579: if (!nmax) nmax = 1;
580: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
581:
582: /* Make sure every processor loops through the nstages */
583: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
584: for (i=0,pos=0; i<nstages; i++) {
585: if (pos+nmax <= ismax) max_no = nmax;
586: else if (pos == ismax) max_no = 0;
587: else max_no = ismax-pos;
588: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
589: pos += max_no;
590: }
591:
592: for (i=0; i<ismax; i++) {
593: ISDestroy(isrow_new[i]);
594: ISDestroy(iscol_new[i]);
595: }
596: PetscFree(isrow_new);
597: return(0);
598: }
600: #if defined (PETSC_USE_CTABLE)
603: int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc)
604: {
605: int nGlobalNd = proc_gnode[numprocs];
606: int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5);
607:
609: /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/
610: if (fproc > numprocs) fproc = numprocs;
611: while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) {
612: if (gid < proc_gnode[fproc]) fproc--;
613: else fproc++;
614: }
615: /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/
616: *proc = fproc;
617: return(0);
618: }
619: #endif
621: /* -------------------------------------------------------------------------*/
624: static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
625: {
626: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
627: Mat A = c->A;
628: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
629: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size;
630: int **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
631: int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
632: int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
633: int **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
634: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
635: int bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
636: int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
637: int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
638: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
639: MPI_Request *r_waits4,*s_waits3,*s_waits4;
640: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
641: MPI_Status *r_status3,*r_status4,*s_status4;
642: MPI_Comm comm;
643: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
644: MatScalar *a_a=a->a,*b_a=b->a;
645: PetscTruth flag;
646: int *onodes1,*olengths1;
648: #if defined (PETSC_USE_CTABLE)
649: int tt;
650: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
651: #else
652: int **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
653: #endif
656: comm = C->comm;
657: tag0 = C->tag;
658: size = c->size;
659: rank = c->rank;
660:
661: /* Get some new tags to keep the communication clean */
662: PetscObjectGetNewTag((PetscObject)C,&tag1);
663: PetscObjectGetNewTag((PetscObject)C,&tag2);
664: PetscObjectGetNewTag((PetscObject)C,&tag3);
666: /* Check if the col indices are sorted */
667: for (i=0; i<ismax; i++) {
668: ISSorted(iscol[i],(PetscTruth*)&j);
669: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
670: }
672: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int));
673: #if !defined (PETSC_USE_CTABLE)
674: len += (Mbs+1)*sizeof(int);
675: #endif
676: PetscMalloc(len,&irow);
677: icol = irow + ismax;
678: nrow = (int*)(icol + ismax);
679: ncol = nrow + ismax;
680: #if !defined (PETSC_USE_CTABLE)
681: rtable = ncol + ismax;
682: /* Create hash table for the mapping :row -> proc*/
683: for (i=0,j=0; i<size; i++) {
684: jmax = c->rowners[i+1];
685: for (; j<jmax; j++) {
686: rtable[j] = i;
687: }
688: }
689: #endif
690:
691: for (i=0; i<ismax; i++) {
692: ISGetIndices(isrow[i],&irow[i]);
693: ISGetIndices(iscol[i],&icol[i]);
694: ISGetLocalSize(isrow[i],&nrow[i]);
695: ISGetLocalSize(iscol[i],&ncol[i]);
696: }
698: /* evaluate communication - mesg to who,length of mesg,and buffer space
699: required. Based on this, buffers are allocated, and data copied into them*/
700: PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
701: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
702: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
703: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
704: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
705: for (i=0; i<ismax; i++) {
706: PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
707: jmax = nrow[i];
708: irow_i = irow[i];
709: for (j=0; j<jmax; j++) {
710: row = irow_i[j];
711: #if defined (PETSC_USE_CTABLE)
712: PetscGetProc(row,size,c->rowners,&proc);
713: #else
714: proc = rtable[row];
715: #endif
716: w4[proc]++;
717: }
718: for (j=0; j<size; j++) {
719: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
720: }
721: }
723: nrqs = 0; /* no of outgoing messages */
724: msz = 0; /* total mesg length for all proc */
725: w1[rank] = 0; /* no mesg sent to intself */
726: w3[rank] = 0;
727: for (i=0; i<size; i++) {
728: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
729: }
730: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
731: for (i=0,j=0; i<size; i++) {
732: if (w1[i]) { pa[j] = i; j++; }
733: }
735: /* Each message would have a header = 1 + 2*(no of IS) + data */
736: for (i=0; i<nrqs; i++) {
737: j = pa[i];
738: w1[j] += w2[j] + 2* w3[j];
739: msz += w1[j];
740: }
742: /* Determine the number of messages to expect, their lengths, from from-ids */
743: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
744: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
746: /* Now post the Irecvs corresponding to these messages */
747: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
748:
749: PetscFree(onodes1);
750: PetscFree(olengths1);
752: /* Allocate Memory for outgoing messages */
753: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
754: PetscMalloc(len,&sbuf1);
755: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
756: PetscMemzero(sbuf1,2*size*sizeof(int*));
757: /* allocate memory for outgoing data + buf to receive the first reply */
758: tmp = (int*)(ptr + size);
759: ctr = tmp + 2*msz;
761: {
762: int *iptr = tmp,ict = 0;
763: for (i=0; i<nrqs; i++) {
764: j = pa[i];
765: iptr += ict;
766: sbuf1[j] = iptr;
767: ict = w1[j];
768: }
769: }
771: /* Form the outgoing messages */
772: /* Initialise the header space */
773: for (i=0; i<nrqs; i++) {
774: j = pa[i];
775: sbuf1[j][0] = 0;
776: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
777: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
778: }
779:
780: /* Parse the isrow and copy data into outbuf */
781: for (i=0; i<ismax; i++) {
782: PetscMemzero(ctr,size*sizeof(int));
783: irow_i = irow[i];
784: jmax = nrow[i];
785: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
786: row = irow_i[j];
787: #if defined (PETSC_USE_CTABLE)
788: PetscGetProc(row,size,c->rowners,&proc);
789: #else
790: proc = rtable[row];
791: #endif
792: if (proc != rank) { /* copy to the outgoing buf*/
793: ctr[proc]++;
794: *ptr[proc] = row;
795: ptr[proc]++;
796: }
797: }
798: /* Update the headers for the current IS */
799: for (j=0; j<size; j++) { /* Can Optimise this loop too */
800: if ((ctr_j = ctr[j])) {
801: sbuf1_j = sbuf1[j];
802: k = ++sbuf1_j[0];
803: sbuf1_j[2*k] = ctr_j;
804: sbuf1_j[2*k-1] = i;
805: }
806: }
807: }
809: /* Now post the sends */
810: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
811: for (i=0; i<nrqs; ++i) {
812: j = pa[i];
813: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
814: }
816: /* Post Recieves to capture the buffer size */
817: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
818: PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
819: rbuf2[0] = tmp + msz;
820: for (i=1; i<nrqs; ++i) {
821: j = pa[i];
822: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
823: }
824: for (i=0; i<nrqs; ++i) {
825: j = pa[i];
826: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
827: }
829: /* Send to other procs the buf size they should allocate */
831: /* Receive messages*/
832: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
833: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
834: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
835: PetscMalloc(len,&sbuf2);
836: req_size = (int*)(sbuf2 + nrqr);
837: req_source = req_size + nrqr;
838:
839: {
840: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
841: int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
843: for (i=0; i<nrqr; ++i) {
844: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
845: req_size[idex] = 0;
846: rbuf1_i = rbuf1[idex];
847: start = 2*rbuf1_i[0] + 1;
848: MPI_Get_count(r_status1+i,MPI_INT,&end);
849: PetscMalloc(end*sizeof(int),&sbuf2[idex]);
850: sbuf2_i = sbuf2[idex];
851: for (j=start; j<end; j++) {
852: id = rbuf1_i[j] - rstart;
853: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
854: sbuf2_i[j] = ncols;
855: req_size[idex] += ncols;
856: }
857: req_source[idex] = r_status1[i].MPI_SOURCE;
858: /* form the header */
859: sbuf2_i[0] = req_size[idex];
860: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
861: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);
862: }
863: }
864: PetscFree(r_status1);
865: PetscFree(r_waits1);
867: /* recv buffer sizes */
868: /* Receive messages*/
870: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
871: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
872: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
873: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
874: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
876: for (i=0; i<nrqs; ++i) {
877: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
878: PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);
879: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
880: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,
881: r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
882: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,
883: r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
884: }
885: PetscFree(r_status2);
886: PetscFree(r_waits2);
887:
888: /* Wait on sends1 and sends2 */
889: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
890: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
892: MPI_Waitall(nrqs,s_waits1,s_status1);
893: MPI_Waitall(nrqr,s_waits2,s_status2);
894: PetscFree(s_status1);
895: PetscFree(s_status2);
896: PetscFree(s_waits1);
897: PetscFree(s_waits2);
899: /* Now allocate buffers for a->j, and send them off */
900: PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);
901: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
902: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
903: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
904:
905: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
906: {
907: for (i=0; i<nrqr; i++) {
908: rbuf1_i = rbuf1[i];
909: sbuf_aj_i = sbuf_aj[i];
910: ct1 = 2*rbuf1_i[0] + 1;
911: ct2 = 0;
912: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
913: kmax = rbuf1[i][2*j];
914: for (k=0; k<kmax; k++,ct1++) {
915: row = rbuf1_i[ct1] - rstart;
916: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
917: ncols = nzA + nzB;
918: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
920: /* load the column indices for this row into cols*/
921: cols = sbuf_aj_i + ct2;
922: for (l=0; l<nzB; l++) {
923: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
924: else break;
925: }
926: imark = l;
927: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
928: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
929: ct2 += ncols;
930: }
931: }
932: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
933: }
934: }
935: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
936: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
938: /* Allocate buffers for a->a, and send them off */
939: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
940: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
941: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
942: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
943:
944: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
945: {
946: for (i=0; i<nrqr; i++) {
947: rbuf1_i = rbuf1[i];
948: sbuf_aa_i = sbuf_aa[i];
949: ct1 = 2*rbuf1_i[0]+1;
950: ct2 = 0;
951: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
952: kmax = rbuf1_i[2*j];
953: for (k=0; k<kmax; k++,ct1++) {
954: row = rbuf1_i[ct1] - rstart;
955: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
956: ncols = nzA + nzB;
957: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
958: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
960: /* load the column values for this row into vals*/
961: vals = sbuf_aa_i+ct2*bs2;
962: for (l=0; l<nzB; l++) {
963: if ((bmap[cworkB[l]]) < cstart) {
964: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
965: }
966: else break;
967: }
968: imark = l;
969: for (l=0; l<nzA; l++) {
970: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
971: }
972: for (l=imark; l<nzB; l++) {
973: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
974: }
975: ct2 += ncols;
976: }
977: }
978: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
979: }
980: }
981: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
982: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
983: PetscFree(rbuf1);
985: /* Form the matrix */
986: /* create col map */
987: {
988: int *icol_i;
989: #if defined (PETSC_USE_CTABLE)
990: /* Create row map*/
991: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
992: for (i=0; i<ismax; i++) {
993: PetscTableCreate(ncol[i]+1,&colmaps[i]);
994: }
995: #else
996: len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
997: PetscMalloc(len,&cmap);
998: cmap[0] = (int *)(cmap + ismax);
999: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));
1000: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1001: #endif
1002: for (i=0; i<ismax; i++) {
1003: jmax = ncol[i];
1004: icol_i = icol[i];
1005: #if defined (PETSC_USE_CTABLE)
1006: lcol1_gcol1 = colmaps[i];
1007: for (j=0; j<jmax; j++) {
1008: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1009: }
1010: #else
1011: cmap_i = cmap[i];
1012: for (j=0; j<jmax; j++) {
1013: cmap_i[icol_i[j]] = j+1;
1014: }
1015: #endif
1016: }
1017: }
1019: /* Create lens which is required for MatCreate... */
1020: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1021: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1022: PetscMalloc(len,&lens);
1023: lens[0] = (int *)(lens + ismax);
1024: PetscMemzero(lens[0],j*sizeof(int));
1025: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1026:
1027: /* Update lens from local data */
1028: for (i=0; i<ismax; i++) {
1029: jmax = nrow[i];
1030: #if defined (PETSC_USE_CTABLE)
1031: lcol1_gcol1 = colmaps[i];
1032: #else
1033: cmap_i = cmap[i];
1034: #endif
1035: irow_i = irow[i];
1036: lens_i = lens[i];
1037: for (j=0; j<jmax; j++) {
1038: row = irow_i[j];
1039: #if defined (PETSC_USE_CTABLE)
1040: PetscGetProc(row,size,c->rowners,&proc);
1041: #else
1042: proc = rtable[row];
1043: #endif
1044: if (proc == rank) {
1045: /* Get indices from matA and then from matB */
1046: row = row - rstart;
1047: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1048: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1049: #if defined (PETSC_USE_CTABLE)
1050: for (k=0; k<nzA; k++) {
1051: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1052: if (tt) { lens_i[j]++; }
1053: }
1054: for (k=0; k<nzB; k++) {
1055: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1056: if (tt) { lens_i[j]++; }
1057: }
1058: #else
1059: for (k=0; k<nzA; k++) {
1060: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1061: }
1062: for (k=0; k<nzB; k++) {
1063: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1064: }
1065: #endif
1066: }
1067: }
1068: }
1069: #if defined (PETSC_USE_CTABLE)
1070: /* Create row map*/
1071: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1072: for (i=0; i<ismax; i++){
1073: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1074: }
1075: #else
1076: /* Create row map*/
1077: len = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int);
1078: PetscMalloc(len,&rmap);
1079: rmap[0] = (int *)(rmap + ismax);
1080: PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));
1081: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1082: #endif
1083: for (i=0; i<ismax; i++) {
1084: irow_i = irow[i];
1085: jmax = nrow[i];
1086: #if defined (PETSC_USE_CTABLE)
1087: lrow1_grow1 = rowmaps[i];
1088: for (j=0; j<jmax; j++) {
1089: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1090: }
1091: #else
1092: rmap_i = rmap[i];
1093: for (j=0; j<jmax; j++) {
1094: rmap_i[irow_i[j]] = j;
1095: }
1096: #endif
1097: }
1099: /* Update lens from offproc data */
1100: {
1101: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1103: for (tmp2=0; tmp2<nrqs; tmp2++) {
1104: MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1105: idex = pa[i];
1106: sbuf1_i = sbuf1[idex];
1107: jmax = sbuf1_i[0];
1108: ct1 = 2*jmax+1;
1109: ct2 = 0;
1110: rbuf2_i = rbuf2[i];
1111: rbuf3_i = rbuf3[i];
1112: for (j=1; j<=jmax; j++) {
1113: is_no = sbuf1_i[2*j-1];
1114: max1 = sbuf1_i[2*j];
1115: lens_i = lens[is_no];
1116: #if defined (PETSC_USE_CTABLE)
1117: lcol1_gcol1 = colmaps[is_no];
1118: lrow1_grow1 = rowmaps[is_no];
1119: #else
1120: cmap_i = cmap[is_no];
1121: rmap_i = rmap[is_no];
1122: #endif
1123: for (k=0; k<max1; k++,ct1++) {
1124: #if defined (PETSC_USE_CTABLE)
1125: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1126: row--;
1127: if(row < 0) { SETERRQ(1,"row not found in table"); }
1128: #else
1129: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1130: #endif
1131: max2 = rbuf2_i[ct1];
1132: for (l=0; l<max2; l++,ct2++) {
1133: #if defined (PETSC_USE_CTABLE)
1134: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1135: if (tt) {
1136: lens_i[row]++;
1137: }
1138: #else
1139: if (cmap_i[rbuf3_i[ct2]]) {
1140: lens_i[row]++;
1141: }
1142: #endif
1143: }
1144: }
1145: }
1146: }
1147: }
1148: PetscFree(r_status3);
1149: PetscFree(r_waits3);
1150: MPI_Waitall(nrqr,s_waits3,s_status3);
1151: PetscFree(s_status3);
1152: PetscFree(s_waits3);
1154: /* Create the submatrices */
1155: if (scall == MAT_REUSE_MATRIX) {
1156: /*
1157: Assumes new rows are same length as the old rows, hence bug!
1158: */
1159: for (i=0; i<ismax; i++) {
1160: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1161: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1162: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1163: }
1164: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);
1165: if (flag == PETSC_FALSE) {
1166: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1167: }
1168: /* Initial matrix as if empty */
1169: PetscMemzero(mat->ilen,mat->mbs*sizeof(int));
1170: submats[i]->factor = C->factor;
1171: }
1172: } else {
1173: for (i=0; i<ismax; i++) {
1174: MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);
1175: MatSetType(submats[i],A->type_name);
1176: MatSeqBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);
1177: MatSeqSBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);
1178: }
1179: }
1181: /* Assemble the matrices */
1182: /* First assemble the local rows */
1183: {
1184: int ilen_row,*imat_ilen,*imat_j,*imat_i;
1185: MatScalar *imat_a;
1186:
1187: for (i=0; i<ismax; i++) {
1188: mat = (Mat_SeqBAIJ*)submats[i]->data;
1189: imat_ilen = mat->ilen;
1190: imat_j = mat->j;
1191: imat_i = mat->i;
1192: imat_a = mat->a;
1194: #if defined (PETSC_USE_CTABLE)
1195: lcol1_gcol1 = colmaps[i];
1196: lrow1_grow1 = rowmaps[i];
1197: #else
1198: cmap_i = cmap[i];
1199: rmap_i = rmap[i];
1200: #endif
1201: irow_i = irow[i];
1202: jmax = nrow[i];
1203: for (j=0; j<jmax; j++) {
1204: row = irow_i[j];
1205: #if defined (PETSC_USE_CTABLE)
1206: PetscGetProc(row,size,c->rowners,&proc);
1207: #else
1208: proc = rtable[row];
1209: #endif
1210: if (proc == rank) {
1211: row = row - rstart;
1212: nzA = a_i[row+1] - a_i[row];
1213: nzB = b_i[row+1] - b_i[row];
1214: cworkA = a_j + a_i[row];
1215: cworkB = b_j + b_i[row];
1216: vworkA = a_a + a_i[row]*bs2;
1217: vworkB = b_a + b_i[row]*bs2;
1218: #if defined (PETSC_USE_CTABLE)
1219: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1220: row--;
1221: if (row < 0) { SETERRQ(1,"row not found in table"); }
1222: #else
1223: row = rmap_i[row + rstart];
1224: #endif
1225: mat_i = imat_i[row];
1226: mat_a = imat_a + mat_i*bs2;
1227: mat_j = imat_j + mat_i;
1228: ilen_row = imat_ilen[row];
1230: /* load the column indices for this row into cols*/
1231: for (l=0; l<nzB; l++) {
1232: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1233: #if defined (PETSC_USE_CTABLE)
1234: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1235: if (tcol) {
1236: #else
1237: if ((tcol = cmap_i[ctmp])) {
1238: #endif
1239: *mat_j++ = tcol - 1;
1240: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1241: mat_a += bs2;
1242: ilen_row++;
1243: }
1244: } else break;
1245: }
1246: imark = l;
1247: for (l=0; l<nzA; l++) {
1248: #if defined (PETSC_USE_CTABLE)
1249: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1250: if (tcol) {
1251: #else
1252: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1253: #endif
1254: *mat_j++ = tcol - 1;
1255: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1256: mat_a += bs2;
1257: ilen_row++;
1258: }
1259: }
1260: for (l=imark; l<nzB; l++) {
1261: #if defined (PETSC_USE_CTABLE)
1262: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1263: if (tcol) {
1264: #else
1265: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1266: #endif
1267: *mat_j++ = tcol - 1;
1268: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1269: mat_a += bs2;
1270: ilen_row++;
1271: }
1272: }
1273: imat_ilen[row] = ilen_row;
1274: }
1275: }
1276:
1277: }
1278: }
1280: /* Now assemble the off proc rows*/
1281: {
1282: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1283: int *imat_j,*imat_i;
1284: MatScalar *imat_a,*rbuf4_i;
1286: for (tmp2=0; tmp2<nrqs; tmp2++) {
1287: MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1288: idex = pa[i];
1289: sbuf1_i = sbuf1[idex];
1290: jmax = sbuf1_i[0];
1291: ct1 = 2*jmax + 1;
1292: ct2 = 0;
1293: rbuf2_i = rbuf2[i];
1294: rbuf3_i = rbuf3[i];
1295: rbuf4_i = rbuf4[i];
1296: for (j=1; j<=jmax; j++) {
1297: is_no = sbuf1_i[2*j-1];
1298: #if defined (PETSC_USE_CTABLE)
1299: lrow1_grow1 = rowmaps[is_no];
1300: lcol1_gcol1 = colmaps[is_no];
1301: #else
1302: rmap_i = rmap[is_no];
1303: cmap_i = cmap[is_no];
1304: #endif
1305: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1306: imat_ilen = mat->ilen;
1307: imat_j = mat->j;
1308: imat_i = mat->i;
1309: imat_a = mat->a;
1310: max1 = sbuf1_i[2*j];
1311: for (k=0; k<max1; k++,ct1++) {
1312: row = sbuf1_i[ct1];
1313: #if defined (PETSC_USE_CTABLE)
1314: PetscTableFind(lrow1_grow1,row+1,&row);
1315: row--;
1316: if(row < 0) { SETERRQ(1,"row not found in table"); }
1317: #else
1318: row = rmap_i[row];
1319: #endif
1320: ilen = imat_ilen[row];
1321: mat_i = imat_i[row];
1322: mat_a = imat_a + mat_i*bs2;
1323: mat_j = imat_j + mat_i;
1324: max2 = rbuf2_i[ct1];
1325: for (l=0; l<max2; l++,ct2++) {
1326: #if defined (PETSC_USE_CTABLE)
1327: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1328: if (tcol) {
1329: #else
1330: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1331: #endif
1332: *mat_j++ = tcol - 1;
1333: /* *mat_a++= rbuf4_i[ct2]; */
1334: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1335: mat_a += bs2;
1336: ilen++;
1337: }
1338: }
1339: imat_ilen[row] = ilen;
1340: }
1341: }
1342: }
1343: }
1344: PetscFree(r_status4);
1345: PetscFree(r_waits4);
1346: MPI_Waitall(nrqr,s_waits4,s_status4);
1347: PetscFree(s_waits4);
1348: PetscFree(s_status4);
1350: /* Restore the indices */
1351: for (i=0; i<ismax; i++) {
1352: ISRestoreIndices(isrow[i],irow+i);
1353: ISRestoreIndices(iscol[i],icol+i);
1354: }
1356: /* Destroy allocated memory */
1357: PetscFree(irow);
1358: PetscFree(w1);
1359: PetscFree(pa);
1361: PetscFree(sbuf1);
1362: PetscFree(rbuf2);
1363: for (i=0; i<nrqr; ++i) {
1364: PetscFree(sbuf2[i]);
1365: }
1366: for (i=0; i<nrqs; ++i) {
1367: PetscFree(rbuf3[i]);
1368: PetscFree(rbuf4[i]);
1369: }
1371: PetscFree(sbuf2);
1372: PetscFree(rbuf3);
1373: PetscFree(rbuf4);
1374: PetscFree(sbuf_aj[0]);
1375: PetscFree(sbuf_aj);
1376: PetscFree(sbuf_aa[0]);
1377: PetscFree(sbuf_aa);
1379: #if defined (PETSC_USE_CTABLE)
1380: for (i=0; i<ismax; i++){
1381: PetscTableDelete(rowmaps[i]);
1382: PetscTableDelete(colmaps[i]);
1383: }
1384: PetscFree(colmaps);
1385: PetscFree(rowmaps);
1386: #else
1387: PetscFree(rmap);
1388: PetscFree(cmap);
1389: #endif
1390: PetscFree(lens);
1392: for (i=0; i<ismax; i++) {
1393: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1394: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1395: }
1397: return(0);
1398: }