Actual source code: aijnode.c

  1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
  2: /*
  3:   This file provides high performance routines for the AIJ (compressed row)
  4:   format by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
 6:  #include src/mat/impls/aij/seq/aij.h

  8: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
  9: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
 10: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);

 12: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
 13: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
 14: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
 15: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
 16: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 17: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 18: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 19: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);

 23: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
 24: {
 25:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 26:   int        ierr,i,count,m,n,min_mn,*ns_row,*ns_col;

 29:   n      = A->n;
 30:   m      = A->m;
 31:   ns_row = a->inode.size;
 32: 
 33:   min_mn = (m < n) ? m : n;
 34:   if (!ns) {
 35:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 36:     for(; count+1 < n; count++,i++);
 37:     if (count < n)  {
 38:       i++;
 39:     }
 40:     *size = i;
 41:     return(0);
 42:   }
 43:   PetscMalloc((n+1)*sizeof(int),&ns_col);
 44: 
 45:   /* Use the same row structure wherever feasible. */
 46:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 47:     ns_col[i] = ns_row[i];
 48:   }

 50:   /* if m < n; pad up the remainder with inode_limit */
 51:   for(; count+1 < n; count++,i++) {
 52:     ns_col[i] = 1;
 53:   }
 54:   /* The last node is the odd ball. padd it up with the remaining rows; */
 55:   if (count < n)  {
 56:     ns_col[i] = n - count;
 57:     i++;
 58:   } else if (count > n) {
 59:     /* Adjust for the over estimation */
 60:     ns_col[i-1] += n - count;
 61:   }
 62:   *size = i;
 63:   *ns   = ns_col;
 64:   return(0);
 65: }


 68: /*
 69:       This builds symmetric version of nonzero structure,
 70: */
 73: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
 74: {
 75:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 76:   int        *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
 77:   int        *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 80:   nslim_row = a->inode.node_count;
 81:   m         = A->m;
 82:   n         = A->n;
 83:   if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
 84: 
 85:   /* Use the row_inode as column_inode */
 86:   nslim_col = nslim_row;
 87:   ns_col    = ns_row;

 89:   /* allocate space for reformated inode structure */
 90:   PetscMalloc((nslim_col+1)*sizeof(int),&tns);
 91:   PetscMalloc((n+1)*sizeof(int),&tvc);
 92:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 94:   for (i1=0,col=0; i1<nslim_col; ++i1){
 95:     nsz = ns_col[i1];
 96:     for (i2=0; i2<nsz; ++i2,++col)
 97:       tvc[col] = i1;
 98:   }
 99:   /* allocate space for row pointers */
100:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
101:   *iia = ia;
102:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
103:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

105:   /* determine the number of columns in each row */
106:   ia[0] = oshift;
107:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

109:     j    = aj + ai[row] + ishift;
110:     jmax = aj + ai[row+1] + ishift;
111:     i2   = 0;
112:     col  = *j++ + ishift;
113:     i2   = tvc[col];
114:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
115:       ia[i1+1]++;
116:       ia[i2+1]++;
117:       i2++;                     /* Start col of next node */
118:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
119:       i2 = tvc[col];
120:     }
121:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
122:   }

124:   /* shift ia[i] to point to next row */
125:   for (i1=1; i1<nslim_row+1; i1++) {
126:     row        = ia[i1-1];
127:     ia[i1]    += row;
128:     work[i1-1] = row - oshift;
129:   }

131:   /* allocate space for column pointers */
132:   nz   = ia[nslim_row] + (!ishift);
133:   PetscMalloc(nz*sizeof(int),&ja);
134:   *jja = ja;

136:  /* loop over lower triangular part putting into ja */
137:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
138:     j    = aj + ai[row] + ishift;
139:     jmax = aj + ai[row+1] + ishift;
140:     i2   = 0;                     /* Col inode index */
141:     col  = *j++ + ishift;
142:     i2   = tvc[col];
143:     while (i2<i1 && j<jmax) {
144:       ja[work[i2]++] = i1 + oshift;
145:       ja[work[i1]++] = i2 + oshift;
146:       ++i2;
147:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
148:       i2 = tvc[col];
149:     }
150:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

152:   }
153:   PetscFree(work);
154:   PetscFree(tns);
155:   PetscFree(tvc);
156:   return(0);
157: }

159: /*
160:       This builds nonsymmetric version of nonzero structure,
161: */
164: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
165: {
166:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
167:   int        *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
168:   int        *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

171:   nslim_row = a->inode.node_count;
172:   n         = A->n;

174:   /* Create The column_inode for this matrix */
175:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
176: 
177:   /* allocate space for reformated column_inode structure */
178:   PetscMalloc((nslim_col +1)*sizeof(int),&tns);
179:   PetscMalloc((n +1)*sizeof(int),&tvc);
180:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

182:   for (i1=0,col=0; i1<nslim_col; ++i1){
183:     nsz = ns_col[i1];
184:     for (i2=0; i2<nsz; ++i2,++col)
185:       tvc[col] = i1;
186:   }
187:   /* allocate space for row pointers */
188:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
189:   *iia = ia;
190:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
191:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

193:   /* determine the number of columns in each row */
194:   ia[0] = oshift;
195:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
196:     j   = aj + ai[row] + ishift;
197:     col = *j++ + ishift;
198:     i2  = tvc[col];
199:     nz  = ai[row+1] - ai[row];
200:     while (nz-- > 0) {           /* off-diagonal elemets */
201:       ia[i1+1]++;
202:       i2++;                     /* Start col of next node */
203:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
204:       i2 = tvc[col];
205:     }
206:   }

208:   /* shift ia[i] to point to next row */
209:   for (i1=1; i1<nslim_row+1; i1++) {
210:     row        = ia[i1-1];
211:     ia[i1]    += row;
212:     work[i1-1] = row - oshift;
213:   }

215:   /* allocate space for column pointers */
216:   nz   = ia[nslim_row] + (!ishift);
217:   PetscMalloc(nz*sizeof(int),&ja);
218:   *jja = ja;

220:  /* loop over matrix putting into ja */
221:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
222:     j   = aj + ai[row] + ishift;
223:     i2  = 0;                     /* Col inode index */
224:     col = *j++ + ishift;
225:     i2  = tvc[col];
226:     nz  = ai[row+1] - ai[row];
227:     while (nz-- > 0) {
228:       ja[work[i1]++] = i2 + oshift;
229:       ++i2;
230:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
231:       i2 = tvc[col];
232:     }
233:   }
234:   PetscFree(ns_col);
235:   PetscFree(work);
236:   PetscFree(tns);
237:   PetscFree(tvc);
238:   return(0);
239: }

243: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
244: {
245:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
246:   int        ierr;

249:   *n     = a->inode.node_count;
250:   if (!ia) return(0);

252:   if (symmetric) {
253:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
254:   } else {
255:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
256:   }
257:   return(0);
258: }

262: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
263: {

267:   if (!ia) return(0);
268:   PetscFree(*ia);
269:   PetscFree(*ja);
270:   return(0);
271: }

273: /* ----------------------------------------------------------- */

277: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
278: {
279:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
280:   int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
281:   int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

284:   nslim_row = a->inode.node_count;
285:   n         = A->n;

287:   /* Create The column_inode for this matrix */
288:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
289: 
290:   /* allocate space for reformated column_inode structure */
291:   PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
292:   PetscMalloc((n + 1)*sizeof(int),&tvc);
293:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

295:   for (i1=0,col=0; i1<nslim_col; ++i1){
296:     nsz = ns_col[i1];
297:     for (i2=0; i2<nsz; ++i2,++col)
298:       tvc[col] = i1;
299:   }
300:   /* allocate space for column pointers */
301:   PetscMalloc((nslim_col+1)*sizeof(int),&ia);
302:   *iia = ia;
303:   PetscMemzero(ia,(nslim_col+1)*sizeof(int));
304:   PetscMalloc((nslim_col+1)*sizeof(int),&work);

306:   /* determine the number of columns in each row */
307:   ia[0] = oshift;
308:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
309:     j   = aj + ai[row] + ishift;
310:     col = *j++ + ishift;
311:     i2  = tvc[col];
312:     nz  = ai[row+1] - ai[row];
313:     while (nz-- > 0) {           /* off-diagonal elemets */
314:       /* ia[i1+1]++; */
315:       ia[i2+1]++;
316:       i2++;
317:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
318:       i2 = tvc[col];
319:     }
320:   }

322:   /* shift ia[i] to point to next col */
323:   for (i1=1; i1<nslim_col+1; i1++) {
324:     col        = ia[i1-1];
325:     ia[i1]    += col;
326:     work[i1-1] = col - oshift;
327:   }

329:   /* allocate space for column pointers */
330:   nz   = ia[nslim_col] + (!ishift);
331:   PetscMalloc(nz*sizeof(int),&ja);
332:   *jja = ja;

334:  /* loop over matrix putting into ja */
335:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
336:     j   = aj + ai[row] + ishift;
337:     i2  = 0;                     /* Col inode index */
338:     col = *j++ + ishift;
339:     i2  = tvc[col];
340:     nz  = ai[row+1] - ai[row];
341:     while (nz-- > 0) {
342:       /* ja[work[i1]++] = i2 + oshift; */
343:       ja[work[i2]++] = i1 + oshift;
344:       i2++;
345:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
346:       i2 = tvc[col];
347:     }
348:   }
349:   PetscFree(ns_col);
350:   PetscFree(work);
351:   PetscFree(tns);
352:   PetscFree(tvc);
353:   return(0);
354: }

358: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
359: {

363:   Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
364:   if (!ia) return(0);

366:   if (symmetric) {
367:     /* Since the indices are symmetric it does'nt matter */
368:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
369:   } else {
370:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
371:   }
372:   return(0);
373: }

377: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
378: {

382:   if (!ia) return(0);
383:   PetscFree(*ia);
384:   PetscFree(*ja);
385:   return(0);
386: }

388: /* ----------------------------------------------------------- */

392: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
393: {
394:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
395:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
396:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y;
397:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
398: 
399: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
400: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
401: #endif

404:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
405:   node_max = a->inode.node_count;
406:   ns       = a->inode.size;     /* Node Size array */
407:   VecGetArray(xx,&x);
408:   VecGetArray(yy,&y);
409:   idx  = a->j;
410:   v1   = a->a;
411:   ii   = a->i;

413:   for (i = 0,row = 0; i< node_max; ++i){
414:     nsz  = ns[i];
415:     n    = ii[1] - ii[0];
416:     ii  += nsz;
417:     sz   = n;                   /* No of non zeros in this row */
418:                                 /* Switch on the size of Node */
419:     switch (nsz){               /* Each loop in 'case' is unrolled */
420:     case 1 :
421:       sum1  = 0;
422: 
423:       for(n = 0; n< sz-1; n+=2) {
424:         i1   = idx[0];          /* The instructions are ordered to */
425:         i2   = idx[1];          /* make the compiler's job easy */
426:         idx += 2;
427:         tmp0 = x[i1];
428:         tmp1 = x[i2];
429:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
430:        }
431: 
432:       if (n == sz-1){          /* Take care of the last nonzero  */
433:         tmp0  = x[*idx++];
434:         sum1 += *v1++ * tmp0;
435:       }
436:       y[row++]=sum1;
437:       break;
438:     case 2:
439:       sum1  = 0;
440:       sum2  = 0;
441:       v2    = v1 + n;
442: 
443:       for (n = 0; n< sz-1; n+=2) {
444:         i1   = idx[0];
445:         i2   = idx[1];
446:         idx += 2;
447:         tmp0 = x[i1];
448:         tmp1 = x[i2];
449:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
450:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
451:       }
452:       if (n == sz-1){
453:         tmp0  = x[*idx++];
454:         sum1 += *v1++ * tmp0;
455:         sum2 += *v2++ * tmp0;
456:       }
457:       y[row++]=sum1;
458:       y[row++]=sum2;
459:       v1      =v2;              /* Since the next block to be processed starts there*/
460:       idx    +=sz;
461:       break;
462:     case 3:
463:       sum1  = 0;
464:       sum2  = 0;
465:       sum3  = 0;
466:       v2    = v1 + n;
467:       v3    = v2 + n;
468: 
469:       for (n = 0; n< sz-1; n+=2) {
470:         i1   = idx[0];
471:         i2   = idx[1];
472:         idx += 2;
473:         tmp0 = x[i1];
474:         tmp1 = x[i2];
475:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
476:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
477:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
478:       }
479:       if (n == sz-1){
480:         tmp0  = x[*idx++];
481:         sum1 += *v1++ * tmp0;
482:         sum2 += *v2++ * tmp0;
483:         sum3 += *v3++ * tmp0;
484:       }
485:       y[row++]=sum1;
486:       y[row++]=sum2;
487:       y[row++]=sum3;
488:       v1       =v3;             /* Since the next block to be processed starts there*/
489:       idx     +=2*sz;
490:       break;
491:     case 4:
492:       sum1  = 0;
493:       sum2  = 0;
494:       sum3  = 0;
495:       sum4  = 0;
496:       v2    = v1 + n;
497:       v3    = v2 + n;
498:       v4    = v3 + n;
499: 
500:       for (n = 0; n< sz-1; n+=2) {
501:         i1   = idx[0];
502:         i2   = idx[1];
503:         idx += 2;
504:         tmp0 = x[i1];
505:         tmp1 = x[i2];
506:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
507:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
508:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
509:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
510:       }
511:       if (n == sz-1){
512:         tmp0  = x[*idx++];
513:         sum1 += *v1++ * tmp0;
514:         sum2 += *v2++ * tmp0;
515:         sum3 += *v3++ * tmp0;
516:         sum4 += *v4++ * tmp0;
517:       }
518:       y[row++]=sum1;
519:       y[row++]=sum2;
520:       y[row++]=sum3;
521:       y[row++]=sum4;
522:       v1      =v4;              /* Since the next block to be processed starts there*/
523:       idx    +=3*sz;
524:       break;
525:     case 5:
526:       sum1  = 0;
527:       sum2  = 0;
528:       sum3  = 0;
529:       sum4  = 0;
530:       sum5  = 0;
531:       v2    = v1 + n;
532:       v3    = v2 + n;
533:       v4    = v3 + n;
534:       v5    = v4 + n;
535: 
536:       for (n = 0; n<sz-1; n+=2) {
537:         i1   = idx[0];
538:         i2   = idx[1];
539:         idx += 2;
540:         tmp0 = x[i1];
541:         tmp1 = x[i2];
542:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
543:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
544:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
545:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
546:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
547:       }
548:       if (n == sz-1){
549:         tmp0  = x[*idx++];
550:         sum1 += *v1++ * tmp0;
551:         sum2 += *v2++ * tmp0;
552:         sum3 += *v3++ * tmp0;
553:         sum4 += *v4++ * tmp0;
554:         sum5 += *v5++ * tmp0;
555:       }
556:       y[row++]=sum1;
557:       y[row++]=sum2;
558:       y[row++]=sum3;
559:       y[row++]=sum4;
560:       y[row++]=sum5;
561:       v1      =v5;       /* Since the next block to be processed starts there */
562:       idx    +=4*sz;
563:       break;
564:     default :
565:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
566:     }
567:   }
568:   VecRestoreArray(xx,&x);
569:   VecRestoreArray(yy,&y);
570:   PetscLogFlops(2*a->nz - A->m);
571:   return(0);
572: }
573: /* ----------------------------------------------------------- */
574: /* Almost same code as the MatMult_SeqAij_Inode() */
577: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
578: {
579:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
580:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
581:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
582:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
583: 
585:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
586:   node_max = a->inode.node_count;
587:   ns       = a->inode.size;     /* Node Size array */
588:   VecGetArray(xx,&x);
589:   VecGetArray(yy,&y);
590:   if (zz != yy) {
591:     VecGetArray(zz,&z);
592:   } else {
593:     z = y;
594:   }
595:   zt = z;

597:   idx  = a->j;
598:   v1   = a->a;
599:   ii   = a->i;

601:   for (i = 0,row = 0; i< node_max; ++i){
602:     nsz  = ns[i];
603:     n    = ii[1] - ii[0];
604:     ii  += nsz;
605:     sz   = n;                   /* No of non zeros in this row */
606:                                 /* Switch on the size of Node */
607:     switch (nsz){               /* Each loop in 'case' is unrolled */
608:     case 1 :
609:       sum1  = *zt++;
610: 
611:       for(n = 0; n< sz-1; n+=2) {
612:         i1   = idx[0];          /* The instructions are ordered to */
613:         i2   = idx[1];          /* make the compiler's job easy */
614:         idx += 2;
615:         tmp0 = x[i1];
616:         tmp1 = x[i2];
617:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618:        }
619: 
620:       if(n   == sz-1){          /* Take care of the last nonzero  */
621:         tmp0  = x[*idx++];
622:         sum1 += *v1++ * tmp0;
623:       }
624:       y[row++]=sum1;
625:       break;
626:     case 2:
627:       sum1  = *zt++;
628:       sum2  = *zt++;
629:       v2    = v1 + n;
630: 
631:       for(n = 0; n< sz-1; n+=2) {
632:         i1   = idx[0];
633:         i2   = idx[1];
634:         idx += 2;
635:         tmp0 = x[i1];
636:         tmp1 = x[i2];
637:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
638:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
639:       }
640:       if(n   == sz-1){
641:         tmp0  = x[*idx++];
642:         sum1 += *v1++ * tmp0;
643:         sum2 += *v2++ * tmp0;
644:       }
645:       y[row++]=sum1;
646:       y[row++]=sum2;
647:       v1      =v2;              /* Since the next block to be processed starts there*/
648:       idx    +=sz;
649:       break;
650:     case 3:
651:       sum1  = *zt++;
652:       sum2  = *zt++;
653:       sum3  = *zt++;
654:       v2    = v1 + n;
655:       v3    = v2 + n;
656: 
657:       for (n = 0; n< sz-1; n+=2) {
658:         i1   = idx[0];
659:         i2   = idx[1];
660:         idx += 2;
661:         tmp0 = x[i1];
662:         tmp1 = x[i2];
663:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
664:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
665:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
666:       }
667:       if (n == sz-1){
668:         tmp0  = x[*idx++];
669:         sum1 += *v1++ * tmp0;
670:         sum2 += *v2++ * tmp0;
671:         sum3 += *v3++ * tmp0;
672:       }
673:       y[row++]=sum1;
674:       y[row++]=sum2;
675:       y[row++]=sum3;
676:       v1       =v3;             /* Since the next block to be processed starts there*/
677:       idx     +=2*sz;
678:       break;
679:     case 4:
680:       sum1  = *zt++;
681:       sum2  = *zt++;
682:       sum3  = *zt++;
683:       sum4  = *zt++;
684:       v2    = v1 + n;
685:       v3    = v2 + n;
686:       v4    = v3 + n;
687: 
688:       for (n = 0; n< sz-1; n+=2) {
689:         i1   = idx[0];
690:         i2   = idx[1];
691:         idx += 2;
692:         tmp0 = x[i1];
693:         tmp1 = x[i2];
694:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
695:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
696:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
697:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
698:       }
699:       if (n == sz-1){
700:         tmp0  = x[*idx++];
701:         sum1 += *v1++ * tmp0;
702:         sum2 += *v2++ * tmp0;
703:         sum3 += *v3++ * tmp0;
704:         sum4 += *v4++ * tmp0;
705:       }
706:       y[row++]=sum1;
707:       y[row++]=sum2;
708:       y[row++]=sum3;
709:       y[row++]=sum4;
710:       v1      =v4;              /* Since the next block to be processed starts there*/
711:       idx    +=3*sz;
712:       break;
713:     case 5:
714:       sum1  = *zt++;
715:       sum2  = *zt++;
716:       sum3  = *zt++;
717:       sum4  = *zt++;
718:       sum5  = *zt++;
719:       v2    = v1 + n;
720:       v3    = v2 + n;
721:       v4    = v3 + n;
722:       v5    = v4 + n;
723: 
724:       for (n = 0; n<sz-1; n+=2) {
725:         i1   = idx[0];
726:         i2   = idx[1];
727:         idx += 2;
728:         tmp0 = x[i1];
729:         tmp1 = x[i2];
730:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
731:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
732:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
733:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
734:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
735:       }
736:       if(n   == sz-1){
737:         tmp0  = x[*idx++];
738:         sum1 += *v1++ * tmp0;
739:         sum2 += *v2++ * tmp0;
740:         sum3 += *v3++ * tmp0;
741:         sum4 += *v4++ * tmp0;
742:         sum5 += *v5++ * tmp0;
743:       }
744:       y[row++]=sum1;
745:       y[row++]=sum2;
746:       y[row++]=sum3;
747:       y[row++]=sum4;
748:       y[row++]=sum5;
749:       v1      =v5;       /* Since the next block to be processed starts there */
750:       idx    +=4*sz;
751:       break;
752:     default :
753:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
754:     }
755:   }
756:   VecRestoreArray(xx,&x);
757:   VecRestoreArray(yy,&y);
758:   if (zz != yy) {
759:     VecRestoreArray(zz,&z);
760:   }
761:   PetscLogFlops(2*a->nz);
762:   return(0);
763: }
764: /* ----------------------------------------------------------- */
765: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,const ISColoringValue[],ISColoring *);

767: /*
768:     samestructure indicates that the matrix has not changed its nonzero structure so we 
769:     do not need to recompute the inodes 
770: */
773: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
774: {
775:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
776:   int        ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
777:   PetscTruth flag,flg;

780:   if (samestructure && a->inode.checked) return(0);

782:   a->inode.checked = PETSC_TRUE;

784:   /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
785:   if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
786:   PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
787:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inode\n");return(0);}
788:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
789:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unroll\n");return(0);}
790:   PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
791:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
792:   m = A->m;
793:   if (a->inode.size) {ns = a->inode.size;}
794:   else {PetscMalloc((m+1)*sizeof(int),&ns);}

796:   i          = 0;
797:   node_count = 0;
798:   idx        = a->j;
799:   ii         = a->i;
800:   while (i < m){                /* For each row */
801:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
802:     /* Limits the number of elements in a node to 'a->inode.limit' */
803:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
804:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
805:       if (nzy != nzx) break;
806:       idy  += nzx;             /* Same nonzero pattern */
807:       PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
808:       if (flag == PETSC_FALSE) break;
809:     }
810:     ns[node_count++] = blk_size;
811:     idx += blk_size*nzx;
812:     i    = j;
813:   }
814:   /* If not enough inodes found,, do not use inode version of the routines */
815:   if (!a->inode.size && m && node_count > 0.9*m) {
816:     PetscFree(ns);
817:     A->ops->mult            = MatMult_SeqAIJ;
818:     A->ops->multadd         = MatMultAdd_SeqAIJ;
819:     A->ops->solve           = MatSolve_SeqAIJ;
820:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
821:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ;
822:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ;
823:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ;
824:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
825:     A->ops->coloringpatch   = 0;
826:     a->inode.node_count     = 0;
827:     a->inode.size           = 0;
828:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routines\n",node_count,m);
829:   } else {
830:     A->ops->mult            = MatMult_SeqAIJ_Inode;
831:     A->ops->multadd         = MatMultAdd_SeqAIJ_Inode;
832:     A->ops->solve           = MatSolve_SeqAIJ_Inode;
833:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
834:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
835:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
836:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
837:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
838:     A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
839:     a->inode.node_count     = node_count;
840:     a->inode.size           = ns;
841:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routines\n",node_count,m,a->inode.limit);
842:   }
843:   return(0);
844: }

846: /* ----------------------------------------------------------- */
849: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
850: {
851:   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data;
852:   IS          iscol = a->col,isrow = a->row;
853:   int         *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
854:   int         node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
855:   PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
856:   PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

859:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
860:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
861:   node_max = a->inode.node_count;
862:   ns       = a->inode.size;     /* Node Size array */

864:   VecGetArray(bb,&b);
865:   VecGetArray(xx,&x);
866:   tmp  = a->solve_work;
867: 
868:   ISGetIndices(isrow,&rout); r = rout;
869:   ISGetIndices(iscol,&cout); c = cout + (n-1);
870: 
871:   /* forward solve the lower triangular */
872:   tmps = tmp ;
873:   aa   = a_a ;
874:   aj   = a_j ;
875:   ad   = a->diag;

877:   for (i = 0,row = 0; i< node_max; ++i){
878:     nsz = ns[i];
879:     aii = ai[row];
880:     v1  = aa + aii;
881:     vi  = aj + aii;
882:     nz  = ad[row]- aii;
883: 
884:     switch (nsz){               /* Each loop in 'case' is unrolled */
885:     case 1 :
886:       sum1 = b[*r++];
887:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
888:       for(j=0; j<nz-1; j+=2){
889:         i0   = vi[0];
890:         i1   = vi[1];
891:         vi  +=2;
892:         tmp0 = tmps[i0];
893:         tmp1 = tmps[i1];
894:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
895:       }
896:       if(j == nz-1){
897:         tmp0 = tmps[*vi++];
898:         sum1 -= *v1++ *tmp0;
899:       }
900:       tmp[row ++]=sum1;
901:       break;
902:     case 2:
903:       sum1 = b[*r++];
904:       sum2 = b[*r++];
905:       v2   = aa + ai[row+1];

907:       for(j=0; j<nz-1; j+=2){
908:         i0   = vi[0];
909:         i1   = vi[1];
910:         vi  +=2;
911:         tmp0 = tmps[i0];
912:         tmp1 = tmps[i1];
913:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915:       }
916:       if(j == nz-1){
917:         tmp0 = tmps[*vi++];
918:         sum1 -= *v1++ *tmp0;
919:         sum2 -= *v2++ *tmp0;
920:       }
921:       sum2 -= *v2++ * sum1;
922:       tmp[row ++]=sum1;
923:       tmp[row ++]=sum2;
924:       break;
925:     case 3:
926:       sum1 = b[*r++];
927:       sum2 = b[*r++];
928:       sum3 = b[*r++];
929:       v2   = aa + ai[row+1];
930:       v3   = aa + ai[row+2];
931: 
932:       for (j=0; j<nz-1; j+=2){
933:         i0   = vi[0];
934:         i1   = vi[1];
935:         vi  +=2;
936:         tmp0 = tmps[i0];
937:         tmp1 = tmps[i1];
938:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
939:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
940:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
941:       }
942:       if (j == nz-1){
943:         tmp0 = tmps[*vi++];
944:         sum1 -= *v1++ *tmp0;
945:         sum2 -= *v2++ *tmp0;
946:         sum3 -= *v3++ *tmp0;
947:       }
948:       sum2 -= *v2++ * sum1;
949:       sum3 -= *v3++ * sum1;
950:       sum3 -= *v3++ * sum2;
951:       tmp[row ++]=sum1;
952:       tmp[row ++]=sum2;
953:       tmp[row ++]=sum3;
954:       break;
955: 
956:     case 4:
957:       sum1 = b[*r++];
958:       sum2 = b[*r++];
959:       sum3 = b[*r++];
960:       sum4 = b[*r++];
961:       v2   = aa + ai[row+1];
962:       v3   = aa + ai[row+2];
963:       v4   = aa + ai[row+3];
964: 
965:       for (j=0; j<nz-1; j+=2){
966:         i0   = vi[0];
967:         i1   = vi[1];
968:         vi  +=2;
969:         tmp0 = tmps[i0];
970:         tmp1 = tmps[i1];
971:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
972:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
973:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
974:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
975:       }
976:       if (j == nz-1){
977:         tmp0 = tmps[*vi++];
978:         sum1 -= *v1++ *tmp0;
979:         sum2 -= *v2++ *tmp0;
980:         sum3 -= *v3++ *tmp0;
981:         sum4 -= *v4++ *tmp0;
982:       }
983:       sum2 -= *v2++ * sum1;
984:       sum3 -= *v3++ * sum1;
985:       sum4 -= *v4++ * sum1;
986:       sum3 -= *v3++ * sum2;
987:       sum4 -= *v4++ * sum2;
988:       sum4 -= *v4++ * sum3;
989: 
990:       tmp[row ++]=sum1;
991:       tmp[row ++]=sum2;
992:       tmp[row ++]=sum3;
993:       tmp[row ++]=sum4;
994:       break;
995:     case 5:
996:       sum1 = b[*r++];
997:       sum2 = b[*r++];
998:       sum3 = b[*r++];
999:       sum4 = b[*r++];
1000:       sum5 = b[*r++];
1001:       v2   = aa + ai[row+1];
1002:       v3   = aa + ai[row+2];
1003:       v4   = aa + ai[row+3];
1004:       v5   = aa + ai[row+4];
1005: 
1006:       for (j=0; j<nz-1; j+=2){
1007:         i0   = vi[0];
1008:         i1   = vi[1];
1009:         vi  +=2;
1010:         tmp0 = tmps[i0];
1011:         tmp1 = tmps[i1];
1012:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1013:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1014:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1015:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1016:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1017:       }
1018:       if (j == nz-1){
1019:         tmp0 = tmps[*vi++];
1020:         sum1 -= *v1++ *tmp0;
1021:         sum2 -= *v2++ *tmp0;
1022:         sum3 -= *v3++ *tmp0;
1023:         sum4 -= *v4++ *tmp0;
1024:         sum5 -= *v5++ *tmp0;
1025:       }

1027:       sum2 -= *v2++ * sum1;
1028:       sum3 -= *v3++ * sum1;
1029:       sum4 -= *v4++ * sum1;
1030:       sum5 -= *v5++ * sum1;
1031:       sum3 -= *v3++ * sum2;
1032:       sum4 -= *v4++ * sum2;
1033:       sum5 -= *v5++ * sum2;
1034:       sum4 -= *v4++ * sum3;
1035:       sum5 -= *v5++ * sum3;
1036:       sum5 -= *v5++ * sum4;
1037: 
1038:       tmp[row ++]=sum1;
1039:       tmp[row ++]=sum2;
1040:       tmp[row ++]=sum3;
1041:       tmp[row ++]=sum4;
1042:       tmp[row ++]=sum5;
1043:       break;
1044:     default:
1045:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1046:     }
1047:   }
1048:   /* backward solve the upper triangular */
1049:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1050:     nsz = ns[i];
1051:     aii = ai[row+1] -1;
1052:     v1  = aa + aii;
1053:     vi  = aj + aii;
1054:     nz  = aii- ad[row];
1055:     switch (nsz){               /* Each loop in 'case' is unrolled */
1056:     case 1 :
1057:       sum1 = tmp[row];

1059:       for(j=nz ; j>1; j-=2){
1060:         i0   = vi[0];
1061:         i1   = vi[-1];
1062:         vi  -=2;
1063:         tmp0 = tmps[i0];
1064:         tmp1 = tmps[i1];
1065:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1066:       }
1067:       if (j==1){
1068:         tmp0  = tmps[*vi--];
1069:         sum1 -= *v1-- * tmp0;
1070:       }
1071:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1072:       break;
1073:     case 2 :
1074:       sum1 = tmp[row];
1075:       sum2 = tmp[row -1];
1076:       v2   = aa + ai[row]-1;
1077:       for (j=nz ; j>1; j-=2){
1078:         i0   = vi[0];
1079:         i1   = vi[-1];
1080:         vi  -=2;
1081:         tmp0 = tmps[i0];
1082:         tmp1 = tmps[i1];
1083:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1084:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1085:       }
1086:       if (j==1){
1087:         tmp0  = tmps[*vi--];
1088:         sum1 -= *v1-- * tmp0;
1089:         sum2 -= *v2-- * tmp0;
1090:       }
1091: 
1092:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1093:       sum2   -= *v2-- * tmp0;
1094:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1095:       break;
1096:     case 3 :
1097:       sum1 = tmp[row];
1098:       sum2 = tmp[row -1];
1099:       sum3 = tmp[row -2];
1100:       v2   = aa + ai[row]-1;
1101:       v3   = aa + ai[row -1]-1;
1102:       for (j=nz ; j>1; j-=2){
1103:         i0   = vi[0];
1104:         i1   = vi[-1];
1105:         vi  -=2;
1106:         tmp0 = tmps[i0];
1107:         tmp1 = tmps[i1];
1108:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1109:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1110:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1111:       }
1112:       if (j==1){
1113:         tmp0  = tmps[*vi--];
1114:         sum1 -= *v1-- * tmp0;
1115:         sum2 -= *v2-- * tmp0;
1116:         sum3 -= *v3-- * tmp0;
1117:       }
1118:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1119:       sum2   -= *v2-- * tmp0;
1120:       sum3   -= *v3-- * tmp0;
1121:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1122:       sum3   -= *v3-- * tmp0;
1123:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1124: 
1125:       break;
1126:     case 4 :
1127:       sum1 = tmp[row];
1128:       sum2 = tmp[row -1];
1129:       sum3 = tmp[row -2];
1130:       sum4 = tmp[row -3];
1131:       v2   = aa + ai[row]-1;
1132:       v3   = aa + ai[row -1]-1;
1133:       v4   = aa + ai[row -2]-1;

1135:       for (j=nz ; j>1; j-=2){
1136:         i0   = vi[0];
1137:         i1   = vi[-1];
1138:         vi  -=2;
1139:         tmp0 = tmps[i0];
1140:         tmp1 = tmps[i1];
1141:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1142:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1143:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1144:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1145:       }
1146:       if (j==1){
1147:         tmp0  = tmps[*vi--];
1148:         sum1 -= *v1-- * tmp0;
1149:         sum2 -= *v2-- * tmp0;
1150:         sum3 -= *v3-- * tmp0;
1151:         sum4 -= *v4-- * tmp0;
1152:       }

1154:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1155:       sum2   -= *v2-- * tmp0;
1156:       sum3   -= *v3-- * tmp0;
1157:       sum4   -= *v4-- * tmp0;
1158:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159:       sum3   -= *v3-- * tmp0;
1160:       sum4   -= *v4-- * tmp0;
1161:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162:       sum4   -= *v4-- * tmp0;
1163:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1164:       break;
1165:     case 5 :
1166:       sum1 = tmp[row];
1167:       sum2 = tmp[row -1];
1168:       sum3 = tmp[row -2];
1169:       sum4 = tmp[row -3];
1170:       sum5 = tmp[row -4];
1171:       v2   = aa + ai[row]-1;
1172:       v3   = aa + ai[row -1]-1;
1173:       v4   = aa + ai[row -2]-1;
1174:       v5   = aa + ai[row -3]-1;
1175:       for (j=nz ; j>1; j-=2){
1176:         i0   = vi[0];
1177:         i1   = vi[-1];
1178:         vi  -=2;
1179:         tmp0 = tmps[i0];
1180:         tmp1 = tmps[i1];
1181:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1182:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1183:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1184:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1185:         sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1186:       }
1187:       if (j==1){
1188:         tmp0  = tmps[*vi--];
1189:         sum1 -= *v1-- * tmp0;
1190:         sum2 -= *v2-- * tmp0;
1191:         sum3 -= *v3-- * tmp0;
1192:         sum4 -= *v4-- * tmp0;
1193:         sum5 -= *v5-- * tmp0;
1194:       }

1196:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1197:       sum2   -= *v2-- * tmp0;
1198:       sum3   -= *v3-- * tmp0;
1199:       sum4   -= *v4-- * tmp0;
1200:       sum5   -= *v5-- * tmp0;
1201:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1202:       sum3   -= *v3-- * tmp0;
1203:       sum4   -= *v4-- * tmp0;
1204:       sum5   -= *v5-- * tmp0;
1205:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1206:       sum4   -= *v4-- * tmp0;
1207:       sum5   -= *v5-- * tmp0;
1208:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1209:       sum5   -= *v5-- * tmp0;
1210:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1211:       break;
1212:     default:
1213:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1214:     }
1215:   }
1216:   ISRestoreIndices(isrow,&rout);
1217:   ISRestoreIndices(iscol,&cout);
1218:   VecRestoreArray(bb,&b);
1219:   VecRestoreArray(xx,&x);
1220:   PetscLogFlops(2*a->nz - A->n);
1221:   return(0);
1222: }


1227: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1228: {
1229:   Mat          C = *B;
1230:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1231:   IS           iscol = b->col,isrow = b->row,isicol = b->icol;
1232:   int          *r,*ic,*c,ierr,n = A->m,*bi = b->i;
1233:   int          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1234:   int          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nsz;
1235:   int          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1236:   PetscScalar  *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1237:   PetscScalar  tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1238:   PetscReal    damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1239:   PetscTruth   damp;

1242:   ISGetIndices(isrow,&r);
1243:   ISGetIndices(iscol,&c);
1244:   ISGetIndices(isicol,&ic);
1245:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1246:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1247:   ics    = ic ; rtmps1 = rtmp1 ;
1248:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 ;
1249:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 ;
1250: 
1251:   node_max = a->inode.node_count;
1252:   ns       = a->inode.size ;
1253:   if (!ns){
1254:     SETERRQ(1,"Matrix without inode information");
1255:   }

1257:   /* If max inode size > 3, split it into two inodes.*/
1258:   /* also map the inode sizes according to the ordering */
1259:   PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1260:   for (i=0,j=0; i<node_max; ++i,++j){
1261:     if (ns[i]>3) {
1262:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1263:       ++j;
1264:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1265:     } else {
1266:       tmp_vec1[j] = ns[i];
1267:     }
1268:   }
1269:   /* Use the correct node_max */
1270:   node_max = j;

1272:   /* Now reorder the inode info based on mat re-ordering info */
1273:   /* First create a row -> inode_size_array_index map */
1274:   PetscMalloc(n*sizeof(int)+1,&nsmap);
1275:   PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1276:   for (i=0,row=0; i<node_max; i++) {
1277:     nsz = tmp_vec1[i];
1278:     for (j=0; j<nsz; j++,row++) {
1279:       nsmap[row] = i;
1280:     }
1281:   }
1282:   /* Using nsmap, create a reordered ns structure */
1283:   for (i=0,j=0; i< node_max; i++) {
1284:     nsz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1285:     tmp_vec2[i] = nsz;
1286:     j        += nsz;
1287:   }
1288:   PetscFree(nsmap);
1289:   PetscFree(tmp_vec1);
1290:   /* Now use the correct ns */
1291:   ns = tmp_vec2;


1294:   do {
1295:     damp = PETSC_FALSE;
1296:     /* Now loop over each block-row, and do the factorization */
1297:     for (i=0,row=0; i<node_max; i++) {
1298:       nsz   = ns[i];
1299:       nz    = bi[row+1] - bi[row];
1300:       bjtmp = bj + bi[row];
1301: 
1302:       switch (nsz){
1303:       case 1:
1304:         for  (j=0; j<nz; j++){
1305:           idx         = bjtmp[j];
1306:           rtmps1[idx] = 0.0;
1307:         }
1308: 
1309:         /* load in initial (unfactored row) */
1310:         idx   = r[row];
1311:         nz    = ai[idx+1] - ai[idx];
1312:         ajtmp = aj + ai[idx];
1313:         v1    = aa + ai[idx];

1315:         for (j=0; j<nz; j++) {
1316:           idx        = ics[ajtmp[j]];
1317:           rtmp1[idx] = v1[j];
1318:           if (ndamp && ajtmp[j] == r[row]) {
1319:             rtmp1[idx] += damping;
1320:           }
1321:         }
1322:         prow = *bjtmp++ ;
1323:         while (prow < row) {
1324:           pc1 = rtmp1 + prow;
1325:           if (*pc1 != 0.0){
1326:             pv   = ba + bd[prow];
1327:             pj   = nbj + bd[prow];
1328:             mul1 = *pc1 * *pv++;
1329:             *pc1 = mul1;
1330:             nz   = bi[prow+1] - bd[prow] - 1;
1331:             PetscLogFlops(2*nz);
1332:             for (j=0; j<nz; j++) {
1333:               tmp = pv[j];
1334:               idx = pj[j];
1335:               rtmps1[idx] -= mul1 * tmp;
1336:             }
1337:           }
1338:           prow = *bjtmp++ ;
1339:         }
1340:         nz  = bi[row+1] - bi[row];
1341:         pj  = bj + bi[row];
1342:         pc1 = ba + bi[row];
1343:         if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1344:           if (damping) {
1345:             if (ndamp) damping *= 2.0;
1346:             damp = PETSC_TRUE;
1347:             ndamp++;
1348:             goto endofwhile;
1349:           } else {
1350:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1351:           }
1352:         }
1353:         rtmp1[row] = 1.0/rtmp1[row];
1354:         for (j=0; j<nz; j++) {
1355:           idx    = pj[j];
1356:           pc1[j] = rtmps1[idx];
1357:         }
1358:         break;
1359: 
1360:       case 2:
1361:         for  (j=0; j<nz; j++) {
1362:           idx         = bjtmp[j];
1363:           rtmps1[idx] = 0.0;
1364:           rtmps2[idx] = 0.0;
1365:         }
1366: 
1367:         /* load in initial (unfactored row) */
1368:         idx   = r[row];
1369:         nz    = ai[idx+1] - ai[idx];
1370:         ajtmp = aj + ai[idx];
1371:         v1    = aa + ai[idx];
1372:         v2    = aa + ai[idx+1];
1373: 
1374:         for (j=0; j<nz; j++) {
1375:           idx        = ics[ajtmp[j]];
1376:           rtmp1[idx] = v1[j];
1377:           rtmp2[idx] = v2[j];
1378:           if (ndamp && ajtmp[j] == r[row]) {
1379:             rtmp1[idx] += damping;
1380:           }
1381:           if (ndamp && ajtmp[j] == r[row+1]) {
1382:             rtmp2[idx] += damping;
1383:           }
1384:         }
1385:         prow = *bjtmp++ ;
1386:         while (prow < row) {
1387:           pc1 = rtmp1 + prow;
1388:           pc2 = rtmp2 + prow;
1389:           if (*pc1 != 0.0 || *pc2 != 0.0){
1390:             pv   = ba + bd[prow];
1391:             pj   = nbj + bd[prow];
1392:             mul1 = *pc1 * *pv;
1393:             mul2 = *pc2 * *pv;
1394:             ++pv;
1395:             *pc1 = mul1;
1396:             *pc2 = mul2;
1397: 
1398:             nz   = bi[prow+1] - bd[prow] - 1;
1399:             PetscLogFlops(2*2*nz);
1400:             for (j=0; j<nz; j++) {
1401:               tmp = pv[j];
1402:               idx = pj[j];
1403:               rtmps1[idx] -= mul1 * tmp;
1404:               rtmps2[idx] -= mul2 * tmp;
1405:             }
1406:           }
1407:           prow = *bjtmp++ ;
1408:         }
1409:         /* Now take care of the odd element*/
1410:         pc1 = rtmp1 + prow;
1411:         pc2 = rtmp2 + prow;
1412:         if (*pc2 != 0.0){
1413:           pj   = nbj + bd[prow];
1414:           if (PetscAbsScalar(*pc1) < zeropivot) {
1415:             if (damping) {
1416:               if (ndamp) damping *= 2.0;
1417:               damp = PETSC_TRUE;
1418:               ndamp++;
1419:               goto endofwhile;
1420:             } else {
1421:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1422:             }
1423:           }
1424:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1425:           *pc2 = mul2;
1426:           nz   = bi[prow+1] - bd[prow] - 1;
1427:           PetscLogFlops(2*nz);
1428:           for (j=0; j<nz; j++) {
1429:             idx = pj[j] ;
1430:             tmp = rtmp1[idx];
1431:             rtmp2[idx] -= mul2 * tmp;
1432:           }
1433:         }
1434: 
1435:         nz  = bi[row+1] - bi[row];
1436:         pj  = bj + bi[row];
1437:         pc1 = ba + bi[row];
1438:         pc2 = ba + bi[row+1];
1439:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1440:           if (damping) {
1441:             if (ndamp) damping *= 2.0;
1442:             damp = PETSC_TRUE;
1443:             ndamp++;
1444:             goto endofwhile;
1445:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1446:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1447:           } else {
1448:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1449:           }
1450:         }
1451:         rtmp1[row]   = 1.0/rtmp1[row];
1452:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1453:         for (j=0; j<nz; j++) {
1454:           idx    = pj[j];
1455:           pc1[j] = rtmps1[idx];
1456:           pc2[j] = rtmps2[idx];
1457:         }
1458:         break;

1460:       case 3:
1461:         for  (j=0; j<nz; j++) {
1462:           idx         = bjtmp[j];
1463:           rtmps1[idx] = 0.0;
1464:           rtmps2[idx] = 0.0;
1465:           rtmps3[idx] = 0.0;
1466:         }
1467:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1468:         idx   = r[row];
1469:         nz    = ai[idx+1] - ai[idx];
1470:         ajtmp = aj + ai[idx];
1471:         v1    = aa + ai[idx];
1472:         v2    = aa + ai[idx+1];
1473:         v3    = aa + ai[idx+2];
1474:         for (j=0; j<nz; j++) {
1475:           idx        = ics[ajtmp[j]];
1476:           rtmp1[idx] = v1[j];
1477:           rtmp2[idx] = v2[j];
1478:           rtmp3[idx] = v3[j];
1479:           if (ndamp && ajtmp[j] == r[row]) {
1480:             rtmp1[idx] += damping;
1481:           }
1482:           if (ndamp && ajtmp[j] == r[row+1]) {
1483:             rtmp2[idx] += damping;
1484:           }
1485:           if (ndamp && ajtmp[j] == r[row+2]) {
1486:             rtmp3[idx] += damping;
1487:           }
1488:         }
1489:         /* loop over all pivot row blocks above this row block */
1490:         prow = *bjtmp++ ;
1491:         while (prow < row) {
1492:           pc1 = rtmp1 + prow;
1493:           pc2 = rtmp2 + prow;
1494:           pc3 = rtmp3 + prow;
1495:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1496:             pv   = ba  + bd[prow];
1497:             pj   = nbj + bd[prow];
1498:             mul1 = *pc1 * *pv;
1499:             mul2 = *pc2 * *pv;
1500:             mul3 = *pc3 * *pv;
1501:             ++pv;
1502:             *pc1 = mul1;
1503:             *pc2 = mul2;
1504:             *pc3 = mul3;
1505: 
1506:             nz   = bi[prow+1] - bd[prow] - 1;
1507:             PetscLogFlops(3*2*nz);
1508:             /* update this row based on pivot row */
1509:             for (j=0; j<nz; j++) {
1510:               tmp = pv[j];
1511:               idx = pj[j];
1512:               rtmps1[idx] -= mul1 * tmp;
1513:               rtmps2[idx] -= mul2 * tmp;
1514:               rtmps3[idx] -= mul3 * tmp;
1515:             }
1516:           }
1517:           prow = *bjtmp++ ;
1518:         }
1519:         /* Now take care of diagonal block in this set of rows */
1520:         pc1 = rtmp1 + prow;
1521:         pc2 = rtmp2 + prow;
1522:         pc3 = rtmp3 + prow;
1523:         if (*pc2 != 0.0 || *pc3 != 0.0){
1524:           pj   = nbj + bd[prow];
1525:           if (PetscAbsScalar(*pc1) < zeropivot) {
1526:             if (damping) {
1527:               if (ndamp) damping *= 2.0;
1528:               damp = PETSC_TRUE;
1529:               ndamp++;
1530:               goto endofwhile;
1531:             } else {
1532:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1533:             }
1534:           }
1535:           mul2 = (*pc2)/(*pc1);
1536:           mul3 = (*pc3)/(*pc1);
1537:           *pc2 = mul2;
1538:           *pc3 = mul3;
1539:           nz   = bi[prow+1] - bd[prow] - 1;
1540:           PetscLogFlops(2*2*nz);
1541:           for (j=0; j<nz; j++) {
1542:             idx = pj[j] ;
1543:             tmp = rtmp1[idx];
1544:             rtmp2[idx] -= mul2 * tmp;
1545:             rtmp3[idx] -= mul3 * tmp;
1546:           }
1547:         }
1548:         ++prow;
1549:         pc2 = rtmp2 + prow;
1550:         pc3 = rtmp3 + prow;
1551:         if (*pc3 != 0.0){
1552:           pj   = nbj + bd[prow];
1553:           pj   = nbj + bd[prow];
1554:           if (PetscAbsScalar(*pc2) < zeropivot) {
1555:             if (damping) {
1556:               if (ndamp) damping *= 2.0;
1557:               damp = PETSC_TRUE;
1558:               ndamp++;
1559:               goto endofwhile;
1560:             } else {
1561:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1562:             }
1563:           }
1564:           mul3 = (*pc3)/(*pc2);
1565:           *pc3 = mul3;
1566:           nz   = bi[prow+1] - bd[prow] - 1;
1567:           PetscLogFlops(2*2*nz);
1568:           for (j=0; j<nz; j++) {
1569:             idx = pj[j] ;
1570:             tmp = rtmp2[idx];
1571:             rtmp3[idx] -= mul3 * tmp;
1572:           }
1573:         }
1574:         nz  = bi[row+1] - bi[row];
1575:         pj  = bj + bi[row];
1576:         pc1 = ba + bi[row];
1577:         pc2 = ba + bi[row+1];
1578:         pc3 = ba + bi[row+2];
1579:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1580:           if (damping) {
1581:             if (ndamp) damping *= 2.0;
1582:             damp = PETSC_TRUE;
1583:             ndamp++;
1584:             goto endofwhile;
1585:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1586:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1587:           } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1588:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1589:         } else {
1590:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1591:           }
1592:         }
1593:         rtmp1[row]   = 1.0/rtmp1[row];
1594:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1595:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1596:         /* copy row entries from dense representation to sparse */
1597:         for (j=0; j<nz; j++) {
1598:           idx    = pj[j];
1599:           pc1[j] = rtmps1[idx];
1600:           pc2[j] = rtmps2[idx];
1601:           pc3[j] = rtmps3[idx];
1602:         }
1603:         break;

1605:       default:
1606:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1607:       }
1608:       row += nsz;                 /* Update the row */
1609:     }
1610:     endofwhile:;
1611:   } while (damp);
1612:   PetscFree(rtmp1);
1613:   PetscFree(tmp_vec2);
1614:   ISRestoreIndices(isicol,&ic);
1615:   ISRestoreIndices(isrow,&r);
1616:   ISRestoreIndices(iscol,&c);
1617:   C->factor      = FACTOR_LU;
1618:   C->assembled   = PETSC_TRUE;
1619:   if (ndamp || b->lu_damping) {
1620:     PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %g\n",ndamp,damping);
1621:   }
1622:   PetscLogFlops(C->n);
1623:   return(0);
1624: }

1626: /*
1627:      This is really ugly. if inodes are used this replaces the 
1628:   permutations with ones that correspond to rows/cols of the matrix
1629:   rather then inode blocks
1630: */
1633: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1634: {
1635:   int ierr,(*f)(Mat,IS*,IS*);

1638:   PetscObjectQueryFunction((PetscObject)A,"MatAdjustForInodes_C",(void (**)(void))&f);
1639:   if (f) {
1640:     (*f)(A,rperm,cperm);
1641:   }
1642:   return(0);
1643: }

1645: EXTERN_C_BEGIN
1648: int MatAdjustForInodes_SeqAIJ(Mat A,IS *rperm,IS *cperm)
1649: {
1650:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1651:   int        ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1652:   int        row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1653:   int        nslim_col,*ns_col;
1654:   IS         ris = *rperm,cis = *cperm;

1657:   if (!a->inode.size) return(0); /* no inodes so return */
1658:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1660:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1661:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1662:   PetscMalloc((m+n+1)*sizeof(int),&permr);
1663:   permc = permr + m;

1665:   ISGetIndices(ris,&ridx);
1666:   ISGetIndices(cis,&cidx);

1668:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1669:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1671:   /* Construct the permutations for rows*/
1672:   for (i=0,row = 0; i<nslim_row; ++i){
1673:     indx      = ridx[i];
1674:     start_val = tns[indx];
1675:     end_val   = tns[indx + 1];
1676:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1677:   }

1679:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1680:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1682:  /* Construct permutations for columns */
1683:   for (i=0,col=0; i<nslim_col; ++i){
1684:     indx      = cidx[i];
1685:     start_val = tns[indx];
1686:     end_val   = tns[indx + 1];
1687:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1688:   }

1690:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1691:   ISSetPermutation(*rperm);
1692:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1693:   ISSetPermutation(*cperm);
1694: 
1695:   ISRestoreIndices(ris,&ridx);
1696:   ISRestoreIndices(cis,&cidx);

1698:   PetscFree(ns_col);
1699:   PetscFree(permr);
1700:   ISDestroy(cis);
1701:   ISDestroy(ris);
1702:   PetscFree(tns);
1703:   return(0);
1704: }
1705: EXTERN_C_END

1709: /*@C
1710:    MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.

1712:    Collective on Mat

1714:    Input Parameter:
1715: .  A - the SeqAIJ matrix.

1717:    Output Parameter:
1718: +  node_count - no of inodes present in the matrix.
1719: .  sizes      - an array of size node_count,with sizes of each inode.
1720: -  limit      - the max size used to generate the inodes.

1722:    Level: advanced

1724:    Notes: This routine returns some internal storage information
1725:    of the matrix, it is intended to be used by advanced users.
1726:    It should be called after the matrix is assembled.
1727:    The contents of the sizes[] array should not be changed.

1729: .keywords: matrix, seqaij, get, inode

1731: .seealso: MatGetInfo()
1732: @*/
1733: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1734: {
1735:   int ierr,(*f)(Mat,int*,int*[],int*);

1738:   PetscObjectQueryFunction((PetscObject)A,"MatSeqAIJGetInodeSizes_C",(void (**)(void))&f);
1739:   if (f) {
1740:     (*f)(A,node_count,sizes,limit);
1741:   }
1742:   return(0);
1743: }

1745: EXTERN_C_BEGIN
1748: int MatSeqAIJGetInodeSizes_SeqAIJ(Mat A,int *node_count,int *sizes[],int *limit)
1749: {
1750:   Mat_SeqAIJ *a;

1753:   a           = (Mat_SeqAIJ*)A->data;
1754:   *node_count = a->inode.node_count;
1755:   *sizes      = a->inode.size;
1756:   *limit      = a->inode.limit;
1757:   return(0);
1758: }
1759: EXTERN_C_END