Actual source code: aijnode.c
1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
2: /*
3: This file provides high performance routines for the AIJ (compressed row)
4: format by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include src/mat/impls/aij/seq/aij.h
8: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
9: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
10: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);
12: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
13: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
14: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
15: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
16: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
17: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
18: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
19: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
23: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
24: {
25: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
26: int ierr,i,count,m,n,min_mn,*ns_row,*ns_col;
29: n = A->n;
30: m = A->m;
31: ns_row = a->inode.size;
32:
33: min_mn = (m < n) ? m : n;
34: if (!ns) {
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
36: for(; count+1 < n; count++,i++);
37: if (count < n) {
38: i++;
39: }
40: *size = i;
41: return(0);
42: }
43: PetscMalloc((n+1)*sizeof(int),&ns_col);
44:
45: /* Use the same row structure wherever feasible. */
46: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
47: ns_col[i] = ns_row[i];
48: }
50: /* if m < n; pad up the remainder with inode_limit */
51: for(; count+1 < n; count++,i++) {
52: ns_col[i] = 1;
53: }
54: /* The last node is the odd ball. padd it up with the remaining rows; */
55: if (count < n) {
56: ns_col[i] = n - count;
57: i++;
58: } else if (count > n) {
59: /* Adjust for the over estimation */
60: ns_col[i-1] += n - count;
61: }
62: *size = i;
63: *ns = ns_col;
64: return(0);
65: }
68: /*
69: This builds symmetric version of nonzero structure,
70: */
73: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
74: {
75: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
76: int *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
77: int *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
80: nslim_row = a->inode.node_count;
81: m = A->m;
82: n = A->n;
83: if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
84:
85: /* Use the row_inode as column_inode */
86: nslim_col = nslim_row;
87: ns_col = ns_row;
89: /* allocate space for reformated inode structure */
90: PetscMalloc((nslim_col+1)*sizeof(int),&tns);
91: PetscMalloc((n+1)*sizeof(int),&tvc);
92: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
94: for (i1=0,col=0; i1<nslim_col; ++i1){
95: nsz = ns_col[i1];
96: for (i2=0; i2<nsz; ++i2,++col)
97: tvc[col] = i1;
98: }
99: /* allocate space for row pointers */
100: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
101: *iia = ia;
102: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
103: PetscMalloc((nslim_row+1)*sizeof(int),&work);
105: /* determine the number of columns in each row */
106: ia[0] = oshift;
107: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
109: j = aj + ai[row] + ishift;
110: jmax = aj + ai[row+1] + ishift;
111: i2 = 0;
112: col = *j++ + ishift;
113: i2 = tvc[col];
114: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
115: ia[i1+1]++;
116: ia[i2+1]++;
117: i2++; /* Start col of next node */
118: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
119: i2 = tvc[col];
120: }
121: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
122: }
124: /* shift ia[i] to point to next row */
125: for (i1=1; i1<nslim_row+1; i1++) {
126: row = ia[i1-1];
127: ia[i1] += row;
128: work[i1-1] = row - oshift;
129: }
131: /* allocate space for column pointers */
132: nz = ia[nslim_row] + (!ishift);
133: PetscMalloc(nz*sizeof(int),&ja);
134: *jja = ja;
136: /* loop over lower triangular part putting into ja */
137: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
138: j = aj + ai[row] + ishift;
139: jmax = aj + ai[row+1] + ishift;
140: i2 = 0; /* Col inode index */
141: col = *j++ + ishift;
142: i2 = tvc[col];
143: while (i2<i1 && j<jmax) {
144: ja[work[i2]++] = i1 + oshift;
145: ja[work[i1]++] = i2 + oshift;
146: ++i2;
147: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
148: i2 = tvc[col];
149: }
150: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
152: }
153: PetscFree(work);
154: PetscFree(tns);
155: PetscFree(tvc);
156: return(0);
157: }
159: /*
160: This builds nonsymmetric version of nonzero structure,
161: */
164: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
165: {
166: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
167: int *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
168: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
171: nslim_row = a->inode.node_count;
172: n = A->n;
174: /* Create The column_inode for this matrix */
175: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
176:
177: /* allocate space for reformated column_inode structure */
178: PetscMalloc((nslim_col +1)*sizeof(int),&tns);
179: PetscMalloc((n +1)*sizeof(int),&tvc);
180: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
182: for (i1=0,col=0; i1<nslim_col; ++i1){
183: nsz = ns_col[i1];
184: for (i2=0; i2<nsz; ++i2,++col)
185: tvc[col] = i1;
186: }
187: /* allocate space for row pointers */
188: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
189: *iia = ia;
190: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
191: PetscMalloc((nslim_row+1)*sizeof(int),&work);
193: /* determine the number of columns in each row */
194: ia[0] = oshift;
195: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
196: j = aj + ai[row] + ishift;
197: col = *j++ + ishift;
198: i2 = tvc[col];
199: nz = ai[row+1] - ai[row];
200: while (nz-- > 0) { /* off-diagonal elemets */
201: ia[i1+1]++;
202: i2++; /* Start col of next node */
203: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
204: i2 = tvc[col];
205: }
206: }
208: /* shift ia[i] to point to next row */
209: for (i1=1; i1<nslim_row+1; i1++) {
210: row = ia[i1-1];
211: ia[i1] += row;
212: work[i1-1] = row - oshift;
213: }
215: /* allocate space for column pointers */
216: nz = ia[nslim_row] + (!ishift);
217: PetscMalloc(nz*sizeof(int),&ja);
218: *jja = ja;
220: /* loop over matrix putting into ja */
221: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
222: j = aj + ai[row] + ishift;
223: i2 = 0; /* Col inode index */
224: col = *j++ + ishift;
225: i2 = tvc[col];
226: nz = ai[row+1] - ai[row];
227: while (nz-- > 0) {
228: ja[work[i1]++] = i2 + oshift;
229: ++i2;
230: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
231: i2 = tvc[col];
232: }
233: }
234: PetscFree(ns_col);
235: PetscFree(work);
236: PetscFree(tns);
237: PetscFree(tvc);
238: return(0);
239: }
243: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
244: {
245: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
246: int ierr;
249: *n = a->inode.node_count;
250: if (!ia) return(0);
252: if (symmetric) {
253: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
254: } else {
255: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
256: }
257: return(0);
258: }
262: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
263: {
267: if (!ia) return(0);
268: PetscFree(*ia);
269: PetscFree(*ja);
270: return(0);
271: }
273: /* ----------------------------------------------------------- */
277: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
278: {
279: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
280: int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
281: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
284: nslim_row = a->inode.node_count;
285: n = A->n;
287: /* Create The column_inode for this matrix */
288: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
289:
290: /* allocate space for reformated column_inode structure */
291: PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
292: PetscMalloc((n + 1)*sizeof(int),&tvc);
293: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
295: for (i1=0,col=0; i1<nslim_col; ++i1){
296: nsz = ns_col[i1];
297: for (i2=0; i2<nsz; ++i2,++col)
298: tvc[col] = i1;
299: }
300: /* allocate space for column pointers */
301: PetscMalloc((nslim_col+1)*sizeof(int),&ia);
302: *iia = ia;
303: PetscMemzero(ia,(nslim_col+1)*sizeof(int));
304: PetscMalloc((nslim_col+1)*sizeof(int),&work);
306: /* determine the number of columns in each row */
307: ia[0] = oshift;
308: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
309: j = aj + ai[row] + ishift;
310: col = *j++ + ishift;
311: i2 = tvc[col];
312: nz = ai[row+1] - ai[row];
313: while (nz-- > 0) { /* off-diagonal elemets */
314: /* ia[i1+1]++; */
315: ia[i2+1]++;
316: i2++;
317: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
318: i2 = tvc[col];
319: }
320: }
322: /* shift ia[i] to point to next col */
323: for (i1=1; i1<nslim_col+1; i1++) {
324: col = ia[i1-1];
325: ia[i1] += col;
326: work[i1-1] = col - oshift;
327: }
329: /* allocate space for column pointers */
330: nz = ia[nslim_col] + (!ishift);
331: PetscMalloc(nz*sizeof(int),&ja);
332: *jja = ja;
334: /* loop over matrix putting into ja */
335: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
336: j = aj + ai[row] + ishift;
337: i2 = 0; /* Col inode index */
338: col = *j++ + ishift;
339: i2 = tvc[col];
340: nz = ai[row+1] - ai[row];
341: while (nz-- > 0) {
342: /* ja[work[i1]++] = i2 + oshift; */
343: ja[work[i2]++] = i1 + oshift;
344: i2++;
345: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
346: i2 = tvc[col];
347: }
348: }
349: PetscFree(ns_col);
350: PetscFree(work);
351: PetscFree(tns);
352: PetscFree(tvc);
353: return(0);
354: }
358: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
359: {
363: Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
364: if (!ia) return(0);
366: if (symmetric) {
367: /* Since the indices are symmetric it does'nt matter */
368: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
369: } else {
370: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
371: }
372: return(0);
373: }
377: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
378: {
382: if (!ia) return(0);
383: PetscFree(*ia);
384: PetscFree(*ja);
385: return(0);
386: }
388: /* ----------------------------------------------------------- */
392: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
393: {
394: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
395: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
396: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
397: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
398:
399: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
400: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
401: #endif
404: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
405: node_max = a->inode.node_count;
406: ns = a->inode.size; /* Node Size array */
407: VecGetArray(xx,&x);
408: VecGetArray(yy,&y);
409: idx = a->j;
410: v1 = a->a;
411: ii = a->i;
413: for (i = 0,row = 0; i< node_max; ++i){
414: nsz = ns[i];
415: n = ii[1] - ii[0];
416: ii += nsz;
417: sz = n; /* No of non zeros in this row */
418: /* Switch on the size of Node */
419: switch (nsz){ /* Each loop in 'case' is unrolled */
420: case 1 :
421: sum1 = 0;
422:
423: for(n = 0; n< sz-1; n+=2) {
424: i1 = idx[0]; /* The instructions are ordered to */
425: i2 = idx[1]; /* make the compiler's job easy */
426: idx += 2;
427: tmp0 = x[i1];
428: tmp1 = x[i2];
429: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
430: }
431:
432: if (n == sz-1){ /* Take care of the last nonzero */
433: tmp0 = x[*idx++];
434: sum1 += *v1++ * tmp0;
435: }
436: y[row++]=sum1;
437: break;
438: case 2:
439: sum1 = 0;
440: sum2 = 0;
441: v2 = v1 + n;
442:
443: for (n = 0; n< sz-1; n+=2) {
444: i1 = idx[0];
445: i2 = idx[1];
446: idx += 2;
447: tmp0 = x[i1];
448: tmp1 = x[i2];
449: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
450: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
451: }
452: if (n == sz-1){
453: tmp0 = x[*idx++];
454: sum1 += *v1++ * tmp0;
455: sum2 += *v2++ * tmp0;
456: }
457: y[row++]=sum1;
458: y[row++]=sum2;
459: v1 =v2; /* Since the next block to be processed starts there*/
460: idx +=sz;
461: break;
462: case 3:
463: sum1 = 0;
464: sum2 = 0;
465: sum3 = 0;
466: v2 = v1 + n;
467: v3 = v2 + n;
468:
469: for (n = 0; n< sz-1; n+=2) {
470: i1 = idx[0];
471: i2 = idx[1];
472: idx += 2;
473: tmp0 = x[i1];
474: tmp1 = x[i2];
475: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
476: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
477: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
478: }
479: if (n == sz-1){
480: tmp0 = x[*idx++];
481: sum1 += *v1++ * tmp0;
482: sum2 += *v2++ * tmp0;
483: sum3 += *v3++ * tmp0;
484: }
485: y[row++]=sum1;
486: y[row++]=sum2;
487: y[row++]=sum3;
488: v1 =v3; /* Since the next block to be processed starts there*/
489: idx +=2*sz;
490: break;
491: case 4:
492: sum1 = 0;
493: sum2 = 0;
494: sum3 = 0;
495: sum4 = 0;
496: v2 = v1 + n;
497: v3 = v2 + n;
498: v4 = v3 + n;
499:
500: for (n = 0; n< sz-1; n+=2) {
501: i1 = idx[0];
502: i2 = idx[1];
503: idx += 2;
504: tmp0 = x[i1];
505: tmp1 = x[i2];
506: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
507: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
508: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
509: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
510: }
511: if (n == sz-1){
512: tmp0 = x[*idx++];
513: sum1 += *v1++ * tmp0;
514: sum2 += *v2++ * tmp0;
515: sum3 += *v3++ * tmp0;
516: sum4 += *v4++ * tmp0;
517: }
518: y[row++]=sum1;
519: y[row++]=sum2;
520: y[row++]=sum3;
521: y[row++]=sum4;
522: v1 =v4; /* Since the next block to be processed starts there*/
523: idx +=3*sz;
524: break;
525: case 5:
526: sum1 = 0;
527: sum2 = 0;
528: sum3 = 0;
529: sum4 = 0;
530: sum5 = 0;
531: v2 = v1 + n;
532: v3 = v2 + n;
533: v4 = v3 + n;
534: v5 = v4 + n;
535:
536: for (n = 0; n<sz-1; n+=2) {
537: i1 = idx[0];
538: i2 = idx[1];
539: idx += 2;
540: tmp0 = x[i1];
541: tmp1 = x[i2];
542: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
543: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
544: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
545: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
546: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
547: }
548: if (n == sz-1){
549: tmp0 = x[*idx++];
550: sum1 += *v1++ * tmp0;
551: sum2 += *v2++ * tmp0;
552: sum3 += *v3++ * tmp0;
553: sum4 += *v4++ * tmp0;
554: sum5 += *v5++ * tmp0;
555: }
556: y[row++]=sum1;
557: y[row++]=sum2;
558: y[row++]=sum3;
559: y[row++]=sum4;
560: y[row++]=sum5;
561: v1 =v5; /* Since the next block to be processed starts there */
562: idx +=4*sz;
563: break;
564: default :
565: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
566: }
567: }
568: VecRestoreArray(xx,&x);
569: VecRestoreArray(yy,&y);
570: PetscLogFlops(2*a->nz - A->m);
571: return(0);
572: }
573: /* ----------------------------------------------------------- */
574: /* Almost same code as the MatMult_SeqAij_Inode() */
577: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
578: {
579: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
580: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
581: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
582: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
583:
585: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
586: node_max = a->inode.node_count;
587: ns = a->inode.size; /* Node Size array */
588: VecGetArray(xx,&x);
589: VecGetArray(yy,&y);
590: if (zz != yy) {
591: VecGetArray(zz,&z);
592: } else {
593: z = y;
594: }
595: zt = z;
597: idx = a->j;
598: v1 = a->a;
599: ii = a->i;
601: for (i = 0,row = 0; i< node_max; ++i){
602: nsz = ns[i];
603: n = ii[1] - ii[0];
604: ii += nsz;
605: sz = n; /* No of non zeros in this row */
606: /* Switch on the size of Node */
607: switch (nsz){ /* Each loop in 'case' is unrolled */
608: case 1 :
609: sum1 = *zt++;
610:
611: for(n = 0; n< sz-1; n+=2) {
612: i1 = idx[0]; /* The instructions are ordered to */
613: i2 = idx[1]; /* make the compiler's job easy */
614: idx += 2;
615: tmp0 = x[i1];
616: tmp1 = x[i2];
617: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618: }
619:
620: if(n == sz-1){ /* Take care of the last nonzero */
621: tmp0 = x[*idx++];
622: sum1 += *v1++ * tmp0;
623: }
624: y[row++]=sum1;
625: break;
626: case 2:
627: sum1 = *zt++;
628: sum2 = *zt++;
629: v2 = v1 + n;
630:
631: for(n = 0; n< sz-1; n+=2) {
632: i1 = idx[0];
633: i2 = idx[1];
634: idx += 2;
635: tmp0 = x[i1];
636: tmp1 = x[i2];
637: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
638: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
639: }
640: if(n == sz-1){
641: tmp0 = x[*idx++];
642: sum1 += *v1++ * tmp0;
643: sum2 += *v2++ * tmp0;
644: }
645: y[row++]=sum1;
646: y[row++]=sum2;
647: v1 =v2; /* Since the next block to be processed starts there*/
648: idx +=sz;
649: break;
650: case 3:
651: sum1 = *zt++;
652: sum2 = *zt++;
653: sum3 = *zt++;
654: v2 = v1 + n;
655: v3 = v2 + n;
656:
657: for (n = 0; n< sz-1; n+=2) {
658: i1 = idx[0];
659: i2 = idx[1];
660: idx += 2;
661: tmp0 = x[i1];
662: tmp1 = x[i2];
663: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
664: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
665: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
666: }
667: if (n == sz-1){
668: tmp0 = x[*idx++];
669: sum1 += *v1++ * tmp0;
670: sum2 += *v2++ * tmp0;
671: sum3 += *v3++ * tmp0;
672: }
673: y[row++]=sum1;
674: y[row++]=sum2;
675: y[row++]=sum3;
676: v1 =v3; /* Since the next block to be processed starts there*/
677: idx +=2*sz;
678: break;
679: case 4:
680: sum1 = *zt++;
681: sum2 = *zt++;
682: sum3 = *zt++;
683: sum4 = *zt++;
684: v2 = v1 + n;
685: v3 = v2 + n;
686: v4 = v3 + n;
687:
688: for (n = 0; n< sz-1; n+=2) {
689: i1 = idx[0];
690: i2 = idx[1];
691: idx += 2;
692: tmp0 = x[i1];
693: tmp1 = x[i2];
694: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
695: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
696: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
697: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
698: }
699: if (n == sz-1){
700: tmp0 = x[*idx++];
701: sum1 += *v1++ * tmp0;
702: sum2 += *v2++ * tmp0;
703: sum3 += *v3++ * tmp0;
704: sum4 += *v4++ * tmp0;
705: }
706: y[row++]=sum1;
707: y[row++]=sum2;
708: y[row++]=sum3;
709: y[row++]=sum4;
710: v1 =v4; /* Since the next block to be processed starts there*/
711: idx +=3*sz;
712: break;
713: case 5:
714: sum1 = *zt++;
715: sum2 = *zt++;
716: sum3 = *zt++;
717: sum4 = *zt++;
718: sum5 = *zt++;
719: v2 = v1 + n;
720: v3 = v2 + n;
721: v4 = v3 + n;
722: v5 = v4 + n;
723:
724: for (n = 0; n<sz-1; n+=2) {
725: i1 = idx[0];
726: i2 = idx[1];
727: idx += 2;
728: tmp0 = x[i1];
729: tmp1 = x[i2];
730: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
731: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
732: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
733: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
734: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
735: }
736: if(n == sz-1){
737: tmp0 = x[*idx++];
738: sum1 += *v1++ * tmp0;
739: sum2 += *v2++ * tmp0;
740: sum3 += *v3++ * tmp0;
741: sum4 += *v4++ * tmp0;
742: sum5 += *v5++ * tmp0;
743: }
744: y[row++]=sum1;
745: y[row++]=sum2;
746: y[row++]=sum3;
747: y[row++]=sum4;
748: y[row++]=sum5;
749: v1 =v5; /* Since the next block to be processed starts there */
750: idx +=4*sz;
751: break;
752: default :
753: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
754: }
755: }
756: VecRestoreArray(xx,&x);
757: VecRestoreArray(yy,&y);
758: if (zz != yy) {
759: VecRestoreArray(zz,&z);
760: }
761: PetscLogFlops(2*a->nz);
762: return(0);
763: }
764: /* ----------------------------------------------------------- */
765: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,const ISColoringValue[],ISColoring *);
767: /*
768: samestructure indicates that the matrix has not changed its nonzero structure so we
769: do not need to recompute the inodes
770: */
773: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
774: {
775: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
776: int ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
777: PetscTruth flag,flg;
780: if (samestructure && a->inode.checked) return(0);
782: a->inode.checked = PETSC_TRUE;
784: /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
785: if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
786: PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
787: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inode\n");return(0);}
788: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
789: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unroll\n");return(0);}
790: PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
791: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
792: m = A->m;
793: if (a->inode.size) {ns = a->inode.size;}
794: else {PetscMalloc((m+1)*sizeof(int),&ns);}
796: i = 0;
797: node_count = 0;
798: idx = a->j;
799: ii = a->i;
800: while (i < m){ /* For each row */
801: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
802: /* Limits the number of elements in a node to 'a->inode.limit' */
803: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
804: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
805: if (nzy != nzx) break;
806: idy += nzx; /* Same nonzero pattern */
807: PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
808: if (flag == PETSC_FALSE) break;
809: }
810: ns[node_count++] = blk_size;
811: idx += blk_size*nzx;
812: i = j;
813: }
814: /* If not enough inodes found,, do not use inode version of the routines */
815: if (!a->inode.size && m && node_count > 0.9*m) {
816: PetscFree(ns);
817: A->ops->mult = MatMult_SeqAIJ;
818: A->ops->multadd = MatMultAdd_SeqAIJ;
819: A->ops->solve = MatSolve_SeqAIJ;
820: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
821: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
822: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
823: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
824: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
825: A->ops->coloringpatch = 0;
826: a->inode.node_count = 0;
827: a->inode.size = 0;
828: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routines\n",node_count,m);
829: } else {
830: A->ops->mult = MatMult_SeqAIJ_Inode;
831: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
832: A->ops->solve = MatSolve_SeqAIJ_Inode;
833: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
834: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
835: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
836: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
837: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
838: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
839: a->inode.node_count = node_count;
840: a->inode.size = ns;
841: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routines\n",node_count,m,a->inode.limit);
842: }
843: return(0);
844: }
846: /* ----------------------------------------------------------- */
849: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
850: {
851: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
852: IS iscol = a->col,isrow = a->row;
853: int *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
854: int node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
855: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
856: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
859: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
860: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
861: node_max = a->inode.node_count;
862: ns = a->inode.size; /* Node Size array */
864: VecGetArray(bb,&b);
865: VecGetArray(xx,&x);
866: tmp = a->solve_work;
867:
868: ISGetIndices(isrow,&rout); r = rout;
869: ISGetIndices(iscol,&cout); c = cout + (n-1);
870:
871: /* forward solve the lower triangular */
872: tmps = tmp ;
873: aa = a_a ;
874: aj = a_j ;
875: ad = a->diag;
877: for (i = 0,row = 0; i< node_max; ++i){
878: nsz = ns[i];
879: aii = ai[row];
880: v1 = aa + aii;
881: vi = aj + aii;
882: nz = ad[row]- aii;
883:
884: switch (nsz){ /* Each loop in 'case' is unrolled */
885: case 1 :
886: sum1 = b[*r++];
887: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
888: for(j=0; j<nz-1; j+=2){
889: i0 = vi[0];
890: i1 = vi[1];
891: vi +=2;
892: tmp0 = tmps[i0];
893: tmp1 = tmps[i1];
894: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
895: }
896: if(j == nz-1){
897: tmp0 = tmps[*vi++];
898: sum1 -= *v1++ *tmp0;
899: }
900: tmp[row ++]=sum1;
901: break;
902: case 2:
903: sum1 = b[*r++];
904: sum2 = b[*r++];
905: v2 = aa + ai[row+1];
907: for(j=0; j<nz-1; j+=2){
908: i0 = vi[0];
909: i1 = vi[1];
910: vi +=2;
911: tmp0 = tmps[i0];
912: tmp1 = tmps[i1];
913: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915: }
916: if(j == nz-1){
917: tmp0 = tmps[*vi++];
918: sum1 -= *v1++ *tmp0;
919: sum2 -= *v2++ *tmp0;
920: }
921: sum2 -= *v2++ * sum1;
922: tmp[row ++]=sum1;
923: tmp[row ++]=sum2;
924: break;
925: case 3:
926: sum1 = b[*r++];
927: sum2 = b[*r++];
928: sum3 = b[*r++];
929: v2 = aa + ai[row+1];
930: v3 = aa + ai[row+2];
931:
932: for (j=0; j<nz-1; j+=2){
933: i0 = vi[0];
934: i1 = vi[1];
935: vi +=2;
936: tmp0 = tmps[i0];
937: tmp1 = tmps[i1];
938: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
939: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
940: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
941: }
942: if (j == nz-1){
943: tmp0 = tmps[*vi++];
944: sum1 -= *v1++ *tmp0;
945: sum2 -= *v2++ *tmp0;
946: sum3 -= *v3++ *tmp0;
947: }
948: sum2 -= *v2++ * sum1;
949: sum3 -= *v3++ * sum1;
950: sum3 -= *v3++ * sum2;
951: tmp[row ++]=sum1;
952: tmp[row ++]=sum2;
953: tmp[row ++]=sum3;
954: break;
955:
956: case 4:
957: sum1 = b[*r++];
958: sum2 = b[*r++];
959: sum3 = b[*r++];
960: sum4 = b[*r++];
961: v2 = aa + ai[row+1];
962: v3 = aa + ai[row+2];
963: v4 = aa + ai[row+3];
964:
965: for (j=0; j<nz-1; j+=2){
966: i0 = vi[0];
967: i1 = vi[1];
968: vi +=2;
969: tmp0 = tmps[i0];
970: tmp1 = tmps[i1];
971: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
972: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
973: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
974: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
975: }
976: if (j == nz-1){
977: tmp0 = tmps[*vi++];
978: sum1 -= *v1++ *tmp0;
979: sum2 -= *v2++ *tmp0;
980: sum3 -= *v3++ *tmp0;
981: sum4 -= *v4++ *tmp0;
982: }
983: sum2 -= *v2++ * sum1;
984: sum3 -= *v3++ * sum1;
985: sum4 -= *v4++ * sum1;
986: sum3 -= *v3++ * sum2;
987: sum4 -= *v4++ * sum2;
988: sum4 -= *v4++ * sum3;
989:
990: tmp[row ++]=sum1;
991: tmp[row ++]=sum2;
992: tmp[row ++]=sum3;
993: tmp[row ++]=sum4;
994: break;
995: case 5:
996: sum1 = b[*r++];
997: sum2 = b[*r++];
998: sum3 = b[*r++];
999: sum4 = b[*r++];
1000: sum5 = b[*r++];
1001: v2 = aa + ai[row+1];
1002: v3 = aa + ai[row+2];
1003: v4 = aa + ai[row+3];
1004: v5 = aa + ai[row+4];
1005:
1006: for (j=0; j<nz-1; j+=2){
1007: i0 = vi[0];
1008: i1 = vi[1];
1009: vi +=2;
1010: tmp0 = tmps[i0];
1011: tmp1 = tmps[i1];
1012: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1013: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1014: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1015: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1016: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1017: }
1018: if (j == nz-1){
1019: tmp0 = tmps[*vi++];
1020: sum1 -= *v1++ *tmp0;
1021: sum2 -= *v2++ *tmp0;
1022: sum3 -= *v3++ *tmp0;
1023: sum4 -= *v4++ *tmp0;
1024: sum5 -= *v5++ *tmp0;
1025: }
1027: sum2 -= *v2++ * sum1;
1028: sum3 -= *v3++ * sum1;
1029: sum4 -= *v4++ * sum1;
1030: sum5 -= *v5++ * sum1;
1031: sum3 -= *v3++ * sum2;
1032: sum4 -= *v4++ * sum2;
1033: sum5 -= *v5++ * sum2;
1034: sum4 -= *v4++ * sum3;
1035: sum5 -= *v5++ * sum3;
1036: sum5 -= *v5++ * sum4;
1037:
1038: tmp[row ++]=sum1;
1039: tmp[row ++]=sum2;
1040: tmp[row ++]=sum3;
1041: tmp[row ++]=sum4;
1042: tmp[row ++]=sum5;
1043: break;
1044: default:
1045: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1046: }
1047: }
1048: /* backward solve the upper triangular */
1049: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1050: nsz = ns[i];
1051: aii = ai[row+1] -1;
1052: v1 = aa + aii;
1053: vi = aj + aii;
1054: nz = aii- ad[row];
1055: switch (nsz){ /* Each loop in 'case' is unrolled */
1056: case 1 :
1057: sum1 = tmp[row];
1059: for(j=nz ; j>1; j-=2){
1060: i0 = vi[0];
1061: i1 = vi[-1];
1062: vi -=2;
1063: tmp0 = tmps[i0];
1064: tmp1 = tmps[i1];
1065: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1066: }
1067: if (j==1){
1068: tmp0 = tmps[*vi--];
1069: sum1 -= *v1-- * tmp0;
1070: }
1071: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1072: break;
1073: case 2 :
1074: sum1 = tmp[row];
1075: sum2 = tmp[row -1];
1076: v2 = aa + ai[row]-1;
1077: for (j=nz ; j>1; j-=2){
1078: i0 = vi[0];
1079: i1 = vi[-1];
1080: vi -=2;
1081: tmp0 = tmps[i0];
1082: tmp1 = tmps[i1];
1083: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1084: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1085: }
1086: if (j==1){
1087: tmp0 = tmps[*vi--];
1088: sum1 -= *v1-- * tmp0;
1089: sum2 -= *v2-- * tmp0;
1090: }
1091:
1092: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1093: sum2 -= *v2-- * tmp0;
1094: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1095: break;
1096: case 3 :
1097: sum1 = tmp[row];
1098: sum2 = tmp[row -1];
1099: sum3 = tmp[row -2];
1100: v2 = aa + ai[row]-1;
1101: v3 = aa + ai[row -1]-1;
1102: for (j=nz ; j>1; j-=2){
1103: i0 = vi[0];
1104: i1 = vi[-1];
1105: vi -=2;
1106: tmp0 = tmps[i0];
1107: tmp1 = tmps[i1];
1108: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1109: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1110: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1111: }
1112: if (j==1){
1113: tmp0 = tmps[*vi--];
1114: sum1 -= *v1-- * tmp0;
1115: sum2 -= *v2-- * tmp0;
1116: sum3 -= *v3-- * tmp0;
1117: }
1118: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1119: sum2 -= *v2-- * tmp0;
1120: sum3 -= *v3-- * tmp0;
1121: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1122: sum3 -= *v3-- * tmp0;
1123: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1124:
1125: break;
1126: case 4 :
1127: sum1 = tmp[row];
1128: sum2 = tmp[row -1];
1129: sum3 = tmp[row -2];
1130: sum4 = tmp[row -3];
1131: v2 = aa + ai[row]-1;
1132: v3 = aa + ai[row -1]-1;
1133: v4 = aa + ai[row -2]-1;
1135: for (j=nz ; j>1; j-=2){
1136: i0 = vi[0];
1137: i1 = vi[-1];
1138: vi -=2;
1139: tmp0 = tmps[i0];
1140: tmp1 = tmps[i1];
1141: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1142: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1143: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1144: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1145: }
1146: if (j==1){
1147: tmp0 = tmps[*vi--];
1148: sum1 -= *v1-- * tmp0;
1149: sum2 -= *v2-- * tmp0;
1150: sum3 -= *v3-- * tmp0;
1151: sum4 -= *v4-- * tmp0;
1152: }
1154: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1155: sum2 -= *v2-- * tmp0;
1156: sum3 -= *v3-- * tmp0;
1157: sum4 -= *v4-- * tmp0;
1158: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159: sum3 -= *v3-- * tmp0;
1160: sum4 -= *v4-- * tmp0;
1161: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162: sum4 -= *v4-- * tmp0;
1163: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1164: break;
1165: case 5 :
1166: sum1 = tmp[row];
1167: sum2 = tmp[row -1];
1168: sum3 = tmp[row -2];
1169: sum4 = tmp[row -3];
1170: sum5 = tmp[row -4];
1171: v2 = aa + ai[row]-1;
1172: v3 = aa + ai[row -1]-1;
1173: v4 = aa + ai[row -2]-1;
1174: v5 = aa + ai[row -3]-1;
1175: for (j=nz ; j>1; j-=2){
1176: i0 = vi[0];
1177: i1 = vi[-1];
1178: vi -=2;
1179: tmp0 = tmps[i0];
1180: tmp1 = tmps[i1];
1181: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1182: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1183: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1184: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1185: sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1186: }
1187: if (j==1){
1188: tmp0 = tmps[*vi--];
1189: sum1 -= *v1-- * tmp0;
1190: sum2 -= *v2-- * tmp0;
1191: sum3 -= *v3-- * tmp0;
1192: sum4 -= *v4-- * tmp0;
1193: sum5 -= *v5-- * tmp0;
1194: }
1196: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1197: sum2 -= *v2-- * tmp0;
1198: sum3 -= *v3-- * tmp0;
1199: sum4 -= *v4-- * tmp0;
1200: sum5 -= *v5-- * tmp0;
1201: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1202: sum3 -= *v3-- * tmp0;
1203: sum4 -= *v4-- * tmp0;
1204: sum5 -= *v5-- * tmp0;
1205: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1206: sum4 -= *v4-- * tmp0;
1207: sum5 -= *v5-- * tmp0;
1208: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1209: sum5 -= *v5-- * tmp0;
1210: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1211: break;
1212: default:
1213: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1214: }
1215: }
1216: ISRestoreIndices(isrow,&rout);
1217: ISRestoreIndices(iscol,&cout);
1218: VecRestoreArray(bb,&b);
1219: VecRestoreArray(xx,&x);
1220: PetscLogFlops(2*a->nz - A->n);
1221: return(0);
1222: }
1227: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1228: {
1229: Mat C = *B;
1230: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1231: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1232: int *r,*ic,*c,ierr,n = A->m,*bi = b->i;
1233: int *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1234: int *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nsz;
1235: int *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1236: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1237: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1238: PetscReal damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1239: PetscTruth damp;
1242: ISGetIndices(isrow,&r);
1243: ISGetIndices(iscol,&c);
1244: ISGetIndices(isicol,&ic);
1245: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1246: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1247: ics = ic ; rtmps1 = rtmp1 ;
1248: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 ;
1249: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 ;
1250:
1251: node_max = a->inode.node_count;
1252: ns = a->inode.size ;
1253: if (!ns){
1254: SETERRQ(1,"Matrix without inode information");
1255: }
1257: /* If max inode size > 3, split it into two inodes.*/
1258: /* also map the inode sizes according to the ordering */
1259: PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1260: for (i=0,j=0; i<node_max; ++i,++j){
1261: if (ns[i]>3) {
1262: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1263: ++j;
1264: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1265: } else {
1266: tmp_vec1[j] = ns[i];
1267: }
1268: }
1269: /* Use the correct node_max */
1270: node_max = j;
1272: /* Now reorder the inode info based on mat re-ordering info */
1273: /* First create a row -> inode_size_array_index map */
1274: PetscMalloc(n*sizeof(int)+1,&nsmap);
1275: PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1276: for (i=0,row=0; i<node_max; i++) {
1277: nsz = tmp_vec1[i];
1278: for (j=0; j<nsz; j++,row++) {
1279: nsmap[row] = i;
1280: }
1281: }
1282: /* Using nsmap, create a reordered ns structure */
1283: for (i=0,j=0; i< node_max; i++) {
1284: nsz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1285: tmp_vec2[i] = nsz;
1286: j += nsz;
1287: }
1288: PetscFree(nsmap);
1289: PetscFree(tmp_vec1);
1290: /* Now use the correct ns */
1291: ns = tmp_vec2;
1294: do {
1295: damp = PETSC_FALSE;
1296: /* Now loop over each block-row, and do the factorization */
1297: for (i=0,row=0; i<node_max; i++) {
1298: nsz = ns[i];
1299: nz = bi[row+1] - bi[row];
1300: bjtmp = bj + bi[row];
1301:
1302: switch (nsz){
1303: case 1:
1304: for (j=0; j<nz; j++){
1305: idx = bjtmp[j];
1306: rtmps1[idx] = 0.0;
1307: }
1308:
1309: /* load in initial (unfactored row) */
1310: idx = r[row];
1311: nz = ai[idx+1] - ai[idx];
1312: ajtmp = aj + ai[idx];
1313: v1 = aa + ai[idx];
1315: for (j=0; j<nz; j++) {
1316: idx = ics[ajtmp[j]];
1317: rtmp1[idx] = v1[j];
1318: if (ndamp && ajtmp[j] == r[row]) {
1319: rtmp1[idx] += damping;
1320: }
1321: }
1322: prow = *bjtmp++ ;
1323: while (prow < row) {
1324: pc1 = rtmp1 + prow;
1325: if (*pc1 != 0.0){
1326: pv = ba + bd[prow];
1327: pj = nbj + bd[prow];
1328: mul1 = *pc1 * *pv++;
1329: *pc1 = mul1;
1330: nz = bi[prow+1] - bd[prow] - 1;
1331: PetscLogFlops(2*nz);
1332: for (j=0; j<nz; j++) {
1333: tmp = pv[j];
1334: idx = pj[j];
1335: rtmps1[idx] -= mul1 * tmp;
1336: }
1337: }
1338: prow = *bjtmp++ ;
1339: }
1340: nz = bi[row+1] - bi[row];
1341: pj = bj + bi[row];
1342: pc1 = ba + bi[row];
1343: if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1344: if (damping) {
1345: if (ndamp) damping *= 2.0;
1346: damp = PETSC_TRUE;
1347: ndamp++;
1348: goto endofwhile;
1349: } else {
1350: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1351: }
1352: }
1353: rtmp1[row] = 1.0/rtmp1[row];
1354: for (j=0; j<nz; j++) {
1355: idx = pj[j];
1356: pc1[j] = rtmps1[idx];
1357: }
1358: break;
1359:
1360: case 2:
1361: for (j=0; j<nz; j++) {
1362: idx = bjtmp[j];
1363: rtmps1[idx] = 0.0;
1364: rtmps2[idx] = 0.0;
1365: }
1366:
1367: /* load in initial (unfactored row) */
1368: idx = r[row];
1369: nz = ai[idx+1] - ai[idx];
1370: ajtmp = aj + ai[idx];
1371: v1 = aa + ai[idx];
1372: v2 = aa + ai[idx+1];
1373:
1374: for (j=0; j<nz; j++) {
1375: idx = ics[ajtmp[j]];
1376: rtmp1[idx] = v1[j];
1377: rtmp2[idx] = v2[j];
1378: if (ndamp && ajtmp[j] == r[row]) {
1379: rtmp1[idx] += damping;
1380: }
1381: if (ndamp && ajtmp[j] == r[row+1]) {
1382: rtmp2[idx] += damping;
1383: }
1384: }
1385: prow = *bjtmp++ ;
1386: while (prow < row) {
1387: pc1 = rtmp1 + prow;
1388: pc2 = rtmp2 + prow;
1389: if (*pc1 != 0.0 || *pc2 != 0.0){
1390: pv = ba + bd[prow];
1391: pj = nbj + bd[prow];
1392: mul1 = *pc1 * *pv;
1393: mul2 = *pc2 * *pv;
1394: ++pv;
1395: *pc1 = mul1;
1396: *pc2 = mul2;
1397:
1398: nz = bi[prow+1] - bd[prow] - 1;
1399: PetscLogFlops(2*2*nz);
1400: for (j=0; j<nz; j++) {
1401: tmp = pv[j];
1402: idx = pj[j];
1403: rtmps1[idx] -= mul1 * tmp;
1404: rtmps2[idx] -= mul2 * tmp;
1405: }
1406: }
1407: prow = *bjtmp++ ;
1408: }
1409: /* Now take care of the odd element*/
1410: pc1 = rtmp1 + prow;
1411: pc2 = rtmp2 + prow;
1412: if (*pc2 != 0.0){
1413: pj = nbj + bd[prow];
1414: if (PetscAbsScalar(*pc1) < zeropivot) {
1415: if (damping) {
1416: if (ndamp) damping *= 2.0;
1417: damp = PETSC_TRUE;
1418: ndamp++;
1419: goto endofwhile;
1420: } else {
1421: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1422: }
1423: }
1424: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1425: *pc2 = mul2;
1426: nz = bi[prow+1] - bd[prow] - 1;
1427: PetscLogFlops(2*nz);
1428: for (j=0; j<nz; j++) {
1429: idx = pj[j] ;
1430: tmp = rtmp1[idx];
1431: rtmp2[idx] -= mul2 * tmp;
1432: }
1433: }
1434:
1435: nz = bi[row+1] - bi[row];
1436: pj = bj + bi[row];
1437: pc1 = ba + bi[row];
1438: pc2 = ba + bi[row+1];
1439: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1440: if (damping) {
1441: if (ndamp) damping *= 2.0;
1442: damp = PETSC_TRUE;
1443: ndamp++;
1444: goto endofwhile;
1445: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1446: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1447: } else {
1448: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1449: }
1450: }
1451: rtmp1[row] = 1.0/rtmp1[row];
1452: rtmp2[row+1] = 1.0/rtmp2[row+1];
1453: for (j=0; j<nz; j++) {
1454: idx = pj[j];
1455: pc1[j] = rtmps1[idx];
1456: pc2[j] = rtmps2[idx];
1457: }
1458: break;
1460: case 3:
1461: for (j=0; j<nz; j++) {
1462: idx = bjtmp[j];
1463: rtmps1[idx] = 0.0;
1464: rtmps2[idx] = 0.0;
1465: rtmps3[idx] = 0.0;
1466: }
1467: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1468: idx = r[row];
1469: nz = ai[idx+1] - ai[idx];
1470: ajtmp = aj + ai[idx];
1471: v1 = aa + ai[idx];
1472: v2 = aa + ai[idx+1];
1473: v3 = aa + ai[idx+2];
1474: for (j=0; j<nz; j++) {
1475: idx = ics[ajtmp[j]];
1476: rtmp1[idx] = v1[j];
1477: rtmp2[idx] = v2[j];
1478: rtmp3[idx] = v3[j];
1479: if (ndamp && ajtmp[j] == r[row]) {
1480: rtmp1[idx] += damping;
1481: }
1482: if (ndamp && ajtmp[j] == r[row+1]) {
1483: rtmp2[idx] += damping;
1484: }
1485: if (ndamp && ajtmp[j] == r[row+2]) {
1486: rtmp3[idx] += damping;
1487: }
1488: }
1489: /* loop over all pivot row blocks above this row block */
1490: prow = *bjtmp++ ;
1491: while (prow < row) {
1492: pc1 = rtmp1 + prow;
1493: pc2 = rtmp2 + prow;
1494: pc3 = rtmp3 + prow;
1495: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1496: pv = ba + bd[prow];
1497: pj = nbj + bd[prow];
1498: mul1 = *pc1 * *pv;
1499: mul2 = *pc2 * *pv;
1500: mul3 = *pc3 * *pv;
1501: ++pv;
1502: *pc1 = mul1;
1503: *pc2 = mul2;
1504: *pc3 = mul3;
1505:
1506: nz = bi[prow+1] - bd[prow] - 1;
1507: PetscLogFlops(3*2*nz);
1508: /* update this row based on pivot row */
1509: for (j=0; j<nz; j++) {
1510: tmp = pv[j];
1511: idx = pj[j];
1512: rtmps1[idx] -= mul1 * tmp;
1513: rtmps2[idx] -= mul2 * tmp;
1514: rtmps3[idx] -= mul3 * tmp;
1515: }
1516: }
1517: prow = *bjtmp++ ;
1518: }
1519: /* Now take care of diagonal block in this set of rows */
1520: pc1 = rtmp1 + prow;
1521: pc2 = rtmp2 + prow;
1522: pc3 = rtmp3 + prow;
1523: if (*pc2 != 0.0 || *pc3 != 0.0){
1524: pj = nbj + bd[prow];
1525: if (PetscAbsScalar(*pc1) < zeropivot) {
1526: if (damping) {
1527: if (ndamp) damping *= 2.0;
1528: damp = PETSC_TRUE;
1529: ndamp++;
1530: goto endofwhile;
1531: } else {
1532: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1533: }
1534: }
1535: mul2 = (*pc2)/(*pc1);
1536: mul3 = (*pc3)/(*pc1);
1537: *pc2 = mul2;
1538: *pc3 = mul3;
1539: nz = bi[prow+1] - bd[prow] - 1;
1540: PetscLogFlops(2*2*nz);
1541: for (j=0; j<nz; j++) {
1542: idx = pj[j] ;
1543: tmp = rtmp1[idx];
1544: rtmp2[idx] -= mul2 * tmp;
1545: rtmp3[idx] -= mul3 * tmp;
1546: }
1547: }
1548: ++prow;
1549: pc2 = rtmp2 + prow;
1550: pc3 = rtmp3 + prow;
1551: if (*pc3 != 0.0){
1552: pj = nbj + bd[prow];
1553: pj = nbj + bd[prow];
1554: if (PetscAbsScalar(*pc2) < zeropivot) {
1555: if (damping) {
1556: if (ndamp) damping *= 2.0;
1557: damp = PETSC_TRUE;
1558: ndamp++;
1559: goto endofwhile;
1560: } else {
1561: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1562: }
1563: }
1564: mul3 = (*pc3)/(*pc2);
1565: *pc3 = mul3;
1566: nz = bi[prow+1] - bd[prow] - 1;
1567: PetscLogFlops(2*2*nz);
1568: for (j=0; j<nz; j++) {
1569: idx = pj[j] ;
1570: tmp = rtmp2[idx];
1571: rtmp3[idx] -= mul3 * tmp;
1572: }
1573: }
1574: nz = bi[row+1] - bi[row];
1575: pj = bj + bi[row];
1576: pc1 = ba + bi[row];
1577: pc2 = ba + bi[row+1];
1578: pc3 = ba + bi[row+2];
1579: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1580: if (damping) {
1581: if (ndamp) damping *= 2.0;
1582: damp = PETSC_TRUE;
1583: ndamp++;
1584: goto endofwhile;
1585: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1586: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1587: } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1588: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1589: } else {
1590: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1591: }
1592: }
1593: rtmp1[row] = 1.0/rtmp1[row];
1594: rtmp2[row+1] = 1.0/rtmp2[row+1];
1595: rtmp3[row+2] = 1.0/rtmp3[row+2];
1596: /* copy row entries from dense representation to sparse */
1597: for (j=0; j<nz; j++) {
1598: idx = pj[j];
1599: pc1[j] = rtmps1[idx];
1600: pc2[j] = rtmps2[idx];
1601: pc3[j] = rtmps3[idx];
1602: }
1603: break;
1605: default:
1606: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1607: }
1608: row += nsz; /* Update the row */
1609: }
1610: endofwhile:;
1611: } while (damp);
1612: PetscFree(rtmp1);
1613: PetscFree(tmp_vec2);
1614: ISRestoreIndices(isicol,&ic);
1615: ISRestoreIndices(isrow,&r);
1616: ISRestoreIndices(iscol,&c);
1617: C->factor = FACTOR_LU;
1618: C->assembled = PETSC_TRUE;
1619: if (ndamp || b->lu_damping) {
1620: PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %g\n",ndamp,damping);
1621: }
1622: PetscLogFlops(C->n);
1623: return(0);
1624: }
1626: /*
1627: This is really ugly. if inodes are used this replaces the
1628: permutations with ones that correspond to rows/cols of the matrix
1629: rather then inode blocks
1630: */
1633: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1634: {
1635: int ierr,(*f)(Mat,IS*,IS*);
1638: PetscObjectQueryFunction((PetscObject)A,"MatAdjustForInodes_C",(void (**)(void))&f);
1639: if (f) {
1640: (*f)(A,rperm,cperm);
1641: }
1642: return(0);
1643: }
1645: EXTERN_C_BEGIN
1648: int MatAdjustForInodes_SeqAIJ(Mat A,IS *rperm,IS *cperm)
1649: {
1650: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1651: int ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1652: int row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1653: int nslim_col,*ns_col;
1654: IS ris = *rperm,cis = *cperm;
1657: if (!a->inode.size) return(0); /* no inodes so return */
1658: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1660: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1661: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1662: PetscMalloc((m+n+1)*sizeof(int),&permr);
1663: permc = permr + m;
1665: ISGetIndices(ris,&ridx);
1666: ISGetIndices(cis,&cidx);
1668: /* Form the inode structure for the rows of permuted matric using inv perm*/
1669: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1671: /* Construct the permutations for rows*/
1672: for (i=0,row = 0; i<nslim_row; ++i){
1673: indx = ridx[i];
1674: start_val = tns[indx];
1675: end_val = tns[indx + 1];
1676: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1677: }
1679: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1680: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1682: /* Construct permutations for columns */
1683: for (i=0,col=0; i<nslim_col; ++i){
1684: indx = cidx[i];
1685: start_val = tns[indx];
1686: end_val = tns[indx + 1];
1687: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1688: }
1690: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1691: ISSetPermutation(*rperm);
1692: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1693: ISSetPermutation(*cperm);
1694:
1695: ISRestoreIndices(ris,&ridx);
1696: ISRestoreIndices(cis,&cidx);
1698: PetscFree(ns_col);
1699: PetscFree(permr);
1700: ISDestroy(cis);
1701: ISDestroy(ris);
1702: PetscFree(tns);
1703: return(0);
1704: }
1705: EXTERN_C_END
1709: /*@C
1710: MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.
1712: Collective on Mat
1714: Input Parameter:
1715: . A - the SeqAIJ matrix.
1717: Output Parameter:
1718: + node_count - no of inodes present in the matrix.
1719: . sizes - an array of size node_count,with sizes of each inode.
1720: - limit - the max size used to generate the inodes.
1722: Level: advanced
1724: Notes: This routine returns some internal storage information
1725: of the matrix, it is intended to be used by advanced users.
1726: It should be called after the matrix is assembled.
1727: The contents of the sizes[] array should not be changed.
1729: .keywords: matrix, seqaij, get, inode
1731: .seealso: MatGetInfo()
1732: @*/
1733: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1734: {
1735: int ierr,(*f)(Mat,int*,int*[],int*);
1738: PetscObjectQueryFunction((PetscObject)A,"MatSeqAIJGetInodeSizes_C",(void (**)(void))&f);
1739: if (f) {
1740: (*f)(A,node_count,sizes,limit);
1741: }
1742: return(0);
1743: }
1745: EXTERN_C_BEGIN
1748: int MatSeqAIJGetInodeSizes_SeqAIJ(Mat A,int *node_count,int *sizes[],int *limit)
1749: {
1750: Mat_SeqAIJ *a;
1753: a = (Mat_SeqAIJ*)A->data;
1754: *node_count = a->inode.node_count;
1755: *sizes = a->inode.size;
1756: *limit = a->inode.limit;
1757: return(0);
1758: }
1759: EXTERN_C_END