Actual source code: baij2.c
1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/inline/spops.h
5: #include src/inline/ilu.h
6: #include petscbt.h
10: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS is[],int ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
14: int start,end,*ai,*aj,bs,*nidx2;
15: PetscBT table;
18: m = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = a->bs;
23: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: PetscBTCreate(m,table);
26: PetscMalloc((m+1)*sizeof(int),&nidx);
27: PetscMalloc((A->m+1)*sizeof(int),&nidx2);
29: for (i=0; i<is_max; i++) {
30: /* Initialise the two local arrays */
31: isz = 0;
32: PetscBTMemzero(m,table);
33:
34: /* Extract the indices, assume there can be duplicate entries */
35: ISGetIndices(is[i],&idx);
36: ISGetLocalSize(is[i],&n);
38: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
39: for (j=0; j<n ; ++j){
40: ival = idx[j]/bs; /* convert the indices into block indices */
41: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
43: }
44: ISRestoreIndices(is[i],&idx);
45: ISDestroy(is[i]);
46:
47: k = 0;
48: for (j=0; j<ov; j++){ /* for each overlap*/
49: n = isz;
50: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
51: row = nidx[k];
52: start = ai[row];
53: end = ai[row+1];
54: for (l = start; l<end ; l++){
55: val = aj[l];
56: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
57: }
58: }
59: }
60: /* expand the Index Set */
61: for (j=0; j<isz; j++) {
62: for (k=0; k<bs; k++)
63: nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
66: }
67: PetscBTDestroy(table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
78: int *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
79: int row,mat_i,*mat_j,tcol,*mat_ilen;
80: int *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
81: int *aj = a->j,*ai = a->i;
82: MatScalar *mat_a;
83: Mat C;
84: PetscTruth flag;
87: ISSorted(iscol,(PetscTruth*)&i);
88: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscMalloc((1+oldcols)*sizeof(int),&smap);
96: ssmap = smap;
97: PetscMalloc((1+nrows)*sizeof(int),&lens);
98: PetscMemzero(smap,oldcols*sizeof(int));
99: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
100: /* determine lens of each row */
101: for (i=0; i<nrows; i++) {
102: kstart = ai[irow[i]];
103: kend = kstart + a->ilen[irow[i]];
104: lens[i] = 0;
105: for (k=kstart; k<kend; k++) {
106: if (ssmap[aj[k]]) {
107: lens[i]++;
108: }
109: }
110: }
111: /* Create and fill new matrix */
112: if (scall == MAT_REUSE_MATRIX) {
113: c = (Mat_SeqBAIJ *)((*B)->data);
115: if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
117: if (flag == PETSC_FALSE) {
118: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
119: }
120: PetscMemzero(c->ilen,c->mbs*sizeof(int));
121: C = *B;
122: } else {
123: MatCreate(A->comm,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);
124: MatSetType(C,A->type_name);
125: MatSeqBAIJSetPreallocation(C,bs,0,lens);
126: }
127: c = (Mat_SeqBAIJ *)(C->data);
128: for (i=0; i<nrows; i++) {
129: row = irow[i];
130: kstart = ai[row];
131: kend = kstart + a->ilen[row];
132: mat_i = c->i[i];
133: mat_j = c->j + mat_i;
134: mat_a = c->a + mat_i*bs2;
135: mat_ilen = c->ilen + i;
136: for (k=kstart; k<kend; k++) {
137: if ((tcol=ssmap[a->j[k]])) {
138: *mat_j++ = tcol - 1;
139: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
140: mat_a += bs2;
141: (*mat_ilen)++;
142: }
143: }
144: }
145:
146: /* Free work space */
147: ISRestoreIndices(iscol,&icol);
148: PetscFree(smap);
149: PetscFree(lens);
150: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
151: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
152:
153: ISRestoreIndices(isrow,&irow);
154: *B = C;
155: return(0);
156: }
160: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
161: {
162: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
163: IS is1,is2;
164: int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
167: ISGetIndices(isrow,&irow);
168: ISGetIndices(iscol,&icol);
169: ISGetLocalSize(isrow,&nrows);
170: ISGetLocalSize(iscol,&ncols);
171:
172: /* Verify if the indices corespond to each element in a block
173: and form the IS with compressed IS */
174: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
175: iary = vary + a->mbs;
176: PetscMemzero(vary,(a->mbs)*sizeof(int));
177: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178: count = 0;
179: for (i=0; i<a->mbs; i++) {
180: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
181: if (vary[i]==bs) iary[count++] = i;
182: }
183: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
184:
185: PetscMemzero(vary,(a->mbs)*sizeof(int));
186: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187: count = 0;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
190: if (vary[i]==bs) iary[count++] = i;
191: }
192: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
193: ISRestoreIndices(isrow,&irow);
194: ISRestoreIndices(iscol,&icol);
195: PetscFree(vary);
197: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
198: ISDestroy(is1);
199: ISDestroy(is2);
200: return(0);
201: }
205: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206: {
207: int ierr,i;
210: if (scall == MAT_INITIAL_MATRIX) {
211: PetscMalloc((n+1)*sizeof(Mat),B);
212: }
214: for (i=0; i<n; i++) {
215: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
216: }
217: return(0);
218: }
221: /* -------------------------------------------------------*/
222: /* Should check that shapes of vectors and matrices match */
223: /* -------------------------------------------------------*/
224: #include petscblaslapack.h
228: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229: {
230: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
231: PetscScalar *x,*z,sum;
232: MatScalar *v;
233: int mbs=a->mbs,i,*idx,*ii,n,ierr;
236: VecGetArray(xx,&x);
237: VecGetArray(zz,&z);
239: idx = a->j;
240: v = a->a;
241: ii = a->i;
243: for (i=0; i<mbs; i++) {
244: n = ii[1] - ii[0]; ii++;
245: sum = 0.0;
246: while (n--) sum += *v++ * x[*idx++];
247: z[i] = sum;
248: }
249: VecRestoreArray(xx,&x);
250: VecRestoreArray(zz,&z);
251: PetscLogFlops(2*a->nz - A->m);
252: return(0);
253: }
257: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
258: {
259: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
260: PetscScalar *x,*z,*xb,sum1,sum2;
261: PetscScalar x1,x2;
262: MatScalar *v;
263: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
266: VecGetArray(xx,&x);
267: VecGetArray(zz,&z);
269: idx = a->j;
270: v = a->a;
271: ii = a->i;
273: for (i=0; i<mbs; i++) {
274: n = ii[1] - ii[0]; ii++;
275: sum1 = 0.0; sum2 = 0.0;
276: for (j=0; j<n; j++) {
277: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
278: sum1 += v[0]*x1 + v[2]*x2;
279: sum2 += v[1]*x1 + v[3]*x2;
280: v += 4;
281: }
282: z[0] = sum1; z[1] = sum2;
283: z += 2;
284: }
285: VecRestoreArray(xx,&x);
286: VecRestoreArray(zz,&z);
287: PetscLogFlops(8*a->nz - A->m);
288: return(0);
289: }
293: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
294: {
295: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
296: PetscScalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
297: MatScalar *v;
298: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
300: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
301: #pragma disjoint(*v,*z,*xb)
302: #endif
305: VecGetArray(xx,&x);
306: VecGetArray(zz,&z);
308: idx = a->j;
309: v = a->a;
310: ii = a->i;
312: for (i=0; i<mbs; i++) {
313: n = ii[1] - ii[0]; ii++;
314: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
315: for (j=0; j<n; j++) {
316: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
317: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
318: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
319: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
320: v += 9;
321: }
322: z[0] = sum1; z[1] = sum2; z[2] = sum3;
323: z += 3;
324: }
325: VecRestoreArray(xx,&x);
326: VecRestoreArray(zz,&z);
327: PetscLogFlops(18*a->nz - A->m);
328: return(0);
329: }
333: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
334: {
335: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
336: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
337: MatScalar *v;
338: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
341: VecGetArray(xx,&x);
342: VecGetArray(zz,&z);
344: idx = a->j;
345: v = a->a;
346: ii = a->i;
348: for (i=0; i<mbs; i++) {
349: n = ii[1] - ii[0]; ii++;
350: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
351: for (j=0; j<n; j++) {
352: xb = x + 4*(*idx++);
353: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
354: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
355: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
356: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
357: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
358: v += 16;
359: }
360: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
361: z += 4;
362: }
363: VecRestoreArray(xx,&x);
364: VecRestoreArray(zz,&z);
365: PetscLogFlops(32*a->nz - A->m);
366: return(0);
367: }
371: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
372: {
373: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
374: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
375: MatScalar *v;
376: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
379: VecGetArray(xx,&x);
380: VecGetArray(zz,&z);
382: idx = a->j;
383: v = a->a;
384: ii = a->i;
386: for (i=0; i<mbs; i++) {
387: n = ii[1] - ii[0]; ii++;
388: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
389: for (j=0; j<n; j++) {
390: xb = x + 5*(*idx++);
391: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
392: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
393: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
394: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
395: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
396: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
397: v += 25;
398: }
399: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
400: z += 5;
401: }
402: VecRestoreArray(xx,&x);
403: VecRestoreArray(zz,&z);
404: PetscLogFlops(50*a->nz - A->m);
405: return(0);
406: }
411: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
412: {
413: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
414: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
415: PetscScalar x1,x2,x3,x4,x5,x6;
416: MatScalar *v;
417: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
420: VecGetArray(xx,&x);
421: VecGetArray(zz,&z);
423: idx = a->j;
424: v = a->a;
425: ii = a->i;
427: for (i=0; i<mbs; i++) {
428: n = ii[1] - ii[0]; ii++;
429: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
430: for (j=0; j<n; j++) {
431: xb = x + 6*(*idx++);
432: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
433: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
434: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
435: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
436: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
437: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
438: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
439: v += 36;
440: }
441: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
442: z += 6;
443: }
445: VecRestoreArray(xx,&x);
446: VecRestoreArray(zz,&z);
447: PetscLogFlops(72*a->nz - A->m);
448: return(0);
449: }
452: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
453: {
454: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
455: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
456: PetscScalar x1,x2,x3,x4,x5,x6,x7;
457: MatScalar *v;
458: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
461: VecGetArray(xx,&x);
462: VecGetArray(zz,&z);
464: idx = a->j;
465: v = a->a;
466: ii = a->i;
468: for (i=0; i<mbs; i++) {
469: n = ii[1] - ii[0]; ii++;
470: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
471: for (j=0; j<n; j++) {
472: xb = x + 7*(*idx++);
473: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
474: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
475: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
476: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
477: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
478: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
479: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
480: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
481: v += 49;
482: }
483: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
484: z += 7;
485: }
487: VecRestoreArray(xx,&x);
488: VecRestoreArray(zz,&z);
489: PetscLogFlops(98*a->nz - A->m);
490: return(0);
491: }
493: /*
494: This will not work with MatScalar == float because it calls the BLAS
495: */
498: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
499: {
500: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
501: PetscScalar *x,*z,*xb,*work,*workt;
502: MatScalar *v;
503: int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
504: int ncols,k;
507: VecGetArray(xx,&x);
508: VecGetArray(zz,&z);
510: idx = a->j;
511: v = a->a;
512: ii = a->i;
515: if (!a->mult_work) {
516: k = PetscMax(A->m,A->n);
517: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
518: }
519: work = a->mult_work;
520: for (i=0; i<mbs; i++) {
521: n = ii[1] - ii[0]; ii++;
522: ncols = n*bs;
523: workt = work;
524: for (j=0; j<n; j++) {
525: xb = x + bs*(*idx++);
526: for (k=0; k<bs; k++) workt[k] = xb[k];
527: workt += bs;
528: }
529: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
530: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
531: v += n*bs2;
532: z += bs;
533: }
534: VecRestoreArray(xx,&x);
535: VecRestoreArray(zz,&z);
536: PetscLogFlops(2*a->nz*bs2 - A->m);
537: return(0);
538: }
542: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
543: {
544: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
545: PetscScalar *x,*y,*z,sum;
546: MatScalar *v;
547: int ierr,mbs=a->mbs,i,*idx,*ii,n;
550: VecGetArray(xx,&x);
551: VecGetArray(yy,&y);
552: if (zz != yy) {
553: VecGetArray(zz,&z);
554: } else {
555: z = y;
556: }
558: idx = a->j;
559: v = a->a;
560: ii = a->i;
562: for (i=0; i<mbs; i++) {
563: n = ii[1] - ii[0]; ii++;
564: sum = y[i];
565: while (n--) sum += *v++ * x[*idx++];
566: z[i] = sum;
567: }
568: VecRestoreArray(xx,&x);
569: VecRestoreArray(yy,&y);
570: if (zz != yy) {
571: VecRestoreArray(zz,&z);
572: }
573: PetscLogFlops(2*a->nz);
574: return(0);
575: }
579: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
580: {
581: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
582: PetscScalar *x,*y,*z,*xb,sum1,sum2;
583: PetscScalar x1,x2;
584: MatScalar *v;
585: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
588: VecGetArray(xx,&x);
589: VecGetArray(yy,&y);
590: if (zz != yy) {
591: VecGetArray(zz,&z);
592: } else {
593: z = y;
594: }
596: idx = a->j;
597: v = a->a;
598: ii = a->i;
600: for (i=0; i<mbs; i++) {
601: n = ii[1] - ii[0]; ii++;
602: sum1 = y[0]; sum2 = y[1];
603: for (j=0; j<n; j++) {
604: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
605: sum1 += v[0]*x1 + v[2]*x2;
606: sum2 += v[1]*x1 + v[3]*x2;
607: v += 4;
608: }
609: z[0] = sum1; z[1] = sum2;
610: z += 2; y += 2;
611: }
612: VecRestoreArray(xx,&x);
613: VecRestoreArray(yy,&y);
614: if (zz != yy) {
615: VecRestoreArray(zz,&z);
616: }
617: PetscLogFlops(4*a->nz);
618: return(0);
619: }
623: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
624: {
625: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
626: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
627: MatScalar *v;
628: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
631: VecGetArray(xx,&x);
632: VecGetArray(yy,&y);
633: if (zz != yy) {
634: VecGetArray(zz,&z);
635: } else {
636: z = y;
637: }
639: idx = a->j;
640: v = a->a;
641: ii = a->i;
643: for (i=0; i<mbs; i++) {
644: n = ii[1] - ii[0]; ii++;
645: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
646: for (j=0; j<n; j++) {
647: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
648: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
649: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
650: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
651: v += 9;
652: }
653: z[0] = sum1; z[1] = sum2; z[2] = sum3;
654: z += 3; y += 3;
655: }
656: VecRestoreArray(xx,&x);
657: VecRestoreArray(yy,&y);
658: if (zz != yy) {
659: VecRestoreArray(zz,&z);
660: }
661: PetscLogFlops(18*a->nz);
662: return(0);
663: }
667: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
668: {
669: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
670: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
671: MatScalar *v;
672: int ierr,mbs=a->mbs,i,*idx,*ii;
673: int j,n;
676: VecGetArray(xx,&x);
677: VecGetArray(yy,&y);
678: if (zz != yy) {
679: VecGetArray(zz,&z);
680: } else {
681: z = y;
682: }
684: idx = a->j;
685: v = a->a;
686: ii = a->i;
688: for (i=0; i<mbs; i++) {
689: n = ii[1] - ii[0]; ii++;
690: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
691: for (j=0; j<n; j++) {
692: xb = x + 4*(*idx++);
693: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
694: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
695: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
696: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
697: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
698: v += 16;
699: }
700: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
701: z += 4; y += 4;
702: }
703: VecRestoreArray(xx,&x);
704: VecRestoreArray(yy,&y);
705: if (zz != yy) {
706: VecRestoreArray(zz,&z);
707: }
708: PetscLogFlops(32*a->nz);
709: return(0);
710: }
714: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
715: {
716: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
717: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
718: MatScalar *v;
719: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
722: VecGetArray(xx,&x);
723: VecGetArray(yy,&y);
724: if (zz != yy) {
725: VecGetArray(zz,&z);
726: } else {
727: z = y;
728: }
730: idx = a->j;
731: v = a->a;
732: ii = a->i;
734: for (i=0; i<mbs; i++) {
735: n = ii[1] - ii[0]; ii++;
736: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
737: for (j=0; j<n; j++) {
738: xb = x + 5*(*idx++);
739: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
740: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
741: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
742: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
743: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
744: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
745: v += 25;
746: }
747: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
748: z += 5; y += 5;
749: }
750: VecRestoreArray(xx,&x);
751: VecRestoreArray(yy,&y);
752: if (zz != yy) {
753: VecRestoreArray(zz,&z);
754: }
755: PetscLogFlops(50*a->nz);
756: return(0);
757: }
760: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
761: {
762: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
763: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
764: PetscScalar x1,x2,x3,x4,x5,x6;
765: MatScalar *v;
766: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
769: VecGetArray(xx,&x);
770: VecGetArray(yy,&y);
771: if (zz != yy) {
772: VecGetArray(zz,&z);
773: } else {
774: z = y;
775: }
777: idx = a->j;
778: v = a->a;
779: ii = a->i;
781: for (i=0; i<mbs; i++) {
782: n = ii[1] - ii[0]; ii++;
783: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
784: for (j=0; j<n; j++) {
785: xb = x + 6*(*idx++);
786: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
787: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
788: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
789: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
790: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
791: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
792: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
793: v += 36;
794: }
795: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
796: z += 6; y += 6;
797: }
798: VecRestoreArray(xx,&x);
799: VecRestoreArray(yy,&y);
800: if (zz != yy) {
801: VecRestoreArray(zz,&z);
802: }
803: PetscLogFlops(72*a->nz);
804: return(0);
805: }
809: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
810: {
811: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
812: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
813: PetscScalar x1,x2,x3,x4,x5,x6,x7;
814: MatScalar *v;
815: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
818: VecGetArray(xx,&x);
819: VecGetArray(yy,&y);
820: if (zz != yy) {
821: VecGetArray(zz,&z);
822: } else {
823: z = y;
824: }
826: idx = a->j;
827: v = a->a;
828: ii = a->i;
830: for (i=0; i<mbs; i++) {
831: n = ii[1] - ii[0]; ii++;
832: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
833: for (j=0; j<n; j++) {
834: xb = x + 7*(*idx++);
835: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
836: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
837: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
838: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
839: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
840: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
841: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
842: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
843: v += 49;
844: }
845: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
846: z += 7; y += 7;
847: }
848: VecRestoreArray(xx,&x);
849: VecRestoreArray(yy,&y);
850: if (zz != yy) {
851: VecRestoreArray(zz,&z);
852: }
853: PetscLogFlops(98*a->nz);
854: return(0);
855: }
859: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
860: {
861: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
862: PetscScalar *x,*z,*xb,*work,*workt,*y;
863: MatScalar *v;
864: int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
865: int ncols,k;
868: VecGetArray(xx,&x);
869: VecGetArray(zz,&z);
870: if (zz != yy) {
871: VecGetArray(yy,&y);
872: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
873: VecRestoreArray(yy,&y);
874: }
876: idx = a->j;
877: v = a->a;
878: ii = a->i;
881: if (!a->mult_work) {
882: k = PetscMax(A->m,A->n);
883: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
884: }
885: work = a->mult_work;
886: for (i=0; i<mbs; i++) {
887: n = ii[1] - ii[0]; ii++;
888: ncols = n*bs;
889: workt = work;
890: for (j=0; j<n; j++) {
891: xb = x + bs*(*idx++);
892: for (k=0; k<bs; k++) workt[k] = xb[k];
893: workt += bs;
894: }
895: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
896: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
897: v += n*bs2;
898: z += bs;
899: }
900: VecRestoreArray(xx,&x);
901: VecRestoreArray(zz,&z);
902: PetscLogFlops(2*a->nz*bs2);
903: return(0);
904: }
908: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
909: {
910: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
911: PetscScalar *xg,*zg,*zb,zero = 0.0;
912: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
913: MatScalar *v;
914: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
915: int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
918: VecSet(&zero,zz);
919: VecGetArray(xx,&xg); x = xg;
920: VecGetArray(zz,&zg); z = zg;
922: idx = a->j;
923: v = a->a;
924: ii = a->i;
925: xb = x;
926: switch (bs) {
927: case 1:
928: for (i=0; i<mbs; i++) {
929: n = ii[1] - ii[0]; ii++;
930: x1 = xb[0];
931: ib = idx + ai[i];
932: for (j=0; j<n; j++) {
933: rval = ib[j];
934: z[rval] += *v * x1;
935: v++;
936: }
937: xb++;
938: }
939: break;
940: case 2:
941: for (i=0; i<mbs; i++) {
942: n = ii[1] - ii[0]; ii++;
943: x1 = xb[0]; x2 = xb[1];
944: ib = idx + ai[i];
945: for (j=0; j<n; j++) {
946: rval = ib[j]*2;
947: z[rval++] += v[0]*x1 + v[1]*x2;
948: z[rval] += v[2]*x1 + v[3]*x2;
949: v += 4;
950: }
951: xb += 2;
952: }
953: break;
954: case 3:
955: for (i=0; i<mbs; i++) {
956: n = ii[1] - ii[0]; ii++;
957: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
958: ib = idx + ai[i];
959: for (j=0; j<n; j++) {
960: rval = ib[j]*3;
961: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
962: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
963: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
964: v += 9;
965: }
966: xb += 3;
967: }
968: break;
969: case 4:
970: for (i=0; i<mbs; i++) {
971: n = ii[1] - ii[0]; ii++;
972: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
973: ib = idx + ai[i];
974: for (j=0; j<n; j++) {
975: rval = ib[j]*4;
976: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
977: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
978: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
979: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
980: v += 16;
981: }
982: xb += 4;
983: }
984: break;
985: case 5:
986: for (i=0; i<mbs; i++) {
987: n = ii[1] - ii[0]; ii++;
988: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
989: x4 = xb[3]; x5 = xb[4];
990: ib = idx + ai[i];
991: for (j=0; j<n; j++) {
992: rval = ib[j]*5;
993: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
994: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
995: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
996: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
997: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
998: v += 25;
999: }
1000: xb += 5;
1001: }
1002: break;
1003: case 6:
1004: for (i=0; i<mbs; i++) {
1005: n = ii[1] - ii[0]; ii++;
1006: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1007: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1008: ib = idx + ai[i];
1009: for (j=0; j<n; j++) {
1010: rval = ib[j]*6;
1011: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1012: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1013: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1014: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1015: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1016: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1017: v += 36;
1018: }
1019: xb += 6;
1020: }
1021: break;
1022: case 7:
1023: for (i=0; i<mbs; i++) {
1024: n = ii[1] - ii[0]; ii++;
1025: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1026: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1027: ib = idx + ai[i];
1028: for (j=0; j<n; j++) {
1029: rval = ib[j]*7;
1030: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1031: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1032: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1033: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1034: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1035: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1036: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1037: v += 49;
1038: }
1039: xb += 7;
1040: }
1041: break;
1042: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
1043: int ncols,k;
1044: PetscScalar *work,*workt;
1046: if (!a->mult_work) {
1047: k = PetscMax(A->m,A->n);
1048: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1049: }
1050: work = a->mult_work;
1051: for (i=0; i<mbs; i++) {
1052: n = ii[1] - ii[0]; ii++;
1053: ncols = n*bs;
1054: PetscMemzero(work,ncols*sizeof(PetscScalar));
1055: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1056: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1057: v += n*bs2;
1058: x += bs;
1059: workt = work;
1060: for (j=0; j<n; j++) {
1061: zb = z + bs*(*idx++);
1062: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1063: workt += bs;
1064: }
1065: }
1066: }
1067: }
1068: VecRestoreArray(xx,&xg);
1069: VecRestoreArray(zz,&zg);
1070: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1071: return(0);
1072: }
1076: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1078: {
1079: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1080: PetscScalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1081: MatScalar *v;
1082: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1085: if (yy != zz) { VecCopy(yy,zz); }
1086: VecGetArray(xx,&xg); x = xg;
1087: VecGetArray(zz,&zg); z = zg;
1090: idx = a->j;
1091: v = a->a;
1092: ii = a->i;
1093: xb = x;
1095: switch (bs) {
1096: case 1:
1097: for (i=0; i<mbs; i++) {
1098: n = ii[1] - ii[0]; ii++;
1099: x1 = xb[0];
1100: ib = idx + ai[i];
1101: for (j=0; j<n; j++) {
1102: rval = ib[j];
1103: z[rval] += *v * x1;
1104: v++;
1105: }
1106: xb++;
1107: }
1108: break;
1109: case 2:
1110: for (i=0; i<mbs; i++) {
1111: n = ii[1] - ii[0]; ii++;
1112: x1 = xb[0]; x2 = xb[1];
1113: ib = idx + ai[i];
1114: for (j=0; j<n; j++) {
1115: rval = ib[j]*2;
1116: z[rval++] += v[0]*x1 + v[1]*x2;
1117: z[rval++] += v[2]*x1 + v[3]*x2;
1118: v += 4;
1119: }
1120: xb += 2;
1121: }
1122: break;
1123: case 3:
1124: for (i=0; i<mbs; i++) {
1125: n = ii[1] - ii[0]; ii++;
1126: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1127: ib = idx + ai[i];
1128: for (j=0; j<n; j++) {
1129: rval = ib[j]*3;
1130: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1131: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1132: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1133: v += 9;
1134: }
1135: xb += 3;
1136: }
1137: break;
1138: case 4:
1139: for (i=0; i<mbs; i++) {
1140: n = ii[1] - ii[0]; ii++;
1141: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1142: ib = idx + ai[i];
1143: for (j=0; j<n; j++) {
1144: rval = ib[j]*4;
1145: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1146: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1147: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1148: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1149: v += 16;
1150: }
1151: xb += 4;
1152: }
1153: break;
1154: case 5:
1155: for (i=0; i<mbs; i++) {
1156: n = ii[1] - ii[0]; ii++;
1157: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1158: x4 = xb[3]; x5 = xb[4];
1159: ib = idx + ai[i];
1160: for (j=0; j<n; j++) {
1161: rval = ib[j]*5;
1162: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1163: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1164: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1165: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1166: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1167: v += 25;
1168: }
1169: xb += 5;
1170: }
1171: break;
1172: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1173: int ncols,k;
1174: PetscScalar *work,*workt;
1176: if (!a->mult_work) {
1177: k = PetscMax(A->m,A->n);
1178: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1179: }
1180: work = a->mult_work;
1181: for (i=0; i<mbs; i++) {
1182: n = ii[1] - ii[0]; ii++;
1183: ncols = n*bs;
1184: PetscMemzero(work,ncols*sizeof(PetscScalar));
1185: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1186: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1187: v += n*bs2;
1188: x += bs;
1189: workt = work;
1190: for (j=0; j<n; j++) {
1191: zb = z + bs*(*idx++);
1192: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1193: workt += bs;
1194: }
1195: }
1196: }
1197: }
1198: VecRestoreArray(xx,&xg);
1199: VecRestoreArray(zz,&zg);
1200: PetscLogFlops(2*a->nz*a->bs2);
1201: return(0);
1202: }
1206: int MatScale_SeqBAIJ(const PetscScalar *alpha,Mat inA)
1207: {
1208: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1209: int totalnz = a->bs2*a->nz;
1210: #if defined(PETSC_USE_MAT_SINGLE)
1211: int i;
1212: #else
1213: int one = 1;
1214: #endif
1217: #if defined(PETSC_USE_MAT_SINGLE)
1218: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1219: #else
1220: BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1221: #endif
1222: PetscLogFlops(totalnz);
1223: return(0);
1224: }
1228: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1229: {
1230: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1231: MatScalar *v = a->a;
1232: PetscReal sum = 0.0;
1233: int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1236: if (type == NORM_FROBENIUS) {
1237: for (i=0; i< bs2*nz; i++) {
1238: #if defined(PETSC_USE_COMPLEX)
1239: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1240: #else
1241: sum += (*v)*(*v); v++;
1242: #endif
1243: }
1244: *norm = sqrt(sum);
1245: } else if (type == NORM_INFINITY) { /* maximum row sum */
1246: *norm = 0.0;
1247: for (k=0; k<bs; k++) {
1248: for (j=0; j<a->mbs; j++) {
1249: v = a->a + bs2*a->i[j] + k;
1250: sum = 0.0;
1251: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1252: for (k1=0; k1<bs; k1++){
1253: sum += PetscAbsScalar(*v);
1254: v += bs;
1255: }
1256: }
1257: if (sum > *norm) *norm = sum;
1258: }
1259: }
1260: } else {
1261: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1262: }
1263: return(0);
1264: }
1269: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1270: {
1271: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1272: int ierr;
1275: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1276: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1277: *flg = PETSC_FALSE;
1278: return(0);
1279: }
1280:
1281: /* if the a->i are the same */
1282: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1283: if (*flg == PETSC_FALSE) {
1284: return(0);
1285: }
1286:
1287: /* if a->j are the same */
1288: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1289: if (*flg == PETSC_FALSE) {
1290: return(0);
1291: }
1292: /* if a->a are the same */
1293: PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1294: return(0);
1295:
1296: }
1300: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1301: {
1302: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1303: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1304: PetscScalar *x,zero = 0.0;
1305: MatScalar *aa,*aa_j;
1308: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1309: bs = a->bs;
1310: aa = a->a;
1311: ai = a->i;
1312: aj = a->j;
1313: ambs = a->mbs;
1314: bs2 = a->bs2;
1316: VecSet(&zero,v);
1317: VecGetArray(v,&x);
1318: VecGetLocalSize(v,&n);
1319: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1320: for (i=0; i<ambs; i++) {
1321: for (j=ai[i]; j<ai[i+1]; j++) {
1322: if (aj[j] == i) {
1323: row = i*bs;
1324: aa_j = aa+j*bs2;
1325: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1326: break;
1327: }
1328: }
1329: }
1330: VecRestoreArray(v,&x);
1331: return(0);
1332: }
1336: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1337: {
1338: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1339: PetscScalar *l,*r,x,*li,*ri;
1340: MatScalar *aa,*v;
1341: int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1344: ai = a->i;
1345: aj = a->j;
1346: aa = a->a;
1347: m = A->m;
1348: n = A->n;
1349: bs = a->bs;
1350: mbs = a->mbs;
1351: bs2 = a->bs2;
1352: if (ll) {
1353: VecGetArray(ll,&l);
1354: VecGetLocalSize(ll,&lm);
1355: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1356: for (i=0; i<mbs; i++) { /* for each block row */
1357: M = ai[i+1] - ai[i];
1358: li = l + i*bs;
1359: v = aa + bs2*ai[i];
1360: for (j=0; j<M; j++) { /* for each block */
1361: for (k=0; k<bs2; k++) {
1362: (*v++) *= li[k%bs];
1363: }
1364: }
1365: }
1366: VecRestoreArray(ll,&l);
1367: PetscLogFlops(a->nz);
1368: }
1369:
1370: if (rr) {
1371: VecGetArray(rr,&r);
1372: VecGetLocalSize(rr,&rn);
1373: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1374: for (i=0; i<mbs; i++) { /* for each block row */
1375: M = ai[i+1] - ai[i];
1376: v = aa + bs2*ai[i];
1377: for (j=0; j<M; j++) { /* for each block */
1378: ri = r + bs*aj[ai[i]+j];
1379: for (k=0; k<bs; k++) {
1380: x = ri[k];
1381: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1382: }
1383: }
1384: }
1385: VecRestoreArray(rr,&r);
1386: PetscLogFlops(a->nz);
1387: }
1388: return(0);
1389: }
1394: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1395: {
1396: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1399: info->rows_global = (double)A->m;
1400: info->columns_global = (double)A->n;
1401: info->rows_local = (double)A->m;
1402: info->columns_local = (double)A->n;
1403: info->block_size = a->bs2;
1404: info->nz_allocated = a->maxnz;
1405: info->nz_used = a->bs2*a->nz;
1406: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1407: info->assemblies = A->num_ass;
1408: info->mallocs = a->reallocs;
1409: info->memory = A->mem;
1410: if (A->factor) {
1411: info->fill_ratio_given = A->info.fill_ratio_given;
1412: info->fill_ratio_needed = A->info.fill_ratio_needed;
1413: info->factor_mallocs = A->info.factor_mallocs;
1414: } else {
1415: info->fill_ratio_given = 0;
1416: info->fill_ratio_needed = 0;
1417: info->factor_mallocs = 0;
1418: }
1419: return(0);
1420: }
1425: int MatZeroEntries_SeqBAIJ(Mat A)
1426: {
1427: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1428: int ierr;
1431: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1432: return(0);
1433: }