Actual source code: baij2.c

  1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/inline/spops.h
 5:  #include src/inline/ilu.h
 6:  #include petscbt.h

 10: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS is[],int ov)
 11: {
 12:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
 13:   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
 14:   int         start,end,*ai,*aj,bs,*nidx2;
 15:   PetscBT     table;

 18:   m     = a->mbs;
 19:   ai    = a->i;
 20:   aj    = a->j;
 21:   bs    = a->bs;

 23:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 25:   PetscBTCreate(m,table);
 26:   PetscMalloc((m+1)*sizeof(int),&nidx);
 27:   PetscMalloc((A->m+1)*sizeof(int),&nidx2);

 29:   for (i=0; i<is_max; i++) {
 30:     /* Initialise the two local arrays */
 31:     isz  = 0;
 32:     PetscBTMemzero(m,table);
 33: 
 34:     /* Extract the indices, assume there can be duplicate entries */
 35:     ISGetIndices(is[i],&idx);
 36:     ISGetLocalSize(is[i],&n);

 38:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 39:     for (j=0; j<n ; ++j){
 40:       ival = idx[j]/bs; /* convert the indices into block indices */
 41:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 43:     }
 44:     ISRestoreIndices(is[i],&idx);
 45:     ISDestroy(is[i]);
 46: 
 47:     k = 0;
 48:     for (j=0; j<ov; j++){ /* for each overlap*/
 49:       n = isz;
 50:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 51:         row   = nidx[k];
 52:         start = ai[row];
 53:         end   = ai[row+1];
 54:         for (l = start; l<end ; l++){
 55:           val = aj[l];
 56:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 57:         }
 58:       }
 59:     }
 60:     /* expand the Index Set */
 61:     for (j=0; j<isz; j++) {
 62:       for (k=0; k<bs; k++)
 63:         nidx2[j*bs+k] = nidx[j]*bs+k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 66:   }
 67:   PetscBTDestroy(table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return(0);
 71: }

 75: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
 76: {
 77:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*c;
 78:   int          *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
 79:   int          row,mat_i,*mat_j,tcol,*mat_ilen;
 80:   int          *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
 81:   int          *aj = a->j,*ai = a->i;
 82:   MatScalar    *mat_a;
 83:   Mat          C;
 84:   PetscTruth   flag;

 87:   ISSorted(iscol,(PetscTruth*)&i);
 88:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 90:   ISGetIndices(isrow,&irow);
 91:   ISGetIndices(iscol,&icol);
 92:   ISGetLocalSize(isrow,&nrows);
 93:   ISGetLocalSize(iscol,&ncols);

 95:   PetscMalloc((1+oldcols)*sizeof(int),&smap);
 96:   ssmap = smap;
 97:   PetscMalloc((1+nrows)*sizeof(int),&lens);
 98:   PetscMemzero(smap,oldcols*sizeof(int));
 99:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
100:   /* determine lens of each row */
101:   for (i=0; i<nrows; i++) {
102:     kstart  = ai[irow[i]];
103:     kend    = kstart + a->ilen[irow[i]];
104:     lens[i] = 0;
105:       for (k=kstart; k<kend; k++) {
106:         if (ssmap[aj[k]]) {
107:           lens[i]++;
108:         }
109:       }
110:     }
111:   /* Create and fill new matrix */
112:   if (scall == MAT_REUSE_MATRIX) {
113:     c = (Mat_SeqBAIJ *)((*B)->data);

115:     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
117:     if (flag == PETSC_FALSE) {
118:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
119:     }
120:     PetscMemzero(c->ilen,c->mbs*sizeof(int));
121:     C = *B;
122:   } else {
123:     MatCreate(A->comm,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);
124:     MatSetType(C,A->type_name);
125:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
126:   }
127:   c = (Mat_SeqBAIJ *)(C->data);
128:   for (i=0; i<nrows; i++) {
129:     row    = irow[i];
130:     kstart = ai[row];
131:     kend   = kstart + a->ilen[row];
132:     mat_i  = c->i[i];
133:     mat_j  = c->j + mat_i;
134:     mat_a  = c->a + mat_i*bs2;
135:     mat_ilen = c->ilen + i;
136:     for (k=kstart; k<kend; k++) {
137:       if ((tcol=ssmap[a->j[k]])) {
138:         *mat_j++ = tcol - 1;
139:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
140:         mat_a   += bs2;
141:         (*mat_ilen)++;
142:       }
143:     }
144:   }
145: 
146:   /* Free work space */
147:   ISRestoreIndices(iscol,&icol);
148:   PetscFree(smap);
149:   PetscFree(lens);
150:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
151:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
152: 
153:   ISRestoreIndices(isrow,&irow);
154:   *B = C;
155:   return(0);
156: }

160: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
161: {
162:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
163:   IS          is1,is2;
164:   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;

167:   ISGetIndices(isrow,&irow);
168:   ISGetIndices(iscol,&icol);
169:   ISGetLocalSize(isrow,&nrows);
170:   ISGetLocalSize(iscol,&ncols);
171: 
172:   /* Verify if the indices corespond to each element in a block 
173:    and form the IS with compressed IS */
174:   PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
175:   iary = vary + a->mbs;
176:   PetscMemzero(vary,(a->mbs)*sizeof(int));
177:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178:   count = 0;
179:   for (i=0; i<a->mbs; i++) {
180:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
181:     if (vary[i]==bs) iary[count++] = i;
182:   }
183:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
184: 
185:   PetscMemzero(vary,(a->mbs)*sizeof(int));
186:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187:   count = 0;
188:   for (i=0; i<a->mbs; i++) {
189:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
190:     if (vary[i]==bs) iary[count++] = i;
191:   }
192:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
193:   ISRestoreIndices(isrow,&irow);
194:   ISRestoreIndices(iscol,&icol);
195:   PetscFree(vary);

197:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
198:   ISDestroy(is1);
199:   ISDestroy(is2);
200:   return(0);
201: }

205: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206: {
207:   int ierr,i;

210:   if (scall == MAT_INITIAL_MATRIX) {
211:     PetscMalloc((n+1)*sizeof(Mat),B);
212:   }

214:   for (i=0; i<n; i++) {
215:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
216:   }
217:   return(0);
218: }


221: /* -------------------------------------------------------*/
222: /* Should check that shapes of vectors and matrices match */
223: /* -------------------------------------------------------*/
224:  #include petscblaslapack.h

228: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229: {
230:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
231:   PetscScalar     *x,*z,sum;
232:   MatScalar       *v;
233:   int             mbs=a->mbs,i,*idx,*ii,n,ierr;

236:   VecGetArray(xx,&x);
237:   VecGetArray(zz,&z);

239:   idx   = a->j;
240:   v     = a->a;
241:   ii    = a->i;

243:   for (i=0; i<mbs; i++) {
244:     n    = ii[1] - ii[0]; ii++;
245:     sum  = 0.0;
246:     while (n--) sum += *v++ * x[*idx++];
247:     z[i] = sum;
248:   }
249:   VecRestoreArray(xx,&x);
250:   VecRestoreArray(zz,&z);
251:   PetscLogFlops(2*a->nz - A->m);
252:   return(0);
253: }

257: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
258: {
259:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
260:   PetscScalar     *x,*z,*xb,sum1,sum2;
261:   PetscScalar     x1,x2;
262:   MatScalar       *v;
263:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

266:   VecGetArray(xx,&x);
267:   VecGetArray(zz,&z);

269:   idx   = a->j;
270:   v     = a->a;
271:   ii    = a->i;

273:   for (i=0; i<mbs; i++) {
274:     n  = ii[1] - ii[0]; ii++;
275:     sum1 = 0.0; sum2 = 0.0;
276:     for (j=0; j<n; j++) {
277:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
278:       sum1 += v[0]*x1 + v[2]*x2;
279:       sum2 += v[1]*x1 + v[3]*x2;
280:       v += 4;
281:     }
282:     z[0] = sum1; z[1] = sum2;
283:     z += 2;
284:   }
285:   VecRestoreArray(xx,&x);
286:   VecRestoreArray(zz,&z);
287:   PetscLogFlops(8*a->nz - A->m);
288:   return(0);
289: }

293: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
294: {
295:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
296:   PetscScalar  *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
297:   MatScalar    *v;
298:   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;

300: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
301: #pragma disjoint(*v,*z,*xb)
302: #endif

305:   VecGetArray(xx,&x);
306:   VecGetArray(zz,&z);

308:   idx   = a->j;
309:   v     = a->a;
310:   ii    = a->i;

312:   for (i=0; i<mbs; i++) {
313:     n  = ii[1] - ii[0]; ii++;
314:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
315:     for (j=0; j<n; j++) {
316:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
317:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
318:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
319:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
320:       v += 9;
321:     }
322:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
323:     z += 3;
324:   }
325:   VecRestoreArray(xx,&x);
326:   VecRestoreArray(zz,&z);
327:   PetscLogFlops(18*a->nz - A->m);
328:   return(0);
329: }

333: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
334: {
335:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
336:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
337:   MatScalar       *v;
338:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

341:   VecGetArray(xx,&x);
342:   VecGetArray(zz,&z);

344:   idx   = a->j;
345:   v     = a->a;
346:   ii    = a->i;

348:   for (i=0; i<mbs; i++) {
349:     n  = ii[1] - ii[0]; ii++;
350:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
351:     for (j=0; j<n; j++) {
352:       xb = x + 4*(*idx++);
353:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
354:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
355:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
356:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
357:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
358:       v += 16;
359:     }
360:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
361:     z += 4;
362:   }
363:   VecRestoreArray(xx,&x);
364:   VecRestoreArray(zz,&z);
365:   PetscLogFlops(32*a->nz - A->m);
366:   return(0);
367: }

371: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
372: {
373:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
374:   PetscScalar     sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
375:   MatScalar       *v;
376:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

379:   VecGetArray(xx,&x);
380:   VecGetArray(zz,&z);

382:   idx   = a->j;
383:   v     = a->a;
384:   ii    = a->i;

386:   for (i=0; i<mbs; i++) {
387:     n  = ii[1] - ii[0]; ii++;
388:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
389:     for (j=0; j<n; j++) {
390:       xb = x + 5*(*idx++);
391:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
392:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
393:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
394:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
395:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
396:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
397:       v += 25;
398:     }
399:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
400:     z += 5;
401:   }
402:   VecRestoreArray(xx,&x);
403:   VecRestoreArray(zz,&z);
404:   PetscLogFlops(50*a->nz - A->m);
405:   return(0);
406: }


411: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
412: {
413:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
414:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
415:   PetscScalar     x1,x2,x3,x4,x5,x6;
416:   MatScalar       *v;
417:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

420:   VecGetArray(xx,&x);
421:   VecGetArray(zz,&z);

423:   idx   = a->j;
424:   v     = a->a;
425:   ii    = a->i;

427:   for (i=0; i<mbs; i++) {
428:     n  = ii[1] - ii[0]; ii++;
429:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
430:     for (j=0; j<n; j++) {
431:       xb = x + 6*(*idx++);
432:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
433:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
434:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
435:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
436:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
437:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
438:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
439:       v += 36;
440:     }
441:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
442:     z += 6;
443:   }

445:   VecRestoreArray(xx,&x);
446:   VecRestoreArray(zz,&z);
447:   PetscLogFlops(72*a->nz - A->m);
448:   return(0);
449: }
452: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
453: {
454:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
455:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
456:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
457:   MatScalar       *v;
458:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

461:   VecGetArray(xx,&x);
462:   VecGetArray(zz,&z);

464:   idx   = a->j;
465:   v     = a->a;
466:   ii    = a->i;

468:   for (i=0; i<mbs; i++) {
469:     n  = ii[1] - ii[0]; ii++;
470:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
471:     for (j=0; j<n; j++) {
472:       xb = x + 7*(*idx++);
473:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
474:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
475:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
476:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
477:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
478:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
479:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
480:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
481:       v += 49;
482:     }
483:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
484:     z += 7;
485:   }

487:   VecRestoreArray(xx,&x);
488:   VecRestoreArray(zz,&z);
489:   PetscLogFlops(98*a->nz - A->m);
490:   return(0);
491: }

493: /*
494:     This will not work with MatScalar == float because it calls the BLAS
495: */
498: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
499: {
500:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
501:   PetscScalar     *x,*z,*xb,*work,*workt;
502:   MatScalar       *v;
503:   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
504:   int             ncols,k;

507:   VecGetArray(xx,&x);
508:   VecGetArray(zz,&z);

510:   idx   = a->j;
511:   v     = a->a;
512:   ii    = a->i;


515:   if (!a->mult_work) {
516:     k    = PetscMax(A->m,A->n);
517:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
518:   }
519:   work = a->mult_work;
520:   for (i=0; i<mbs; i++) {
521:     n     = ii[1] - ii[0]; ii++;
522:     ncols = n*bs;
523:     workt = work;
524:     for (j=0; j<n; j++) {
525:       xb = x + bs*(*idx++);
526:       for (k=0; k<bs; k++) workt[k] = xb[k];
527:       workt += bs;
528:     }
529:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
530:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
531:     v += n*bs2;
532:     z += bs;
533:   }
534:   VecRestoreArray(xx,&x);
535:   VecRestoreArray(zz,&z);
536:   PetscLogFlops(2*a->nz*bs2 - A->m);
537:   return(0);
538: }

542: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
543: {
544:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
545:   PetscScalar     *x,*y,*z,sum;
546:   MatScalar       *v;
547:   int             ierr,mbs=a->mbs,i,*idx,*ii,n;

550:   VecGetArray(xx,&x);
551:   VecGetArray(yy,&y);
552:   if (zz != yy) {
553:     VecGetArray(zz,&z);
554:   } else {
555:     z = y;
556:   }

558:   idx   = a->j;
559:   v     = a->a;
560:   ii    = a->i;

562:   for (i=0; i<mbs; i++) {
563:     n    = ii[1] - ii[0]; ii++;
564:     sum  = y[i];
565:     while (n--) sum += *v++ * x[*idx++];
566:     z[i] = sum;
567:   }
568:   VecRestoreArray(xx,&x);
569:   VecRestoreArray(yy,&y);
570:   if (zz != yy) {
571:     VecRestoreArray(zz,&z);
572:   }
573:   PetscLogFlops(2*a->nz);
574:   return(0);
575: }

579: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
580: {
581:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
582:   PetscScalar     *x,*y,*z,*xb,sum1,sum2;
583:   PetscScalar     x1,x2;
584:   MatScalar       *v;
585:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

588:   VecGetArray(xx,&x);
589:   VecGetArray(yy,&y);
590:   if (zz != yy) {
591:     VecGetArray(zz,&z);
592:   } else {
593:     z = y;
594:   }

596:   idx   = a->j;
597:   v     = a->a;
598:   ii    = a->i;

600:   for (i=0; i<mbs; i++) {
601:     n  = ii[1] - ii[0]; ii++;
602:     sum1 = y[0]; sum2 = y[1];
603:     for (j=0; j<n; j++) {
604:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
605:       sum1 += v[0]*x1 + v[2]*x2;
606:       sum2 += v[1]*x1 + v[3]*x2;
607:       v += 4;
608:     }
609:     z[0] = sum1; z[1] = sum2;
610:     z += 2; y += 2;
611:   }
612:   VecRestoreArray(xx,&x);
613:   VecRestoreArray(yy,&y);
614:   if (zz != yy) {
615:     VecRestoreArray(zz,&z);
616:   }
617:   PetscLogFlops(4*a->nz);
618:   return(0);
619: }

623: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
624: {
625:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
626:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
627:   MatScalar       *v;
628:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

631:   VecGetArray(xx,&x);
632:   VecGetArray(yy,&y);
633:   if (zz != yy) {
634:     VecGetArray(zz,&z);
635:   } else {
636:     z = y;
637:   }

639:   idx   = a->j;
640:   v     = a->a;
641:   ii    = a->i;

643:   for (i=0; i<mbs; i++) {
644:     n  = ii[1] - ii[0]; ii++;
645:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
646:     for (j=0; j<n; j++) {
647:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
648:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
649:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
650:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
651:       v += 9;
652:     }
653:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
654:     z += 3; y += 3;
655:   }
656:   VecRestoreArray(xx,&x);
657:   VecRestoreArray(yy,&y);
658:   if (zz != yy) {
659:     VecRestoreArray(zz,&z);
660:   }
661:   PetscLogFlops(18*a->nz);
662:   return(0);
663: }

667: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
668: {
669:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
670:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
671:   MatScalar       *v;
672:   int             ierr,mbs=a->mbs,i,*idx,*ii;
673:   int             j,n;

676:   VecGetArray(xx,&x);
677:   VecGetArray(yy,&y);
678:   if (zz != yy) {
679:     VecGetArray(zz,&z);
680:   } else {
681:     z = y;
682:   }

684:   idx   = a->j;
685:   v     = a->a;
686:   ii    = a->i;

688:   for (i=0; i<mbs; i++) {
689:     n  = ii[1] - ii[0]; ii++;
690:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
691:     for (j=0; j<n; j++) {
692:       xb = x + 4*(*idx++);
693:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
694:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
695:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
696:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
697:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
698:       v += 16;
699:     }
700:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
701:     z += 4; y += 4;
702:   }
703:   VecRestoreArray(xx,&x);
704:   VecRestoreArray(yy,&y);
705:   if (zz != yy) {
706:     VecRestoreArray(zz,&z);
707:   }
708:   PetscLogFlops(32*a->nz);
709:   return(0);
710: }

714: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
715: {
716:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
717:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
718:   MatScalar       *v;
719:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

722:   VecGetArray(xx,&x);
723:   VecGetArray(yy,&y);
724:   if (zz != yy) {
725:     VecGetArray(zz,&z);
726:   } else {
727:     z = y;
728:   }

730:   idx   = a->j;
731:   v     = a->a;
732:   ii    = a->i;

734:   for (i=0; i<mbs; i++) {
735:     n  = ii[1] - ii[0]; ii++;
736:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
737:     for (j=0; j<n; j++) {
738:       xb = x + 5*(*idx++);
739:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
740:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
741:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
742:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
743:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
744:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
745:       v += 25;
746:     }
747:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
748:     z += 5; y += 5;
749:   }
750:   VecRestoreArray(xx,&x);
751:   VecRestoreArray(yy,&y);
752:   if (zz != yy) {
753:     VecRestoreArray(zz,&z);
754:   }
755:   PetscLogFlops(50*a->nz);
756:   return(0);
757: }
760: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
761: {
762:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
763:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
764:   PetscScalar     x1,x2,x3,x4,x5,x6;
765:   MatScalar       *v;
766:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

769:   VecGetArray(xx,&x);
770:   VecGetArray(yy,&y);
771:   if (zz != yy) {
772:     VecGetArray(zz,&z);
773:   } else {
774:     z = y;
775:   }

777:   idx   = a->j;
778:   v     = a->a;
779:   ii    = a->i;

781:   for (i=0; i<mbs; i++) {
782:     n  = ii[1] - ii[0]; ii++;
783:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
784:     for (j=0; j<n; j++) {
785:       xb = x + 6*(*idx++);
786:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
787:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
788:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
789:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
790:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
791:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
792:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
793:       v += 36;
794:     }
795:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
796:     z += 6; y += 6;
797:   }
798:   VecRestoreArray(xx,&x);
799:   VecRestoreArray(yy,&y);
800:   if (zz != yy) {
801:     VecRestoreArray(zz,&z);
802:   }
803:   PetscLogFlops(72*a->nz);
804:   return(0);
805: }

809: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
810: {
811:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
812:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
813:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
814:   MatScalar       *v;
815:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

818:   VecGetArray(xx,&x);
819:   VecGetArray(yy,&y);
820:   if (zz != yy) {
821:     VecGetArray(zz,&z);
822:   } else {
823:     z = y;
824:   }

826:   idx   = a->j;
827:   v     = a->a;
828:   ii    = a->i;

830:   for (i=0; i<mbs; i++) {
831:     n  = ii[1] - ii[0]; ii++;
832:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
833:     for (j=0; j<n; j++) {
834:       xb = x + 7*(*idx++);
835:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
836:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
837:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
838:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
839:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
840:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
841:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
842:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
843:       v += 49;
844:     }
845:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
846:     z += 7; y += 7;
847:   }
848:   VecRestoreArray(xx,&x);
849:   VecRestoreArray(yy,&y);
850:   if (zz != yy) {
851:     VecRestoreArray(zz,&z);
852:   }
853:   PetscLogFlops(98*a->nz);
854:   return(0);
855: }

859: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
860: {
861:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
862:   PetscScalar    *x,*z,*xb,*work,*workt,*y;
863:   MatScalar      *v;
864:   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
865:   int            ncols,k;

868:   VecGetArray(xx,&x);
869:   VecGetArray(zz,&z);
870:   if (zz != yy) {
871:     VecGetArray(yy,&y);
872:     PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
873:     VecRestoreArray(yy,&y);
874:   }

876:   idx   = a->j;
877:   v     = a->a;
878:   ii    = a->i;


881:   if (!a->mult_work) {
882:     k    = PetscMax(A->m,A->n);
883:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
884:   }
885:   work = a->mult_work;
886:   for (i=0; i<mbs; i++) {
887:     n     = ii[1] - ii[0]; ii++;
888:     ncols = n*bs;
889:     workt = work;
890:     for (j=0; j<n; j++) {
891:       xb = x + bs*(*idx++);
892:       for (k=0; k<bs; k++) workt[k] = xb[k];
893:       workt += bs;
894:     }
895:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
896:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
897:     v += n*bs2;
898:     z += bs;
899:   }
900:   VecRestoreArray(xx,&x);
901:   VecRestoreArray(zz,&z);
902:   PetscLogFlops(2*a->nz*bs2);
903:   return(0);
904: }

908: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
909: {
910:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
911:   PetscScalar     *xg,*zg,*zb,zero = 0.0;
912:   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
913:   MatScalar       *v;
914:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
915:   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

918:   VecSet(&zero,zz);
919:   VecGetArray(xx,&xg); x = xg;
920:   VecGetArray(zz,&zg); z = zg;

922:   idx   = a->j;
923:   v     = a->a;
924:   ii    = a->i;
925:   xb    = x;
926:   switch (bs) {
927:   case 1:
928:     for (i=0; i<mbs; i++) {
929:       n  = ii[1] - ii[0]; ii++;
930:       x1 = xb[0];
931:       ib = idx + ai[i];
932:       for (j=0; j<n; j++) {
933:         rval    = ib[j];
934:         z[rval] += *v * x1;
935:         v++;
936:       }
937:       xb++;
938:     }
939:     break;
940:   case 2:
941:     for (i=0; i<mbs; i++) {
942:       n  = ii[1] - ii[0]; ii++;
943:       x1 = xb[0]; x2 = xb[1];
944:       ib = idx + ai[i];
945:       for (j=0; j<n; j++) {
946:         rval      = ib[j]*2;
947:         z[rval++] += v[0]*x1 + v[1]*x2;
948:         z[rval]   += v[2]*x1 + v[3]*x2;
949:         v  += 4;
950:       }
951:       xb += 2;
952:     }
953:     break;
954:   case 3:
955:     for (i=0; i<mbs; i++) {
956:       n  = ii[1] - ii[0]; ii++;
957:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
958:       ib = idx + ai[i];
959:       for (j=0; j<n; j++) {
960:         rval      = ib[j]*3;
961:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
962:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
963:         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
964:         v  += 9;
965:       }
966:       xb += 3;
967:     }
968:     break;
969:   case 4:
970:     for (i=0; i<mbs; i++) {
971:       n  = ii[1] - ii[0]; ii++;
972:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
973:       ib = idx + ai[i];
974:       for (j=0; j<n; j++) {
975:         rval      = ib[j]*4;
976:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
977:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
978:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
979:         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
980:         v  += 16;
981:       }
982:       xb += 4;
983:     }
984:     break;
985:   case 5:
986:     for (i=0; i<mbs; i++) {
987:       n  = ii[1] - ii[0]; ii++;
988:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
989:       x4 = xb[3]; x5 = xb[4];
990:       ib = idx + ai[i];
991:       for (j=0; j<n; j++) {
992:         rval      = ib[j]*5;
993:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
994:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
995:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
996:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
997:         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
998:         v  += 25;
999:       }
1000:       xb += 5;
1001:     }
1002:     break;
1003:   case 6:
1004:     for (i=0; i<mbs; i++) {
1005:       n  = ii[1] - ii[0]; ii++;
1006:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1007:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1008:       ib = idx + ai[i];
1009:       for (j=0; j<n; j++) {
1010:         rval      = ib[j]*6;
1011:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
1012:         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1013:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1014:         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1015:         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1016:         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1017:         v  += 36;
1018:       }
1019:       xb += 6;
1020:     }
1021:     break;
1022:   case 7:
1023:     for (i=0; i<mbs; i++) {
1024:       n  = ii[1] - ii[0]; ii++;
1025:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1026:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1027:       ib = idx + ai[i];
1028:       for (j=0; j<n; j++) {
1029:         rval      = ib[j]*7;
1030:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1031:         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1032:         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1033:         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1034:         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1035:         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1036:         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1037:         v  += 49;
1038:       }
1039:       xb += 7;
1040:     }
1041:     break;
1042:   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1043:       int          ncols,k;
1044:       PetscScalar  *work,*workt;

1046:       if (!a->mult_work) {
1047:         k = PetscMax(A->m,A->n);
1048:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1049:       }
1050:       work = a->mult_work;
1051:       for (i=0; i<mbs; i++) {
1052:         n     = ii[1] - ii[0]; ii++;
1053:         ncols = n*bs;
1054:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1055:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1056:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1057:         v += n*bs2;
1058:         x += bs;
1059:         workt = work;
1060:         for (j=0; j<n; j++) {
1061:           zb = z + bs*(*idx++);
1062:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1063:           workt += bs;
1064:         }
1065:       }
1066:     }
1067:   }
1068:   VecRestoreArray(xx,&xg);
1069:   VecRestoreArray(zz,&zg);
1070:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1071:   return(0);
1072: }

1076: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1078: {
1079:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1080:   PetscScalar     *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1081:   MatScalar       *v;
1082:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

1085:   if (yy != zz) { VecCopy(yy,zz); }
1086:   VecGetArray(xx,&xg); x = xg;
1087:   VecGetArray(zz,&zg); z = zg;


1090:   idx   = a->j;
1091:   v     = a->a;
1092:   ii    = a->i;
1093:   xb    = x;

1095:   switch (bs) {
1096:   case 1:
1097:     for (i=0; i<mbs; i++) {
1098:       n  = ii[1] - ii[0]; ii++;
1099:       x1 = xb[0];
1100:       ib = idx + ai[i];
1101:       for (j=0; j<n; j++) {
1102:         rval    = ib[j];
1103:         z[rval] += *v * x1;
1104:         v++;
1105:       }
1106:       xb++;
1107:     }
1108:     break;
1109:   case 2:
1110:     for (i=0; i<mbs; i++) {
1111:       n  = ii[1] - ii[0]; ii++;
1112:       x1 = xb[0]; x2 = xb[1];
1113:       ib = idx + ai[i];
1114:       for (j=0; j<n; j++) {
1115:         rval      = ib[j]*2;
1116:         z[rval++] += v[0]*x1 + v[1]*x2;
1117:         z[rval++] += v[2]*x1 + v[3]*x2;
1118:         v  += 4;
1119:       }
1120:       xb += 2;
1121:     }
1122:     break;
1123:   case 3:
1124:     for (i=0; i<mbs; i++) {
1125:       n  = ii[1] - ii[0]; ii++;
1126:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1127:       ib = idx + ai[i];
1128:       for (j=0; j<n; j++) {
1129:         rval      = ib[j]*3;
1130:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1131:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1132:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1133:         v  += 9;
1134:       }
1135:       xb += 3;
1136:     }
1137:     break;
1138:   case 4:
1139:     for (i=0; i<mbs; i++) {
1140:       n  = ii[1] - ii[0]; ii++;
1141:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1142:       ib = idx + ai[i];
1143:       for (j=0; j<n; j++) {
1144:         rval      = ib[j]*4;
1145:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1146:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1147:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1148:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1149:         v  += 16;
1150:       }
1151:       xb += 4;
1152:     }
1153:     break;
1154:   case 5:
1155:     for (i=0; i<mbs; i++) {
1156:       n  = ii[1] - ii[0]; ii++;
1157:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1158:       x4 = xb[3]; x5 = xb[4];
1159:       ib = idx + ai[i];
1160:       for (j=0; j<n; j++) {
1161:         rval      = ib[j]*5;
1162:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1163:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1164:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1165:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1166:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1167:         v  += 25;
1168:       }
1169:       xb += 5;
1170:     }
1171:     break;
1172:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1173:       int          ncols,k;
1174:       PetscScalar  *work,*workt;

1176:       if (!a->mult_work) {
1177:         k = PetscMax(A->m,A->n);
1178:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1179:       }
1180:       work = a->mult_work;
1181:       for (i=0; i<mbs; i++) {
1182:         n     = ii[1] - ii[0]; ii++;
1183:         ncols = n*bs;
1184:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1185:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1186:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1187:         v += n*bs2;
1188:         x += bs;
1189:         workt = work;
1190:         for (j=0; j<n; j++) {
1191:           zb = z + bs*(*idx++);
1192:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1193:           workt += bs;
1194:         }
1195:       }
1196:     }
1197:   }
1198:   VecRestoreArray(xx,&xg);
1199:   VecRestoreArray(zz,&zg);
1200:   PetscLogFlops(2*a->nz*a->bs2);
1201:   return(0);
1202: }

1206: int MatScale_SeqBAIJ(const PetscScalar *alpha,Mat inA)
1207: {
1208:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1209:   int         totalnz = a->bs2*a->nz;
1210: #if defined(PETSC_USE_MAT_SINGLE)
1211:   int         i;
1212: #else
1213:   int         one = 1;
1214: #endif

1217: #if defined(PETSC_USE_MAT_SINGLE)
1218:   for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1219: #else
1220:   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1221: #endif
1222:   PetscLogFlops(totalnz);
1223:   return(0);
1224: }

1228: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1229: {
1230:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1231:   MatScalar   *v = a->a;
1232:   PetscReal   sum = 0.0;
1233:   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;

1236:   if (type == NORM_FROBENIUS) {
1237:     for (i=0; i< bs2*nz; i++) {
1238: #if defined(PETSC_USE_COMPLEX)
1239:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1240: #else
1241:       sum += (*v)*(*v); v++;
1242: #endif
1243:     }
1244:     *norm = sqrt(sum);
1245:   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1246:     *norm = 0.0;
1247:     for (k=0; k<bs; k++) {
1248:       for (j=0; j<a->mbs; j++) {
1249:         v = a->a + bs2*a->i[j] + k;
1250:         sum = 0.0;
1251:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1252:           for (k1=0; k1<bs; k1++){
1253:             sum += PetscAbsScalar(*v);
1254:             v   += bs;
1255:           }
1256:         }
1257:         if (sum > *norm) *norm = sum;
1258:       }
1259:     }
1260:   } else {
1261:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1262:   }
1263:   return(0);
1264: }


1269: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1270: {
1271:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1272:   int         ierr;

1275:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1276:   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1277:     *flg = PETSC_FALSE;
1278:     return(0);
1279:   }
1280: 
1281:   /* if the a->i are the same */
1282:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1283:   if (*flg == PETSC_FALSE) {
1284:     return(0);
1285:   }
1286: 
1287:   /* if a->j are the same */
1288:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1289:   if (*flg == PETSC_FALSE) {
1290:     return(0);
1291:   }
1292:   /* if a->a are the same */
1293:   PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1294:   return(0);
1295: 
1296: }

1300: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1301: {
1302:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1303:   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1304:   PetscScalar  *x,zero = 0.0;
1305:   MatScalar    *aa,*aa_j;

1308:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1309:   bs   = a->bs;
1310:   aa   = a->a;
1311:   ai   = a->i;
1312:   aj   = a->j;
1313:   ambs = a->mbs;
1314:   bs2  = a->bs2;

1316:   VecSet(&zero,v);
1317:   VecGetArray(v,&x);
1318:   VecGetLocalSize(v,&n);
1319:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1320:   for (i=0; i<ambs; i++) {
1321:     for (j=ai[i]; j<ai[i+1]; j++) {
1322:       if (aj[j] == i) {
1323:         row  = i*bs;
1324:         aa_j = aa+j*bs2;
1325:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1326:         break;
1327:       }
1328:     }
1329:   }
1330:   VecRestoreArray(v,&x);
1331:   return(0);
1332: }

1336: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1337: {
1338:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1339:   PetscScalar  *l,*r,x,*li,*ri;
1340:   MatScalar    *aa,*v;
1341:   int          ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1344:   ai  = a->i;
1345:   aj  = a->j;
1346:   aa  = a->a;
1347:   m   = A->m;
1348:   n   = A->n;
1349:   bs  = a->bs;
1350:   mbs = a->mbs;
1351:   bs2 = a->bs2;
1352:   if (ll) {
1353:     VecGetArray(ll,&l);
1354:     VecGetLocalSize(ll,&lm);
1355:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1356:     for (i=0; i<mbs; i++) { /* for each block row */
1357:       M  = ai[i+1] - ai[i];
1358:       li = l + i*bs;
1359:       v  = aa + bs2*ai[i];
1360:       for (j=0; j<M; j++) { /* for each block */
1361:         for (k=0; k<bs2; k++) {
1362:           (*v++) *= li[k%bs];
1363:         }
1364:       }
1365:     }
1366:     VecRestoreArray(ll,&l);
1367:     PetscLogFlops(a->nz);
1368:   }
1369: 
1370:   if (rr) {
1371:     VecGetArray(rr,&r);
1372:     VecGetLocalSize(rr,&rn);
1373:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1374:     for (i=0; i<mbs; i++) { /* for each block row */
1375:       M  = ai[i+1] - ai[i];
1376:       v  = aa + bs2*ai[i];
1377:       for (j=0; j<M; j++) { /* for each block */
1378:         ri = r + bs*aj[ai[i]+j];
1379:         for (k=0; k<bs; k++) {
1380:           x = ri[k];
1381:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1382:         }
1383:       }
1384:     }
1385:     VecRestoreArray(rr,&r);
1386:     PetscLogFlops(a->nz);
1387:   }
1388:   return(0);
1389: }


1394: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1395: {
1396:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1399:   info->rows_global    = (double)A->m;
1400:   info->columns_global = (double)A->n;
1401:   info->rows_local     = (double)A->m;
1402:   info->columns_local  = (double)A->n;
1403:   info->block_size     = a->bs2;
1404:   info->nz_allocated   = a->maxnz;
1405:   info->nz_used        = a->bs2*a->nz;
1406:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1407:   info->assemblies   = A->num_ass;
1408:   info->mallocs      = a->reallocs;
1409:   info->memory       = A->mem;
1410:   if (A->factor) {
1411:     info->fill_ratio_given  = A->info.fill_ratio_given;
1412:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1413:     info->factor_mallocs    = A->info.factor_mallocs;
1414:   } else {
1415:     info->fill_ratio_given  = 0;
1416:     info->fill_ratio_needed = 0;
1417:     info->factor_mallocs    = 0;
1418:   }
1419:   return(0);
1420: }


1425: int MatZeroEntries_SeqBAIJ(Mat A)
1426: {
1427:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1428:   int         ierr;

1431:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1432:   return(0);
1433: }