Actual source code: sbaijfact2.c

  1: /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
  2: /*
  3:     Factorization code for SBAIJ format. 
  4: */

 6:  #include src/mat/impls/sbaij/seq/sbaij.h
 7:  #include src/mat/impls/baij/seq/baij.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 13: int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 16:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 17: }

 21: int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 22: {
 24:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 25: }

 29: int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
 30: {
 32:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 33: }

 37: int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
 38: {
 40:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 41: }

 45: int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
 46: {
 48:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 49: }

 53: int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
 54: {
 56:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 57: }

 61: int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
 62: {
 64:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 65: }

 69: int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
 70: {
 72:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 73: }

 77: int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
 78: {
 80:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 81: }

 85: int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
 86: {
 88:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 89: }

 93: int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
 94: {
 96:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 97: }

101: int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
102: {
104:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
105: }

109: int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
110: {
112:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
113: }

117: int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
118: {
120:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
121: }

125: int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
126: {
127:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
128:   IS              isrow=a->row;
129:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
130:   int             nz,*vj,k,ierr,*r,idx,k1;
131:   int             bs=a->bs,bs2 = a->bs2;
132:   MatScalar       *aa=a->a,*v,*diag;
133:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

136:   VecGetArray(bb,&b);
137:   VecGetArray(xx,&x);
138:   t  = a->solve_work;
139:   ISGetIndices(isrow,&r);
140:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

142:   /* solve U^T * D * y = b by forward substitution */
143:   xk = t;
144:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
145:     idx   = bs*r[k];
146:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
147:   }
148:   for (k=0; k<mbs; k++){
149:     v  = aa + bs2*ai[k];
150:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
151:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
152:     nz = ai[k+1] - ai[k];
153:     vj = aj + ai[k];
154:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
155:     while (nz--) {
156:       /* x(:) += U(k,:)^T*(Dk*xk) */
157:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
158:       vj++; xj = t + (*vj)*bs;
159:       v += bs2;
160:     }
161:     /* xk = inv(Dk)*(Dk*xk) */
162:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
163:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
164:   }

166:   /* solve U*x = y by back substitution */
167:   for (k=mbs-1; k>=0; k--){
168:     v  = aa + bs2*ai[k];
169:     xk = t + k*bs;        /* xk */
170:     nz = ai[k+1] - ai[k];
171:     vj = aj + ai[k];
172:     xj = t + (*vj)*bs;
173:     while (nz--) {
174:       /* xk += U(k,:)*x(:) */
175:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
176:       vj++;
177:       v += bs2; xj = t + (*vj)*bs;
178:     }
179:     idx   = bs*r[k];
180:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
181:   }

183:   PetscFree(xk_tmp);
184:   VecRestoreArray(bb,&b);
185:   VecRestoreArray(xx,&x);
186:   PetscLogFlops(bs2*(2*a->nz + mbs));
187:   return(0);
188: }

192: int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
193: {
194:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
195:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
196:   int             nz,*vj,k,ierr;
197:   int             bs=a->bs,bs2 = a->bs2;
198:   MatScalar       *aa=a->a,*v,*diag;
199:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp;

202: 
203:   VecGetArray(bb,&b);
204:   VecGetArray(xx,&x);

206:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

208:   /* solve U^T * D * y = b by forward substitution */
209:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
210:   for (k=0; k<mbs; k++){
211:     v  = aa + bs2*ai[k];
212:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
213:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
214:     nz = ai[k+1] - ai[k];
215:     vj = aj + ai[k];
216:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
217:     while (nz--) {
218:       /* x(:) += U(k,:)^T*(Dk*xk) */
219:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
220:       vj++; xj = x + (*vj)*bs;
221:       v += bs2;
222:     }
223:     /* xk = inv(Dk)*(Dk*xk) */
224:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
225:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
226:   }

228:   /* solve U*x = y by back substitution */
229:   for (k=mbs-1; k>=0; k--){
230:     v  = aa + bs2*ai[k];
231:     xk = x + k*bs;        /* xk */
232:     nz = ai[k+1] - ai[k];
233:     vj = aj + ai[k];
234:     xj = x + (*vj)*bs;
235:     while (nz--) {
236:       /* xk += U(k,:)*x(:) */
237:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
238:       vj++;
239:       v += bs2; xj = x + (*vj)*bs;
240:     }
241:   }

243:   PetscFree(xk_tmp);
244:   VecRestoreArray(bb,&b);
245:   VecRestoreArray(xx,&x);
246:   PetscLogFlops(bs2*(2*a->nz + mbs));
247:   return(0);
248: }

252: int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
253: {
254:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
255:   IS              isrow=a->row;
256:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
257:   int             nz,*vj,k,ierr,*r,idx;
258:   MatScalar       *aa=a->a,*v,*d;
259:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

262:   VecGetArray(bb,&b);
263:   VecGetArray(xx,&x);
264:   t  = a->solve_work;
265:   ISGetIndices(isrow,&r);

267:   /* solve U^T * D * y = b by forward substitution */
268:   tp = t;
269:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
270:     idx   = 7*r[k];
271:     tp[0] = b[idx];
272:     tp[1] = b[idx+1];
273:     tp[2] = b[idx+2];
274:     tp[3] = b[idx+3];
275:     tp[4] = b[idx+4];
276:     tp[5] = b[idx+5];
277:     tp[6] = b[idx+6];
278:     tp += 7;
279:   }
280: 
281:   for (k=0; k<mbs; k++){
282:     v  = aa + 49*ai[k];
283:     vj = aj + ai[k];
284:     tp = t + k*7;
285:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
286:     nz = ai[k+1] - ai[k];
287:     tp = t + (*vj)*7;
288:     while (nz--) {
289:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
290:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
291:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
292:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
293:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
294:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
295:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
296:       vj++; tp = t + (*vj)*7;
297:       v += 49;
298:     }

300:     /* xk = inv(Dk)*(Dk*xk) */
301:     d  = aa+k*49;          /* ptr to inv(Dk) */
302:     tp    = t + k*7;
303:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
304:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
305:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
306:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
307:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
308:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
309:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
310:   }

312:   /* solve U*x = y by back substitution */
313:   for (k=mbs-1; k>=0; k--){
314:     v  = aa + 49*ai[k];
315:     vj = aj + ai[k];
316:     tp    = t + k*7;
317:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
318:     nz = ai[k+1] - ai[k];
319: 
320:     tp = t + (*vj)*7;
321:     while (nz--) {
322:       /* xk += U(k,:)*x(:) */
323:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
324:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
325:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
326:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
327:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
328:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
329:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
330:       vj++; tp = t + (*vj)*7;
331:       v += 49;
332:     }
333:     tp    = t + k*7;
334:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
335:     idx   = 7*r[k];
336:     x[idx]     = x0;
337:     x[idx+1]   = x1;
338:     x[idx+2]   = x2;
339:     x[idx+3]   = x3;
340:     x[idx+4]   = x4;
341:     x[idx+5]   = x5;
342:     x[idx+6]   = x6;
343:   }

345:   VecRestoreArray(bb,&b);
346:   VecRestoreArray(xx,&x);
347:   PetscLogFlops(49*(2*a->nz + mbs));
348:   return(0);
349: }

353: int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
354: {
355:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
356:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
357:   MatScalar       *aa=a->a,*v,*d;
358:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
359:   int             nz,*vj,k,ierr;

362: 
363:   VecGetArray(bb,&b);
364:   VecGetArray(xx,&x);
365: 
366:   /* solve U^T * D * y = b by forward substitution */
367:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
368:   for (k=0; k<mbs; k++){
369:     v  = aa + 49*ai[k];
370:     xp = x + k*7;
371:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
372:     nz = ai[k+1] - ai[k];
373:     vj = aj + ai[k];
374:     xp = x + (*vj)*7;
375:     while (nz--) {
376:       /* x(:) += U(k,:)^T*(Dk*xk) */
377:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
378:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
379:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
380:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
381:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
382:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
383:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
384:       vj++; xp = x + (*vj)*7;
385:       v += 49;
386:     }
387:     /* xk = inv(Dk)*(Dk*xk) */
388:     d  = aa+k*49;          /* ptr to inv(Dk) */
389:     xp = x + k*7;
390:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
391:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
392:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
393:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
394:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
395:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
396:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
397:   }

399:   /* solve U*x = y by back substitution */
400:   for (k=mbs-1; k>=0; k--){
401:     v  = aa + 49*ai[k];
402:     xp = x + k*7;
403:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
404:     nz = ai[k+1] - ai[k];
405:     vj = aj + ai[k];
406:     xp = x + (*vj)*7;
407:     while (nz--) {
408:       /* xk += U(k,:)*x(:) */
409:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
410:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
411:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
412:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
413:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
414:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
415:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
416:       vj++;
417:       v += 49; xp = x + (*vj)*7;
418:     }
419:     xp = x + k*7;
420:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
421:   }

423:   VecRestoreArray(bb,&b);
424:   VecRestoreArray(xx,&x);
425:   PetscLogFlops(49*(2*a->nz + mbs));
426:   return(0);
427: }

431: int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
432: {
433:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
434:   IS              isrow=a->row;
435:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
436:   int             nz,*vj,k,ierr,*r,idx;
437:   MatScalar       *aa=a->a,*v,*d;
438:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

441:   VecGetArray(bb,&b);
442:   VecGetArray(xx,&x);
443:   t  = a->solve_work;
444:   ISGetIndices(isrow,&r);

446:   /* solve U^T * D * y = b by forward substitution */
447:   tp = t;
448:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
449:     idx   = 6*r[k];
450:     tp[0] = b[idx];
451:     tp[1] = b[idx+1];
452:     tp[2] = b[idx+2];
453:     tp[3] = b[idx+3];
454:     tp[4] = b[idx+4];
455:     tp[5] = b[idx+5];
456:     tp += 6;
457:   }
458: 
459:   for (k=0; k<mbs; k++){
460:     v  = aa + 36*ai[k];
461:     vj = aj + ai[k];
462:     tp = t + k*6;
463:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
464:     nz = ai[k+1] - ai[k];
465:     tp = t + (*vj)*6;
466:     while (nz--) {
467:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
468:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
469:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
470:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
471:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
472:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
473:       vj++; tp = t + (*vj)*6;
474:       v += 36;
475:     }

477:     /* xk = inv(Dk)*(Dk*xk) */
478:     d  = aa+k*36;          /* ptr to inv(Dk) */
479:     tp    = t + k*6;
480:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
481:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
482:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
483:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
484:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
485:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
486:   }

488:   /* solve U*x = y by back substitution */
489:   for (k=mbs-1; k>=0; k--){
490:     v  = aa + 36*ai[k];
491:     vj = aj + ai[k];
492:     tp    = t + k*6;
493:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
494:     nz = ai[k+1] - ai[k];
495: 
496:     tp = t + (*vj)*6;
497:     while (nz--) {
498:       /* xk += U(k,:)*x(:) */
499:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
500:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
501:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
502:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
503:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
504:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
505:       vj++; tp = t + (*vj)*6;
506:       v += 36;
507:     }
508:     tp    = t + k*6;
509:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
510:     idx   = 6*r[k];
511:     x[idx]     = x0;
512:     x[idx+1]   = x1;
513:     x[idx+2]   = x2;
514:     x[idx+3]   = x3;
515:     x[idx+4]   = x4;
516:     x[idx+5]   = x5;
517:   }

519:   VecRestoreArray(bb,&b);
520:   VecRestoreArray(xx,&x);
521:   PetscLogFlops(36*(2*a->nz + mbs));
522:   return(0);
523: }

527: int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
530:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
531:   MatScalar       *aa=a->a,*v,*d;
532:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5;
533:   int             nz,*vj,k,ierr;

536: 
537:   VecGetArray(bb,&b);
538:   VecGetArray(xx,&x);
539: 
540:   /* solve U^T * D * y = b by forward substitution */
541:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
542:   for (k=0; k<mbs; k++){
543:     v  = aa + 36*ai[k];
544:     xp = x + k*6;
545:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
546:     nz = ai[k+1] - ai[k];
547:     vj = aj + ai[k];
548:     xp = x + (*vj)*6;
549:     while (nz--) {
550:       /* x(:) += U(k,:)^T*(Dk*xk) */
551:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
552:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
553:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
554:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
555:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
556:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
557:       vj++; xp = x + (*vj)*6;
558:       v += 36;
559:     }
560:     /* xk = inv(Dk)*(Dk*xk) */
561:     d  = aa+k*36;          /* ptr to inv(Dk) */
562:     xp = x + k*6;
563:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
564:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
565:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
566:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
567:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
568:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
569:   }

571:   /* solve U*x = y by back substitution */
572:   for (k=mbs-1; k>=0; k--){
573:     v  = aa + 36*ai[k];
574:     xp = x + k*6;
575:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
576:     nz = ai[k+1] - ai[k];
577:     vj = aj + ai[k];
578:     xp = x + (*vj)*6;
579:     while (nz--) {
580:       /* xk += U(k,:)*x(:) */
581:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
582:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
583:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
584:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
585:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
586:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
587:       vj++;
588:       v += 36; xp = x + (*vj)*6;
589:     }
590:     xp = x + k*6;
591:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
592:   }

594:   VecRestoreArray(bb,&b);
595:   VecRestoreArray(xx,&x);
596:   PetscLogFlops(36*(2*a->nz + mbs));
597:   return(0);
598: }

602: int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
603: {
604:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
605:   IS              isrow=a->row;
606:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
607:   int             nz,*vj,k,ierr,*r,idx;
608:   MatScalar       *aa=a->a,*v,*diag;
609:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,*t,*tp;

612:   VecGetArray(bb,&b);
613:   VecGetArray(xx,&x);
614:   t  = a->solve_work;
615:   ISGetIndices(isrow,&r);

617:   /* solve U^T * D * y = b by forward substitution */
618:   tp = t;
619:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
620:     idx   = 5*r[k];
621:     tp[0] = b[idx];
622:     tp[1] = b[idx+1];
623:     tp[2] = b[idx+2];
624:     tp[3] = b[idx+3];
625:     tp[4] = b[idx+4];
626:     tp += 5;
627:   }
628: 
629:   for (k=0; k<mbs; k++){
630:     v  = aa + 25*ai[k];
631:     vj = aj + ai[k];
632:     tp = t + k*5;
633:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
634:     nz = ai[k+1] - ai[k];

636:     tp = t + (*vj)*5;
637:     while (nz--) {
638:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
639:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
640:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
641:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
642:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
643:       vj++; tp = t + (*vj)*5;
644:       v += 25;
645:     }

647:     /* xk = inv(Dk)*(Dk*xk) */
648:     diag  = aa+k*25;          /* ptr to inv(Dk) */
649:     tp    = t + k*5;
650:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
651:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
652:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
653:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
654:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
655:   }

657:   /* solve U*x = y by back substitution */
658:   for (k=mbs-1; k>=0; k--){
659:     v  = aa + 25*ai[k];
660:     vj = aj + ai[k];
661:     tp    = t + k*5;
662:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
663:     nz = ai[k+1] - ai[k];
664: 
665:     tp = t + (*vj)*5;
666:     while (nz--) {
667:       /* xk += U(k,:)*x(:) */
668:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
669:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
670:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
671:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
672:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
673:       vj++; tp = t + (*vj)*5;
674:       v += 25;
675:     }
676:     tp    = t + k*5;
677:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
678:     idx   = 5*r[k];
679:     x[idx]     = x0;
680:     x[idx+1]   = x1;
681:     x[idx+2]   = x2;
682:     x[idx+3]   = x3;
683:     x[idx+4]   = x4;
684:   }

686:   VecRestoreArray(bb,&b);
687:   VecRestoreArray(xx,&x);
688:   PetscLogFlops(25*(2*a->nz + mbs));
689:   return(0);
690: }

694: int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
695: {
696:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
697:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
698:   MatScalar       *aa=a->a,*v,*diag;
699:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4;
700:   int             nz,*vj,k,ierr;

703: 
704:   VecGetArray(bb,&b);
705:   VecGetArray(xx,&x);

707:   /* solve U^T * D * y = b by forward substitution */
708:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
709:   for (k=0; k<mbs; k++){
710:     v  = aa + 25*ai[k];
711:     xp = x + k*5;
712:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
713:     nz = ai[k+1] - ai[k];
714:     vj = aj + ai[k];
715:     xp = x + (*vj)*5;
716:     while (nz--) {
717:       /* x(:) += U(k,:)^T*(Dk*xk) */
718:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
719:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
720:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
721:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
722:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
723:       vj++; xp = x + (*vj)*5;
724:       v += 25;
725:     }
726:     /* xk = inv(Dk)*(Dk*xk) */
727:     diag = aa+k*25;          /* ptr to inv(Dk) */
728:     xp   = x + k*5;
729:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
730:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
731:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
732:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
733:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
734:   }

736:   /* solve U*x = y by back substitution */
737:   for (k=mbs-1; k>=0; k--){
738:     v  = aa + 25*ai[k];
739:     xp = x + k*5;
740:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
741:     nz = ai[k+1] - ai[k];
742:     vj = aj + ai[k];
743:     xp = x + (*vj)*5;
744:     while (nz--) {
745:       /* xk += U(k,:)*x(:) */
746:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
747:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
748:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
749:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
750:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
751:       vj++;
752:       v += 25; xp = x + (*vj)*5;
753:     }
754:     xp = x + k*5;
755:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
756:   }

758:   VecRestoreArray(bb,&b);
759:   VecRestoreArray(xx,&x);
760:   PetscLogFlops(25*(2*a->nz + mbs));
761:   return(0);
762: }

766: int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
767: {
768:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
769:   IS              isrow=a->row;
770:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
771:   int             nz,*vj,k,ierr,*r,idx;
772:   MatScalar       *aa=a->a,*v,*diag;
773:   PetscScalar     *x,*b,x0,x1,x2,x3,*t,*tp;

776:   VecGetArray(bb,&b);
777:   VecGetArray(xx,&x);
778:   t  = a->solve_work;
779:   ISGetIndices(isrow,&r);

781:   /* solve U^T * D * y = b by forward substitution */
782:   tp = t;
783:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
784:     idx   = 4*r[k];
785:     tp[0] = b[idx];
786:     tp[1] = b[idx+1];
787:     tp[2] = b[idx+2];
788:     tp[3] = b[idx+3];
789:     tp += 4;
790:   }
791: 
792:   for (k=0; k<mbs; k++){
793:     v  = aa + 16*ai[k];
794:     vj = aj + ai[k];
795:     tp = t + k*4;
796:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
797:     nz = ai[k+1] - ai[k];

799:     tp = t + (*vj)*4;
800:     while (nz--) {
801:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
802:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
803:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
804:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
805:       vj++; tp = t + (*vj)*4;
806:       v += 16;
807:     }

809:     /* xk = inv(Dk)*(Dk*xk) */
810:     diag  = aa+k*16;          /* ptr to inv(Dk) */
811:     tp    = t + k*4;
812:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
813:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
814:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
815:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
816:   }

818:   /* solve U*x = y by back substitution */
819:   for (k=mbs-1; k>=0; k--){
820:     v  = aa + 16*ai[k];
821:     vj = aj + ai[k];
822:     tp    = t + k*4;
823:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
824:     nz = ai[k+1] - ai[k];
825: 
826:     tp = t + (*vj)*4;
827:     while (nz--) {
828:       /* xk += U(k,:)*x(:) */
829:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
830:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
831:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
832:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
833:       vj++; tp = t + (*vj)*4;
834:       v += 16;
835:     }
836:     tp    = t + k*4;
837:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
838:     idx        = 4*r[k];
839:     x[idx]     = x0;
840:     x[idx+1]   = x1;
841:     x[idx+2]   = x2;
842:     x[idx+3]   = x3;
843:   }

845:   VecRestoreArray(bb,&b);
846:   VecRestoreArray(xx,&x);
847:   PetscLogFlops(16*(2*a->nz + mbs));
848:   return(0);
849: }

851: /*
852:    Special case where the matrix was factored in the natural ordering. 
853:    This eliminates the need for the column and row permutation.
854: */
857: int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
858: {
859:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
860:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
861:   MatScalar       *aa=a->a,*v,*diag;
862:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3;
863:   int             nz,*vj,k,ierr;

866: 
867:   VecGetArray(bb,&b);
868:   VecGetArray(xx,&x);

870:   /* solve U^T * D * y = b by forward substitution */
871:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
872:   for (k=0; k<mbs; k++){
873:     v  = aa + 16*ai[k];
874:     xp = x + k*4;
875:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
876:     nz = ai[k+1] - ai[k];
877:     vj = aj + ai[k];
878:     xp = x + (*vj)*4;
879:     while (nz--) {
880:       /* x(:) += U(k,:)^T*(Dk*xk) */
881:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
882:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
883:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
884:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
885:       vj++; xp = x + (*vj)*4;
886:       v += 16;
887:     }
888:     /* xk = inv(Dk)*(Dk*xk) */
889:     diag = aa+k*16;          /* ptr to inv(Dk) */
890:     xp   = x + k*4;
891:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
892:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
893:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
894:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
895:   }

897:   /* solve U*x = y by back substitution */
898:   for (k=mbs-1; k>=0; k--){
899:     v  = aa + 16*ai[k];
900:     xp = x + k*4;
901:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
902:     nz = ai[k+1] - ai[k];
903:     vj = aj + ai[k];
904:     xp = x + (*vj)*4;
905:     while (nz--) {
906:       /* xk += U(k,:)*x(:) */
907:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
908:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
909:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
910:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
911:       vj++;
912:       v += 16; xp = x + (*vj)*4;
913:     }
914:     xp = x + k*4;
915:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
916:   }

918:   VecRestoreArray(bb,&b);
919:   VecRestoreArray(xx,&x);
920:   PetscLogFlops(16*(2*a->nz + mbs));
921:   return(0);
922: }

926: int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
927: {
928:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
929:   IS              isrow=a->row;
930:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
931:   int             nz,*vj,k,ierr,*r,idx;
932:   MatScalar       *aa=a->a,*v,*diag;
933:   PetscScalar     *x,*b,x0,x1,x2,*t,*tp;

936:   VecGetArray(bb,&b);
937:   VecGetArray(xx,&x);
938:   t  = a->solve_work;
939:   ISGetIndices(isrow,&r);

941:   /* solve U^T * D * y = b by forward substitution */
942:   tp = t;
943:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
944:     idx   = 3*r[k];
945:     tp[0] = b[idx];
946:     tp[1] = b[idx+1];
947:     tp[2] = b[idx+2];
948:     tp += 3;
949:   }
950: 
951:   for (k=0; k<mbs; k++){
952:     v  = aa + 9*ai[k];
953:     vj = aj + ai[k];
954:     tp = t + k*3;
955:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
956:     nz = ai[k+1] - ai[k];

958:     tp = t + (*vj)*3;
959:     while (nz--) {
960:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
961:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
962:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
963:       vj++; tp = t + (*vj)*3;
964:       v += 9;
965:     }

967:     /* xk = inv(Dk)*(Dk*xk) */
968:     diag  = aa+k*9;          /* ptr to inv(Dk) */
969:     tp    = t + k*3;
970:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
971:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
972:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
973:   }

975:   /* solve U*x = y by back substitution */
976:   for (k=mbs-1; k>=0; k--){
977:     v  = aa + 9*ai[k];
978:     vj = aj + ai[k];
979:     tp    = t + k*3;
980:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
981:     nz = ai[k+1] - ai[k];
982: 
983:     tp = t + (*vj)*3;
984:     while (nz--) {
985:       /* xk += U(k,:)*x(:) */
986:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
987:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
988:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
989:       vj++; tp = t + (*vj)*3;
990:       v += 9;
991:     }
992:     tp    = t + k*3;
993:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
994:     idx      = 3*r[k];
995:     x[idx]   = x0;
996:     x[idx+1] = x1;
997:     x[idx+2] = x2;
998:   }

1000:   VecRestoreArray(bb,&b);
1001:   VecRestoreArray(xx,&x);
1002:   PetscLogFlops(9*(2*a->nz + mbs));
1003:   return(0);
1004: }

1006: /*
1007:    Special case where the matrix was factored in the natural ordering. 
1008:    This eliminates the need for the column and row permutation.
1009: */
1012: int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1013: {
1014:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1015:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1016:   MatScalar       *aa=a->a,*v,*diag;
1017:   PetscScalar     *x,*xp,*b,x0,x1,x2;
1018:   int             nz,*vj,k,ierr;

1021: 
1022:   VecGetArray(bb,&b);
1023:   VecGetArray(xx,&x);

1025:   /* solve U^T * D * y = b by forward substitution */
1026:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1027:   for (k=0; k<mbs; k++){
1028:     v  = aa + 9*ai[k];
1029:     xp = x + k*3;
1030:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1031:     nz = ai[k+1] - ai[k];
1032:     vj = aj + ai[k];
1033:     xp = x + (*vj)*3;
1034:     while (nz--) {
1035:       /* x(:) += U(k,:)^T*(Dk*xk) */
1036:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1037:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1038:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1039:       vj++; xp = x + (*vj)*3;
1040:       v += 9;
1041:     }
1042:     /* xk = inv(Dk)*(Dk*xk) */
1043:     diag = aa+k*9;          /* ptr to inv(Dk) */
1044:     xp   = x + k*3;
1045:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1046:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1047:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1048:   }

1050:   /* solve U*x = y by back substitution */
1051:   for (k=mbs-1; k>=0; k--){
1052:     v  = aa + 9*ai[k];
1053:     xp = x + k*3;
1054:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1055:     nz = ai[k+1] - ai[k];
1056:     vj = aj + ai[k];
1057:     xp = x + (*vj)*3;
1058:     while (nz--) {
1059:       /* xk += U(k,:)*x(:) */
1060:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1061:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1062:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1063:       vj++;
1064:       v += 9; xp = x + (*vj)*3;
1065:     }
1066:     xp = x + k*3;
1067:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1068:   }

1070:   VecRestoreArray(bb,&b);
1071:   VecRestoreArray(xx,&x);
1072:   PetscLogFlops(9*(2*a->nz + mbs));
1073:   return(0);
1074: }

1078: int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1079: {
1080:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ *)A->data;
1081:   IS              isrow=a->row;
1082:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1083:   int             nz,*vj,k,k2,ierr,*r,idx;
1084:   MatScalar       *aa=a->a,*v,*diag;
1085:   PetscScalar     *x,*b,x0,x1,*t;

1088:   VecGetArray(bb,&b);
1089:   VecGetArray(xx,&x);
1090:   t  = a->solve_work;
1091:   /* printf("called MatSolve_SeqSBAIJ_2\n"); */
1092:   ISGetIndices(isrow,&r);

1094:   /* solve U^T * D * y = perm(b) by forward substitution */
1095:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1096:     idx = 2*r[k];
1097:     t[k*2]   = b[idx];
1098:     t[k*2+1] = b[idx+1];
1099:   }
1100:   for (k=0; k<mbs; k++){
1101:     v  = aa + 4*ai[k];
1102:     vj = aj + ai[k];
1103:     k2 = k*2;
1104:     x0 = t[k2]; x1 = t[k2+1];
1105:     nz = ai[k+1] - ai[k];
1106:     while (nz--) {
1107:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1108:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1109:       vj++; v += 4;
1110:     }
1111:     diag = aa+k*4;  /* ptr to inv(Dk) */
1112:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1113:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1114:   }

1116:   /* solve U*x = y by back substitution */
1117:   for (k=mbs-1; k>=0; k--){
1118:     v  = aa + 4*ai[k];
1119:     vj = aj + ai[k];
1120:     k2 = k*2;
1121:     x0 = t[k2]; x1 = t[k2+1];
1122:     nz = ai[k+1] - ai[k];
1123:     while (nz--) {
1124:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1125:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1126:       vj++; v += 4;
1127:     }
1128:     t[k2]    = x0;
1129:     t[k2+1]  = x1;
1130:     idx      = 2*r[k];
1131:     x[idx]   = x0;
1132:     x[idx+1] = x1;
1133:   }

1135:   ISRestoreIndices(isrow,&r);
1136:   VecRestoreArray(bb,&b);
1137:   VecRestoreArray(xx,&x);
1138:   PetscLogFlops(4*(2*a->nz + mbs));
1139:   return(0);
1140: }

1142: /*
1143:    Special case where the matrix was factored in the natural ordering. 
1144:    This eliminates the need for the column and row permutation.
1145: */
1148: int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1149: {
1150:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1151:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1152:   MatScalar       *aa=a->a,*v,*diag;
1153:   PetscScalar     *x,*b,x0,x1;
1154:   int             nz,*vj,k,k2,ierr;

1157: 
1158:   VecGetArray(bb,&b);
1159:   VecGetArray(xx,&x);

1161:   /* solve U^T * D * y = b by forward substitution */
1162:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1163:   for (k=0; k<mbs; k++){
1164:     v  = aa + 4*ai[k];
1165:     vj = aj + ai[k];
1166:     k2 = k*2;
1167:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1168:     nz = ai[k+1] - ai[k];
1169: 
1170:     while (nz--) {
1171:       /* x(:) += U(k,:)^T*(Dk*xk) */
1172:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1173:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1174:       vj++; v += 4;
1175:     }
1176:     /* xk = inv(Dk)*(Dk*xk) */
1177:     diag = aa+k*4;          /* ptr to inv(Dk) */
1178:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1179:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1180:   }

1182:   /* solve U*x = y by back substitution */
1183:   for (k=mbs-1; k>=0; k--){
1184:     v  = aa + 4*ai[k];
1185:     vj = aj + ai[k];
1186:     k2 = k*2;
1187:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1188:     nz = ai[k+1] - ai[k];
1189:     while (nz--) {
1190:       /* xk += U(k,:)*x(:) */
1191:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1192:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1193:       vj++; v += 4;
1194:     }
1195:     x[k2]     = x0;
1196:     x[k2+1]   = x1;
1197:   }

1199:   VecRestoreArray(bb,&b);
1200:   VecRestoreArray(xx,&x);
1201:   PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1202:   return(0);
1203: }

1207: int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1208: {
1209:   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
1210:   IS              isrow=a->row;
1211:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1212:   MatScalar       *aa=a->a,*v;
1213:   PetscScalar     *x,*b,xk,*t;
1214:   int             nz,*vj,k;

1217:   if (!mbs) return(0);

1219:   VecGetArray(bb,&b);
1220:   VecGetArray(xx,&x);
1221:   t    = a->solve_work;

1223:   ISGetIndices(isrow,&rip);
1224: 
1225:   /* solve U^T*D*y = perm(b) by forward substitution */
1226:   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1227:   for (k=0; k<mbs; k++){
1228:     v  = aa + ai[k];
1229:     vj = aj + ai[k];
1230:     xk = t[k];
1231:     nz = ai[k+1] - ai[k];
1232:     while (nz--) t[*vj++] += (*v++) * xk;
1233:     t[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1234:   }

1236:   /* solve U*x = y by back substitution */
1237:   for (k=mbs-1; k>=0; k--){
1238:     v  = aa + ai[k];
1239:     vj = aj + ai[k];
1240:     xk = t[k];
1241:     nz = ai[k+1] - ai[k];
1242:     while (nz--) xk += (*v++) * t[*vj++];
1243:     t[k]      = xk;
1244:     x[rip[k]] = xk;
1245:   }

1247:   ISRestoreIndices(isrow,&rip);
1248:   VecRestoreArray(bb,&b);
1249:   VecRestoreArray(xx,&x);
1250:   PetscLogFlops(4*a->nz + A->m);
1251:   return(0);
1252: }

1256: int MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1257: {
1258:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1259:   int          ierr;

1262:   if (a->bs == 1) {
1263:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1264:   } else {
1265:     IS              isrow=a->row;
1266:     int             mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1267:     MatScalar       *aa=a->a,*v;
1268:     PetscScalar     *x,*b,*t;
1269:     int             nz,*vj,k,n;
1270:     if (bb->n > a->solves_work_n) {
1271:       if (a->solves_work) {PetscFree(a->solves_work);}
1272:       PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1273:       a->solves_work_n = bb->n;
1274:     }
1275:     n    = bb->n;
1276:     VecGetArray(bb->v,&b);
1277:     VecGetArray(xx->v,&x);
1278:     t    = a->solves_work;

1280:     ISGetIndices(isrow,&rip);
1281: 
1282:     /* solve U^T*D*y = perm(b) by forward substitution */
1283:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1284:     for (k=0; k<mbs; k++){
1285:       v  = aa + ai[k];
1286:       vj = aj + ai[k];
1287:       nz = ai[k+1] - ai[k];
1288:       while (nz--) {
1289:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1290:         v++;vj++;
1291:       }
1292:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1293:     }
1294: 
1295:     /* solve U*x = y by back substitution */
1296:     for (k=mbs-1; k>=0; k--){
1297:       v  = aa + ai[k];
1298:       vj = aj + ai[k];
1299:       nz = ai[k+1] - ai[k];
1300:       while (nz--) {
1301:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1302:         v++;vj++;
1303:       }
1304:       for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1305:     }

1307:     ISRestoreIndices(isrow,&rip);
1308:     VecRestoreArray(bb->v,&b);
1309:     VecRestoreArray(xx->v,&x);
1310:     PetscLogFlops(bb->n*(4*a->nz + A->m));
1311:   }
1312:   return(0);
1313: }

1315: /*
1316:       Special case where the matrix was ILU(0) factored in the natural
1317:    ordering. This eliminates the need for the column and row permutation.
1318: */
1321: int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1322: {
1323:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1324:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1325:   MatScalar       *aa=a->a,*v;
1326:   PetscScalar     *x,*b,xk;
1327:   int             nz,*vj,k;

1330:   VecGetArray(bb,&b);
1331:   VecGetArray(xx,&x);
1332: 
1333:   /* solve U^T*D*y = b by forward substitution */
1334:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1335:   for (k=0; k<mbs; k++){
1336:     v  = aa + ai[k] + 1;
1337:     vj = aj + ai[k] + 1;
1338:     xk = x[k];
1339:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1340:     while (nz--) x[*vj++] += (*v++) * xk;
1341:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1342:   }

1344:   /* solve U*x = y by back substitution */
1345:   for (k=mbs-2; k>=0; k--){
1346:     v  = aa + ai[k] + 1;
1347:     vj = aj + ai[k] + 1;
1348:     xk = x[k];
1349:     nz = ai[k+1] - ai[k] - 1;
1350:     while (nz--) xk += (*v++) * x[*vj++];
1351:     x[k] = xk;
1352:   }

1354:   VecRestoreArray(bb,&b);
1355:   VecRestoreArray(xx,&x);
1356:   PetscLogFlops(4*a->nz + A->m);
1357:   return(0);
1358: }

1360: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1363: int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1364: {
1365:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1366:   int         *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1367:   int         *jutmp,bs = a->bs,bs2=a->bs2;
1368:   int         m,realloc = 0,*levtmp;
1369:   int         *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1370:   int         incrlev,*lev,shift,prow,nz;
1371:   int         *il,ili,nextprow;
1372:   PetscReal   f = info->fill,levels = info->levels;
1373:   PetscTruth  perm_identity;

1376:   /* check whether perm is the identity mapping */
1377:   ISIdentity(perm,&perm_identity);

1379:   /* special case that simply copies fill pattern */
1380:   if (!levels && perm_identity && bs==1) {
1381:     MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1382:     (*B)->factor    = FACTOR_CHOLESKY;
1383:     b               = (Mat_SeqSBAIJ*)(*B)->data;
1384:     b->row          = perm;
1385:     b->icol         = perm;
1386:     b->factor_damping   = info->damping;
1387:     b->factor_shift     = info->shift;
1388:     b->factor_zeropivot = info->zeropivot;
1389:     PetscObjectReference((PetscObject)perm);
1390:     PetscObjectReference((PetscObject)perm);
1391:     PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1392:     (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1393:     (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1394:     return(0);
1395:   }

1397:   /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1398:   if (levels > 0 && perm_identity && bs==1 ){
1399:     if (!perm_identity) a->permute = PETSC_TRUE;
1400: 
1401:   ISGetIndices(perm,&rip);
1402: 
1403:   if (perm_identity){ /* without permutation */
1404:     ai = a->i; aj = a->j;
1405:   } else {            /* non-trivial permutation */
1406:     MatReorderingSeqSBAIJ(A,perm);
1407:     ai = a->inew; aj = a->jnew;
1408:   }
1409: 
1410:   /* initialization */
1411:   PetscMalloc((mbs+1)*sizeof(int),&iu);
1412:   umax  = (int)(f*ai[mbs] + 1);
1413:   PetscMalloc(umax*sizeof(int),&lev);
1414:   PetscMalloc(umax*sizeof(int),&ju);
1415:   iu[0] = 0;
1416:   juidx = 0; /* index for ju */
1417:   PetscMalloc((4*mbs+1)*sizeof(int),&jl); /* linked list for getting pivot row */
1418:   q      = jl + mbs;   /* linked list for col index of active row */
1419:   levtmp = q + mbs;
1420:   il     = levtmp + mbs;
1421:   for (i=0; i<mbs; i++){
1422:     jl[i] = mbs;
1423:     q[i]  = 0;
1424:     il[i] = 0;
1425:   }

1427:   /* for each row k */
1428:   for (k=0; k<mbs; k++){
1429:     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1430:     q[k] = mbs;
1431:     /* initialize nonzero structure of k-th row to row rip[k] of A */
1432:     jmin = ai[rip[k]] +1; /* exclude diag[k] */
1433:     jmax = ai[rip[k]+1];
1434:     for (j=jmin; j<jmax; j++){
1435:       vj = rip[aj[j]]; /* col. value */
1436:       if(vj > k){
1437:         qm = k;
1438:         do {
1439:           m  = qm; qm = q[m];
1440:         } while(qm < vj);
1441:         if (qm == vj) {
1442:           SETERRQ(1," error: duplicate entry in A\n");
1443:         }
1444:         nzk++;
1445:         q[m]  = vj;
1446:         q[vj] = qm;
1447:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1448:       } /* if(vj > k) */
1449:     } /* for (j=jmin; j<jmax; j++) */

1451:     /* modify nonzero structure of k-th row by computing fill-in
1452:        for each row i to be merged in */
1453:     prow = k;
1454:     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1455: 
1456:     while (prow < k){
1457:       nextprow = jl[prow];
1458:       /* merge row prow into k-th row */
1459:       ili  = il[prow];
1460:       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1461:       jmax = iu[prow+1];
1462:       qm   = k;
1463:       for (j=jmin; j<jmax; j++){
1464:         vj = ju[j];
1465:         incrlev = lev[j] + 1;
1466:         if (incrlev > levels) continue;
1467:         do {
1468:           m = qm; qm = q[m];
1469:         } while (qm < vj);
1470:         if (qm != vj){  /* a fill */
1471:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1472:           levtmp[vj]  = incrlev;
1473:         } else {
1474:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1475:         }
1476:       }
1477:       if (jmin < jmax){ /* update il and jl */
1478:         il[prow] = jmin;
1479:         j = ju[jmin];
1480:         jl[prow] = jl[j]; jl[j] = prow;
1481:       }
1482:       prow = nextprow;
1483:     }
1484: 
1485:     /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1486:     if (nzk > 0){
1487:       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1488:       jl[k] = jl[i]; jl[i] = k;
1489:       il[k] = iu[k] + 1;
1490:     }
1491:     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */

1493:     /* allocate more space to ju if needed */
1494:     if (iu[k+1] > umax) {
1495:       /* estimate how much additional space we will need */
1496:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1497:       /* just double the memory each time */
1498:       maxadd = umax;
1499:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1500:       umax += maxadd;

1502:       /* allocate a longer ju */
1503:       PetscMalloc(umax*sizeof(int),&jutmp);
1504:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1505:       PetscFree(ju);
1506:       ju   = jutmp;

1508:       PetscMalloc(umax*sizeof(int),&jutmp);
1509:       PetscMemcpy(jutmp,lev,(iu[k])*sizeof(int));
1510:       PetscFree(lev);
1511:       lev      = jutmp;
1512:       realloc += 2; /* count how many times we realloc */
1513:     }

1515:     /* save nonzero structure of k-th row in ju */
1516:     ju[juidx]  = k; /* diag[k] */
1517:     lev[juidx] = 0;
1518:     juidx++;
1519:     i = k;
1520:     while (nzk --) {
1521:       i           = q[i];
1522:       ju[juidx] = i;
1523:       lev[juidx] = levtmp[i];
1524:       juidx++;
1525:     }
1526:   } /* end of for (k=0; k<mbs; k++) */

1528:   if (ai[mbs] != 0) {
1529:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1530:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1531:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1532:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1533:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1534:   } else {
1535:      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1536:   }

1538:   ISRestoreIndices(perm,&rip);
1539:   PetscFree(jl);
1540:   PetscFree(lev);

1542:   /* put together the new matrix */
1543:   MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1544:   MatSetType(*B,A->type_name);
1545:   MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);

1547:   /* PetscLogObjectParent(*B,iperm); */
1548:   b = (Mat_SeqSBAIJ*)(*B)->data;
1549:   PetscFree(b->imax);
1550:   b->singlemalloc = PETSC_FALSE;
1551:   /* the next line frees the default space generated by the Create() */
1552:   PetscFree(b->a);
1553:   PetscFree(b->ilen);
1554:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1555:   b->j    = ju;
1556:   b->i    = iu;
1557:   b->diag = 0;
1558:   b->ilen = 0;
1559:   b->imax = 0;
1560:   b->row  = perm;
1561:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1562:   PetscObjectReference((PetscObject)perm);
1563:   b->icol = perm;
1564:   PetscObjectReference((PetscObject)perm);
1565:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1566:   /* In b structure:  Free imax, ilen, old a, old j.  
1567:      Allocate idnew, solve_work, new a, new j */
1568:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1569:   b->maxnz          = b->nz = iu[mbs];
1570:   b->factor_damping   = info->damping;
1571:   b->factor_shift     = info->shift;
1572:   b->factor_zeropivot = info->zeropivot;

1574:   (*B)->factor                 = FACTOR_CHOLESKY;
1575:   (*B)->info.factor_mallocs    = realloc;
1576:   (*B)->info.fill_ratio_given  = f;
1577:   if (ai[mbs] != 0) {
1578:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1579:   } else {
1580:     (*B)->info.fill_ratio_needed = 0.0;
1581:   }


1584:   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1585:   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1586:   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1587: 
1588:   return(0);
1589:   } /* end of if (levels > 0 && perm_identity && bs==1 ) */

1591:   if (!perm_identity) a->permute = PETSC_TRUE;
1592:   if (perm_identity){
1593:     ai = a->i; aj = a->j;
1594:   } else { /*  non-trivial permutation */
1595:     MatReorderingSeqSBAIJ(A, perm);
1596:     ai = a->inew; aj = a->jnew;
1597:   }
1598: 
1599:   /* initialization */
1600:   ISGetIndices(perm,&rip);
1601:   umax  = (int)(f*ai[mbs] + 1);
1602:   PetscMalloc(umax*sizeof(int),&lev);
1603:   umax += mbs + 1;
1604:   shift = mbs + 1;
1605:   PetscMalloc((mbs+1)*sizeof(int),&iu);
1606:   PetscMalloc(umax*sizeof(int),&ju);
1607:   iu[0] = mbs + 1;
1608:   juidx = mbs + 1;
1609:   /* prowl: linked list for pivot row */
1610:   PetscMalloc((3*mbs+1)*sizeof(int),&prowl);
1611:   /* q: linked list for col index */
1612:   q       = prowl + mbs;
1613:   levtmp  = q     + mbs;
1614: 
1615:   for (i=0; i<mbs; i++){
1616:     prowl[i] = mbs;
1617:     q[i] = 0;
1618:   }

1620:   /* for each row k */
1621:   for (k=0; k<mbs; k++){
1622:     nzk  = 0;
1623:     q[k] = mbs;
1624:     /* copy current row into linked list */
1625:     nz = ai[rip[k]+1] - ai[rip[k]];
1626:     j = ai[rip[k]];
1627:     while (nz--){
1628:       vj = rip[aj[j++]];
1629:       if (vj > k){
1630:         qm = k;
1631:         do {
1632:           m  = qm; qm = q[m];
1633:         } while(qm < vj);
1634:         if (qm == vj) {
1635:           SETERRQ(1," error: duplicate entry in A\n");
1636:         }
1637:         nzk++;
1638:         q[m]       = vj;
1639:         q[vj]      = qm;
1640:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1641:       }
1642:     }

1644:     /* modify nonzero structure of k-th row by computing fill-in
1645:        for each row prow to be merged in */
1646:     prow = k;
1647:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1648: 
1649:     while (prow < k){
1650:       /* merge row prow into k-th row */
1651:       jmin = iu[prow] + 1;
1652:       jmax = iu[prow+1];
1653:       qm = k;
1654:       for (j=jmin; j<jmax; j++){
1655:         incrlev = lev[j-shift] + 1;
1656:         if (incrlev > levels) continue;

1658:         vj      = ju[j];
1659:         do {
1660:           m = qm; qm = q[m];
1661:         } while (qm < vj);
1662:         if (qm != vj){      /* a fill */
1663:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1664:           levtmp[vj] = incrlev;
1665:         } else {
1666:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1667:         }
1668:       }
1669:       prow = prowl[prow]; /* next pivot row */
1670:     }
1671: 
1672:     /* add k to row list for first nonzero element in k-th row */
1673:     if (nzk > 1){
1674:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1675:       prowl[k] = prowl[i]; prowl[i] = k;
1676:     }
1677:     iu[k+1] = iu[k] + nzk;

1679:     /* allocate more space to ju and lev if needed */
1680:     if (iu[k+1] > umax) {
1681:       /* estimate how much additional space we will need */
1682:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1683:       /* just double the memory each time */
1684:       maxadd = umax;
1685:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1686:       umax += maxadd;

1688:       /* allocate a longer ju */
1689:       PetscMalloc(umax*sizeof(int),&jutmp);
1690:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1691:       PetscFree(ju);
1692:       ju       = jutmp;

1694:       PetscMalloc(umax*sizeof(int),&jutmp);
1695:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));
1696:       PetscFree(lev);
1697:       lev      = jutmp;
1698:       realloc += 2; /* count how many times we realloc */
1699:     }

1701:     /* save nonzero structure of k-th row in ju */
1702:     i=k;
1703:     while (nzk --) {
1704:       i                = q[i];
1705:       ju[juidx]        = i;
1706:       lev[juidx-shift] = levtmp[i];
1707:       juidx++;
1708:     }
1709:   }
1710: 
1711:   if (ai[mbs] != 0) {
1712:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1713:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1714:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1715:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1716:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1717:   } else {
1718:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1719:   }

1721:   ISRestoreIndices(perm,&rip);
1722:   PetscFree(prowl);
1723:   PetscFree(lev);

1725:   /* put together the new matrix */
1726:   MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1727:   MatSetType(*B,A->type_name);
1728:   MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);

1730:   /* PetscLogObjectParent(*B,iperm); */
1731:   b    = (Mat_SeqSBAIJ*)(*B)->data;
1732:   PetscFree(b->imax);
1733:   b->singlemalloc = PETSC_FALSE;
1734:   /* the next line frees the default space generated by the Create() */
1735:   PetscFree(b->a);
1736:   PetscFree(b->ilen);
1737:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1738:   b->j    = ju;
1739:   b->i    = iu;
1740:   b->diag = 0;
1741:   b->ilen = 0;
1742:   b->imax = 0;
1743: 
1744:   if (b->row) {
1745:     ISDestroy(b->row);
1746:   }
1747:   if (b->icol) {
1748:     ISDestroy(b->icol);
1749:   }
1750:   b->row  = perm;
1751:   b->icol = perm;
1752:   PetscObjectReference((PetscObject)perm);
1753:   PetscObjectReference((PetscObject)perm);
1754:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1755:   /* In b structure:  Free imax, ilen, old a, old j.  
1756:      Allocate idnew, solve_work, new a, new j */
1757:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1758:   b->maxnz = b->nz = iu[mbs];
1759: 
1760:   (*B)->factor                 = FACTOR_CHOLESKY;
1761:   (*B)->info.factor_mallocs    = realloc;
1762:   (*B)->info.fill_ratio_given  = f;
1763:   if (ai[mbs] != 0) {
1764:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1765:   } else {
1766:     (*B)->info.fill_ratio_needed = 0.0;
1767:   }

1769:   if (perm_identity){
1770:     switch (bs) {
1771:       case 1:
1772:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1773:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1774:         (*B)->ops->solves                = MatSolves_SeqSBAIJ_1;
1775:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1776:         break;
1777:       case 2:
1778:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1779:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1780:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1781:         break;
1782:       case 3:
1783:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1784:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1785:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1786:         break;
1787:       case 4:
1788:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1789:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1790:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1791:         break;
1792:       case 5:
1793:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1794:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1795:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1796:         break;
1797:       case 6:
1798:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1799:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1800:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1801:         break;
1802:       case 7:
1803:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1804:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1805:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1806:       break;
1807:       default:
1808:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1809:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1810:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1811:       break;
1812:     }
1813:   }

1815:   return(0);
1816: }