Actual source code: sbaijfact2.c
1: /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include src/mat/impls/sbaij/seq/sbaij.h
7: #include src/mat/impls/baij/seq/baij.h
8: #include src/inline/ilu.h
9: #include src/inline/dot.h
13: int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
16: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
17: }
21: int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
22: {
24: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
25: }
29: int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
30: {
32: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
33: }
37: int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
38: {
40: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
41: }
45: int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
46: {
48: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
49: }
53: int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
54: {
56: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
57: }
61: int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
62: {
64: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
65: }
69: int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
70: {
72: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
73: }
77: int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
78: {
80: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
81: }
85: int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
86: {
88: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
89: }
93: int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
94: {
96: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
97: }
101: int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
102: {
104: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
105: }
109: int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
110: {
112: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
113: }
117: int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
118: {
120: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
121: }
125: int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
126: {
127: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
128: IS isrow=a->row;
129: int mbs=a->mbs,*ai=a->i,*aj=a->j;
130: int nz,*vj,k,ierr,*r,idx,k1;
131: int bs=a->bs,bs2 = a->bs2;
132: MatScalar *aa=a->a,*v,*diag;
133: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
136: VecGetArray(bb,&b);
137: VecGetArray(xx,&x);
138: t = a->solve_work;
139: ISGetIndices(isrow,&r);
140: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
142: /* solve U^T * D * y = b by forward substitution */
143: xk = t;
144: for (k=0; k<mbs; k++) { /* t <- perm(b) */
145: idx = bs*r[k];
146: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
147: }
148: for (k=0; k<mbs; k++){
149: v = aa + bs2*ai[k];
150: xk = t + k*bs; /* Dk*xk = k-th block of x */
151: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
152: nz = ai[k+1] - ai[k];
153: vj = aj + ai[k];
154: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
155: while (nz--) {
156: /* x(:) += U(k,:)^T*(Dk*xk) */
157: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
158: vj++; xj = t + (*vj)*bs;
159: v += bs2;
160: }
161: /* xk = inv(Dk)*(Dk*xk) */
162: diag = aa+k*bs2; /* ptr to inv(Dk) */
163: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
164: }
166: /* solve U*x = y by back substitution */
167: for (k=mbs-1; k>=0; k--){
168: v = aa + bs2*ai[k];
169: xk = t + k*bs; /* xk */
170: nz = ai[k+1] - ai[k];
171: vj = aj + ai[k];
172: xj = t + (*vj)*bs;
173: while (nz--) {
174: /* xk += U(k,:)*x(:) */
175: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
176: vj++;
177: v += bs2; xj = t + (*vj)*bs;
178: }
179: idx = bs*r[k];
180: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
181: }
183: PetscFree(xk_tmp);
184: VecRestoreArray(bb,&b);
185: VecRestoreArray(xx,&x);
186: PetscLogFlops(bs2*(2*a->nz + mbs));
187: return(0);
188: }
192: int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
193: {
194: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
195: int mbs=a->mbs,*ai=a->i,*aj=a->j;
196: int nz,*vj,k,ierr;
197: int bs=a->bs,bs2 = a->bs2;
198: MatScalar *aa=a->a,*v,*diag;
199: PetscScalar *x,*xk,*xj,*b,*xk_tmp;
202:
203: VecGetArray(bb,&b);
204: VecGetArray(xx,&x);
206: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
208: /* solve U^T * D * y = b by forward substitution */
209: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
210: for (k=0; k<mbs; k++){
211: v = aa + bs2*ai[k];
212: xk = x + k*bs; /* Dk*xk = k-th block of x */
213: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
214: nz = ai[k+1] - ai[k];
215: vj = aj + ai[k];
216: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
217: while (nz--) {
218: /* x(:) += U(k,:)^T*(Dk*xk) */
219: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
220: vj++; xj = x + (*vj)*bs;
221: v += bs2;
222: }
223: /* xk = inv(Dk)*(Dk*xk) */
224: diag = aa+k*bs2; /* ptr to inv(Dk) */
225: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
226: }
228: /* solve U*x = y by back substitution */
229: for (k=mbs-1; k>=0; k--){
230: v = aa + bs2*ai[k];
231: xk = x + k*bs; /* xk */
232: nz = ai[k+1] - ai[k];
233: vj = aj + ai[k];
234: xj = x + (*vj)*bs;
235: while (nz--) {
236: /* xk += U(k,:)*x(:) */
237: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
238: vj++;
239: v += bs2; xj = x + (*vj)*bs;
240: }
241: }
243: PetscFree(xk_tmp);
244: VecRestoreArray(bb,&b);
245: VecRestoreArray(xx,&x);
246: PetscLogFlops(bs2*(2*a->nz + mbs));
247: return(0);
248: }
252: int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
253: {
254: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
255: IS isrow=a->row;
256: int mbs=a->mbs,*ai=a->i,*aj=a->j;
257: int nz,*vj,k,ierr,*r,idx;
258: MatScalar *aa=a->a,*v,*d;
259: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
262: VecGetArray(bb,&b);
263: VecGetArray(xx,&x);
264: t = a->solve_work;
265: ISGetIndices(isrow,&r);
267: /* solve U^T * D * y = b by forward substitution */
268: tp = t;
269: for (k=0; k<mbs; k++) { /* t <- perm(b) */
270: idx = 7*r[k];
271: tp[0] = b[idx];
272: tp[1] = b[idx+1];
273: tp[2] = b[idx+2];
274: tp[3] = b[idx+3];
275: tp[4] = b[idx+4];
276: tp[5] = b[idx+5];
277: tp[6] = b[idx+6];
278: tp += 7;
279: }
280:
281: for (k=0; k<mbs; k++){
282: v = aa + 49*ai[k];
283: vj = aj + ai[k];
284: tp = t + k*7;
285: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
286: nz = ai[k+1] - ai[k];
287: tp = t + (*vj)*7;
288: while (nz--) {
289: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
290: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
291: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
292: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
293: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
294: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
295: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
296: vj++; tp = t + (*vj)*7;
297: v += 49;
298: }
300: /* xk = inv(Dk)*(Dk*xk) */
301: d = aa+k*49; /* ptr to inv(Dk) */
302: tp = t + k*7;
303: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
304: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
305: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
306: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
307: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
308: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
309: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
310: }
312: /* solve U*x = y by back substitution */
313: for (k=mbs-1; k>=0; k--){
314: v = aa + 49*ai[k];
315: vj = aj + ai[k];
316: tp = t + k*7;
317: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
318: nz = ai[k+1] - ai[k];
319:
320: tp = t + (*vj)*7;
321: while (nz--) {
322: /* xk += U(k,:)*x(:) */
323: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
324: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
325: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
326: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
327: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
328: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
329: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
330: vj++; tp = t + (*vj)*7;
331: v += 49;
332: }
333: tp = t + k*7;
334: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
335: idx = 7*r[k];
336: x[idx] = x0;
337: x[idx+1] = x1;
338: x[idx+2] = x2;
339: x[idx+3] = x3;
340: x[idx+4] = x4;
341: x[idx+5] = x5;
342: x[idx+6] = x6;
343: }
345: VecRestoreArray(bb,&b);
346: VecRestoreArray(xx,&x);
347: PetscLogFlops(49*(2*a->nz + mbs));
348: return(0);
349: }
353: int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
354: {
355: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
356: int mbs=a->mbs,*ai=a->i,*aj=a->j;
357: MatScalar *aa=a->a,*v,*d;
358: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
359: int nz,*vj,k,ierr;
362:
363: VecGetArray(bb,&b);
364: VecGetArray(xx,&x);
365:
366: /* solve U^T * D * y = b by forward substitution */
367: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
368: for (k=0; k<mbs; k++){
369: v = aa + 49*ai[k];
370: xp = x + k*7;
371: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
372: nz = ai[k+1] - ai[k];
373: vj = aj + ai[k];
374: xp = x + (*vj)*7;
375: while (nz--) {
376: /* x(:) += U(k,:)^T*(Dk*xk) */
377: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
378: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
379: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
380: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
381: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
382: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
383: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
384: vj++; xp = x + (*vj)*7;
385: v += 49;
386: }
387: /* xk = inv(Dk)*(Dk*xk) */
388: d = aa+k*49; /* ptr to inv(Dk) */
389: xp = x + k*7;
390: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
391: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
392: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
393: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
394: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
395: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
396: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
397: }
399: /* solve U*x = y by back substitution */
400: for (k=mbs-1; k>=0; k--){
401: v = aa + 49*ai[k];
402: xp = x + k*7;
403: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
404: nz = ai[k+1] - ai[k];
405: vj = aj + ai[k];
406: xp = x + (*vj)*7;
407: while (nz--) {
408: /* xk += U(k,:)*x(:) */
409: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
410: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
411: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
412: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
413: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
414: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
415: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
416: vj++;
417: v += 49; xp = x + (*vj)*7;
418: }
419: xp = x + k*7;
420: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
421: }
423: VecRestoreArray(bb,&b);
424: VecRestoreArray(xx,&x);
425: PetscLogFlops(49*(2*a->nz + mbs));
426: return(0);
427: }
431: int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
432: {
433: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
434: IS isrow=a->row;
435: int mbs=a->mbs,*ai=a->i,*aj=a->j;
436: int nz,*vj,k,ierr,*r,idx;
437: MatScalar *aa=a->a,*v,*d;
438: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
441: VecGetArray(bb,&b);
442: VecGetArray(xx,&x);
443: t = a->solve_work;
444: ISGetIndices(isrow,&r);
446: /* solve U^T * D * y = b by forward substitution */
447: tp = t;
448: for (k=0; k<mbs; k++) { /* t <- perm(b) */
449: idx = 6*r[k];
450: tp[0] = b[idx];
451: tp[1] = b[idx+1];
452: tp[2] = b[idx+2];
453: tp[3] = b[idx+3];
454: tp[4] = b[idx+4];
455: tp[5] = b[idx+5];
456: tp += 6;
457: }
458:
459: for (k=0; k<mbs; k++){
460: v = aa + 36*ai[k];
461: vj = aj + ai[k];
462: tp = t + k*6;
463: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
464: nz = ai[k+1] - ai[k];
465: tp = t + (*vj)*6;
466: while (nz--) {
467: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
468: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
469: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
470: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
471: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
472: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
473: vj++; tp = t + (*vj)*6;
474: v += 36;
475: }
477: /* xk = inv(Dk)*(Dk*xk) */
478: d = aa+k*36; /* ptr to inv(Dk) */
479: tp = t + k*6;
480: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
481: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
482: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
483: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
484: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
485: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
486: }
488: /* solve U*x = y by back substitution */
489: for (k=mbs-1; k>=0; k--){
490: v = aa + 36*ai[k];
491: vj = aj + ai[k];
492: tp = t + k*6;
493: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
494: nz = ai[k+1] - ai[k];
495:
496: tp = t + (*vj)*6;
497: while (nz--) {
498: /* xk += U(k,:)*x(:) */
499: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
500: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
501: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
502: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
503: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
504: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
505: vj++; tp = t + (*vj)*6;
506: v += 36;
507: }
508: tp = t + k*6;
509: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
510: idx = 6*r[k];
511: x[idx] = x0;
512: x[idx+1] = x1;
513: x[idx+2] = x2;
514: x[idx+3] = x3;
515: x[idx+4] = x4;
516: x[idx+5] = x5;
517: }
519: VecRestoreArray(bb,&b);
520: VecRestoreArray(xx,&x);
521: PetscLogFlops(36*(2*a->nz + mbs));
522: return(0);
523: }
527: int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
530: int mbs=a->mbs,*ai=a->i,*aj=a->j;
531: MatScalar *aa=a->a,*v,*d;
532: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5;
533: int nz,*vj,k,ierr;
536:
537: VecGetArray(bb,&b);
538: VecGetArray(xx,&x);
539:
540: /* solve U^T * D * y = b by forward substitution */
541: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
542: for (k=0; k<mbs; k++){
543: v = aa + 36*ai[k];
544: xp = x + k*6;
545: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
546: nz = ai[k+1] - ai[k];
547: vj = aj + ai[k];
548: xp = x + (*vj)*6;
549: while (nz--) {
550: /* x(:) += U(k,:)^T*(Dk*xk) */
551: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
552: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
553: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
554: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
555: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
556: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
557: vj++; xp = x + (*vj)*6;
558: v += 36;
559: }
560: /* xk = inv(Dk)*(Dk*xk) */
561: d = aa+k*36; /* ptr to inv(Dk) */
562: xp = x + k*6;
563: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
564: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
565: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
566: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
567: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
568: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
569: }
571: /* solve U*x = y by back substitution */
572: for (k=mbs-1; k>=0; k--){
573: v = aa + 36*ai[k];
574: xp = x + k*6;
575: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
576: nz = ai[k+1] - ai[k];
577: vj = aj + ai[k];
578: xp = x + (*vj)*6;
579: while (nz--) {
580: /* xk += U(k,:)*x(:) */
581: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
582: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
583: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
584: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
585: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
586: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
587: vj++;
588: v += 36; xp = x + (*vj)*6;
589: }
590: xp = x + k*6;
591: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
592: }
594: VecRestoreArray(bb,&b);
595: VecRestoreArray(xx,&x);
596: PetscLogFlops(36*(2*a->nz + mbs));
597: return(0);
598: }
602: int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
603: {
604: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
605: IS isrow=a->row;
606: int mbs=a->mbs,*ai=a->i,*aj=a->j;
607: int nz,*vj,k,ierr,*r,idx;
608: MatScalar *aa=a->a,*v,*diag;
609: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
612: VecGetArray(bb,&b);
613: VecGetArray(xx,&x);
614: t = a->solve_work;
615: ISGetIndices(isrow,&r);
617: /* solve U^T * D * y = b by forward substitution */
618: tp = t;
619: for (k=0; k<mbs; k++) { /* t <- perm(b) */
620: idx = 5*r[k];
621: tp[0] = b[idx];
622: tp[1] = b[idx+1];
623: tp[2] = b[idx+2];
624: tp[3] = b[idx+3];
625: tp[4] = b[idx+4];
626: tp += 5;
627: }
628:
629: for (k=0; k<mbs; k++){
630: v = aa + 25*ai[k];
631: vj = aj + ai[k];
632: tp = t + k*5;
633: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
634: nz = ai[k+1] - ai[k];
636: tp = t + (*vj)*5;
637: while (nz--) {
638: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
639: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
640: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
641: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
642: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
643: vj++; tp = t + (*vj)*5;
644: v += 25;
645: }
647: /* xk = inv(Dk)*(Dk*xk) */
648: diag = aa+k*25; /* ptr to inv(Dk) */
649: tp = t + k*5;
650: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
651: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
652: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
653: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
654: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
655: }
657: /* solve U*x = y by back substitution */
658: for (k=mbs-1; k>=0; k--){
659: v = aa + 25*ai[k];
660: vj = aj + ai[k];
661: tp = t + k*5;
662: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
663: nz = ai[k+1] - ai[k];
664:
665: tp = t + (*vj)*5;
666: while (nz--) {
667: /* xk += U(k,:)*x(:) */
668: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
669: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
670: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
671: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
672: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
673: vj++; tp = t + (*vj)*5;
674: v += 25;
675: }
676: tp = t + k*5;
677: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
678: idx = 5*r[k];
679: x[idx] = x0;
680: x[idx+1] = x1;
681: x[idx+2] = x2;
682: x[idx+3] = x3;
683: x[idx+4] = x4;
684: }
686: VecRestoreArray(bb,&b);
687: VecRestoreArray(xx,&x);
688: PetscLogFlops(25*(2*a->nz + mbs));
689: return(0);
690: }
694: int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
695: {
696: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
697: int mbs=a->mbs,*ai=a->i,*aj=a->j;
698: MatScalar *aa=a->a,*v,*diag;
699: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4;
700: int nz,*vj,k,ierr;
703:
704: VecGetArray(bb,&b);
705: VecGetArray(xx,&x);
707: /* solve U^T * D * y = b by forward substitution */
708: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
709: for (k=0; k<mbs; k++){
710: v = aa + 25*ai[k];
711: xp = x + k*5;
712: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
713: nz = ai[k+1] - ai[k];
714: vj = aj + ai[k];
715: xp = x + (*vj)*5;
716: while (nz--) {
717: /* x(:) += U(k,:)^T*(Dk*xk) */
718: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
719: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
720: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
721: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
722: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
723: vj++; xp = x + (*vj)*5;
724: v += 25;
725: }
726: /* xk = inv(Dk)*(Dk*xk) */
727: diag = aa+k*25; /* ptr to inv(Dk) */
728: xp = x + k*5;
729: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
730: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
731: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
732: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
733: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
734: }
736: /* solve U*x = y by back substitution */
737: for (k=mbs-1; k>=0; k--){
738: v = aa + 25*ai[k];
739: xp = x + k*5;
740: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
741: nz = ai[k+1] - ai[k];
742: vj = aj + ai[k];
743: xp = x + (*vj)*5;
744: while (nz--) {
745: /* xk += U(k,:)*x(:) */
746: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
747: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
748: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
749: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
750: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
751: vj++;
752: v += 25; xp = x + (*vj)*5;
753: }
754: xp = x + k*5;
755: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
756: }
758: VecRestoreArray(bb,&b);
759: VecRestoreArray(xx,&x);
760: PetscLogFlops(25*(2*a->nz + mbs));
761: return(0);
762: }
766: int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
767: {
768: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
769: IS isrow=a->row;
770: int mbs=a->mbs,*ai=a->i,*aj=a->j;
771: int nz,*vj,k,ierr,*r,idx;
772: MatScalar *aa=a->a,*v,*diag;
773: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
776: VecGetArray(bb,&b);
777: VecGetArray(xx,&x);
778: t = a->solve_work;
779: ISGetIndices(isrow,&r);
781: /* solve U^T * D * y = b by forward substitution */
782: tp = t;
783: for (k=0; k<mbs; k++) { /* t <- perm(b) */
784: idx = 4*r[k];
785: tp[0] = b[idx];
786: tp[1] = b[idx+1];
787: tp[2] = b[idx+2];
788: tp[3] = b[idx+3];
789: tp += 4;
790: }
791:
792: for (k=0; k<mbs; k++){
793: v = aa + 16*ai[k];
794: vj = aj + ai[k];
795: tp = t + k*4;
796: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
797: nz = ai[k+1] - ai[k];
799: tp = t + (*vj)*4;
800: while (nz--) {
801: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
802: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
803: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
804: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
805: vj++; tp = t + (*vj)*4;
806: v += 16;
807: }
809: /* xk = inv(Dk)*(Dk*xk) */
810: diag = aa+k*16; /* ptr to inv(Dk) */
811: tp = t + k*4;
812: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
813: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
814: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
815: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
816: }
818: /* solve U*x = y by back substitution */
819: for (k=mbs-1; k>=0; k--){
820: v = aa + 16*ai[k];
821: vj = aj + ai[k];
822: tp = t + k*4;
823: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
824: nz = ai[k+1] - ai[k];
825:
826: tp = t + (*vj)*4;
827: while (nz--) {
828: /* xk += U(k,:)*x(:) */
829: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
830: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
831: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
832: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
833: vj++; tp = t + (*vj)*4;
834: v += 16;
835: }
836: tp = t + k*4;
837: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
838: idx = 4*r[k];
839: x[idx] = x0;
840: x[idx+1] = x1;
841: x[idx+2] = x2;
842: x[idx+3] = x3;
843: }
845: VecRestoreArray(bb,&b);
846: VecRestoreArray(xx,&x);
847: PetscLogFlops(16*(2*a->nz + mbs));
848: return(0);
849: }
851: /*
852: Special case where the matrix was factored in the natural ordering.
853: This eliminates the need for the column and row permutation.
854: */
857: int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
858: {
859: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
860: int mbs=a->mbs,*ai=a->i,*aj=a->j;
861: MatScalar *aa=a->a,*v,*diag;
862: PetscScalar *x,*xp,*b,x0,x1,x2,x3;
863: int nz,*vj,k,ierr;
866:
867: VecGetArray(bb,&b);
868: VecGetArray(xx,&x);
870: /* solve U^T * D * y = b by forward substitution */
871: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
872: for (k=0; k<mbs; k++){
873: v = aa + 16*ai[k];
874: xp = x + k*4;
875: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
876: nz = ai[k+1] - ai[k];
877: vj = aj + ai[k];
878: xp = x + (*vj)*4;
879: while (nz--) {
880: /* x(:) += U(k,:)^T*(Dk*xk) */
881: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
882: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
883: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
884: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
885: vj++; xp = x + (*vj)*4;
886: v += 16;
887: }
888: /* xk = inv(Dk)*(Dk*xk) */
889: diag = aa+k*16; /* ptr to inv(Dk) */
890: xp = x + k*4;
891: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
892: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
893: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
894: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
895: }
897: /* solve U*x = y by back substitution */
898: for (k=mbs-1; k>=0; k--){
899: v = aa + 16*ai[k];
900: xp = x + k*4;
901: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
902: nz = ai[k+1] - ai[k];
903: vj = aj + ai[k];
904: xp = x + (*vj)*4;
905: while (nz--) {
906: /* xk += U(k,:)*x(:) */
907: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
908: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
909: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
910: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
911: vj++;
912: v += 16; xp = x + (*vj)*4;
913: }
914: xp = x + k*4;
915: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
916: }
918: VecRestoreArray(bb,&b);
919: VecRestoreArray(xx,&x);
920: PetscLogFlops(16*(2*a->nz + mbs));
921: return(0);
922: }
926: int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
927: {
928: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
929: IS isrow=a->row;
930: int mbs=a->mbs,*ai=a->i,*aj=a->j;
931: int nz,*vj,k,ierr,*r,idx;
932: MatScalar *aa=a->a,*v,*diag;
933: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
936: VecGetArray(bb,&b);
937: VecGetArray(xx,&x);
938: t = a->solve_work;
939: ISGetIndices(isrow,&r);
941: /* solve U^T * D * y = b by forward substitution */
942: tp = t;
943: for (k=0; k<mbs; k++) { /* t <- perm(b) */
944: idx = 3*r[k];
945: tp[0] = b[idx];
946: tp[1] = b[idx+1];
947: tp[2] = b[idx+2];
948: tp += 3;
949: }
950:
951: for (k=0; k<mbs; k++){
952: v = aa + 9*ai[k];
953: vj = aj + ai[k];
954: tp = t + k*3;
955: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
956: nz = ai[k+1] - ai[k];
958: tp = t + (*vj)*3;
959: while (nz--) {
960: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
961: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
962: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
963: vj++; tp = t + (*vj)*3;
964: v += 9;
965: }
967: /* xk = inv(Dk)*(Dk*xk) */
968: diag = aa+k*9; /* ptr to inv(Dk) */
969: tp = t + k*3;
970: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
971: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
972: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
973: }
975: /* solve U*x = y by back substitution */
976: for (k=mbs-1; k>=0; k--){
977: v = aa + 9*ai[k];
978: vj = aj + ai[k];
979: tp = t + k*3;
980: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
981: nz = ai[k+1] - ai[k];
982:
983: tp = t + (*vj)*3;
984: while (nz--) {
985: /* xk += U(k,:)*x(:) */
986: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
987: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
988: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
989: vj++; tp = t + (*vj)*3;
990: v += 9;
991: }
992: tp = t + k*3;
993: tp[0] = x0; tp[1] = x1; tp[2] = x2;
994: idx = 3*r[k];
995: x[idx] = x0;
996: x[idx+1] = x1;
997: x[idx+2] = x2;
998: }
1000: VecRestoreArray(bb,&b);
1001: VecRestoreArray(xx,&x);
1002: PetscLogFlops(9*(2*a->nz + mbs));
1003: return(0);
1004: }
1006: /*
1007: Special case where the matrix was factored in the natural ordering.
1008: This eliminates the need for the column and row permutation.
1009: */
1012: int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1013: {
1014: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1015: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1016: MatScalar *aa=a->a,*v,*diag;
1017: PetscScalar *x,*xp,*b,x0,x1,x2;
1018: int nz,*vj,k,ierr;
1021:
1022: VecGetArray(bb,&b);
1023: VecGetArray(xx,&x);
1025: /* solve U^T * D * y = b by forward substitution */
1026: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1027: for (k=0; k<mbs; k++){
1028: v = aa + 9*ai[k];
1029: xp = x + k*3;
1030: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1031: nz = ai[k+1] - ai[k];
1032: vj = aj + ai[k];
1033: xp = x + (*vj)*3;
1034: while (nz--) {
1035: /* x(:) += U(k,:)^T*(Dk*xk) */
1036: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1037: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1038: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1039: vj++; xp = x + (*vj)*3;
1040: v += 9;
1041: }
1042: /* xk = inv(Dk)*(Dk*xk) */
1043: diag = aa+k*9; /* ptr to inv(Dk) */
1044: xp = x + k*3;
1045: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1046: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1047: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1048: }
1050: /* solve U*x = y by back substitution */
1051: for (k=mbs-1; k>=0; k--){
1052: v = aa + 9*ai[k];
1053: xp = x + k*3;
1054: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1055: nz = ai[k+1] - ai[k];
1056: vj = aj + ai[k];
1057: xp = x + (*vj)*3;
1058: while (nz--) {
1059: /* xk += U(k,:)*x(:) */
1060: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1061: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1062: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1063: vj++;
1064: v += 9; xp = x + (*vj)*3;
1065: }
1066: xp = x + k*3;
1067: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1068: }
1070: VecRestoreArray(bb,&b);
1071: VecRestoreArray(xx,&x);
1072: PetscLogFlops(9*(2*a->nz + mbs));
1073: return(0);
1074: }
1078: int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1079: {
1080: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
1081: IS isrow=a->row;
1082: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1083: int nz,*vj,k,k2,ierr,*r,idx;
1084: MatScalar *aa=a->a,*v,*diag;
1085: PetscScalar *x,*b,x0,x1,*t;
1088: VecGetArray(bb,&b);
1089: VecGetArray(xx,&x);
1090: t = a->solve_work;
1091: /* printf("called MatSolve_SeqSBAIJ_2\n"); */
1092: ISGetIndices(isrow,&r);
1094: /* solve U^T * D * y = perm(b) by forward substitution */
1095: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1096: idx = 2*r[k];
1097: t[k*2] = b[idx];
1098: t[k*2+1] = b[idx+1];
1099: }
1100: for (k=0; k<mbs; k++){
1101: v = aa + 4*ai[k];
1102: vj = aj + ai[k];
1103: k2 = k*2;
1104: x0 = t[k2]; x1 = t[k2+1];
1105: nz = ai[k+1] - ai[k];
1106: while (nz--) {
1107: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1108: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1109: vj++; v += 4;
1110: }
1111: diag = aa+k*4; /* ptr to inv(Dk) */
1112: t[k2] = diag[0]*x0 + diag[2]*x1;
1113: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1114: }
1116: /* solve U*x = y by back substitution */
1117: for (k=mbs-1; k>=0; k--){
1118: v = aa + 4*ai[k];
1119: vj = aj + ai[k];
1120: k2 = k*2;
1121: x0 = t[k2]; x1 = t[k2+1];
1122: nz = ai[k+1] - ai[k];
1123: while (nz--) {
1124: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1125: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1126: vj++; v += 4;
1127: }
1128: t[k2] = x0;
1129: t[k2+1] = x1;
1130: idx = 2*r[k];
1131: x[idx] = x0;
1132: x[idx+1] = x1;
1133: }
1135: ISRestoreIndices(isrow,&r);
1136: VecRestoreArray(bb,&b);
1137: VecRestoreArray(xx,&x);
1138: PetscLogFlops(4*(2*a->nz + mbs));
1139: return(0);
1140: }
1142: /*
1143: Special case where the matrix was factored in the natural ordering.
1144: This eliminates the need for the column and row permutation.
1145: */
1148: int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1149: {
1150: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1151: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1152: MatScalar *aa=a->a,*v,*diag;
1153: PetscScalar *x,*b,x0,x1;
1154: int nz,*vj,k,k2,ierr;
1157:
1158: VecGetArray(bb,&b);
1159: VecGetArray(xx,&x);
1161: /* solve U^T * D * y = b by forward substitution */
1162: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1163: for (k=0; k<mbs; k++){
1164: v = aa + 4*ai[k];
1165: vj = aj + ai[k];
1166: k2 = k*2;
1167: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1168: nz = ai[k+1] - ai[k];
1169:
1170: while (nz--) {
1171: /* x(:) += U(k,:)^T*(Dk*xk) */
1172: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1173: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1174: vj++; v += 4;
1175: }
1176: /* xk = inv(Dk)*(Dk*xk) */
1177: diag = aa+k*4; /* ptr to inv(Dk) */
1178: x[k2] = diag[0]*x0 + diag[2]*x1;
1179: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1180: }
1182: /* solve U*x = y by back substitution */
1183: for (k=mbs-1; k>=0; k--){
1184: v = aa + 4*ai[k];
1185: vj = aj + ai[k];
1186: k2 = k*2;
1187: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1188: nz = ai[k+1] - ai[k];
1189: while (nz--) {
1190: /* xk += U(k,:)*x(:) */
1191: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1192: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1193: vj++; v += 4;
1194: }
1195: x[k2] = x0;
1196: x[k2+1] = x1;
1197: }
1199: VecRestoreArray(bb,&b);
1200: VecRestoreArray(xx,&x);
1201: PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1202: return(0);
1203: }
1207: int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1208: {
1209: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1210: IS isrow=a->row;
1211: int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1212: MatScalar *aa=a->a,*v;
1213: PetscScalar *x,*b,xk,*t;
1214: int nz,*vj,k;
1217: if (!mbs) return(0);
1219: VecGetArray(bb,&b);
1220: VecGetArray(xx,&x);
1221: t = a->solve_work;
1223: ISGetIndices(isrow,&rip);
1224:
1225: /* solve U^T*D*y = perm(b) by forward substitution */
1226: for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1227: for (k=0; k<mbs; k++){
1228: v = aa + ai[k];
1229: vj = aj + ai[k];
1230: xk = t[k];
1231: nz = ai[k+1] - ai[k];
1232: while (nz--) t[*vj++] += (*v++) * xk;
1233: t[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */
1234: }
1236: /* solve U*x = y by back substitution */
1237: for (k=mbs-1; k>=0; k--){
1238: v = aa + ai[k];
1239: vj = aj + ai[k];
1240: xk = t[k];
1241: nz = ai[k+1] - ai[k];
1242: while (nz--) xk += (*v++) * t[*vj++];
1243: t[k] = xk;
1244: x[rip[k]] = xk;
1245: }
1247: ISRestoreIndices(isrow,&rip);
1248: VecRestoreArray(bb,&b);
1249: VecRestoreArray(xx,&x);
1250: PetscLogFlops(4*a->nz + A->m);
1251: return(0);
1252: }
1256: int MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1257: {
1258: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1259: int ierr;
1262: if (a->bs == 1) {
1263: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1264: } else {
1265: IS isrow=a->row;
1266: int mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1267: MatScalar *aa=a->a,*v;
1268: PetscScalar *x,*b,*t;
1269: int nz,*vj,k,n;
1270: if (bb->n > a->solves_work_n) {
1271: if (a->solves_work) {PetscFree(a->solves_work);}
1272: PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1273: a->solves_work_n = bb->n;
1274: }
1275: n = bb->n;
1276: VecGetArray(bb->v,&b);
1277: VecGetArray(xx->v,&x);
1278: t = a->solves_work;
1280: ISGetIndices(isrow,&rip);
1281:
1282: /* solve U^T*D*y = perm(b) by forward substitution */
1283: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1284: for (k=0; k<mbs; k++){
1285: v = aa + ai[k];
1286: vj = aj + ai[k];
1287: nz = ai[k+1] - ai[k];
1288: while (nz--) {
1289: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1290: v++;vj++;
1291: }
1292: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1293: }
1294:
1295: /* solve U*x = y by back substitution */
1296: for (k=mbs-1; k>=0; k--){
1297: v = aa + ai[k];
1298: vj = aj + ai[k];
1299: nz = ai[k+1] - ai[k];
1300: while (nz--) {
1301: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1302: v++;vj++;
1303: }
1304: for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1305: }
1307: ISRestoreIndices(isrow,&rip);
1308: VecRestoreArray(bb->v,&b);
1309: VecRestoreArray(xx->v,&x);
1310: PetscLogFlops(bb->n*(4*a->nz + A->m));
1311: }
1312: return(0);
1313: }
1315: /*
1316: Special case where the matrix was ILU(0) factored in the natural
1317: ordering. This eliminates the need for the column and row permutation.
1318: */
1321: int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1322: {
1323: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1324: int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1325: MatScalar *aa=a->a,*v;
1326: PetscScalar *x,*b,xk;
1327: int nz,*vj,k;
1330: VecGetArray(bb,&b);
1331: VecGetArray(xx,&x);
1332:
1333: /* solve U^T*D*y = b by forward substitution */
1334: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1335: for (k=0; k<mbs; k++){
1336: v = aa + ai[k] + 1;
1337: vj = aj + ai[k] + 1;
1338: xk = x[k];
1339: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1340: while (nz--) x[*vj++] += (*v++) * xk;
1341: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1342: }
1344: /* solve U*x = y by back substitution */
1345: for (k=mbs-2; k>=0; k--){
1346: v = aa + ai[k] + 1;
1347: vj = aj + ai[k] + 1;
1348: xk = x[k];
1349: nz = ai[k+1] - ai[k] - 1;
1350: while (nz--) xk += (*v++) * x[*vj++];
1351: x[k] = xk;
1352: }
1354: VecRestoreArray(bb,&b);
1355: VecRestoreArray(xx,&x);
1356: PetscLogFlops(4*a->nz + A->m);
1357: return(0);
1358: }
1360: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1363: int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1364: {
1365: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1366: int *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1367: int *jutmp,bs = a->bs,bs2=a->bs2;
1368: int m,realloc = 0,*levtmp;
1369: int *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1370: int incrlev,*lev,shift,prow,nz;
1371: int *il,ili,nextprow;
1372: PetscReal f = info->fill,levels = info->levels;
1373: PetscTruth perm_identity;
1376: /* check whether perm is the identity mapping */
1377: ISIdentity(perm,&perm_identity);
1379: /* special case that simply copies fill pattern */
1380: if (!levels && perm_identity && bs==1) {
1381: MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1382: (*B)->factor = FACTOR_CHOLESKY;
1383: b = (Mat_SeqSBAIJ*)(*B)->data;
1384: b->row = perm;
1385: b->icol = perm;
1386: b->factor_damping = info->damping;
1387: b->factor_shift = info->shift;
1388: b->factor_zeropivot = info->zeropivot;
1389: PetscObjectReference((PetscObject)perm);
1390: PetscObjectReference((PetscObject)perm);
1391: PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1392: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1393: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1394: return(0);
1395: }
1397: /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1398: if (levels > 0 && perm_identity && bs==1 ){
1399: if (!perm_identity) a->permute = PETSC_TRUE;
1400:
1401: ISGetIndices(perm,&rip);
1402:
1403: if (perm_identity){ /* without permutation */
1404: ai = a->i; aj = a->j;
1405: } else { /* non-trivial permutation */
1406: MatReorderingSeqSBAIJ(A,perm);
1407: ai = a->inew; aj = a->jnew;
1408: }
1409:
1410: /* initialization */
1411: PetscMalloc((mbs+1)*sizeof(int),&iu);
1412: umax = (int)(f*ai[mbs] + 1);
1413: PetscMalloc(umax*sizeof(int),&lev);
1414: PetscMalloc(umax*sizeof(int),&ju);
1415: iu[0] = 0;
1416: juidx = 0; /* index for ju */
1417: PetscMalloc((4*mbs+1)*sizeof(int),&jl); /* linked list for getting pivot row */
1418: q = jl + mbs; /* linked list for col index of active row */
1419: levtmp = q + mbs;
1420: il = levtmp + mbs;
1421: for (i=0; i<mbs; i++){
1422: jl[i] = mbs;
1423: q[i] = 0;
1424: il[i] = 0;
1425: }
1427: /* for each row k */
1428: for (k=0; k<mbs; k++){
1429: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1430: q[k] = mbs;
1431: /* initialize nonzero structure of k-th row to row rip[k] of A */
1432: jmin = ai[rip[k]] +1; /* exclude diag[k] */
1433: jmax = ai[rip[k]+1];
1434: for (j=jmin; j<jmax; j++){
1435: vj = rip[aj[j]]; /* col. value */
1436: if(vj > k){
1437: qm = k;
1438: do {
1439: m = qm; qm = q[m];
1440: } while(qm < vj);
1441: if (qm == vj) {
1442: SETERRQ(1," error: duplicate entry in A\n");
1443: }
1444: nzk++;
1445: q[m] = vj;
1446: q[vj] = qm;
1447: levtmp[vj] = 0; /* initialize lev for nonzero element */
1448: } /* if(vj > k) */
1449: } /* for (j=jmin; j<jmax; j++) */
1451: /* modify nonzero structure of k-th row by computing fill-in
1452: for each row i to be merged in */
1453: prow = k;
1454: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1455:
1456: while (prow < k){
1457: nextprow = jl[prow];
1458: /* merge row prow into k-th row */
1459: ili = il[prow];
1460: jmin = ili + 1; /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1461: jmax = iu[prow+1];
1462: qm = k;
1463: for (j=jmin; j<jmax; j++){
1464: vj = ju[j];
1465: incrlev = lev[j] + 1;
1466: if (incrlev > levels) continue;
1467: do {
1468: m = qm; qm = q[m];
1469: } while (qm < vj);
1470: if (qm != vj){ /* a fill */
1471: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1472: levtmp[vj] = incrlev;
1473: } else {
1474: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1475: }
1476: }
1477: if (jmin < jmax){ /* update il and jl */
1478: il[prow] = jmin;
1479: j = ju[jmin];
1480: jl[prow] = jl[j]; jl[j] = prow;
1481: }
1482: prow = nextprow;
1483: }
1484:
1485: /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1486: if (nzk > 0){
1487: i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1488: jl[k] = jl[i]; jl[i] = k;
1489: il[k] = iu[k] + 1;
1490: }
1491: iu[k+1] = iu[k] + nzk + 1; /* include diag[k] */
1493: /* allocate more space to ju if needed */
1494: if (iu[k+1] > umax) {
1495: /* estimate how much additional space we will need */
1496: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1497: /* just double the memory each time */
1498: maxadd = umax;
1499: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1500: umax += maxadd;
1502: /* allocate a longer ju */
1503: PetscMalloc(umax*sizeof(int),&jutmp);
1504: PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1505: PetscFree(ju);
1506: ju = jutmp;
1508: PetscMalloc(umax*sizeof(int),&jutmp);
1509: PetscMemcpy(jutmp,lev,(iu[k])*sizeof(int));
1510: PetscFree(lev);
1511: lev = jutmp;
1512: realloc += 2; /* count how many times we realloc */
1513: }
1515: /* save nonzero structure of k-th row in ju */
1516: ju[juidx] = k; /* diag[k] */
1517: lev[juidx] = 0;
1518: juidx++;
1519: i = k;
1520: while (nzk --) {
1521: i = q[i];
1522: ju[juidx] = i;
1523: lev[juidx] = levtmp[i];
1524: juidx++;
1525: }
1526: } /* end of for (k=0; k<mbs; k++) */
1528: if (ai[mbs] != 0) {
1529: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1530: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1531: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1532: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1533: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1534: } else {
1535: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1536: }
1538: ISRestoreIndices(perm,&rip);
1539: PetscFree(jl);
1540: PetscFree(lev);
1542: /* put together the new matrix */
1543: MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1544: MatSetType(*B,A->type_name);
1545: MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);
1547: /* PetscLogObjectParent(*B,iperm); */
1548: b = (Mat_SeqSBAIJ*)(*B)->data;
1549: PetscFree(b->imax);
1550: b->singlemalloc = PETSC_FALSE;
1551: /* the next line frees the default space generated by the Create() */
1552: PetscFree(b->a);
1553: PetscFree(b->ilen);
1554: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1555: b->j = ju;
1556: b->i = iu;
1557: b->diag = 0;
1558: b->ilen = 0;
1559: b->imax = 0;
1560: b->row = perm;
1561: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1562: PetscObjectReference((PetscObject)perm);
1563: b->icol = perm;
1564: PetscObjectReference((PetscObject)perm);
1565: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1566: /* In b structure: Free imax, ilen, old a, old j.
1567: Allocate idnew, solve_work, new a, new j */
1568: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1569: b->maxnz = b->nz = iu[mbs];
1570: b->factor_damping = info->damping;
1571: b->factor_shift = info->shift;
1572: b->factor_zeropivot = info->zeropivot;
1574: (*B)->factor = FACTOR_CHOLESKY;
1575: (*B)->info.factor_mallocs = realloc;
1576: (*B)->info.fill_ratio_given = f;
1577: if (ai[mbs] != 0) {
1578: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1579: } else {
1580: (*B)->info.fill_ratio_needed = 0.0;
1581: }
1584: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1585: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1586: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1587:
1588: return(0);
1589: } /* end of if (levels > 0 && perm_identity && bs==1 ) */
1591: if (!perm_identity) a->permute = PETSC_TRUE;
1592: if (perm_identity){
1593: ai = a->i; aj = a->j;
1594: } else { /* non-trivial permutation */
1595: MatReorderingSeqSBAIJ(A, perm);
1596: ai = a->inew; aj = a->jnew;
1597: }
1598:
1599: /* initialization */
1600: ISGetIndices(perm,&rip);
1601: umax = (int)(f*ai[mbs] + 1);
1602: PetscMalloc(umax*sizeof(int),&lev);
1603: umax += mbs + 1;
1604: shift = mbs + 1;
1605: PetscMalloc((mbs+1)*sizeof(int),&iu);
1606: PetscMalloc(umax*sizeof(int),&ju);
1607: iu[0] = mbs + 1;
1608: juidx = mbs + 1;
1609: /* prowl: linked list for pivot row */
1610: PetscMalloc((3*mbs+1)*sizeof(int),&prowl);
1611: /* q: linked list for col index */
1612: q = prowl + mbs;
1613: levtmp = q + mbs;
1614:
1615: for (i=0; i<mbs; i++){
1616: prowl[i] = mbs;
1617: q[i] = 0;
1618: }
1620: /* for each row k */
1621: for (k=0; k<mbs; k++){
1622: nzk = 0;
1623: q[k] = mbs;
1624: /* copy current row into linked list */
1625: nz = ai[rip[k]+1] - ai[rip[k]];
1626: j = ai[rip[k]];
1627: while (nz--){
1628: vj = rip[aj[j++]];
1629: if (vj > k){
1630: qm = k;
1631: do {
1632: m = qm; qm = q[m];
1633: } while(qm < vj);
1634: if (qm == vj) {
1635: SETERRQ(1," error: duplicate entry in A\n");
1636: }
1637: nzk++;
1638: q[m] = vj;
1639: q[vj] = qm;
1640: levtmp[vj] = 0; /* initialize lev for nonzero element */
1641: }
1642: }
1644: /* modify nonzero structure of k-th row by computing fill-in
1645: for each row prow to be merged in */
1646: prow = k;
1647: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1648:
1649: while (prow < k){
1650: /* merge row prow into k-th row */
1651: jmin = iu[prow] + 1;
1652: jmax = iu[prow+1];
1653: qm = k;
1654: for (j=jmin; j<jmax; j++){
1655: incrlev = lev[j-shift] + 1;
1656: if (incrlev > levels) continue;
1658: vj = ju[j];
1659: do {
1660: m = qm; qm = q[m];
1661: } while (qm < vj);
1662: if (qm != vj){ /* a fill */
1663: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1664: levtmp[vj] = incrlev;
1665: } else {
1666: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1667: }
1668: }
1669: prow = prowl[prow]; /* next pivot row */
1670: }
1671:
1672: /* add k to row list for first nonzero element in k-th row */
1673: if (nzk > 1){
1674: i = q[k]; /* col value of first nonzero element in k_th row of U */
1675: prowl[k] = prowl[i]; prowl[i] = k;
1676: }
1677: iu[k+1] = iu[k] + nzk;
1679: /* allocate more space to ju and lev if needed */
1680: if (iu[k+1] > umax) {
1681: /* estimate how much additional space we will need */
1682: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1683: /* just double the memory each time */
1684: maxadd = umax;
1685: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1686: umax += maxadd;
1688: /* allocate a longer ju */
1689: PetscMalloc(umax*sizeof(int),&jutmp);
1690: PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1691: PetscFree(ju);
1692: ju = jutmp;
1694: PetscMalloc(umax*sizeof(int),&jutmp);
1695: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));
1696: PetscFree(lev);
1697: lev = jutmp;
1698: realloc += 2; /* count how many times we realloc */
1699: }
1701: /* save nonzero structure of k-th row in ju */
1702: i=k;
1703: while (nzk --) {
1704: i = q[i];
1705: ju[juidx] = i;
1706: lev[juidx-shift] = levtmp[i];
1707: juidx++;
1708: }
1709: }
1710:
1711: if (ai[mbs] != 0) {
1712: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1713: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1714: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1715: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1716: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1717: } else {
1718: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1719: }
1721: ISRestoreIndices(perm,&rip);
1722: PetscFree(prowl);
1723: PetscFree(lev);
1725: /* put together the new matrix */
1726: MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1727: MatSetType(*B,A->type_name);
1728: MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);
1730: /* PetscLogObjectParent(*B,iperm); */
1731: b = (Mat_SeqSBAIJ*)(*B)->data;
1732: PetscFree(b->imax);
1733: b->singlemalloc = PETSC_FALSE;
1734: /* the next line frees the default space generated by the Create() */
1735: PetscFree(b->a);
1736: PetscFree(b->ilen);
1737: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1738: b->j = ju;
1739: b->i = iu;
1740: b->diag = 0;
1741: b->ilen = 0;
1742: b->imax = 0;
1743:
1744: if (b->row) {
1745: ISDestroy(b->row);
1746: }
1747: if (b->icol) {
1748: ISDestroy(b->icol);
1749: }
1750: b->row = perm;
1751: b->icol = perm;
1752: PetscObjectReference((PetscObject)perm);
1753: PetscObjectReference((PetscObject)perm);
1754: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1755: /* In b structure: Free imax, ilen, old a, old j.
1756: Allocate idnew, solve_work, new a, new j */
1757: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1758: b->maxnz = b->nz = iu[mbs];
1759:
1760: (*B)->factor = FACTOR_CHOLESKY;
1761: (*B)->info.factor_mallocs = realloc;
1762: (*B)->info.fill_ratio_given = f;
1763: if (ai[mbs] != 0) {
1764: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1765: } else {
1766: (*B)->info.fill_ratio_needed = 0.0;
1767: }
1769: if (perm_identity){
1770: switch (bs) {
1771: case 1:
1772: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1773: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1774: (*B)->ops->solves = MatSolves_SeqSBAIJ_1;
1775: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1776: break;
1777: case 2:
1778: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1779: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1780: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1781: break;
1782: case 3:
1783: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1784: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1785: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1786: break;
1787: case 4:
1788: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1789: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1790: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1791: break;
1792: case 5:
1793: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1794: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1795: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1796: break;
1797: case 6:
1798: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1799: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1800: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1801: break;
1802: case 7:
1803: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1804: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1805: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1806: break;
1807: default:
1808: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1809: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1810: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1811: break;
1812: }
1813: }
1815: return(0);
1816: }