Actual source code: baijfact2.c
1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include src/mat/impls/baij/seq/baij.h
7: #include src/inline/ilu.h
8: #include src/inline/dot.h
12: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
13: {
14: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
15: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16: int *diag = a->diag;
17: MatScalar *aa=a->a,*v;
18: PetscScalar s1,*x,*b;
21: VecCopy(bb,xx);
22: VecGetArray(bb,&b);
23: VecGetArray(xx,&x);
24:
25: /* forward solve the U^T */
26: for (i=0; i<n; i++) {
28: v = aa + diag[i];
29: /* multiply by the inverse of the block diagonal */
30: s1 = (*v++)*x[i];
31: vi = aj + diag[i] + 1;
32: nz = ai[i+1] - diag[i] - 1;
33: while (nz--) {
34: x[*vi++] -= (*v++)*s1;
35: }
36: x[i] = s1;
37: }
38: /* backward solve the L^T */
39: for (i=n-1; i>=0; i--){
40: v = aa + diag[i] - 1;
41: vi = aj + diag[i] - 1;
42: nz = diag[i] - ai[i];
43: s1 = x[i];
44: while (nz--) {
45: x[*vi--] -= (*v--)*s1;
46: }
47: }
48: VecRestoreArray(bb,&b);
49: VecRestoreArray(xx,&x);
50: PetscLogFlops(2*(a->nz) - A->n);
51: return(0);
52: }
56: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57: {
58: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
59: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60: int *diag = a->diag,oidx;
61: MatScalar *aa=a->a,*v;
62: PetscScalar s1,s2,x1,x2;
63: PetscScalar *x,*b;
66: VecCopy(bb,xx);
67: VecGetArray(bb,&b);
68: VecGetArray(xx,&x);
70: /* forward solve the U^T */
71: idx = 0;
72: for (i=0; i<n; i++) {
74: v = aa + 4*diag[i];
75: /* multiply by the inverse of the block diagonal */
76: x1 = x[idx]; x2 = x[1+idx];
77: s1 = v[0]*x1 + v[1]*x2;
78: s2 = v[2]*x1 + v[3]*x2;
79: v += 4;
81: vi = aj + diag[i] + 1;
82: nz = ai[i+1] - diag[i] - 1;
83: while (nz--) {
84: oidx = 2*(*vi++);
85: x[oidx] -= v[0]*s1 + v[1]*s2;
86: x[oidx+1] -= v[2]*s1 + v[3]*s2;
87: v += 4;
88: }
89: x[idx] = s1;x[1+idx] = s2;
90: idx += 2;
91: }
92: /* backward solve the L^T */
93: for (i=n-1; i>=0; i--){
94: v = aa + 4*diag[i] - 4;
95: vi = aj + diag[i] - 1;
96: nz = diag[i] - ai[i];
97: idt = 2*i;
98: s1 = x[idt]; s2 = x[1+idt];
99: while (nz--) {
100: idx = 2*(*vi--);
101: x[idx] -= v[0]*s1 + v[1]*s2;
102: x[idx+1] -= v[2]*s1 + v[3]*s2;
103: v -= 4;
104: }
105: }
106: VecRestoreArray(bb,&b);
107: VecRestoreArray(xx,&x);
108: PetscLogFlops(2*4*(a->nz) - 2*A->n);
109: return(0);
110: }
114: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115: {
116: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
117: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118: int *diag = a->diag,oidx;
119: MatScalar *aa=a->a,*v;
120: PetscScalar s1,s2,s3,x1,x2,x3;
121: PetscScalar *x,*b;
124: VecCopy(bb,xx);
125: VecGetArray(bb,&b);
126: VecGetArray(xx,&x);
128: /* forward solve the U^T */
129: idx = 0;
130: for (i=0; i<n; i++) {
132: v = aa + 9*diag[i];
133: /* multiply by the inverse of the block diagonal */
134: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
135: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
136: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
137: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
138: v += 9;
140: vi = aj + diag[i] + 1;
141: nz = ai[i+1] - diag[i] - 1;
142: while (nz--) {
143: oidx = 3*(*vi++);
144: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
145: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
146: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
147: v += 9;
148: }
149: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
150: idx += 3;
151: }
152: /* backward solve the L^T */
153: for (i=n-1; i>=0; i--){
154: v = aa + 9*diag[i] - 9;
155: vi = aj + diag[i] - 1;
156: nz = diag[i] - ai[i];
157: idt = 3*i;
158: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
159: while (nz--) {
160: idx = 3*(*vi--);
161: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
162: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
163: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
164: v -= 9;
165: }
166: }
167: VecRestoreArray(bb,&b);
168: VecRestoreArray(xx,&x);
169: PetscLogFlops(2*9*(a->nz) - 3*A->n);
170: return(0);
171: }
175: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
176: {
177: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
178: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
179: int *diag = a->diag,oidx;
180: MatScalar *aa=a->a,*v;
181: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
182: PetscScalar *x,*b;
185: VecCopy(bb,xx);
186: VecGetArray(bb,&b);
187: VecGetArray(xx,&x);
189: /* forward solve the U^T */
190: idx = 0;
191: for (i=0; i<n; i++) {
193: v = aa + 16*diag[i];
194: /* multiply by the inverse of the block diagonal */
195: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
196: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
197: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
198: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
199: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
200: v += 16;
202: vi = aj + diag[i] + 1;
203: nz = ai[i+1] - diag[i] - 1;
204: while (nz--) {
205: oidx = 4*(*vi++);
206: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
207: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
208: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
209: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
210: v += 16;
211: }
212: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
213: idx += 4;
214: }
215: /* backward solve the L^T */
216: for (i=n-1; i>=0; i--){
217: v = aa + 16*diag[i] - 16;
218: vi = aj + diag[i] - 1;
219: nz = diag[i] - ai[i];
220: idt = 4*i;
221: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
222: while (nz--) {
223: idx = 4*(*vi--);
224: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
225: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
226: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
227: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
228: v -= 16;
229: }
230: }
231: VecRestoreArray(bb,&b);
232: VecRestoreArray(xx,&x);
233: PetscLogFlops(2*16*(a->nz) - 4*A->n);
234: return(0);
235: }
239: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
240: {
241: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
242: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
243: int *diag = a->diag,oidx;
244: MatScalar *aa=a->a,*v;
245: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
246: PetscScalar *x,*b;
249: VecCopy(bb,xx);
250: VecGetArray(bb,&b);
251: VecGetArray(xx,&x);
253: /* forward solve the U^T */
254: idx = 0;
255: for (i=0; i<n; i++) {
257: v = aa + 25*diag[i];
258: /* multiply by the inverse of the block diagonal */
259: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
260: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
261: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
262: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
263: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
264: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
265: v += 25;
267: vi = aj + diag[i] + 1;
268: nz = ai[i+1] - diag[i] - 1;
269: while (nz--) {
270: oidx = 5*(*vi++);
271: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
272: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
273: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
274: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
275: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
276: v += 25;
277: }
278: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
279: idx += 5;
280: }
281: /* backward solve the L^T */
282: for (i=n-1; i>=0; i--){
283: v = aa + 25*diag[i] - 25;
284: vi = aj + diag[i] - 1;
285: nz = diag[i] - ai[i];
286: idt = 5*i;
287: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
288: while (nz--) {
289: idx = 5*(*vi--);
290: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
291: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
292: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
293: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
294: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
295: v -= 25;
296: }
297: }
298: VecRestoreArray(bb,&b);
299: VecRestoreArray(xx,&x);
300: PetscLogFlops(2*25*(a->nz) - 5*A->n);
301: return(0);
302: }
306: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
307: {
308: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
309: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310: int *diag = a->diag,oidx;
311: MatScalar *aa=a->a,*v;
312: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
313: PetscScalar *x,*b;
316: VecCopy(bb,xx);
317: VecGetArray(bb,&b);
318: VecGetArray(xx,&x);
320: /* forward solve the U^T */
321: idx = 0;
322: for (i=0; i<n; i++) {
324: v = aa + 36*diag[i];
325: /* multiply by the inverse of the block diagonal */
326: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327: x6 = x[5+idx];
328: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
329: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
330: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
331: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
332: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
333: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
334: v += 36;
336: vi = aj + diag[i] + 1;
337: nz = ai[i+1] - diag[i] - 1;
338: while (nz--) {
339: oidx = 6*(*vi++);
340: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
341: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
342: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
343: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
344: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
345: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
346: v += 36;
347: }
348: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
349: x[5+idx] = s6;
350: idx += 6;
351: }
352: /* backward solve the L^T */
353: for (i=n-1; i>=0; i--){
354: v = aa + 36*diag[i] - 36;
355: vi = aj + diag[i] - 1;
356: nz = diag[i] - ai[i];
357: idt = 6*i;
358: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
359: s6 = x[5+idt];
360: while (nz--) {
361: idx = 6*(*vi--);
362: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
363: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
364: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
365: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
366: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
367: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
368: v -= 36;
369: }
370: }
371: VecRestoreArray(bb,&b);
372: VecRestoreArray(xx,&x);
373: PetscLogFlops(2*36*(a->nz) - 6*A->n);
374: return(0);
375: }
379: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
380: {
381: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
382: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
383: int *diag = a->diag,oidx;
384: MatScalar *aa=a->a,*v;
385: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
386: PetscScalar *x,*b;
389: VecCopy(bb,xx);
390: VecGetArray(bb,&b);
391: VecGetArray(xx,&x);
393: /* forward solve the U^T */
394: idx = 0;
395: for (i=0; i<n; i++) {
397: v = aa + 49*diag[i];
398: /* multiply by the inverse of the block diagonal */
399: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
400: x6 = x[5+idx]; x7 = x[6+idx];
401: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
402: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
403: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
404: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
405: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
406: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
407: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
408: v += 49;
410: vi = aj + diag[i] + 1;
411: nz = ai[i+1] - diag[i] - 1;
412: while (nz--) {
413: oidx = 7*(*vi++);
414: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
415: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
416: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
417: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
418: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
419: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
420: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
421: v += 49;
422: }
423: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
424: x[5+idx] = s6;x[6+idx] = s7;
425: idx += 7;
426: }
427: /* backward solve the L^T */
428: for (i=n-1; i>=0; i--){
429: v = aa + 49*diag[i] - 49;
430: vi = aj + diag[i] - 1;
431: nz = diag[i] - ai[i];
432: idt = 7*i;
433: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
434: s6 = x[5+idt];s7 = x[6+idt];
435: while (nz--) {
436: idx = 7*(*vi--);
437: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
438: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
439: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
440: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
441: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
442: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
443: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
444: v -= 49;
445: }
446: }
447: VecRestoreArray(bb,&b);
448: VecRestoreArray(xx,&x);
449: PetscLogFlops(2*49*(a->nz) - 7*A->n);
450: return(0);
451: }
453: /*---------------------------------------------------------------------------------------------*/
456: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
457: {
458: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
459: IS iscol=a->col,isrow=a->row;
460: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
461: int *diag = a->diag;
462: MatScalar *aa=a->a,*v;
463: PetscScalar s1,*x,*b,*t;
466: VecGetArray(bb,&b);
467: VecGetArray(xx,&x);
468: t = a->solve_work;
470: ISGetIndices(isrow,&rout); r = rout;
471: ISGetIndices(iscol,&cout); c = cout;
473: /* copy the b into temp work space according to permutation */
474: for (i=0; i<n; i++) {
475: t[i] = b[c[i]];
476: }
478: /* forward solve the U^T */
479: for (i=0; i<n; i++) {
481: v = aa + diag[i];
482: /* multiply by the inverse of the block diagonal */
483: s1 = (*v++)*t[i];
484: vi = aj + diag[i] + 1;
485: nz = ai[i+1] - diag[i] - 1;
486: while (nz--) {
487: t[*vi++] -= (*v++)*s1;
488: }
489: t[i] = s1;
490: }
491: /* backward solve the L^T */
492: for (i=n-1; i>=0; i--){
493: v = aa + diag[i] - 1;
494: vi = aj + diag[i] - 1;
495: nz = diag[i] - ai[i];
496: s1 = t[i];
497: while (nz--) {
498: t[*vi--] -= (*v--)*s1;
499: }
500: }
502: /* copy t into x according to permutation */
503: for (i=0; i<n; i++) {
504: x[r[i]] = t[i];
505: }
507: ISRestoreIndices(isrow,&rout);
508: ISRestoreIndices(iscol,&cout);
509: VecRestoreArray(bb,&b);
510: VecRestoreArray(xx,&x);
511: PetscLogFlops(2*(a->nz) - A->n);
512: return(0);
513: }
517: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
518: {
519: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
520: IS iscol=a->col,isrow=a->row;
521: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
522: int *diag = a->diag,ii,ic,ir,oidx;
523: MatScalar *aa=a->a,*v;
524: PetscScalar s1,s2,x1,x2;
525: PetscScalar *x,*b,*t;
528: VecGetArray(bb,&b);
529: VecGetArray(xx,&x);
530: t = a->solve_work;
532: ISGetIndices(isrow,&rout); r = rout;
533: ISGetIndices(iscol,&cout); c = cout;
535: /* copy the b into temp work space according to permutation */
536: ii = 0;
537: for (i=0; i<n; i++) {
538: ic = 2*c[i];
539: t[ii] = b[ic];
540: t[ii+1] = b[ic+1];
541: ii += 2;
542: }
544: /* forward solve the U^T */
545: idx = 0;
546: for (i=0; i<n; i++) {
548: v = aa + 4*diag[i];
549: /* multiply by the inverse of the block diagonal */
550: x1 = t[idx]; x2 = t[1+idx];
551: s1 = v[0]*x1 + v[1]*x2;
552: s2 = v[2]*x1 + v[3]*x2;
553: v += 4;
555: vi = aj + diag[i] + 1;
556: nz = ai[i+1] - diag[i] - 1;
557: while (nz--) {
558: oidx = 2*(*vi++);
559: t[oidx] -= v[0]*s1 + v[1]*s2;
560: t[oidx+1] -= v[2]*s1 + v[3]*s2;
561: v += 4;
562: }
563: t[idx] = s1;t[1+idx] = s2;
564: idx += 2;
565: }
566: /* backward solve the L^T */
567: for (i=n-1; i>=0; i--){
568: v = aa + 4*diag[i] - 4;
569: vi = aj + diag[i] - 1;
570: nz = diag[i] - ai[i];
571: idt = 2*i;
572: s1 = t[idt]; s2 = t[1+idt];
573: while (nz--) {
574: idx = 2*(*vi--);
575: t[idx] -= v[0]*s1 + v[1]*s2;
576: t[idx+1] -= v[2]*s1 + v[3]*s2;
577: v -= 4;
578: }
579: }
581: /* copy t into x according to permutation */
582: ii = 0;
583: for (i=0; i<n; i++) {
584: ir = 2*r[i];
585: x[ir] = t[ii];
586: x[ir+1] = t[ii+1];
587: ii += 2;
588: }
590: ISRestoreIndices(isrow,&rout);
591: ISRestoreIndices(iscol,&cout);
592: VecRestoreArray(bb,&b);
593: VecRestoreArray(xx,&x);
594: PetscLogFlops(2*4*(a->nz) - 2*A->n);
595: return(0);
596: }
600: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
601: {
602: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
603: IS iscol=a->col,isrow=a->row;
604: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
605: int *diag = a->diag,ii,ic,ir,oidx;
606: MatScalar *aa=a->a,*v;
607: PetscScalar s1,s2,s3,x1,x2,x3;
608: PetscScalar *x,*b,*t;
611: VecGetArray(bb,&b);
612: VecGetArray(xx,&x);
613: t = a->solve_work;
615: ISGetIndices(isrow,&rout); r = rout;
616: ISGetIndices(iscol,&cout); c = cout;
618: /* copy the b into temp work space according to permutation */
619: ii = 0;
620: for (i=0; i<n; i++) {
621: ic = 3*c[i];
622: t[ii] = b[ic];
623: t[ii+1] = b[ic+1];
624: t[ii+2] = b[ic+2];
625: ii += 3;
626: }
628: /* forward solve the U^T */
629: idx = 0;
630: for (i=0; i<n; i++) {
632: v = aa + 9*diag[i];
633: /* multiply by the inverse of the block diagonal */
634: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
635: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
636: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
637: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
638: v += 9;
640: vi = aj + diag[i] + 1;
641: nz = ai[i+1] - diag[i] - 1;
642: while (nz--) {
643: oidx = 3*(*vi++);
644: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
645: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
646: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
647: v += 9;
648: }
649: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
650: idx += 3;
651: }
652: /* backward solve the L^T */
653: for (i=n-1; i>=0; i--){
654: v = aa + 9*diag[i] - 9;
655: vi = aj + diag[i] - 1;
656: nz = diag[i] - ai[i];
657: idt = 3*i;
658: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
659: while (nz--) {
660: idx = 3*(*vi--);
661: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
662: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
663: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
664: v -= 9;
665: }
666: }
668: /* copy t into x according to permutation */
669: ii = 0;
670: for (i=0; i<n; i++) {
671: ir = 3*r[i];
672: x[ir] = t[ii];
673: x[ir+1] = t[ii+1];
674: x[ir+2] = t[ii+2];
675: ii += 3;
676: }
678: ISRestoreIndices(isrow,&rout);
679: ISRestoreIndices(iscol,&cout);
680: VecRestoreArray(bb,&b);
681: VecRestoreArray(xx,&x);
682: PetscLogFlops(2*9*(a->nz) - 3*A->n);
683: return(0);
684: }
688: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
689: {
690: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
691: IS iscol=a->col,isrow=a->row;
692: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
693: int *diag = a->diag,ii,ic,ir,oidx;
694: MatScalar *aa=a->a,*v;
695: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
696: PetscScalar *x,*b,*t;
699: VecGetArray(bb,&b);
700: VecGetArray(xx,&x);
701: t = a->solve_work;
703: ISGetIndices(isrow,&rout); r = rout;
704: ISGetIndices(iscol,&cout); c = cout;
706: /* copy the b into temp work space according to permutation */
707: ii = 0;
708: for (i=0; i<n; i++) {
709: ic = 4*c[i];
710: t[ii] = b[ic];
711: t[ii+1] = b[ic+1];
712: t[ii+2] = b[ic+2];
713: t[ii+3] = b[ic+3];
714: ii += 4;
715: }
717: /* forward solve the U^T */
718: idx = 0;
719: for (i=0; i<n; i++) {
721: v = aa + 16*diag[i];
722: /* multiply by the inverse of the block diagonal */
723: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
724: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
725: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
726: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
727: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
728: v += 16;
730: vi = aj + diag[i] + 1;
731: nz = ai[i+1] - diag[i] - 1;
732: while (nz--) {
733: oidx = 4*(*vi++);
734: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
735: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
736: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
737: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
738: v += 16;
739: }
740: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
741: idx += 4;
742: }
743: /* backward solve the L^T */
744: for (i=n-1; i>=0; i--){
745: v = aa + 16*diag[i] - 16;
746: vi = aj + diag[i] - 1;
747: nz = diag[i] - ai[i];
748: idt = 4*i;
749: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
750: while (nz--) {
751: idx = 4*(*vi--);
752: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
753: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
754: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756: v -= 16;
757: }
758: }
760: /* copy t into x according to permutation */
761: ii = 0;
762: for (i=0; i<n; i++) {
763: ir = 4*r[i];
764: x[ir] = t[ii];
765: x[ir+1] = t[ii+1];
766: x[ir+2] = t[ii+2];
767: x[ir+3] = t[ii+3];
768: ii += 4;
769: }
771: ISRestoreIndices(isrow,&rout);
772: ISRestoreIndices(iscol,&cout);
773: VecRestoreArray(bb,&b);
774: VecRestoreArray(xx,&x);
775: PetscLogFlops(2*16*(a->nz) - 4*A->n);
776: return(0);
777: }
781: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
782: {
783: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
784: IS iscol=a->col,isrow=a->row;
785: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
786: int *diag = a->diag,ii,ic,ir,oidx;
787: MatScalar *aa=a->a,*v;
788: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
789: PetscScalar *x,*b,*t;
792: VecGetArray(bb,&b);
793: VecGetArray(xx,&x);
794: t = a->solve_work;
796: ISGetIndices(isrow,&rout); r = rout;
797: ISGetIndices(iscol,&cout); c = cout;
799: /* copy the b into temp work space according to permutation */
800: ii = 0;
801: for (i=0; i<n; i++) {
802: ic = 5*c[i];
803: t[ii] = b[ic];
804: t[ii+1] = b[ic+1];
805: t[ii+2] = b[ic+2];
806: t[ii+3] = b[ic+3];
807: t[ii+4] = b[ic+4];
808: ii += 5;
809: }
811: /* forward solve the U^T */
812: idx = 0;
813: for (i=0; i<n; i++) {
815: v = aa + 25*diag[i];
816: /* multiply by the inverse of the block diagonal */
817: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
818: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
819: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
820: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
821: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
822: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
823: v += 25;
825: vi = aj + diag[i] + 1;
826: nz = ai[i+1] - diag[i] - 1;
827: while (nz--) {
828: oidx = 5*(*vi++);
829: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
830: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
831: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
832: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
833: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
834: v += 25;
835: }
836: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
837: idx += 5;
838: }
839: /* backward solve the L^T */
840: for (i=n-1; i>=0; i--){
841: v = aa + 25*diag[i] - 25;
842: vi = aj + diag[i] - 1;
843: nz = diag[i] - ai[i];
844: idt = 5*i;
845: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
846: while (nz--) {
847: idx = 5*(*vi--);
848: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
849: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
850: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
851: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
852: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
853: v -= 25;
854: }
855: }
857: /* copy t into x according to permutation */
858: ii = 0;
859: for (i=0; i<n; i++) {
860: ir = 5*r[i];
861: x[ir] = t[ii];
862: x[ir+1] = t[ii+1];
863: x[ir+2] = t[ii+2];
864: x[ir+3] = t[ii+3];
865: x[ir+4] = t[ii+4];
866: ii += 5;
867: }
869: ISRestoreIndices(isrow,&rout);
870: ISRestoreIndices(iscol,&cout);
871: VecRestoreArray(bb,&b);
872: VecRestoreArray(xx,&x);
873: PetscLogFlops(2*25*(a->nz) - 5*A->n);
874: return(0);
875: }
879: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
880: {
881: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
882: IS iscol=a->col,isrow=a->row;
883: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
884: int *diag = a->diag,ii,ic,ir,oidx;
885: MatScalar *aa=a->a,*v;
886: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
887: PetscScalar *x,*b,*t;
890: VecGetArray(bb,&b);
891: VecGetArray(xx,&x);
892: t = a->solve_work;
894: ISGetIndices(isrow,&rout); r = rout;
895: ISGetIndices(iscol,&cout); c = cout;
897: /* copy the b into temp work space according to permutation */
898: ii = 0;
899: for (i=0; i<n; i++) {
900: ic = 6*c[i];
901: t[ii] = b[ic];
902: t[ii+1] = b[ic+1];
903: t[ii+2] = b[ic+2];
904: t[ii+3] = b[ic+3];
905: t[ii+4] = b[ic+4];
906: t[ii+5] = b[ic+5];
907: ii += 6;
908: }
910: /* forward solve the U^T */
911: idx = 0;
912: for (i=0; i<n; i++) {
914: v = aa + 36*diag[i];
915: /* multiply by the inverse of the block diagonal */
916: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
917: x6 = t[5+idx];
918: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
919: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
920: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
921: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
922: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
923: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
924: v += 36;
926: vi = aj + diag[i] + 1;
927: nz = ai[i+1] - diag[i] - 1;
928: while (nz--) {
929: oidx = 6*(*vi++);
930: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
931: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
932: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
933: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
934: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
935: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
936: v += 36;
937: }
938: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
939: t[5+idx] = s6;
940: idx += 6;
941: }
942: /* backward solve the L^T */
943: for (i=n-1; i>=0; i--){
944: v = aa + 36*diag[i] - 36;
945: vi = aj + diag[i] - 1;
946: nz = diag[i] - ai[i];
947: idt = 6*i;
948: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
949: s6 = t[5+idt];
950: while (nz--) {
951: idx = 6*(*vi--);
952: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
953: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
954: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958: v -= 36;
959: }
960: }
962: /* copy t into x according to permutation */
963: ii = 0;
964: for (i=0; i<n; i++) {
965: ir = 6*r[i];
966: x[ir] = t[ii];
967: x[ir+1] = t[ii+1];
968: x[ir+2] = t[ii+2];
969: x[ir+3] = t[ii+3];
970: x[ir+4] = t[ii+4];
971: x[ir+5] = t[ii+5];
972: ii += 6;
973: }
975: ISRestoreIndices(isrow,&rout);
976: ISRestoreIndices(iscol,&cout);
977: VecRestoreArray(bb,&b);
978: VecRestoreArray(xx,&x);
979: PetscLogFlops(2*36*(a->nz) - 6*A->n);
980: return(0);
981: }
985: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
986: {
987: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
988: IS iscol=a->col,isrow=a->row;
989: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
990: int *diag = a->diag,ii,ic,ir,oidx;
991: MatScalar *aa=a->a,*v;
992: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
993: PetscScalar *x,*b,*t;
996: VecGetArray(bb,&b);
997: VecGetArray(xx,&x);
998: t = a->solve_work;
1000: ISGetIndices(isrow,&rout); r = rout;
1001: ISGetIndices(iscol,&cout); c = cout;
1003: /* copy the b into temp work space according to permutation */
1004: ii = 0;
1005: for (i=0; i<n; i++) {
1006: ic = 7*c[i];
1007: t[ii] = b[ic];
1008: t[ii+1] = b[ic+1];
1009: t[ii+2] = b[ic+2];
1010: t[ii+3] = b[ic+3];
1011: t[ii+4] = b[ic+4];
1012: t[ii+5] = b[ic+5];
1013: t[ii+6] = b[ic+6];
1014: ii += 7;
1015: }
1017: /* forward solve the U^T */
1018: idx = 0;
1019: for (i=0; i<n; i++) {
1021: v = aa + 49*diag[i];
1022: /* multiply by the inverse of the block diagonal */
1023: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1024: x6 = t[5+idx]; x7 = t[6+idx];
1025: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1026: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1027: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1028: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1029: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1030: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1031: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1032: v += 49;
1034: vi = aj + diag[i] + 1;
1035: nz = ai[i+1] - diag[i] - 1;
1036: while (nz--) {
1037: oidx = 7*(*vi++);
1038: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1039: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1040: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1041: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1042: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1043: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1044: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1045: v += 49;
1046: }
1047: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1048: t[5+idx] = s6;t[6+idx] = s7;
1049: idx += 7;
1050: }
1051: /* backward solve the L^T */
1052: for (i=n-1; i>=0; i--){
1053: v = aa + 49*diag[i] - 49;
1054: vi = aj + diag[i] - 1;
1055: nz = diag[i] - ai[i];
1056: idt = 7*i;
1057: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1058: s6 = t[5+idt];s7 = t[6+idt];
1059: while (nz--) {
1060: idx = 7*(*vi--);
1061: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1062: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1063: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1064: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1065: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1066: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1067: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1068: v -= 49;
1069: }
1070: }
1072: /* copy t into x according to permutation */
1073: ii = 0;
1074: for (i=0; i<n; i++) {
1075: ir = 7*r[i];
1076: x[ir] = t[ii];
1077: x[ir+1] = t[ii+1];
1078: x[ir+2] = t[ii+2];
1079: x[ir+3] = t[ii+3];
1080: x[ir+4] = t[ii+4];
1081: x[ir+5] = t[ii+5];
1082: x[ir+6] = t[ii+6];
1083: ii += 7;
1084: }
1086: ISRestoreIndices(isrow,&rout);
1087: ISRestoreIndices(iscol,&cout);
1088: VecRestoreArray(bb,&b);
1089: VecRestoreArray(xx,&x);
1090: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1091: return(0);
1092: }
1094: /* ----------------------------------------------------------- */
1097: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1098: {
1099: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1100: IS iscol=a->col,isrow=a->row;
1101: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1102: int nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1103: MatScalar *aa=a->a,*v;
1104: PetscScalar *x,*b,*s,*t,*ls;
1107: VecGetArray(bb,&b);
1108: VecGetArray(xx,&x);
1109: t = a->solve_work;
1111: ISGetIndices(isrow,&rout); r = rout;
1112: ISGetIndices(iscol,&cout); c = cout + (n-1);
1114: /* forward solve the lower triangular */
1115: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1116: for (i=1; i<n; i++) {
1117: v = aa + bs2*ai[i];
1118: vi = aj + ai[i];
1119: nz = a->diag[i] - ai[i];
1120: s = t + bs*i;
1121: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1122: while (nz--) {
1123: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1124: v += bs2;
1125: }
1126: }
1127: /* backward solve the upper triangular */
1128: ls = a->solve_work + A->n;
1129: for (i=n-1; i>=0; i--){
1130: v = aa + bs2*(a->diag[i] + 1);
1131: vi = aj + a->diag[i] + 1;
1132: nz = ai[i+1] - a->diag[i] - 1;
1133: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1134: while (nz--) {
1135: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1136: v += bs2;
1137: }
1138: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1139: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1140: }
1142: ISRestoreIndices(isrow,&rout);
1143: ISRestoreIndices(iscol,&cout);
1144: VecRestoreArray(bb,&b);
1145: VecRestoreArray(xx,&x);
1146: PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1147: return(0);
1148: }
1152: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1153: {
1154: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1155: IS iscol=a->col,isrow=a->row;
1156: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1157: int *diag = a->diag;
1158: MatScalar *aa=a->a,*v;
1159: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1160: PetscScalar *x,*b,*t;
1163: VecGetArray(bb,&b);
1164: VecGetArray(xx,&x);
1165: t = a->solve_work;
1167: ISGetIndices(isrow,&rout); r = rout;
1168: ISGetIndices(iscol,&cout); c = cout + (n-1);
1170: /* forward solve the lower triangular */
1171: idx = 7*(*r++);
1172: t[0] = b[idx]; t[1] = b[1+idx];
1173: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1174: t[5] = b[5+idx]; t[6] = b[6+idx];
1176: for (i=1; i<n; i++) {
1177: v = aa + 49*ai[i];
1178: vi = aj + ai[i];
1179: nz = diag[i] - ai[i];
1180: idx = 7*(*r++);
1181: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1182: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1183: while (nz--) {
1184: idx = 7*(*vi++);
1185: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1186: x4 = t[3+idx];x5 = t[4+idx];
1187: x6 = t[5+idx];x7 = t[6+idx];
1188: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1189: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1190: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1191: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1192: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1193: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1194: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1195: v += 49;
1196: }
1197: idx = 7*i;
1198: t[idx] = s1;t[1+idx] = s2;
1199: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1200: t[5+idx] = s6;t[6+idx] = s7;
1201: }
1202: /* backward solve the upper triangular */
1203: for (i=n-1; i>=0; i--){
1204: v = aa + 49*diag[i] + 49;
1205: vi = aj + diag[i] + 1;
1206: nz = ai[i+1] - diag[i] - 1;
1207: idt = 7*i;
1208: s1 = t[idt]; s2 = t[1+idt];
1209: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1210: s6 = t[5+idt];s7 = t[6+idt];
1211: while (nz--) {
1212: idx = 7*(*vi++);
1213: x1 = t[idx]; x2 = t[1+idx];
1214: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1215: x6 = t[5+idx]; x7 = t[6+idx];
1216: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1217: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1218: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1219: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1220: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1221: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1222: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1223: v += 49;
1224: }
1225: idc = 7*(*c--);
1226: v = aa + 49*diag[i];
1227: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1228: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1229: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1230: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1231: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1232: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1233: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1234: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1235: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1236: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1237: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1238: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1239: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1240: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1241: }
1243: ISRestoreIndices(isrow,&rout);
1244: ISRestoreIndices(iscol,&cout);
1245: VecRestoreArray(bb,&b);
1246: VecRestoreArray(xx,&x);
1247: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1248: return(0);
1249: }
1253: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1254: {
1255: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1256: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1257: int ierr,*diag = a->diag,jdx;
1258: MatScalar *aa=a->a,*v;
1259: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1262: VecGetArray(bb,&b);
1263: VecGetArray(xx,&x);
1264: /* forward solve the lower triangular */
1265: idx = 0;
1266: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1267: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1268: x[6] = b[6+idx];
1269: for (i=1; i<n; i++) {
1270: v = aa + 49*ai[i];
1271: vi = aj + ai[i];
1272: nz = diag[i] - ai[i];
1273: idx = 7*i;
1274: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1275: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1276: s7 = b[6+idx];
1277: while (nz--) {
1278: jdx = 7*(*vi++);
1279: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1280: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1281: x7 = x[6+jdx];
1282: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1283: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1284: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1285: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1286: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1287: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1288: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1289: v += 49;
1290: }
1291: x[idx] = s1;
1292: x[1+idx] = s2;
1293: x[2+idx] = s3;
1294: x[3+idx] = s4;
1295: x[4+idx] = s5;
1296: x[5+idx] = s6;
1297: x[6+idx] = s7;
1298: }
1299: /* backward solve the upper triangular */
1300: for (i=n-1; i>=0; i--){
1301: v = aa + 49*diag[i] + 49;
1302: vi = aj + diag[i] + 1;
1303: nz = ai[i+1] - diag[i] - 1;
1304: idt = 7*i;
1305: s1 = x[idt]; s2 = x[1+idt];
1306: s3 = x[2+idt]; s4 = x[3+idt];
1307: s5 = x[4+idt]; s6 = x[5+idt];
1308: s7 = x[6+idt];
1309: while (nz--) {
1310: idx = 7*(*vi++);
1311: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1312: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1313: x7 = x[6+idx];
1314: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1315: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1316: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1317: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1318: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1319: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1320: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1321: v += 49;
1322: }
1323: v = aa + 49*diag[i];
1324: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1325: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1326: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1327: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1328: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1329: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1330: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1331: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1332: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1333: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1334: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1335: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1336: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1337: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1338: }
1340: VecRestoreArray(bb,&b);
1341: VecRestoreArray(xx,&x);
1342: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1343: return(0);
1344: }
1348: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1349: {
1350: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1351: IS iscol=a->col,isrow=a->row;
1352: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1353: int *diag = a->diag;
1354: MatScalar *aa=a->a,*v;
1355: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1358: VecGetArray(bb,&b);
1359: VecGetArray(xx,&x);
1360: t = a->solve_work;
1362: ISGetIndices(isrow,&rout); r = rout;
1363: ISGetIndices(iscol,&cout); c = cout + (n-1);
1365: /* forward solve the lower triangular */
1366: idx = 6*(*r++);
1367: t[0] = b[idx]; t[1] = b[1+idx];
1368: t[2] = b[2+idx]; t[3] = b[3+idx];
1369: t[4] = b[4+idx]; t[5] = b[5+idx];
1370: for (i=1; i<n; i++) {
1371: v = aa + 36*ai[i];
1372: vi = aj + ai[i];
1373: nz = diag[i] - ai[i];
1374: idx = 6*(*r++);
1375: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1376: s5 = b[4+idx]; s6 = b[5+idx];
1377: while (nz--) {
1378: idx = 6*(*vi++);
1379: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1380: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1381: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1382: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1383: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1384: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1385: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1386: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1387: v += 36;
1388: }
1389: idx = 6*i;
1390: t[idx] = s1;t[1+idx] = s2;
1391: t[2+idx] = s3;t[3+idx] = s4;
1392: t[4+idx] = s5;t[5+idx] = s6;
1393: }
1394: /* backward solve the upper triangular */
1395: for (i=n-1; i>=0; i--){
1396: v = aa + 36*diag[i] + 36;
1397: vi = aj + diag[i] + 1;
1398: nz = ai[i+1] - diag[i] - 1;
1399: idt = 6*i;
1400: s1 = t[idt]; s2 = t[1+idt];
1401: s3 = t[2+idt];s4 = t[3+idt];
1402: s5 = t[4+idt];s6 = t[5+idt];
1403: while (nz--) {
1404: idx = 6*(*vi++);
1405: x1 = t[idx]; x2 = t[1+idx];
1406: x3 = t[2+idx]; x4 = t[3+idx];
1407: x5 = t[4+idx]; x6 = t[5+idx];
1408: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1414: v += 36;
1415: }
1416: idc = 6*(*c--);
1417: v = aa + 36*diag[i];
1418: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1419: v[18]*s4+v[24]*s5+v[30]*s6;
1420: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1421: v[19]*s4+v[25]*s5+v[31]*s6;
1422: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1423: v[20]*s4+v[26]*s5+v[32]*s6;
1424: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1425: v[21]*s4+v[27]*s5+v[33]*s6;
1426: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1427: v[22]*s4+v[28]*s5+v[34]*s6;
1428: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1429: v[23]*s4+v[29]*s5+v[35]*s6;
1430: }
1432: ISRestoreIndices(isrow,&rout);
1433: ISRestoreIndices(iscol,&cout);
1434: VecRestoreArray(bb,&b);
1435: VecRestoreArray(xx,&x);
1436: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1437: return(0);
1438: }
1442: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1443: {
1444: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1445: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1446: int ierr,*diag = a->diag,jdx;
1447: MatScalar *aa=a->a,*v;
1448: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1451: VecGetArray(bb,&b);
1452: VecGetArray(xx,&x);
1453: /* forward solve the lower triangular */
1454: idx = 0;
1455: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1456: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1457: for (i=1; i<n; i++) {
1458: v = aa + 36*ai[i];
1459: vi = aj + ai[i];
1460: nz = diag[i] - ai[i];
1461: idx = 6*i;
1462: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1463: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1464: while (nz--) {
1465: jdx = 6*(*vi++);
1466: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1467: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1468: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1469: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1470: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1471: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1472: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1473: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1474: v += 36;
1475: }
1476: x[idx] = s1;
1477: x[1+idx] = s2;
1478: x[2+idx] = s3;
1479: x[3+idx] = s4;
1480: x[4+idx] = s5;
1481: x[5+idx] = s6;
1482: }
1483: /* backward solve the upper triangular */
1484: for (i=n-1; i>=0; i--){
1485: v = aa + 36*diag[i] + 36;
1486: vi = aj + diag[i] + 1;
1487: nz = ai[i+1] - diag[i] - 1;
1488: idt = 6*i;
1489: s1 = x[idt]; s2 = x[1+idt];
1490: s3 = x[2+idt]; s4 = x[3+idt];
1491: s5 = x[4+idt]; s6 = x[5+idt];
1492: while (nz--) {
1493: idx = 6*(*vi++);
1494: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1495: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1496: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1497: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1498: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1499: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1500: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1501: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1502: v += 36;
1503: }
1504: v = aa + 36*diag[i];
1505: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1506: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1507: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1508: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1509: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1510: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1511: }
1513: VecRestoreArray(bb,&b);
1514: VecRestoreArray(xx,&x);
1515: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1516: return(0);
1517: }
1521: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1522: {
1523: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1524: IS iscol=a->col,isrow=a->row;
1525: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1526: int *diag = a->diag;
1527: MatScalar *aa=a->a,*v;
1528: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1531: VecGetArray(bb,&b);
1532: VecGetArray(xx,&x);
1533: t = a->solve_work;
1535: ISGetIndices(isrow,&rout); r = rout;
1536: ISGetIndices(iscol,&cout); c = cout + (n-1);
1538: /* forward solve the lower triangular */
1539: idx = 5*(*r++);
1540: t[0] = b[idx]; t[1] = b[1+idx];
1541: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1542: for (i=1; i<n; i++) {
1543: v = aa + 25*ai[i];
1544: vi = aj + ai[i];
1545: nz = diag[i] - ai[i];
1546: idx = 5*(*r++);
1547: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1548: s5 = b[4+idx];
1549: while (nz--) {
1550: idx = 5*(*vi++);
1551: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1552: x4 = t[3+idx];x5 = t[4+idx];
1553: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1554: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1555: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1556: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1557: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1558: v += 25;
1559: }
1560: idx = 5*i;
1561: t[idx] = s1;t[1+idx] = s2;
1562: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1563: }
1564: /* backward solve the upper triangular */
1565: for (i=n-1; i>=0; i--){
1566: v = aa + 25*diag[i] + 25;
1567: vi = aj + diag[i] + 1;
1568: nz = ai[i+1] - diag[i] - 1;
1569: idt = 5*i;
1570: s1 = t[idt]; s2 = t[1+idt];
1571: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1572: while (nz--) {
1573: idx = 5*(*vi++);
1574: x1 = t[idx]; x2 = t[1+idx];
1575: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1576: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1577: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1578: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1579: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1580: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1581: v += 25;
1582: }
1583: idc = 5*(*c--);
1584: v = aa + 25*diag[i];
1585: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1586: v[15]*s4+v[20]*s5;
1587: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1588: v[16]*s4+v[21]*s5;
1589: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1590: v[17]*s4+v[22]*s5;
1591: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1592: v[18]*s4+v[23]*s5;
1593: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1594: v[19]*s4+v[24]*s5;
1595: }
1597: ISRestoreIndices(isrow,&rout);
1598: ISRestoreIndices(iscol,&cout);
1599: VecRestoreArray(bb,&b);
1600: VecRestoreArray(xx,&x);
1601: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1602: return(0);
1603: }
1607: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1608: {
1609: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1610: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1611: int ierr,*diag = a->diag,jdx;
1612: MatScalar *aa=a->a,*v;
1613: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1616: VecGetArray(bb,&b);
1617: VecGetArray(xx,&x);
1618: /* forward solve the lower triangular */
1619: idx = 0;
1620: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1621: for (i=1; i<n; i++) {
1622: v = aa + 25*ai[i];
1623: vi = aj + ai[i];
1624: nz = diag[i] - ai[i];
1625: idx = 5*i;
1626: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1627: while (nz--) {
1628: jdx = 5*(*vi++);
1629: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1630: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1631: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1632: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1633: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1634: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1635: v += 25;
1636: }
1637: x[idx] = s1;
1638: x[1+idx] = s2;
1639: x[2+idx] = s3;
1640: x[3+idx] = s4;
1641: x[4+idx] = s5;
1642: }
1643: /* backward solve the upper triangular */
1644: for (i=n-1; i>=0; i--){
1645: v = aa + 25*diag[i] + 25;
1646: vi = aj + diag[i] + 1;
1647: nz = ai[i+1] - diag[i] - 1;
1648: idt = 5*i;
1649: s1 = x[idt]; s2 = x[1+idt];
1650: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1651: while (nz--) {
1652: idx = 5*(*vi++);
1653: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1654: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1655: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1656: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1657: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1658: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1659: v += 25;
1660: }
1661: v = aa + 25*diag[i];
1662: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1663: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1664: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1665: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1666: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1667: }
1669: VecRestoreArray(bb,&b);
1670: VecRestoreArray(xx,&x);
1671: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1672: return(0);
1673: }
1677: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1678: {
1679: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1680: IS iscol=a->col,isrow=a->row;
1681: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1682: int *diag = a->diag;
1683: MatScalar *aa=a->a,*v;
1684: PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1687: VecGetArray(bb,&b);
1688: VecGetArray(xx,&x);
1689: t = a->solve_work;
1691: ISGetIndices(isrow,&rout); r = rout;
1692: ISGetIndices(iscol,&cout); c = cout + (n-1);
1694: /* forward solve the lower triangular */
1695: idx = 4*(*r++);
1696: t[0] = b[idx]; t[1] = b[1+idx];
1697: t[2] = b[2+idx]; t[3] = b[3+idx];
1698: for (i=1; i<n; i++) {
1699: v = aa + 16*ai[i];
1700: vi = aj + ai[i];
1701: nz = diag[i] - ai[i];
1702: idx = 4*(*r++);
1703: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1704: while (nz--) {
1705: idx = 4*(*vi++);
1706: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1707: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1708: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1709: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1710: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1711: v += 16;
1712: }
1713: idx = 4*i;
1714: t[idx] = s1;t[1+idx] = s2;
1715: t[2+idx] = s3;t[3+idx] = s4;
1716: }
1717: /* backward solve the upper triangular */
1718: for (i=n-1; i>=0; i--){
1719: v = aa + 16*diag[i] + 16;
1720: vi = aj + diag[i] + 1;
1721: nz = ai[i+1] - diag[i] - 1;
1722: idt = 4*i;
1723: s1 = t[idt]; s2 = t[1+idt];
1724: s3 = t[2+idt];s4 = t[3+idt];
1725: while (nz--) {
1726: idx = 4*(*vi++);
1727: x1 = t[idx]; x2 = t[1+idx];
1728: x3 = t[2+idx]; x4 = t[3+idx];
1729: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1730: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1731: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1732: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1733: v += 16;
1734: }
1735: idc = 4*(*c--);
1736: v = aa + 16*diag[i];
1737: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1738: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1739: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1740: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1741: }
1743: ISRestoreIndices(isrow,&rout);
1744: ISRestoreIndices(iscol,&cout);
1745: VecRestoreArray(bb,&b);
1746: VecRestoreArray(xx,&x);
1747: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1748: return(0);
1749: }
1753: int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1754: {
1755: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1756: IS iscol=a->col,isrow=a->row;
1757: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1758: int *diag = a->diag;
1759: MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1760: PetscScalar *x,*b;
1763: VecGetArray(bb,&b);
1764: VecGetArray(xx,&x);
1765: t = (MatScalar *)a->solve_work;
1767: ISGetIndices(isrow,&rout); r = rout;
1768: ISGetIndices(iscol,&cout); c = cout + (n-1);
1770: /* forward solve the lower triangular */
1771: idx = 4*(*r++);
1772: t[0] = (MatScalar)b[idx];
1773: t[1] = (MatScalar)b[1+idx];
1774: t[2] = (MatScalar)b[2+idx];
1775: t[3] = (MatScalar)b[3+idx];
1776: for (i=1; i<n; i++) {
1777: v = aa + 16*ai[i];
1778: vi = aj + ai[i];
1779: nz = diag[i] - ai[i];
1780: idx = 4*(*r++);
1781: s1 = (MatScalar)b[idx];
1782: s2 = (MatScalar)b[1+idx];
1783: s3 = (MatScalar)b[2+idx];
1784: s4 = (MatScalar)b[3+idx];
1785: while (nz--) {
1786: idx = 4*(*vi++);
1787: x1 = t[idx];
1788: x2 = t[1+idx];
1789: x3 = t[2+idx];
1790: x4 = t[3+idx];
1791: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1792: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1793: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1794: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1795: v += 16;
1796: }
1797: idx = 4*i;
1798: t[idx] = s1;
1799: t[1+idx] = s2;
1800: t[2+idx] = s3;
1801: t[3+idx] = s4;
1802: }
1803: /* backward solve the upper triangular */
1804: for (i=n-1; i>=0; i--){
1805: v = aa + 16*diag[i] + 16;
1806: vi = aj + diag[i] + 1;
1807: nz = ai[i+1] - diag[i] - 1;
1808: idt = 4*i;
1809: s1 = t[idt];
1810: s2 = t[1+idt];
1811: s3 = t[2+idt];
1812: s4 = t[3+idt];
1813: while (nz--) {
1814: idx = 4*(*vi++);
1815: x1 = t[idx];
1816: x2 = t[1+idx];
1817: x3 = t[2+idx];
1818: x4 = t[3+idx];
1819: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1820: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1821: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1822: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1823: v += 16;
1824: }
1825: idc = 4*(*c--);
1826: v = aa + 16*diag[i];
1827: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1828: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1829: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1830: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1831: x[idc] = (PetscScalar)t[idt];
1832: x[1+idc] = (PetscScalar)t[1+idt];
1833: x[2+idc] = (PetscScalar)t[2+idt];
1834: x[3+idc] = (PetscScalar)t[3+idt];
1835: }
1837: ISRestoreIndices(isrow,&rout);
1838: ISRestoreIndices(iscol,&cout);
1839: VecRestoreArray(bb,&b);
1840: VecRestoreArray(xx,&x);
1841: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1842: return(0);
1843: }
1845: #if defined (PETSC_HAVE_SSE)
1847: #include PETSC_HAVE_SSE
1851: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1852: {
1853: /*
1854: Note: This code uses demotion of double
1855: to float when performing the mixed-mode computation.
1856: This may not be numerically reasonable for all applications.
1857: */
1858: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1859: IS iscol=a->col,isrow=a->row;
1860: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1861: int *diag = a->diag,ai16;
1862: MatScalar *aa=a->a,*v;
1863: PetscScalar *x,*b,*t;
1865: /* Make space in temp stack for 16 Byte Aligned arrays */
1866: float ssealignedspace[11],*tmps,*tmpx;
1867: unsigned long offset;
1868:
1870: SSE_SCOPE_BEGIN;
1872: offset = (unsigned long)ssealignedspace % 16;
1873: if (offset) offset = (16 - offset)/4;
1874: tmps = &ssealignedspace[offset];
1875: tmpx = &ssealignedspace[offset+4];
1876: PREFETCH_NTA(aa+16*ai[1]);
1878: VecGetArray(bb,&b);
1879: VecGetArray(xx,&x);
1880: t = a->solve_work;
1882: ISGetIndices(isrow,&rout); r = rout;
1883: ISGetIndices(iscol,&cout); c = cout + (n-1);
1885: /* forward solve the lower triangular */
1886: idx = 4*(*r++);
1887: t[0] = b[idx]; t[1] = b[1+idx];
1888: t[2] = b[2+idx]; t[3] = b[3+idx];
1889: v = aa + 16*ai[1];
1891: for (i=1; i<n;) {
1892: PREFETCH_NTA(&v[8]);
1893: vi = aj + ai[i];
1894: nz = diag[i] - ai[i];
1895: idx = 4*(*r++);
1897: /* Demote sum from double to float */
1898: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1899: LOAD_PS(tmps,XMM7);
1901: while (nz--) {
1902: PREFETCH_NTA(&v[16]);
1903: idx = 4*(*vi++);
1904:
1905: /* Demote solution (so far) from double to float */
1906: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1908: /* 4x4 Matrix-Vector product with negative accumulation: */
1909: SSE_INLINE_BEGIN_2(tmpx,v)
1910: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1912: /* First Column */
1913: SSE_COPY_PS(XMM0,XMM6)
1914: SSE_SHUFFLE(XMM0,XMM0,0x00)
1915: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1916: SSE_SUB_PS(XMM7,XMM0)
1917:
1918: /* Second Column */
1919: SSE_COPY_PS(XMM1,XMM6)
1920: SSE_SHUFFLE(XMM1,XMM1,0x55)
1921: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1922: SSE_SUB_PS(XMM7,XMM1)
1923:
1924: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1925:
1926: /* Third Column */
1927: SSE_COPY_PS(XMM2,XMM6)
1928: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1929: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1930: SSE_SUB_PS(XMM7,XMM2)
1932: /* Fourth Column */
1933: SSE_COPY_PS(XMM3,XMM6)
1934: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1935: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1936: SSE_SUB_PS(XMM7,XMM3)
1937: SSE_INLINE_END_2
1938:
1939: v += 16;
1940: }
1941: idx = 4*i;
1942: v = aa + 16*ai[++i];
1943: PREFETCH_NTA(v);
1944: STORE_PS(tmps,XMM7);
1946: /* Promote result from float to double */
1947: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1948: }
1949: /* backward solve the upper triangular */
1950: idt = 4*(n-1);
1951: ai16 = 16*diag[n-1];
1952: v = aa + ai16 + 16;
1953: for (i=n-1; i>=0;){
1954: PREFETCH_NTA(&v[8]);
1955: vi = aj + diag[i] + 1;
1956: nz = ai[i+1] - diag[i] - 1;
1957:
1958: /* Demote accumulator from double to float */
1959: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1960: LOAD_PS(tmps,XMM7);
1962: while (nz--) {
1963: PREFETCH_NTA(&v[16]);
1964: idx = 4*(*vi++);
1966: /* Demote solution (so far) from double to float */
1967: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1969: /* 4x4 Matrix-Vector Product with negative accumulation: */
1970: SSE_INLINE_BEGIN_2(tmpx,v)
1971: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1973: /* First Column */
1974: SSE_COPY_PS(XMM0,XMM6)
1975: SSE_SHUFFLE(XMM0,XMM0,0x00)
1976: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1977: SSE_SUB_PS(XMM7,XMM0)
1979: /* Second Column */
1980: SSE_COPY_PS(XMM1,XMM6)
1981: SSE_SHUFFLE(XMM1,XMM1,0x55)
1982: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1983: SSE_SUB_PS(XMM7,XMM1)
1985: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1986:
1987: /* Third Column */
1988: SSE_COPY_PS(XMM2,XMM6)
1989: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1990: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1991: SSE_SUB_PS(XMM7,XMM2)
1993: /* Fourth Column */
1994: SSE_COPY_PS(XMM3,XMM6)
1995: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1996: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1997: SSE_SUB_PS(XMM7,XMM3)
1998: SSE_INLINE_END_2
1999: v += 16;
2000: }
2001: v = aa + ai16;
2002: ai16 = 16*diag[--i];
2003: PREFETCH_NTA(aa+ai16+16);
2004: /*
2005: Scale the result by the diagonal 4x4 block,
2006: which was inverted as part of the factorization
2007: */
2008: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2009: /* First Column */
2010: SSE_COPY_PS(XMM0,XMM7)
2011: SSE_SHUFFLE(XMM0,XMM0,0x00)
2012: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2014: /* Second Column */
2015: SSE_COPY_PS(XMM1,XMM7)
2016: SSE_SHUFFLE(XMM1,XMM1,0x55)
2017: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2018: SSE_ADD_PS(XMM0,XMM1)
2020: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2021:
2022: /* Third Column */
2023: SSE_COPY_PS(XMM2,XMM7)
2024: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2025: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2026: SSE_ADD_PS(XMM0,XMM2)
2028: /* Fourth Column */
2029: SSE_COPY_PS(XMM3,XMM7)
2030: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2031: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2032: SSE_ADD_PS(XMM0,XMM3)
2033:
2034: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2035: SSE_INLINE_END_3
2037: /* Promote solution from float to double */
2038: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2040: /* Apply reordering to t and stream into x. */
2041: /* This way, x doesn't pollute the cache. */
2042: /* Be careful with size: 2 doubles = 4 floats! */
2043: idc = 4*(*c--);
2044: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2045: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2046: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2047: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2048: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2049: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2050: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2051: SSE_INLINE_END_2
2052: v = aa + ai16 + 16;
2053: idt -= 4;
2054: }
2056: ISRestoreIndices(isrow,&rout);
2057: ISRestoreIndices(iscol,&cout);
2058: VecRestoreArray(bb,&b);
2059: VecRestoreArray(xx,&x);
2060: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2061: SSE_SCOPE_END;
2062: return(0);
2063: }
2065: #endif
2068: /*
2069: Special case where the matrix was ILU(0) factored in the natural
2070: ordering. This eliminates the need for the column and row permutation.
2071: */
2074: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2075: {
2076: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2077: int n=a->mbs,*ai=a->i,*aj=a->j;
2078: int ierr,*diag = a->diag;
2079: MatScalar *aa=a->a;
2080: PetscScalar *x,*b;
2083: VecGetArray(bb,&b);
2084: VecGetArray(xx,&x);
2086: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2087: {
2088: static PetscScalar w[2000]; /* very BAD need to fix */
2089: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2090: }
2091: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2092: {
2093: static PetscScalar w[2000]; /* very BAD need to fix */
2094: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2095: }
2096: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2097: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2098: #else
2099: {
2100: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2101: MatScalar *v;
2102: int jdx,idt,idx,nz,*vi,i,ai16;
2104: /* forward solve the lower triangular */
2105: idx = 0;
2106: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2107: for (i=1; i<n; i++) {
2108: v = aa + 16*ai[i];
2109: vi = aj + ai[i];
2110: nz = diag[i] - ai[i];
2111: idx += 4;
2112: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2113: while (nz--) {
2114: jdx = 4*(*vi++);
2115: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2116: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2117: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2118: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2119: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2120: v += 16;
2121: }
2122: x[idx] = s1;
2123: x[1+idx] = s2;
2124: x[2+idx] = s3;
2125: x[3+idx] = s4;
2126: }
2127: /* backward solve the upper triangular */
2128: idt = 4*(n-1);
2129: for (i=n-1; i>=0; i--){
2130: ai16 = 16*diag[i];
2131: v = aa + ai16 + 16;
2132: vi = aj + diag[i] + 1;
2133: nz = ai[i+1] - diag[i] - 1;
2134: s1 = x[idt]; s2 = x[1+idt];
2135: s3 = x[2+idt];s4 = x[3+idt];
2136: while (nz--) {
2137: idx = 4*(*vi++);
2138: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2139: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2140: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2141: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2142: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2143: v += 16;
2144: }
2145: v = aa + ai16;
2146: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2147: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2148: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2149: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2150: idt -= 4;
2151: }
2152: }
2153: #endif
2155: VecRestoreArray(bb,&b);
2156: VecRestoreArray(xx,&x);
2157: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2158: return(0);
2159: }
2163: int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2164: {
2165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2166: int n=a->mbs,*ai=a->i,*aj=a->j;
2167: int ierr,*diag = a->diag;
2168: MatScalar *aa=a->a;
2169: PetscScalar *x,*b;
2172: VecGetArray(bb,&b);
2173: VecGetArray(xx,&x);
2175: {
2176: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2177: MatScalar *v,*t=(MatScalar *)x;
2178: int jdx,idt,idx,nz,*vi,i,ai16;
2180: /* forward solve the lower triangular */
2181: idx = 0;
2182: t[0] = (MatScalar)b[0];
2183: t[1] = (MatScalar)b[1];
2184: t[2] = (MatScalar)b[2];
2185: t[3] = (MatScalar)b[3];
2186: for (i=1; i<n; i++) {
2187: v = aa + 16*ai[i];
2188: vi = aj + ai[i];
2189: nz = diag[i] - ai[i];
2190: idx += 4;
2191: s1 = (MatScalar)b[idx];
2192: s2 = (MatScalar)b[1+idx];
2193: s3 = (MatScalar)b[2+idx];
2194: s4 = (MatScalar)b[3+idx];
2195: while (nz--) {
2196: jdx = 4*(*vi++);
2197: x1 = t[jdx];
2198: x2 = t[1+jdx];
2199: x3 = t[2+jdx];
2200: x4 = t[3+jdx];
2201: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2202: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2203: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2204: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2205: v += 16;
2206: }
2207: t[idx] = s1;
2208: t[1+idx] = s2;
2209: t[2+idx] = s3;
2210: t[3+idx] = s4;
2211: }
2212: /* backward solve the upper triangular */
2213: idt = 4*(n-1);
2214: for (i=n-1; i>=0; i--){
2215: ai16 = 16*diag[i];
2216: v = aa + ai16 + 16;
2217: vi = aj + diag[i] + 1;
2218: nz = ai[i+1] - diag[i] - 1;
2219: s1 = t[idt];
2220: s2 = t[1+idt];
2221: s3 = t[2+idt];
2222: s4 = t[3+idt];
2223: while (nz--) {
2224: idx = 4*(*vi++);
2225: x1 = (MatScalar)x[idx];
2226: x2 = (MatScalar)x[1+idx];
2227: x3 = (MatScalar)x[2+idx];
2228: x4 = (MatScalar)x[3+idx];
2229: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2230: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2231: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2232: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2233: v += 16;
2234: }
2235: v = aa + ai16;
2236: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2237: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2238: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2239: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2240: idt -= 4;
2241: }
2242: }
2244: VecRestoreArray(bb,&b);
2245: VecRestoreArray(xx,&x);
2246: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2247: return(0);
2248: }
2250: #if defined (PETSC_HAVE_SSE)
2252: #include PETSC_HAVE_SSE
2255: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2256: {
2257: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2258: unsigned short *aj=(unsigned short *)a->j;
2259: int ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2260: MatScalar *aa=a->a;
2261: PetscScalar *x,*b;
2264: SSE_SCOPE_BEGIN;
2265: /*
2266: Note: This code currently uses demotion of double
2267: to float when performing the mixed-mode computation.
2268: This may not be numerically reasonable for all applications.
2269: */
2270: PREFETCH_NTA(aa+16*ai[1]);
2272: VecGetArray(bb,&b);
2273: VecGetArray(xx,&x);
2274: {
2275: /* x will first be computed in single precision then promoted inplace to double */
2276: MatScalar *v,*t=(MatScalar *)x;
2277: int nz,i,idt,ai16;
2278: unsigned int jdx,idx;
2279: unsigned short *vi;
2280: /* Forward solve the lower triangular factor. */
2282: /* First block is the identity. */
2283: idx = 0;
2284: CONVERT_DOUBLE4_FLOAT4(t,b);
2285: v = aa + 16*((unsigned int)ai[1]);
2287: for (i=1; i<n;) {
2288: PREFETCH_NTA(&v[8]);
2289: vi = aj + ai[i];
2290: nz = diag[i] - ai[i];
2291: idx += 4;
2293: /* Demote RHS from double to float. */
2294: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2295: LOAD_PS(&t[idx],XMM7);
2297: while (nz--) {
2298: PREFETCH_NTA(&v[16]);
2299: jdx = 4*((unsigned int)(*vi++));
2300:
2301: /* 4x4 Matrix-Vector product with negative accumulation: */
2302: SSE_INLINE_BEGIN_2(&t[jdx],v)
2303: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2305: /* First Column */
2306: SSE_COPY_PS(XMM0,XMM6)
2307: SSE_SHUFFLE(XMM0,XMM0,0x00)
2308: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2309: SSE_SUB_PS(XMM7,XMM0)
2311: /* Second Column */
2312: SSE_COPY_PS(XMM1,XMM6)
2313: SSE_SHUFFLE(XMM1,XMM1,0x55)
2314: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2315: SSE_SUB_PS(XMM7,XMM1)
2317: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2318:
2319: /* Third Column */
2320: SSE_COPY_PS(XMM2,XMM6)
2321: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2322: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2323: SSE_SUB_PS(XMM7,XMM2)
2325: /* Fourth Column */
2326: SSE_COPY_PS(XMM3,XMM6)
2327: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2328: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2329: SSE_SUB_PS(XMM7,XMM3)
2330: SSE_INLINE_END_2
2331:
2332: v += 16;
2333: }
2334: v = aa + 16*ai[++i];
2335: PREFETCH_NTA(v);
2336: STORE_PS(&t[idx],XMM7);
2337: }
2339: /* Backward solve the upper triangular factor.*/
2341: idt = 4*(n-1);
2342: ai16 = 16*diag[n-1];
2343: v = aa + ai16 + 16;
2344: for (i=n-1; i>=0;){
2345: PREFETCH_NTA(&v[8]);
2346: vi = aj + diag[i] + 1;
2347: nz = ai[i+1] - diag[i] - 1;
2348:
2349: LOAD_PS(&t[idt],XMM7);
2351: while (nz--) {
2352: PREFETCH_NTA(&v[16]);
2353: idx = 4*((unsigned int)(*vi++));
2355: /* 4x4 Matrix-Vector Product with negative accumulation: */
2356: SSE_INLINE_BEGIN_2(&t[idx],v)
2357: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2359: /* First Column */
2360: SSE_COPY_PS(XMM0,XMM6)
2361: SSE_SHUFFLE(XMM0,XMM0,0x00)
2362: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2363: SSE_SUB_PS(XMM7,XMM0)
2365: /* Second Column */
2366: SSE_COPY_PS(XMM1,XMM6)
2367: SSE_SHUFFLE(XMM1,XMM1,0x55)
2368: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2369: SSE_SUB_PS(XMM7,XMM1)
2371: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2372:
2373: /* Third Column */
2374: SSE_COPY_PS(XMM2,XMM6)
2375: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2376: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2377: SSE_SUB_PS(XMM7,XMM2)
2379: /* Fourth Column */
2380: SSE_COPY_PS(XMM3,XMM6)
2381: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2382: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2383: SSE_SUB_PS(XMM7,XMM3)
2384: SSE_INLINE_END_2
2385: v += 16;
2386: }
2387: v = aa + ai16;
2388: ai16 = 16*diag[--i];
2389: PREFETCH_NTA(aa+ai16+16);
2390: /*
2391: Scale the result by the diagonal 4x4 block,
2392: which was inverted as part of the factorization
2393: */
2394: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2395: /* First Column */
2396: SSE_COPY_PS(XMM0,XMM7)
2397: SSE_SHUFFLE(XMM0,XMM0,0x00)
2398: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2400: /* Second Column */
2401: SSE_COPY_PS(XMM1,XMM7)
2402: SSE_SHUFFLE(XMM1,XMM1,0x55)
2403: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2404: SSE_ADD_PS(XMM0,XMM1)
2406: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2407:
2408: /* Third Column */
2409: SSE_COPY_PS(XMM2,XMM7)
2410: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2411: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2412: SSE_ADD_PS(XMM0,XMM2)
2414: /* Fourth Column */
2415: SSE_COPY_PS(XMM3,XMM7)
2416: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2417: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2418: SSE_ADD_PS(XMM0,XMM3)
2420: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2421: SSE_INLINE_END_3
2423: v = aa + ai16 + 16;
2424: idt -= 4;
2425: }
2427: /* Convert t from single precision back to double precision (inplace)*/
2428: idt = 4*(n-1);
2429: for (i=n-1;i>=0;i--) {
2430: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2431: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2432: PetscScalar *xtemp=&x[idt];
2433: MatScalar *ttemp=&t[idt];
2434: xtemp[3] = (PetscScalar)ttemp[3];
2435: xtemp[2] = (PetscScalar)ttemp[2];
2436: xtemp[1] = (PetscScalar)ttemp[1];
2437: xtemp[0] = (PetscScalar)ttemp[0];
2438: idt -= 4;
2439: }
2441: } /* End of artificial scope. */
2442: VecRestoreArray(bb,&b);
2443: VecRestoreArray(xx,&x);
2444: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2445: SSE_SCOPE_END;
2446: return(0);
2447: }
2451: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2452: {
2453: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2454: int *aj=a->j;
2455: int ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2456: MatScalar *aa=a->a;
2457: PetscScalar *x,*b;
2460: SSE_SCOPE_BEGIN;
2461: /*
2462: Note: This code currently uses demotion of double
2463: to float when performing the mixed-mode computation.
2464: This may not be numerically reasonable for all applications.
2465: */
2466: PREFETCH_NTA(aa+16*ai[1]);
2468: VecGetArray(bb,&b);
2469: VecGetArray(xx,&x);
2470: {
2471: /* x will first be computed in single precision then promoted inplace to double */
2472: MatScalar *v,*t=(MatScalar *)x;
2473: int nz,i,idt,ai16;
2474: int jdx,idx;
2475: int *vi;
2476: /* Forward solve the lower triangular factor. */
2478: /* First block is the identity. */
2479: idx = 0;
2480: CONVERT_DOUBLE4_FLOAT4(t,b);
2481: v = aa + 16*ai[1];
2483: for (i=1; i<n;) {
2484: PREFETCH_NTA(&v[8]);
2485: vi = aj + ai[i];
2486: nz = diag[i] - ai[i];
2487: idx += 4;
2489: /* Demote RHS from double to float. */
2490: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2491: LOAD_PS(&t[idx],XMM7);
2493: while (nz--) {
2494: PREFETCH_NTA(&v[16]);
2495: jdx = 4*(*vi++);
2496: /* jdx = *vi++; */
2497:
2498: /* 4x4 Matrix-Vector product with negative accumulation: */
2499: SSE_INLINE_BEGIN_2(&t[jdx],v)
2500: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2502: /* First Column */
2503: SSE_COPY_PS(XMM0,XMM6)
2504: SSE_SHUFFLE(XMM0,XMM0,0x00)
2505: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2506: SSE_SUB_PS(XMM7,XMM0)
2508: /* Second Column */
2509: SSE_COPY_PS(XMM1,XMM6)
2510: SSE_SHUFFLE(XMM1,XMM1,0x55)
2511: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2512: SSE_SUB_PS(XMM7,XMM1)
2514: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2515:
2516: /* Third Column */
2517: SSE_COPY_PS(XMM2,XMM6)
2518: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2519: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2520: SSE_SUB_PS(XMM7,XMM2)
2522: /* Fourth Column */
2523: SSE_COPY_PS(XMM3,XMM6)
2524: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2525: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2526: SSE_SUB_PS(XMM7,XMM3)
2527: SSE_INLINE_END_2
2528:
2529: v += 16;
2530: }
2531: v = aa + 16*ai[++i];
2532: PREFETCH_NTA(v);
2533: STORE_PS(&t[idx],XMM7);
2534: }
2536: /* Backward solve the upper triangular factor.*/
2538: idt = 4*(n-1);
2539: ai16 = 16*diag[n-1];
2540: v = aa + ai16 + 16;
2541: for (i=n-1; i>=0;){
2542: PREFETCH_NTA(&v[8]);
2543: vi = aj + diag[i] + 1;
2544: nz = ai[i+1] - diag[i] - 1;
2545:
2546: LOAD_PS(&t[idt],XMM7);
2548: while (nz--) {
2549: PREFETCH_NTA(&v[16]);
2550: idx = 4*(*vi++);
2551: /* idx = *vi++; */
2553: /* 4x4 Matrix-Vector Product with negative accumulation: */
2554: SSE_INLINE_BEGIN_2(&t[idx],v)
2555: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2557: /* First Column */
2558: SSE_COPY_PS(XMM0,XMM6)
2559: SSE_SHUFFLE(XMM0,XMM0,0x00)
2560: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2561: SSE_SUB_PS(XMM7,XMM0)
2563: /* Second Column */
2564: SSE_COPY_PS(XMM1,XMM6)
2565: SSE_SHUFFLE(XMM1,XMM1,0x55)
2566: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2567: SSE_SUB_PS(XMM7,XMM1)
2569: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2570:
2571: /* Third Column */
2572: SSE_COPY_PS(XMM2,XMM6)
2573: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2574: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2575: SSE_SUB_PS(XMM7,XMM2)
2577: /* Fourth Column */
2578: SSE_COPY_PS(XMM3,XMM6)
2579: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2580: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2581: SSE_SUB_PS(XMM7,XMM3)
2582: SSE_INLINE_END_2
2583: v += 16;
2584: }
2585: v = aa + ai16;
2586: ai16 = 16*diag[--i];
2587: PREFETCH_NTA(aa+ai16+16);
2588: /*
2589: Scale the result by the diagonal 4x4 block,
2590: which was inverted as part of the factorization
2591: */
2592: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2593: /* First Column */
2594: SSE_COPY_PS(XMM0,XMM7)
2595: SSE_SHUFFLE(XMM0,XMM0,0x00)
2596: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2598: /* Second Column */
2599: SSE_COPY_PS(XMM1,XMM7)
2600: SSE_SHUFFLE(XMM1,XMM1,0x55)
2601: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2602: SSE_ADD_PS(XMM0,XMM1)
2604: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2605:
2606: /* Third Column */
2607: SSE_COPY_PS(XMM2,XMM7)
2608: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2609: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2610: SSE_ADD_PS(XMM0,XMM2)
2612: /* Fourth Column */
2613: SSE_COPY_PS(XMM3,XMM7)
2614: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2615: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2616: SSE_ADD_PS(XMM0,XMM3)
2618: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2619: SSE_INLINE_END_3
2621: v = aa + ai16 + 16;
2622: idt -= 4;
2623: }
2625: /* Convert t from single precision back to double precision (inplace)*/
2626: idt = 4*(n-1);
2627: for (i=n-1;i>=0;i--) {
2628: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2629: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2630: PetscScalar *xtemp=&x[idt];
2631: MatScalar *ttemp=&t[idt];
2632: xtemp[3] = (PetscScalar)ttemp[3];
2633: xtemp[2] = (PetscScalar)ttemp[2];
2634: xtemp[1] = (PetscScalar)ttemp[1];
2635: xtemp[0] = (PetscScalar)ttemp[0];
2636: idt -= 4;
2637: }
2639: } /* End of artificial scope. */
2640: VecRestoreArray(bb,&b);
2641: VecRestoreArray(xx,&x);
2642: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2643: SSE_SCOPE_END;
2644: return(0);
2645: }
2647: #endif
2650: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2651: {
2652: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2653: IS iscol=a->col,isrow=a->row;
2654: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2655: int *diag = a->diag;
2656: MatScalar *aa=a->a,*v;
2657: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t;
2660: VecGetArray(bb,&b);
2661: VecGetArray(xx,&x);
2662: t = a->solve_work;
2664: ISGetIndices(isrow,&rout); r = rout;
2665: ISGetIndices(iscol,&cout); c = cout + (n-1);
2667: /* forward solve the lower triangular */
2668: idx = 3*(*r++);
2669: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2670: for (i=1; i<n; i++) {
2671: v = aa + 9*ai[i];
2672: vi = aj + ai[i];
2673: nz = diag[i] - ai[i];
2674: idx = 3*(*r++);
2675: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2676: while (nz--) {
2677: idx = 3*(*vi++);
2678: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2679: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2680: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2681: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2682: v += 9;
2683: }
2684: idx = 3*i;
2685: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2686: }
2687: /* backward solve the upper triangular */
2688: for (i=n-1; i>=0; i--){
2689: v = aa + 9*diag[i] + 9;
2690: vi = aj + diag[i] + 1;
2691: nz = ai[i+1] - diag[i] - 1;
2692: idt = 3*i;
2693: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2694: while (nz--) {
2695: idx = 3*(*vi++);
2696: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2697: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2698: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2699: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2700: v += 9;
2701: }
2702: idc = 3*(*c--);
2703: v = aa + 9*diag[i];
2704: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2705: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2706: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2707: }
2708: ISRestoreIndices(isrow,&rout);
2709: ISRestoreIndices(iscol,&cout);
2710: VecRestoreArray(bb,&b);
2711: VecRestoreArray(xx,&x);
2712: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2713: return(0);
2714: }
2716: /*
2717: Special case where the matrix was ILU(0) factored in the natural
2718: ordering. This eliminates the need for the column and row permutation.
2719: */
2722: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2723: {
2724: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2725: int n=a->mbs,*ai=a->i,*aj=a->j;
2726: int ierr,*diag = a->diag;
2727: MatScalar *aa=a->a,*v;
2728: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3;
2729: int jdx,idt,idx,nz,*vi,i;
2732: VecGetArray(bb,&b);
2733: VecGetArray(xx,&x);
2736: /* forward solve the lower triangular */
2737: idx = 0;
2738: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2739: for (i=1; i<n; i++) {
2740: v = aa + 9*ai[i];
2741: vi = aj + ai[i];
2742: nz = diag[i] - ai[i];
2743: idx += 3;
2744: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2745: while (nz--) {
2746: jdx = 3*(*vi++);
2747: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2748: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2749: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2750: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2751: v += 9;
2752: }
2753: x[idx] = s1;
2754: x[1+idx] = s2;
2755: x[2+idx] = s3;
2756: }
2757: /* backward solve the upper triangular */
2758: for (i=n-1; i>=0; i--){
2759: v = aa + 9*diag[i] + 9;
2760: vi = aj + diag[i] + 1;
2761: nz = ai[i+1] - diag[i] - 1;
2762: idt = 3*i;
2763: s1 = x[idt]; s2 = x[1+idt];
2764: s3 = x[2+idt];
2765: while (nz--) {
2766: idx = 3*(*vi++);
2767: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2768: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2769: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2770: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2771: v += 9;
2772: }
2773: v = aa + 9*diag[i];
2774: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2775: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2776: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2777: }
2779: VecRestoreArray(bb,&b);
2780: VecRestoreArray(xx,&x);
2781: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2782: return(0);
2783: }
2787: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2788: {
2789: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2790: IS iscol=a->col,isrow=a->row;
2791: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2792: int *diag = a->diag;
2793: MatScalar *aa=a->a,*v;
2794: PetscScalar *x,*b,s1,s2,x1,x2,*t;
2797: VecGetArray(bb,&b);
2798: VecGetArray(xx,&x);
2799: t = a->solve_work;
2801: ISGetIndices(isrow,&rout); r = rout;
2802: ISGetIndices(iscol,&cout); c = cout + (n-1);
2804: /* forward solve the lower triangular */
2805: idx = 2*(*r++);
2806: t[0] = b[idx]; t[1] = b[1+idx];
2807: for (i=1; i<n; i++) {
2808: v = aa + 4*ai[i];
2809: vi = aj + ai[i];
2810: nz = diag[i] - ai[i];
2811: idx = 2*(*r++);
2812: s1 = b[idx]; s2 = b[1+idx];
2813: while (nz--) {
2814: idx = 2*(*vi++);
2815: x1 = t[idx]; x2 = t[1+idx];
2816: s1 -= v[0]*x1 + v[2]*x2;
2817: s2 -= v[1]*x1 + v[3]*x2;
2818: v += 4;
2819: }
2820: idx = 2*i;
2821: t[idx] = s1; t[1+idx] = s2;
2822: }
2823: /* backward solve the upper triangular */
2824: for (i=n-1; i>=0; i--){
2825: v = aa + 4*diag[i] + 4;
2826: vi = aj + diag[i] + 1;
2827: nz = ai[i+1] - diag[i] - 1;
2828: idt = 2*i;
2829: s1 = t[idt]; s2 = t[1+idt];
2830: while (nz--) {
2831: idx = 2*(*vi++);
2832: x1 = t[idx]; x2 = t[1+idx];
2833: s1 -= v[0]*x1 + v[2]*x2;
2834: s2 -= v[1]*x1 + v[3]*x2;
2835: v += 4;
2836: }
2837: idc = 2*(*c--);
2838: v = aa + 4*diag[i];
2839: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2840: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2841: }
2842: ISRestoreIndices(isrow,&rout);
2843: ISRestoreIndices(iscol,&cout);
2844: VecRestoreArray(bb,&b);
2845: VecRestoreArray(xx,&x);
2846: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2847: return(0);
2848: }
2850: /*
2851: Special case where the matrix was ILU(0) factored in the natural
2852: ordering. This eliminates the need for the column and row permutation.
2853: */
2856: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2857: {
2858: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2859: int n=a->mbs,*ai=a->i,*aj=a->j;
2860: int ierr,*diag = a->diag;
2861: MatScalar *aa=a->a,*v;
2862: PetscScalar *x,*b,s1,s2,x1,x2;
2863: int jdx,idt,idx,nz,*vi,i;
2866: VecGetArray(bb,&b);
2867: VecGetArray(xx,&x);
2869: /* forward solve the lower triangular */
2870: idx = 0;
2871: x[0] = b[0]; x[1] = b[1];
2872: for (i=1; i<n; i++) {
2873: v = aa + 4*ai[i];
2874: vi = aj + ai[i];
2875: nz = diag[i] - ai[i];
2876: idx += 2;
2877: s1 = b[idx];s2 = b[1+idx];
2878: while (nz--) {
2879: jdx = 2*(*vi++);
2880: x1 = x[jdx];x2 = x[1+jdx];
2881: s1 -= v[0]*x1 + v[2]*x2;
2882: s2 -= v[1]*x1 + v[3]*x2;
2883: v += 4;
2884: }
2885: x[idx] = s1;
2886: x[1+idx] = s2;
2887: }
2888: /* backward solve the upper triangular */
2889: for (i=n-1; i>=0; i--){
2890: v = aa + 4*diag[i] + 4;
2891: vi = aj + diag[i] + 1;
2892: nz = ai[i+1] - diag[i] - 1;
2893: idt = 2*i;
2894: s1 = x[idt]; s2 = x[1+idt];
2895: while (nz--) {
2896: idx = 2*(*vi++);
2897: x1 = x[idx]; x2 = x[1+idx];
2898: s1 -= v[0]*x1 + v[2]*x2;
2899: s2 -= v[1]*x1 + v[3]*x2;
2900: v += 4;
2901: }
2902: v = aa + 4*diag[i];
2903: x[idt] = v[0]*s1 + v[2]*s2;
2904: x[1+idt] = v[1]*s1 + v[3]*s2;
2905: }
2907: VecRestoreArray(bb,&b);
2908: VecRestoreArray(xx,&x);
2909: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2910: return(0);
2911: }
2915: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2916: {
2917: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2918: IS iscol=a->col,isrow=a->row;
2919: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2920: int *diag = a->diag;
2921: MatScalar *aa=a->a,*v;
2922: PetscScalar *x,*b,s1,*t;
2925: if (!n) return(0);
2927: VecGetArray(bb,&b);
2928: VecGetArray(xx,&x);
2929: t = a->solve_work;
2931: ISGetIndices(isrow,&rout); r = rout;
2932: ISGetIndices(iscol,&cout); c = cout + (n-1);
2934: /* forward solve the lower triangular */
2935: t[0] = b[*r++];
2936: for (i=1; i<n; i++) {
2937: v = aa + ai[i];
2938: vi = aj + ai[i];
2939: nz = diag[i] - ai[i];
2940: s1 = b[*r++];
2941: while (nz--) {
2942: s1 -= (*v++)*t[*vi++];
2943: }
2944: t[i] = s1;
2945: }
2946: /* backward solve the upper triangular */
2947: for (i=n-1; i>=0; i--){
2948: v = aa + diag[i] + 1;
2949: vi = aj + diag[i] + 1;
2950: nz = ai[i+1] - diag[i] - 1;
2951: s1 = t[i];
2952: while (nz--) {
2953: s1 -= (*v++)*t[*vi++];
2954: }
2955: x[*c--] = t[i] = aa[diag[i]]*s1;
2956: }
2958: ISRestoreIndices(isrow,&rout);
2959: ISRestoreIndices(iscol,&cout);
2960: VecRestoreArray(bb,&b);
2961: VecRestoreArray(xx,&x);
2962: PetscLogFlops(2*1*(a->nz) - A->n);
2963: return(0);
2964: }
2965: /*
2966: Special case where the matrix was ILU(0) factored in the natural
2967: ordering. This eliminates the need for the column and row permutation.
2968: */
2971: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2972: {
2973: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2974: int n=a->mbs,*ai=a->i,*aj=a->j;
2975: int ierr,*diag = a->diag;
2976: MatScalar *aa=a->a;
2977: PetscScalar *x,*b;
2978: PetscScalar s1,x1;
2979: MatScalar *v;
2980: int jdx,idt,idx,nz,*vi,i;
2983: VecGetArray(bb,&b);
2984: VecGetArray(xx,&x);
2986: /* forward solve the lower triangular */
2987: idx = 0;
2988: x[0] = b[0];
2989: for (i=1; i<n; i++) {
2990: v = aa + ai[i];
2991: vi = aj + ai[i];
2992: nz = diag[i] - ai[i];
2993: idx += 1;
2994: s1 = b[idx];
2995: while (nz--) {
2996: jdx = *vi++;
2997: x1 = x[jdx];
2998: s1 -= v[0]*x1;
2999: v += 1;
3000: }
3001: x[idx] = s1;
3002: }
3003: /* backward solve the upper triangular */
3004: for (i=n-1; i>=0; i--){
3005: v = aa + diag[i] + 1;
3006: vi = aj + diag[i] + 1;
3007: nz = ai[i+1] - diag[i] - 1;
3008: idt = i;
3009: s1 = x[idt];
3010: while (nz--) {
3011: idx = *vi++;
3012: x1 = x[idx];
3013: s1 -= v[0]*x1;
3014: v += 1;
3015: }
3016: v = aa + diag[i];
3017: x[idt] = v[0]*s1;
3018: }
3019: VecRestoreArray(bb,&b);
3020: VecRestoreArray(xx,&x);
3021: PetscLogFlops(2*(a->nz) - A->n);
3022: return(0);
3023: }
3025: /* ----------------------------------------------------------------*/
3026: /*
3027: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3028: except that the data structure of Mat_SeqAIJ is slightly different.
3029: Not a good example of code reuse.
3030: */
3031: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3035: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3036: {
3037: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3038: IS isicol;
3039: int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3040: int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3041: int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3042: int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3043: PetscTruth col_identity,row_identity;
3044: PetscReal f;
3047: f = info->fill;
3048: levels = (int)info->levels;
3049: diagonal_fill = (int)info->diagonal_fill;
3050: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3051: ISIdentity(isrow,&row_identity);
3052: ISIdentity(iscol,&col_identity);
3054: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
3055: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3056: (*fact)->factor = FACTOR_LU;
3057: b = (Mat_SeqBAIJ*)(*fact)->data;
3058: if (!b->diag) {
3059: MatMarkDiagonal_SeqBAIJ(*fact);
3060: }
3061: MatMissingDiagonal_SeqBAIJ(*fact);
3062: b->row = isrow;
3063: b->col = iscol;
3064: PetscObjectReference((PetscObject)isrow);
3065: PetscObjectReference((PetscObject)iscol);
3066: b->icol = isicol;
3067: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3068: PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
3069: } else { /* general case perform the symbolic factorization */
3070: ISGetIndices(isrow,&r);
3071: ISGetIndices(isicol,&ic);
3073: /* get new row pointers */
3074: PetscMalloc((n+1)*sizeof(int),&ainew);
3075: ainew[0] = 0;
3076: /* don't know how many column pointers are needed so estimate */
3077: jmax = (int)(f*ai[n] + 1);
3078: PetscMalloc((jmax)*sizeof(int),&ajnew);
3079: /* ajfill is level of fill for each fill entry */
3080: PetscMalloc((jmax)*sizeof(int),&ajfill);
3081: /* fill is a linked list of nonzeros in active row */
3082: PetscMalloc((n+1)*sizeof(int),&fill);
3083: /* im is level for each filled value */
3084: PetscMalloc((n+1)*sizeof(int),&im);
3085: /* dloc is location of diagonal in factor */
3086: PetscMalloc((n+1)*sizeof(int),&dloc);
3087: dloc[0] = 0;
3088: for (prow=0; prow<n; prow++) {
3090: /* copy prow into linked list */
3091: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3092: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3093: xi = aj + ai[r[prow]];
3094: fill[n] = n;
3095: fill[prow] = -1; /* marker for diagonal entry */
3096: while (nz--) {
3097: fm = n;
3098: idx = ic[*xi++];
3099: do {
3100: m = fm;
3101: fm = fill[m];
3102: } while (fm < idx);
3103: fill[m] = idx;
3104: fill[idx] = fm;
3105: im[idx] = 0;
3106: }
3108: /* make sure diagonal entry is included */
3109: if (diagonal_fill && fill[prow] == -1) {
3110: fm = n;
3111: while (fill[fm] < prow) fm = fill[fm];
3112: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3113: fill[fm] = prow;
3114: im[prow] = 0;
3115: nzf++;
3116: dcount++;
3117: }
3119: nzi = 0;
3120: row = fill[n];
3121: while (row < prow) {
3122: incrlev = im[row] + 1;
3123: nz = dloc[row];
3124: xi = ajnew + ainew[row] + nz + 1;
3125: flev = ajfill + ainew[row] + nz + 1;
3126: nnz = ainew[row+1] - ainew[row] - nz - 1;
3127: fm = row;
3128: while (nnz-- > 0) {
3129: idx = *xi++;
3130: if (*flev + incrlev > levels) {
3131: flev++;
3132: continue;
3133: }
3134: do {
3135: m = fm;
3136: fm = fill[m];
3137: } while (fm < idx);
3138: if (fm != idx) {
3139: im[idx] = *flev + incrlev;
3140: fill[m] = idx;
3141: fill[idx] = fm;
3142: fm = idx;
3143: nzf++;
3144: } else {
3145: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3146: }
3147: flev++;
3148: }
3149: row = fill[row];
3150: nzi++;
3151: }
3152: /* copy new filled row into permanent storage */
3153: ainew[prow+1] = ainew[prow] + nzf;
3154: if (ainew[prow+1] > jmax) {
3156: /* estimate how much additional space we will need */
3157: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3158: /* just double the memory each time */
3159: int maxadd = jmax;
3160: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3161: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3162: jmax += maxadd;
3164: /* allocate a longer ajnew and ajfill */
3165: PetscMalloc(jmax*sizeof(int),&xi);
3166: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
3167: PetscFree(ajnew);
3168: ajnew = xi;
3169: PetscMalloc(jmax*sizeof(int),&xi);
3170: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
3171: PetscFree(ajfill);
3172: ajfill = xi;
3173: reallocate++; /* count how many reallocations are needed */
3174: }
3175: xi = ajnew + ainew[prow];
3176: flev = ajfill + ainew[prow];
3177: dloc[prow] = nzi;
3178: fm = fill[n];
3179: while (nzf--) {
3180: *xi++ = fm;
3181: *flev++ = im[fm];
3182: fm = fill[fm];
3183: }
3184: /* make sure row has diagonal entry */
3185: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3186: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3187: try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3188: }
3189: }
3190: PetscFree(ajfill);
3191: ISRestoreIndices(isrow,&r);
3192: ISRestoreIndices(isicol,&ic);
3193: PetscFree(fill);
3194: PetscFree(im);
3196: {
3197: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3198: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3199: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3200: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3201: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3202: if (diagonal_fill) {
3203: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3204: }
3205: }
3207: /* put together the new matrix */
3208: MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);
3209: MatSetType(*fact,A->type_name);
3210: MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);
3211: PetscLogObjectParent(*fact,isicol);
3212: b = (Mat_SeqBAIJ*)(*fact)->data;
3213: PetscFree(b->imax);
3214: b->singlemalloc = PETSC_FALSE;
3215: len = bs2*ainew[n]*sizeof(MatScalar);
3216: /* the next line frees the default space generated by the Create() */
3217: PetscFree(b->a);
3218: PetscFree(b->ilen);
3219: PetscMalloc(len,&b->a);
3220: b->j = ajnew;
3221: b->i = ainew;
3222: for (i=0; i<n; i++) dloc[i] += ainew[i];
3223: b->diag = dloc;
3224: b->ilen = 0;
3225: b->imax = 0;
3226: b->row = isrow;
3227: b->col = iscol;
3228: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229: PetscObjectReference((PetscObject)isrow);
3230: PetscObjectReference((PetscObject)iscol);
3231: b->icol = isicol;
3232: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3233: /* In b structure: Free imax, ilen, old a, old j.
3234: Allocate dloc, solve_work, new a, new j */
3235: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3236: b->maxnz = b->nz = ainew[n];
3237: (*fact)->factor = FACTOR_LU;
3239: (*fact)->info.factor_mallocs = reallocate;
3240: (*fact)->info.fill_ratio_given = f;
3241: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242: }
3244: if (row_identity && col_identity) {
3245: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3246: }
3247: return(0);
3248: }
3252: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3253: {
3254: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3255: /* int i,*AJ=a->j,nz=a->nz; */
3257: /* Undo Column scaling */
3258: /* while (nz--) { */
3259: /* AJ[i] = AJ[i]/4; */
3260: /* } */
3261: /* This should really invoke a push/pop logic, but we don't have that yet. */
3262: A->ops->setunfactored = PETSC_NULL;
3263: return(0);
3264: }
3268: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3269: {
3270: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3271: int *AJ=a->j,nz=a->nz;
3272: unsigned short *aj=(unsigned short *)AJ;
3274: /* Is this really necessary? */
3275: while (nz--) {
3276: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3277: }
3278: A->ops->setunfactored = PETSC_NULL;
3279: return(0);
3280: }
3284: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3285: {
3286: /*
3287: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3288: with natural ordering
3289: */
3290: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3293: inA->ops->solve = MatSolve_SeqBAIJ_Update;
3294: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
3295: switch (a->bs) {
3296: case 1:
3297: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299: break;
3300: case 2:
3301: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303: break;
3304: case 3:
3305: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307: break;
3308: case 4:
3309: #if defined(PETSC_USE_MAT_SINGLE)
3310: {
3311: PetscTruth sse_enabled_local;
3312: int ierr;
3313: PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3314: if (sse_enabled_local) {
3315: # if defined(PETSC_HAVE_SSE)
3316: int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3317: if (n==(unsigned short)n) {
3318: unsigned short *aj=(unsigned short *)AJ;
3319: for (i=0;i<nz;i++) {
3320: aj[i] = (unsigned short)AJ[i];
3321: }
3322: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3324: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3325: } else {
3326: /* Scale the column indices for easier indexing in MatSolve. */
3327: /* for (i=0;i<nz;i++) { */
3328: /* AJ[i] = AJ[i]*4; */
3329: /* } */
3330: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3331: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3332: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3333: }
3334: # else
3335: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
3336: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337: # endif
3338: } else {
3339: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3341: }
3342: }
3343: #else
3344: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346: #endif
3347: break;
3348: case 5:
3349: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351: break;
3352: case 6:
3353: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355: break;
3356: case 7:
3357: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359: break;
3360: }
3361: return(0);
3362: }
3366: int MatSeqBAIJ_UpdateSolvers(Mat A)
3367: {
3368: /*
3369: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3370: with natural ordering
3371: */
3372: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3373: IS row = a->row, col = a->col;
3374: PetscTruth row_identity, col_identity;
3375: PetscTruth use_natural;
3376: int ierr;
3380: use_natural = PETSC_FALSE;
3381: if (row && col) {
3382: ISIdentity(row,&row_identity);
3383: ISIdentity(col,&col_identity);
3385: if (row_identity && col_identity) {
3386: use_natural = PETSC_TRUE;
3387: }
3388: } else {
3389: use_natural = PETSC_TRUE;
3390: }
3392: switch (a->bs) {
3393: case 1:
3394: if (use_natural) {
3395: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
3396: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3397: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3398: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3399: } else {
3400: A->ops->solve = MatSolve_SeqBAIJ_1;
3401: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
3402: }
3403: break;
3404: case 2:
3405: if (use_natural) {
3406: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
3407: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3408: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3409: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3410: } else {
3411: A->ops->solve = MatSolve_SeqBAIJ_2;
3412: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
3413: }
3414: break;
3415: case 3:
3416: if (use_natural) {
3417: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
3418: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3419: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3420: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3421: } else {
3422: A->ops->solve = MatSolve_SeqBAIJ_3;
3423: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
3424: }
3425: break;
3426: case 4:
3427: {
3428: PetscTruth sse_enabled_local;
3429: PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3430: if (use_natural) {
3431: #if defined(PETSC_USE_MAT_SINGLE)
3432: if (sse_enabled_local) { /* Natural + Single + SSE */
3433: # if defined(PETSC_HAVE_SSE)
3434: int n=a->mbs;
3435: if (n==(unsigned short)n) {
3436: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3437: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3438: } else {
3439: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3440: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3441: }
3442: # else
3443: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3444: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3445: # endif
3446: } else { /* Natural + Single */
3447: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3448: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3449: }
3450: #else
3451: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
3452: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3453: #endif
3454: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3455: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3456: } else { /* Arbitrary ordering */
3457: #if defined(PETSC_USE_MAT_SINGLE)
3458: if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3459: # if defined(PETSC_HAVE_SSE)
3460: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
3461: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3462: # else
3463: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3464: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3465: # endif
3466: } else { /* Arbitrary + Single */
3467: A->ops->solve = MatSolve_SeqBAIJ_4_Demotion;
3468: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3469: }
3470: #else
3471: A->ops->solve = MatSolve_SeqBAIJ_4;
3472: #endif
3473: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
3474: }
3475: }
3476: break;
3477: case 5:
3478: if (use_natural) {
3479: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
3480: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3481: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3482: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3483: } else {
3484: A->ops->solve = MatSolve_SeqBAIJ_5;
3485: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
3486: }
3487: break;
3488: case 6:
3489: if (use_natural) {
3490: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
3491: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3492: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3493: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3494: } else {
3495: A->ops->solve = MatSolve_SeqBAIJ_6;
3496: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
3497: }
3498: break;
3499: case 7:
3500: if (use_natural) {
3501: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
3502: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3503: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3504: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3505: } else {
3506: A->ops->solve = MatSolve_SeqBAIJ_7;
3507: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3508: }
3509: break;
3510: default:
3511: A->ops->solve = MatSolve_SeqBAIJ_N;
3512: break;
3513: }
3514: return(0);
3515: }
3519: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3523: MatSeqBAIJ_UpdateSolvers(A);
3524: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3525: (*A->ops->solve)(A,x,y);
3526: } else {
3527: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3528: }
3529: return(0);
3530: }
3534: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3538: MatSeqBAIJ_UpdateSolvers(A);
3539: (*A->ops->solvetranspose)(A,x,y);
3540: return(0);
3541: }