Actual source code: baijfact2.c

  1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/inline/ilu.h
 8:  #include src/inline/dot.h

 12: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 13: {
 14:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 15:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 16:   int             *diag = a->diag;
 17:   MatScalar       *aa=a->a,*v;
 18:   PetscScalar     s1,*x,*b;

 21:   VecCopy(bb,xx);
 22:   VecGetArray(bb,&b);
 23:   VecGetArray(xx,&x);
 24: 
 25:   /* forward solve the U^T */
 26:   for (i=0; i<n; i++) {

 28:     v     = aa + diag[i];
 29:     /* multiply by the inverse of the block diagonal */
 30:     s1    = (*v++)*x[i];
 31:     vi    = aj + diag[i] + 1;
 32:     nz    = ai[i+1] - diag[i] - 1;
 33:     while (nz--) {
 34:       x[*vi++]  -= (*v++)*s1;
 35:     }
 36:     x[i]   = s1;
 37:   }
 38:   /* backward solve the L^T */
 39:   for (i=n-1; i>=0; i--){
 40:     v    = aa + diag[i] - 1;
 41:     vi   = aj + diag[i] - 1;
 42:     nz   = diag[i] - ai[i];
 43:     s1   = x[i];
 44:     while (nz--) {
 45:       x[*vi--]   -=  (*v--)*s1;
 46:     }
 47:   }
 48:   VecRestoreArray(bb,&b);
 49:   VecRestoreArray(xx,&x);
 50:   PetscLogFlops(2*(a->nz) - A->n);
 51:   return(0);
 52: }

 56: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 57: {
 58:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 59:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 60:   int             *diag = a->diag,oidx;
 61:   MatScalar       *aa=a->a,*v;
 62:   PetscScalar     s1,s2,x1,x2;
 63:   PetscScalar     *x,*b;

 66:   VecCopy(bb,xx);
 67:   VecGetArray(bb,&b);
 68:   VecGetArray(xx,&x);

 70:   /* forward solve the U^T */
 71:   idx = 0;
 72:   for (i=0; i<n; i++) {

 74:     v     = aa + 4*diag[i];
 75:     /* multiply by the inverse of the block diagonal */
 76:     x1 = x[idx];   x2 = x[1+idx];
 77:     s1 = v[0]*x1  +  v[1]*x2;
 78:     s2 = v[2]*x1  +  v[3]*x2;
 79:     v += 4;

 81:     vi    = aj + diag[i] + 1;
 82:     nz    = ai[i+1] - diag[i] - 1;
 83:     while (nz--) {
 84:       oidx = 2*(*vi++);
 85:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 86:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 87:       v  += 4;
 88:     }
 89:     x[idx]   = s1;x[1+idx] = s2;
 90:     idx += 2;
 91:   }
 92:   /* backward solve the L^T */
 93:   for (i=n-1; i>=0; i--){
 94:     v    = aa + 4*diag[i] - 4;
 95:     vi   = aj + diag[i] - 1;
 96:     nz   = diag[i] - ai[i];
 97:     idt  = 2*i;
 98:     s1   = x[idt];  s2 = x[1+idt];
 99:     while (nz--) {
100:       idx   = 2*(*vi--);
101:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
102:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
103:       v -= 4;
104:     }
105:   }
106:   VecRestoreArray(bb,&b);
107:   VecRestoreArray(xx,&x);
108:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
109:   return(0);
110: }

114: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115: {
116:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
117:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118:   int             *diag = a->diag,oidx;
119:   MatScalar       *aa=a->a,*v;
120:   PetscScalar     s1,s2,s3,x1,x2,x3;
121:   PetscScalar     *x,*b;

124:   VecCopy(bb,xx);
125:   VecGetArray(bb,&b);
126:   VecGetArray(xx,&x);

128:   /* forward solve the U^T */
129:   idx = 0;
130:   for (i=0; i<n; i++) {

132:     v     = aa + 9*diag[i];
133:     /* multiply by the inverse of the block diagonal */
134:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
135:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
136:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
137:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
138:     v += 9;

140:     vi    = aj + diag[i] + 1;
141:     nz    = ai[i+1] - diag[i] - 1;
142:     while (nz--) {
143:       oidx = 3*(*vi++);
144:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
145:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
146:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
147:       v  += 9;
148:     }
149:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
150:     idx += 3;
151:   }
152:   /* backward solve the L^T */
153:   for (i=n-1; i>=0; i--){
154:     v    = aa + 9*diag[i] - 9;
155:     vi   = aj + diag[i] - 1;
156:     nz   = diag[i] - ai[i];
157:     idt  = 3*i;
158:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
159:     while (nz--) {
160:       idx   = 3*(*vi--);
161:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
162:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
163:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
164:       v -= 9;
165:     }
166:   }
167:   VecRestoreArray(bb,&b);
168:   VecRestoreArray(xx,&x);
169:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
170:   return(0);
171: }

175: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
176: {
177:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
178:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
179:   int             *diag = a->diag,oidx;
180:   MatScalar       *aa=a->a,*v;
181:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
182:   PetscScalar     *x,*b;

185:   VecCopy(bb,xx);
186:   VecGetArray(bb,&b);
187:   VecGetArray(xx,&x);

189:   /* forward solve the U^T */
190:   idx = 0;
191:   for (i=0; i<n; i++) {

193:     v     = aa + 16*diag[i];
194:     /* multiply by the inverse of the block diagonal */
195:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
196:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
197:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
198:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
199:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
200:     v += 16;

202:     vi    = aj + diag[i] + 1;
203:     nz    = ai[i+1] - diag[i] - 1;
204:     while (nz--) {
205:       oidx = 4*(*vi++);
206:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
207:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
208:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
209:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
210:       v  += 16;
211:     }
212:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
213:     idx += 4;
214:   }
215:   /* backward solve the L^T */
216:   for (i=n-1; i>=0; i--){
217:     v    = aa + 16*diag[i] - 16;
218:     vi   = aj + diag[i] - 1;
219:     nz   = diag[i] - ai[i];
220:     idt  = 4*i;
221:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
222:     while (nz--) {
223:       idx   = 4*(*vi--);
224:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
225:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
226:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
227:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
228:       v -= 16;
229:     }
230:   }
231:   VecRestoreArray(bb,&b);
232:   VecRestoreArray(xx,&x);
233:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
234:   return(0);
235: }

239: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
240: {
241:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
242:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
243:   int             *diag = a->diag,oidx;
244:   MatScalar       *aa=a->a,*v;
245:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
246:   PetscScalar     *x,*b;

249:   VecCopy(bb,xx);
250:   VecGetArray(bb,&b);
251:   VecGetArray(xx,&x);

253:   /* forward solve the U^T */
254:   idx = 0;
255:   for (i=0; i<n; i++) {

257:     v     = aa + 25*diag[i];
258:     /* multiply by the inverse of the block diagonal */
259:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
260:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
261:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
262:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
263:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
264:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
265:     v += 25;

267:     vi    = aj + diag[i] + 1;
268:     nz    = ai[i+1] - diag[i] - 1;
269:     while (nz--) {
270:       oidx = 5*(*vi++);
271:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
272:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
273:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
274:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
275:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
276:       v  += 25;
277:     }
278:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
279:     idx += 5;
280:   }
281:   /* backward solve the L^T */
282:   for (i=n-1; i>=0; i--){
283:     v    = aa + 25*diag[i] - 25;
284:     vi   = aj + diag[i] - 1;
285:     nz   = diag[i] - ai[i];
286:     idt  = 5*i;
287:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
288:     while (nz--) {
289:       idx   = 5*(*vi--);
290:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
291:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
292:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
293:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
294:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
295:       v -= 25;
296:     }
297:   }
298:   VecRestoreArray(bb,&b);
299:   VecRestoreArray(xx,&x);
300:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
301:   return(0);
302: }

306: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
307: {
308:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
309:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310:   int             *diag = a->diag,oidx;
311:   MatScalar       *aa=a->a,*v;
312:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
313:   PetscScalar     *x,*b;

316:   VecCopy(bb,xx);
317:   VecGetArray(bb,&b);
318:   VecGetArray(xx,&x);

320:   /* forward solve the U^T */
321:   idx = 0;
322:   for (i=0; i<n; i++) {

324:     v     = aa + 36*diag[i];
325:     /* multiply by the inverse of the block diagonal */
326:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327:     x6    = x[5+idx];
328:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
329:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
330:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
331:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
332:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
333:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
334:     v += 36;

336:     vi    = aj + diag[i] + 1;
337:     nz    = ai[i+1] - diag[i] - 1;
338:     while (nz--) {
339:       oidx = 6*(*vi++);
340:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
341:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
342:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
343:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
344:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
345:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
346:       v  += 36;
347:     }
348:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
349:     x[5+idx] = s6;
350:     idx += 6;
351:   }
352:   /* backward solve the L^T */
353:   for (i=n-1; i>=0; i--){
354:     v    = aa + 36*diag[i] - 36;
355:     vi   = aj + diag[i] - 1;
356:     nz   = diag[i] - ai[i];
357:     idt  = 6*i;
358:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
359:     s6 = x[5+idt];
360:     while (nz--) {
361:       idx   = 6*(*vi--);
362:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
363:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
364:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
365:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
366:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
367:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
368:       v -= 36;
369:     }
370:   }
371:   VecRestoreArray(bb,&b);
372:   VecRestoreArray(xx,&x);
373:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
374:   return(0);
375: }

379: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
380: {
381:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
382:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
383:   int             *diag = a->diag,oidx;
384:   MatScalar       *aa=a->a,*v;
385:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
386:   PetscScalar     *x,*b;

389:   VecCopy(bb,xx);
390:   VecGetArray(bb,&b);
391:   VecGetArray(xx,&x);

393:   /* forward solve the U^T */
394:   idx = 0;
395:   for (i=0; i<n; i++) {

397:     v     = aa + 49*diag[i];
398:     /* multiply by the inverse of the block diagonal */
399:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
400:     x6    = x[5+idx]; x7 = x[6+idx];
401:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
402:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
403:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
404:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
405:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
406:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
407:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
408:     v += 49;

410:     vi    = aj + diag[i] + 1;
411:     nz    = ai[i+1] - diag[i] - 1;
412:     while (nz--) {
413:       oidx = 7*(*vi++);
414:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
415:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
416:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
417:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
418:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
419:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
420:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
421:       v  += 49;
422:     }
423:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
424:     x[5+idx] = s6;x[6+idx] = s7;
425:     idx += 7;
426:   }
427:   /* backward solve the L^T */
428:   for (i=n-1; i>=0; i--){
429:     v    = aa + 49*diag[i] - 49;
430:     vi   = aj + diag[i] - 1;
431:     nz   = diag[i] - ai[i];
432:     idt  = 7*i;
433:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
434:     s6 = x[5+idt];s7 = x[6+idt];
435:     while (nz--) {
436:       idx   = 7*(*vi--);
437:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
438:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
439:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
440:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
441:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
442:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
443:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
444:       v -= 49;
445:     }
446:   }
447:   VecRestoreArray(bb,&b);
448:   VecRestoreArray(xx,&x);
449:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
450:   return(0);
451: }

453: /*---------------------------------------------------------------------------------------------*/
456: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
457: {
458:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
459:   IS              iscol=a->col,isrow=a->row;
460:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
461:   int             *diag = a->diag;
462:   MatScalar       *aa=a->a,*v;
463:   PetscScalar     s1,*x,*b,*t;

466:   VecGetArray(bb,&b);
467:   VecGetArray(xx,&x);
468:   t  = a->solve_work;

470:   ISGetIndices(isrow,&rout); r = rout;
471:   ISGetIndices(iscol,&cout); c = cout;

473:   /* copy the b into temp work space according to permutation */
474:   for (i=0; i<n; i++) {
475:     t[i] = b[c[i]];
476:   }

478:   /* forward solve the U^T */
479:   for (i=0; i<n; i++) {

481:     v     = aa + diag[i];
482:     /* multiply by the inverse of the block diagonal */
483:     s1    = (*v++)*t[i];
484:     vi    = aj + diag[i] + 1;
485:     nz    = ai[i+1] - diag[i] - 1;
486:     while (nz--) {
487:       t[*vi++]  -= (*v++)*s1;
488:     }
489:     t[i]   = s1;
490:   }
491:   /* backward solve the L^T */
492:   for (i=n-1; i>=0; i--){
493:     v    = aa + diag[i] - 1;
494:     vi   = aj + diag[i] - 1;
495:     nz   = diag[i] - ai[i];
496:     s1   = t[i];
497:     while (nz--) {
498:       t[*vi--]   -=  (*v--)*s1;
499:     }
500:   }

502:   /* copy t into x according to permutation */
503:   for (i=0; i<n; i++) {
504:     x[r[i]]   = t[i];
505:   }

507:   ISRestoreIndices(isrow,&rout);
508:   ISRestoreIndices(iscol,&cout);
509:   VecRestoreArray(bb,&b);
510:   VecRestoreArray(xx,&x);
511:   PetscLogFlops(2*(a->nz) - A->n);
512:   return(0);
513: }

517: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
518: {
519:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
520:   IS              iscol=a->col,isrow=a->row;
521:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
522:   int             *diag = a->diag,ii,ic,ir,oidx;
523:   MatScalar       *aa=a->a,*v;
524:   PetscScalar     s1,s2,x1,x2;
525:   PetscScalar     *x,*b,*t;

528:   VecGetArray(bb,&b);
529:   VecGetArray(xx,&x);
530:   t  = a->solve_work;

532:   ISGetIndices(isrow,&rout); r = rout;
533:   ISGetIndices(iscol,&cout); c = cout;

535:   /* copy the b into temp work space according to permutation */
536:   ii = 0;
537:   for (i=0; i<n; i++) {
538:     ic      = 2*c[i];
539:     t[ii]   = b[ic];
540:     t[ii+1] = b[ic+1];
541:     ii += 2;
542:   }

544:   /* forward solve the U^T */
545:   idx = 0;
546:   for (i=0; i<n; i++) {

548:     v     = aa + 4*diag[i];
549:     /* multiply by the inverse of the block diagonal */
550:     x1    = t[idx];   x2 = t[1+idx];
551:     s1 = v[0]*x1  +  v[1]*x2;
552:     s2 = v[2]*x1  +  v[3]*x2;
553:     v += 4;

555:     vi    = aj + diag[i] + 1;
556:     nz    = ai[i+1] - diag[i] - 1;
557:     while (nz--) {
558:       oidx = 2*(*vi++);
559:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
560:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
561:       v  += 4;
562:     }
563:     t[idx]   = s1;t[1+idx] = s2;
564:     idx += 2;
565:   }
566:   /* backward solve the L^T */
567:   for (i=n-1; i>=0; i--){
568:     v    = aa + 4*diag[i] - 4;
569:     vi   = aj + diag[i] - 1;
570:     nz   = diag[i] - ai[i];
571:     idt  = 2*i;
572:     s1 = t[idt];  s2 = t[1+idt];
573:     while (nz--) {
574:       idx   = 2*(*vi--);
575:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
576:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
577:       v -= 4;
578:     }
579:   }

581:   /* copy t into x according to permutation */
582:   ii = 0;
583:   for (i=0; i<n; i++) {
584:     ir      = 2*r[i];
585:     x[ir]   = t[ii];
586:     x[ir+1] = t[ii+1];
587:     ii += 2;
588:   }

590:   ISRestoreIndices(isrow,&rout);
591:   ISRestoreIndices(iscol,&cout);
592:   VecRestoreArray(bb,&b);
593:   VecRestoreArray(xx,&x);
594:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
595:   return(0);
596: }

600: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
601: {
602:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
603:   IS              iscol=a->col,isrow=a->row;
604:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
605:   int             *diag = a->diag,ii,ic,ir,oidx;
606:   MatScalar       *aa=a->a,*v;
607:   PetscScalar     s1,s2,s3,x1,x2,x3;
608:   PetscScalar     *x,*b,*t;

611:   VecGetArray(bb,&b);
612:   VecGetArray(xx,&x);
613:   t  = a->solve_work;

615:   ISGetIndices(isrow,&rout); r = rout;
616:   ISGetIndices(iscol,&cout); c = cout;

618:   /* copy the b into temp work space according to permutation */
619:   ii = 0;
620:   for (i=0; i<n; i++) {
621:     ic      = 3*c[i];
622:     t[ii]   = b[ic];
623:     t[ii+1] = b[ic+1];
624:     t[ii+2] = b[ic+2];
625:     ii += 3;
626:   }

628:   /* forward solve the U^T */
629:   idx = 0;
630:   for (i=0; i<n; i++) {

632:     v     = aa + 9*diag[i];
633:     /* multiply by the inverse of the block diagonal */
634:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
635:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
636:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
637:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
638:     v += 9;

640:     vi    = aj + diag[i] + 1;
641:     nz    = ai[i+1] - diag[i] - 1;
642:     while (nz--) {
643:       oidx = 3*(*vi++);
644:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
645:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
646:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
647:       v  += 9;
648:     }
649:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
650:     idx += 3;
651:   }
652:   /* backward solve the L^T */
653:   for (i=n-1; i>=0; i--){
654:     v    = aa + 9*diag[i] - 9;
655:     vi   = aj + diag[i] - 1;
656:     nz   = diag[i] - ai[i];
657:     idt  = 3*i;
658:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
659:     while (nz--) {
660:       idx   = 3*(*vi--);
661:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
662:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
663:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
664:       v -= 9;
665:     }
666:   }

668:   /* copy t into x according to permutation */
669:   ii = 0;
670:   for (i=0; i<n; i++) {
671:     ir      = 3*r[i];
672:     x[ir]   = t[ii];
673:     x[ir+1] = t[ii+1];
674:     x[ir+2] = t[ii+2];
675:     ii += 3;
676:   }

678:   ISRestoreIndices(isrow,&rout);
679:   ISRestoreIndices(iscol,&cout);
680:   VecRestoreArray(bb,&b);
681:   VecRestoreArray(xx,&x);
682:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
683:   return(0);
684: }

688: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
689: {
690:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
691:   IS              iscol=a->col,isrow=a->row;
692:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
693:   int             *diag = a->diag,ii,ic,ir,oidx;
694:   MatScalar       *aa=a->a,*v;
695:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
696:   PetscScalar     *x,*b,*t;

699:   VecGetArray(bb,&b);
700:   VecGetArray(xx,&x);
701:   t  = a->solve_work;

703:   ISGetIndices(isrow,&rout); r = rout;
704:   ISGetIndices(iscol,&cout); c = cout;

706:   /* copy the b into temp work space according to permutation */
707:   ii = 0;
708:   for (i=0; i<n; i++) {
709:     ic      = 4*c[i];
710:     t[ii]   = b[ic];
711:     t[ii+1] = b[ic+1];
712:     t[ii+2] = b[ic+2];
713:     t[ii+3] = b[ic+3];
714:     ii += 4;
715:   }

717:   /* forward solve the U^T */
718:   idx = 0;
719:   for (i=0; i<n; i++) {

721:     v     = aa + 16*diag[i];
722:     /* multiply by the inverse of the block diagonal */
723:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
724:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
725:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
726:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
727:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
728:     v += 16;

730:     vi    = aj + diag[i] + 1;
731:     nz    = ai[i+1] - diag[i] - 1;
732:     while (nz--) {
733:       oidx = 4*(*vi++);
734:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
735:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
736:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
737:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
738:       v  += 16;
739:     }
740:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
741:     idx += 4;
742:   }
743:   /* backward solve the L^T */
744:   for (i=n-1; i>=0; i--){
745:     v    = aa + 16*diag[i] - 16;
746:     vi   = aj + diag[i] - 1;
747:     nz   = diag[i] - ai[i];
748:     idt  = 4*i;
749:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
750:     while (nz--) {
751:       idx   = 4*(*vi--);
752:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756:       v -= 16;
757:     }
758:   }

760:   /* copy t into x according to permutation */
761:   ii = 0;
762:   for (i=0; i<n; i++) {
763:     ir      = 4*r[i];
764:     x[ir]   = t[ii];
765:     x[ir+1] = t[ii+1];
766:     x[ir+2] = t[ii+2];
767:     x[ir+3] = t[ii+3];
768:     ii += 4;
769:   }

771:   ISRestoreIndices(isrow,&rout);
772:   ISRestoreIndices(iscol,&cout);
773:   VecRestoreArray(bb,&b);
774:   VecRestoreArray(xx,&x);
775:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
776:   return(0);
777: }

781: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
782: {
783:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
784:   IS              iscol=a->col,isrow=a->row;
785:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
786:   int             *diag = a->diag,ii,ic,ir,oidx;
787:   MatScalar       *aa=a->a,*v;
788:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
789:   PetscScalar     *x,*b,*t;

792:   VecGetArray(bb,&b);
793:   VecGetArray(xx,&x);
794:   t  = a->solve_work;

796:   ISGetIndices(isrow,&rout); r = rout;
797:   ISGetIndices(iscol,&cout); c = cout;

799:   /* copy the b into temp work space according to permutation */
800:   ii = 0;
801:   for (i=0; i<n; i++) {
802:     ic      = 5*c[i];
803:     t[ii]   = b[ic];
804:     t[ii+1] = b[ic+1];
805:     t[ii+2] = b[ic+2];
806:     t[ii+3] = b[ic+3];
807:     t[ii+4] = b[ic+4];
808:     ii += 5;
809:   }

811:   /* forward solve the U^T */
812:   idx = 0;
813:   for (i=0; i<n; i++) {

815:     v     = aa + 25*diag[i];
816:     /* multiply by the inverse of the block diagonal */
817:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
818:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
819:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
820:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
821:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
822:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
823:     v += 25;

825:     vi    = aj + diag[i] + 1;
826:     nz    = ai[i+1] - diag[i] - 1;
827:     while (nz--) {
828:       oidx = 5*(*vi++);
829:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
830:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
831:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
832:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
833:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
834:       v  += 25;
835:     }
836:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
837:     idx += 5;
838:   }
839:   /* backward solve the L^T */
840:   for (i=n-1; i>=0; i--){
841:     v    = aa + 25*diag[i] - 25;
842:     vi   = aj + diag[i] - 1;
843:     nz   = diag[i] - ai[i];
844:     idt  = 5*i;
845:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
846:     while (nz--) {
847:       idx   = 5*(*vi--);
848:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
849:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
850:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
851:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
852:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
853:       v -= 25;
854:     }
855:   }

857:   /* copy t into x according to permutation */
858:   ii = 0;
859:   for (i=0; i<n; i++) {
860:     ir      = 5*r[i];
861:     x[ir]   = t[ii];
862:     x[ir+1] = t[ii+1];
863:     x[ir+2] = t[ii+2];
864:     x[ir+3] = t[ii+3];
865:     x[ir+4] = t[ii+4];
866:     ii += 5;
867:   }

869:   ISRestoreIndices(isrow,&rout);
870:   ISRestoreIndices(iscol,&cout);
871:   VecRestoreArray(bb,&b);
872:   VecRestoreArray(xx,&x);
873:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
874:   return(0);
875: }

879: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
880: {
881:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
882:   IS              iscol=a->col,isrow=a->row;
883:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
884:   int             *diag = a->diag,ii,ic,ir,oidx;
885:   MatScalar       *aa=a->a,*v;
886:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
887:   PetscScalar     *x,*b,*t;

890:   VecGetArray(bb,&b);
891:   VecGetArray(xx,&x);
892:   t  = a->solve_work;

894:   ISGetIndices(isrow,&rout); r = rout;
895:   ISGetIndices(iscol,&cout); c = cout;

897:   /* copy the b into temp work space according to permutation */
898:   ii = 0;
899:   for (i=0; i<n; i++) {
900:     ic      = 6*c[i];
901:     t[ii]   = b[ic];
902:     t[ii+1] = b[ic+1];
903:     t[ii+2] = b[ic+2];
904:     t[ii+3] = b[ic+3];
905:     t[ii+4] = b[ic+4];
906:     t[ii+5] = b[ic+5];
907:     ii += 6;
908:   }

910:   /* forward solve the U^T */
911:   idx = 0;
912:   for (i=0; i<n; i++) {

914:     v     = aa + 36*diag[i];
915:     /* multiply by the inverse of the block diagonal */
916:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
917:     x6    = t[5+idx];
918:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
919:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
920:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
921:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
922:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
923:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
924:     v += 36;

926:     vi    = aj + diag[i] + 1;
927:     nz    = ai[i+1] - diag[i] - 1;
928:     while (nz--) {
929:       oidx = 6*(*vi++);
930:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
931:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
932:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
933:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
934:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
935:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
936:       v  += 36;
937:     }
938:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
939:     t[5+idx] = s6;
940:     idx += 6;
941:   }
942:   /* backward solve the L^T */
943:   for (i=n-1; i>=0; i--){
944:     v    = aa + 36*diag[i] - 36;
945:     vi   = aj + diag[i] - 1;
946:     nz   = diag[i] - ai[i];
947:     idt  = 6*i;
948:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
949:     s6 = t[5+idt];
950:     while (nz--) {
951:       idx   = 6*(*vi--);
952:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958:       v -= 36;
959:     }
960:   }

962:   /* copy t into x according to permutation */
963:   ii = 0;
964:   for (i=0; i<n; i++) {
965:     ir      = 6*r[i];
966:     x[ir]   = t[ii];
967:     x[ir+1] = t[ii+1];
968:     x[ir+2] = t[ii+2];
969:     x[ir+3] = t[ii+3];
970:     x[ir+4] = t[ii+4];
971:     x[ir+5] = t[ii+5];
972:     ii += 6;
973:   }

975:   ISRestoreIndices(isrow,&rout);
976:   ISRestoreIndices(iscol,&cout);
977:   VecRestoreArray(bb,&b);
978:   VecRestoreArray(xx,&x);
979:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
980:   return(0);
981: }

985: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
986: {
987:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
988:   IS              iscol=a->col,isrow=a->row;
989:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
990:   int             *diag = a->diag,ii,ic,ir,oidx;
991:   MatScalar       *aa=a->a,*v;
992:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
993:   PetscScalar     *x,*b,*t;

996:   VecGetArray(bb,&b);
997:   VecGetArray(xx,&x);
998:   t  = a->solve_work;

1000:   ISGetIndices(isrow,&rout); r = rout;
1001:   ISGetIndices(iscol,&cout); c = cout;

1003:   /* copy the b into temp work space according to permutation */
1004:   ii = 0;
1005:   for (i=0; i<n; i++) {
1006:     ic      = 7*c[i];
1007:     t[ii]   = b[ic];
1008:     t[ii+1] = b[ic+1];
1009:     t[ii+2] = b[ic+2];
1010:     t[ii+3] = b[ic+3];
1011:     t[ii+4] = b[ic+4];
1012:     t[ii+5] = b[ic+5];
1013:     t[ii+6] = b[ic+6];
1014:     ii += 7;
1015:   }

1017:   /* forward solve the U^T */
1018:   idx = 0;
1019:   for (i=0; i<n; i++) {

1021:     v     = aa + 49*diag[i];
1022:     /* multiply by the inverse of the block diagonal */
1023:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1024:     x6    = t[5+idx]; x7 = t[6+idx];
1025:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1026:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1027:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1028:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1029:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1030:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1031:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1032:     v += 49;

1034:     vi    = aj + diag[i] + 1;
1035:     nz    = ai[i+1] - diag[i] - 1;
1036:     while (nz--) {
1037:       oidx = 7*(*vi++);
1038:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1039:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1040:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1041:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1042:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1043:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1044:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1045:       v  += 49;
1046:     }
1047:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1048:     t[5+idx] = s6;t[6+idx] = s7;
1049:     idx += 7;
1050:   }
1051:   /* backward solve the L^T */
1052:   for (i=n-1; i>=0; i--){
1053:     v    = aa + 49*diag[i] - 49;
1054:     vi   = aj + diag[i] - 1;
1055:     nz   = diag[i] - ai[i];
1056:     idt  = 7*i;
1057:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1058:     s6 = t[5+idt];s7 = t[6+idt];
1059:     while (nz--) {
1060:       idx   = 7*(*vi--);
1061:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1062:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1063:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1064:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1065:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1066:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1067:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1068:       v -= 49;
1069:     }
1070:   }

1072:   /* copy t into x according to permutation */
1073:   ii = 0;
1074:   for (i=0; i<n; i++) {
1075:     ir      = 7*r[i];
1076:     x[ir]   = t[ii];
1077:     x[ir+1] = t[ii+1];
1078:     x[ir+2] = t[ii+2];
1079:     x[ir+3] = t[ii+3];
1080:     x[ir+4] = t[ii+4];
1081:     x[ir+5] = t[ii+5];
1082:     x[ir+6] = t[ii+6];
1083:     ii += 7;
1084:   }

1086:   ISRestoreIndices(isrow,&rout);
1087:   ISRestoreIndices(iscol,&cout);
1088:   VecRestoreArray(bb,&b);
1089:   VecRestoreArray(xx,&x);
1090:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1091:   return(0);
1092: }

1094: /* ----------------------------------------------------------- */
1097: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1098: {
1099:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1100:   IS              iscol=a->col,isrow=a->row;
1101:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1102:   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1103:   MatScalar       *aa=a->a,*v;
1104:   PetscScalar     *x,*b,*s,*t,*ls;

1107:   VecGetArray(bb,&b);
1108:   VecGetArray(xx,&x);
1109:   t  = a->solve_work;

1111:   ISGetIndices(isrow,&rout); r = rout;
1112:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1114:   /* forward solve the lower triangular */
1115:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1116:   for (i=1; i<n; i++) {
1117:     v   = aa + bs2*ai[i];
1118:     vi  = aj + ai[i];
1119:     nz  = a->diag[i] - ai[i];
1120:     s = t + bs*i;
1121:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1122:     while (nz--) {
1123:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1124:       v += bs2;
1125:     }
1126:   }
1127:   /* backward solve the upper triangular */
1128:   ls = a->solve_work + A->n;
1129:   for (i=n-1; i>=0; i--){
1130:     v   = aa + bs2*(a->diag[i] + 1);
1131:     vi  = aj + a->diag[i] + 1;
1132:     nz  = ai[i+1] - a->diag[i] - 1;
1133:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1134:     while (nz--) {
1135:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1136:       v += bs2;
1137:     }
1138:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1139:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1140:   }

1142:   ISRestoreIndices(isrow,&rout);
1143:   ISRestoreIndices(iscol,&cout);
1144:   VecRestoreArray(bb,&b);
1145:   VecRestoreArray(xx,&x);
1146:   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1147:   return(0);
1148: }

1152: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1153: {
1154:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1155:   IS              iscol=a->col,isrow=a->row;
1156:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1157:   int             *diag = a->diag;
1158:   MatScalar       *aa=a->a,*v;
1159:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1160:   PetscScalar     *x,*b,*t;

1163:   VecGetArray(bb,&b);
1164:   VecGetArray(xx,&x);
1165:   t  = a->solve_work;

1167:   ISGetIndices(isrow,&rout); r = rout;
1168:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1170:   /* forward solve the lower triangular */
1171:   idx    = 7*(*r++);
1172:   t[0] = b[idx];   t[1] = b[1+idx];
1173:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1174:   t[5] = b[5+idx]; t[6] = b[6+idx];

1176:   for (i=1; i<n; i++) {
1177:     v     = aa + 49*ai[i];
1178:     vi    = aj + ai[i];
1179:     nz    = diag[i] - ai[i];
1180:     idx   = 7*(*r++);
1181:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1182:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1183:     while (nz--) {
1184:       idx   = 7*(*vi++);
1185:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1186:       x4    = t[3+idx];x5 = t[4+idx];
1187:       x6    = t[5+idx];x7 = t[6+idx];
1188:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1189:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1190:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1191:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1192:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1193:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1194:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1195:       v += 49;
1196:     }
1197:     idx = 7*i;
1198:     t[idx]   = s1;t[1+idx] = s2;
1199:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1200:     t[5+idx] = s6;t[6+idx] = s7;
1201:   }
1202:   /* backward solve the upper triangular */
1203:   for (i=n-1; i>=0; i--){
1204:     v    = aa + 49*diag[i] + 49;
1205:     vi   = aj + diag[i] + 1;
1206:     nz   = ai[i+1] - diag[i] - 1;
1207:     idt  = 7*i;
1208:     s1 = t[idt];  s2 = t[1+idt];
1209:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1210:     s6 = t[5+idt];s7 = t[6+idt];
1211:     while (nz--) {
1212:       idx   = 7*(*vi++);
1213:       x1    = t[idx];   x2 = t[1+idx];
1214:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1215:       x6    = t[5+idx]; x7 = t[6+idx];
1216:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1217:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1218:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1219:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1220:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1221:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1222:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1223:       v += 49;
1224:     }
1225:     idc = 7*(*c--);
1226:     v   = aa + 49*diag[i];
1227:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1228:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1229:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1230:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1231:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1232:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1233:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1234:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1235:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1236:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1237:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1238:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1239:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1240:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1241:   }

1243:   ISRestoreIndices(isrow,&rout);
1244:   ISRestoreIndices(iscol,&cout);
1245:   VecRestoreArray(bb,&b);
1246:   VecRestoreArray(xx,&x);
1247:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1248:   return(0);
1249: }

1253: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1254: {
1255:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1256:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1257:   int             ierr,*diag = a->diag,jdx;
1258:   MatScalar       *aa=a->a,*v;
1259:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1262:   VecGetArray(bb,&b);
1263:   VecGetArray(xx,&x);
1264:   /* forward solve the lower triangular */
1265:   idx    = 0;
1266:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1267:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1268:   x[6] = b[6+idx];
1269:   for (i=1; i<n; i++) {
1270:     v     =  aa + 49*ai[i];
1271:     vi    =  aj + ai[i];
1272:     nz    =  diag[i] - ai[i];
1273:     idx   =  7*i;
1274:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1275:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1276:     s7  =  b[6+idx];
1277:     while (nz--) {
1278:       jdx   = 7*(*vi++);
1279:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1280:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1281:       x7    = x[6+jdx];
1282:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1283:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1284:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1285:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1286:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1287:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1288:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1289:       v += 49;
1290:      }
1291:     x[idx]   = s1;
1292:     x[1+idx] = s2;
1293:     x[2+idx] = s3;
1294:     x[3+idx] = s4;
1295:     x[4+idx] = s5;
1296:     x[5+idx] = s6;
1297:     x[6+idx] = s7;
1298:   }
1299:   /* backward solve the upper triangular */
1300:   for (i=n-1; i>=0; i--){
1301:     v    = aa + 49*diag[i] + 49;
1302:     vi   = aj + diag[i] + 1;
1303:     nz   = ai[i+1] - diag[i] - 1;
1304:     idt  = 7*i;
1305:     s1 = x[idt];   s2 = x[1+idt];
1306:     s3 = x[2+idt]; s4 = x[3+idt];
1307:     s5 = x[4+idt]; s6 = x[5+idt];
1308:     s7 = x[6+idt];
1309:     while (nz--) {
1310:       idx   = 7*(*vi++);
1311:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1312:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1313:       x7    = x[6+idx];
1314:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1315:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1316:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1317:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1318:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1319:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1320:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1321:       v += 49;
1322:     }
1323:     v        = aa + 49*diag[i];
1324:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1325:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1326:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1327:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1328:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1329:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1330:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1331:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1332:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1333:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1334:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1335:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1336:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1337:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1338:   }

1340:   VecRestoreArray(bb,&b);
1341:   VecRestoreArray(xx,&x);
1342:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1343:   return(0);
1344: }

1348: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1349: {
1350:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1351:   IS              iscol=a->col,isrow=a->row;
1352:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1353:   int             *diag = a->diag;
1354:   MatScalar       *aa=a->a,*v;
1355:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1358:   VecGetArray(bb,&b);
1359:   VecGetArray(xx,&x);
1360:   t  = a->solve_work;

1362:   ISGetIndices(isrow,&rout); r = rout;
1363:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1365:   /* forward solve the lower triangular */
1366:   idx    = 6*(*r++);
1367:   t[0] = b[idx];   t[1] = b[1+idx];
1368:   t[2] = b[2+idx]; t[3] = b[3+idx];
1369:   t[4] = b[4+idx]; t[5] = b[5+idx];
1370:   for (i=1; i<n; i++) {
1371:     v     = aa + 36*ai[i];
1372:     vi    = aj + ai[i];
1373:     nz    = diag[i] - ai[i];
1374:     idx   = 6*(*r++);
1375:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1376:     s5  = b[4+idx]; s6 = b[5+idx];
1377:     while (nz--) {
1378:       idx   = 6*(*vi++);
1379:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1380:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1381:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1382:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1383:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1384:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1385:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1386:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1387:       v += 36;
1388:     }
1389:     idx = 6*i;
1390:     t[idx]   = s1;t[1+idx] = s2;
1391:     t[2+idx] = s3;t[3+idx] = s4;
1392:     t[4+idx] = s5;t[5+idx] = s6;
1393:   }
1394:   /* backward solve the upper triangular */
1395:   for (i=n-1; i>=0; i--){
1396:     v    = aa + 36*diag[i] + 36;
1397:     vi   = aj + diag[i] + 1;
1398:     nz   = ai[i+1] - diag[i] - 1;
1399:     idt  = 6*i;
1400:     s1 = t[idt];  s2 = t[1+idt];
1401:     s3 = t[2+idt];s4 = t[3+idt];
1402:     s5 = t[4+idt];s6 = t[5+idt];
1403:     while (nz--) {
1404:       idx   = 6*(*vi++);
1405:       x1    = t[idx];   x2 = t[1+idx];
1406:       x3    = t[2+idx]; x4 = t[3+idx];
1407:       x5    = t[4+idx]; x6 = t[5+idx];
1408:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1414:       v += 36;
1415:     }
1416:     idc = 6*(*c--);
1417:     v   = aa + 36*diag[i];
1418:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1419:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1420:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1421:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1422:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1423:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1424:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1425:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1426:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1427:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1428:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1429:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1430:   }

1432:   ISRestoreIndices(isrow,&rout);
1433:   ISRestoreIndices(iscol,&cout);
1434:   VecRestoreArray(bb,&b);
1435:   VecRestoreArray(xx,&x);
1436:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1437:   return(0);
1438: }

1442: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1443: {
1444:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1445:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1446:   int             ierr,*diag = a->diag,jdx;
1447:   MatScalar       *aa=a->a,*v;
1448:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1451:   VecGetArray(bb,&b);
1452:   VecGetArray(xx,&x);
1453:   /* forward solve the lower triangular */
1454:   idx    = 0;
1455:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1456:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1457:   for (i=1; i<n; i++) {
1458:     v     =  aa + 36*ai[i];
1459:     vi    =  aj + ai[i];
1460:     nz    =  diag[i] - ai[i];
1461:     idx   =  6*i;
1462:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1463:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1464:     while (nz--) {
1465:       jdx   = 6*(*vi++);
1466:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1467:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1468:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1469:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1470:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1471:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1472:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1473:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1474:       v += 36;
1475:      }
1476:     x[idx]   = s1;
1477:     x[1+idx] = s2;
1478:     x[2+idx] = s3;
1479:     x[3+idx] = s4;
1480:     x[4+idx] = s5;
1481:     x[5+idx] = s6;
1482:   }
1483:   /* backward solve the upper triangular */
1484:   for (i=n-1; i>=0; i--){
1485:     v    = aa + 36*diag[i] + 36;
1486:     vi   = aj + diag[i] + 1;
1487:     nz   = ai[i+1] - diag[i] - 1;
1488:     idt  = 6*i;
1489:     s1 = x[idt];   s2 = x[1+idt];
1490:     s3 = x[2+idt]; s4 = x[3+idt];
1491:     s5 = x[4+idt]; s6 = x[5+idt];
1492:     while (nz--) {
1493:       idx   = 6*(*vi++);
1494:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1495:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1496:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1497:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1498:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1499:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1500:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1501:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1502:       v += 36;
1503:     }
1504:     v        = aa + 36*diag[i];
1505:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1506:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1507:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1508:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1509:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1510:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1511:   }

1513:   VecRestoreArray(bb,&b);
1514:   VecRestoreArray(xx,&x);
1515:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1516:   return(0);
1517: }

1521: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1522: {
1523:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1524:   IS              iscol=a->col,isrow=a->row;
1525:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1526:   int             *diag = a->diag;
1527:   MatScalar       *aa=a->a,*v;
1528:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1531:   VecGetArray(bb,&b);
1532:   VecGetArray(xx,&x);
1533:   t  = a->solve_work;

1535:   ISGetIndices(isrow,&rout); r = rout;
1536:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1538:   /* forward solve the lower triangular */
1539:   idx    = 5*(*r++);
1540:   t[0] = b[idx];   t[1] = b[1+idx];
1541:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1542:   for (i=1; i<n; i++) {
1543:     v     = aa + 25*ai[i];
1544:     vi    = aj + ai[i];
1545:     nz    = diag[i] - ai[i];
1546:     idx   = 5*(*r++);
1547:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1548:     s5  = b[4+idx];
1549:     while (nz--) {
1550:       idx   = 5*(*vi++);
1551:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1552:       x4    = t[3+idx];x5 = t[4+idx];
1553:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1554:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1555:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1556:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1557:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1558:       v += 25;
1559:     }
1560:     idx = 5*i;
1561:     t[idx]   = s1;t[1+idx] = s2;
1562:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1563:   }
1564:   /* backward solve the upper triangular */
1565:   for (i=n-1; i>=0; i--){
1566:     v    = aa + 25*diag[i] + 25;
1567:     vi   = aj + diag[i] + 1;
1568:     nz   = ai[i+1] - diag[i] - 1;
1569:     idt  = 5*i;
1570:     s1 = t[idt];  s2 = t[1+idt];
1571:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1572:     while (nz--) {
1573:       idx   = 5*(*vi++);
1574:       x1    = t[idx];   x2 = t[1+idx];
1575:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1576:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1577:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1578:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1579:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1580:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1581:       v += 25;
1582:     }
1583:     idc = 5*(*c--);
1584:     v   = aa + 25*diag[i];
1585:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1586:                                  v[15]*s4+v[20]*s5;
1587:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1588:                                  v[16]*s4+v[21]*s5;
1589:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1590:                                  v[17]*s4+v[22]*s5;
1591:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1592:                                  v[18]*s4+v[23]*s5;
1593:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1594:                                  v[19]*s4+v[24]*s5;
1595:   }

1597:   ISRestoreIndices(isrow,&rout);
1598:   ISRestoreIndices(iscol,&cout);
1599:   VecRestoreArray(bb,&b);
1600:   VecRestoreArray(xx,&x);
1601:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1602:   return(0);
1603: }

1607: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1608: {
1609:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1610:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1611:   int             ierr,*diag = a->diag,jdx;
1612:   MatScalar       *aa=a->a,*v;
1613:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1616:   VecGetArray(bb,&b);
1617:   VecGetArray(xx,&x);
1618:   /* forward solve the lower triangular */
1619:   idx    = 0;
1620:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1621:   for (i=1; i<n; i++) {
1622:     v     =  aa + 25*ai[i];
1623:     vi    =  aj + ai[i];
1624:     nz    =  diag[i] - ai[i];
1625:     idx   =  5*i;
1626:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1627:     while (nz--) {
1628:       jdx   = 5*(*vi++);
1629:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1630:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1631:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1632:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1633:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1634:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1635:       v    += 25;
1636:     }
1637:     x[idx]   = s1;
1638:     x[1+idx] = s2;
1639:     x[2+idx] = s3;
1640:     x[3+idx] = s4;
1641:     x[4+idx] = s5;
1642:   }
1643:   /* backward solve the upper triangular */
1644:   for (i=n-1; i>=0; i--){
1645:     v    = aa + 25*diag[i] + 25;
1646:     vi   = aj + diag[i] + 1;
1647:     nz   = ai[i+1] - diag[i] - 1;
1648:     idt  = 5*i;
1649:     s1 = x[idt];  s2 = x[1+idt];
1650:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1651:     while (nz--) {
1652:       idx   = 5*(*vi++);
1653:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1654:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1655:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1656:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1657:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1658:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1659:       v    += 25;
1660:     }
1661:     v        = aa + 25*diag[i];
1662:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1663:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1664:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1665:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1666:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1667:   }

1669:   VecRestoreArray(bb,&b);
1670:   VecRestoreArray(xx,&x);
1671:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1672:   return(0);
1673: }

1677: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1678: {
1679:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1680:   IS              iscol=a->col,isrow=a->row;
1681:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1682:   int             *diag = a->diag;
1683:   MatScalar       *aa=a->a,*v;
1684:   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1687:   VecGetArray(bb,&b);
1688:   VecGetArray(xx,&x);
1689:   t  = a->solve_work;

1691:   ISGetIndices(isrow,&rout); r = rout;
1692:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1694:   /* forward solve the lower triangular */
1695:   idx    = 4*(*r++);
1696:   t[0] = b[idx];   t[1] = b[1+idx];
1697:   t[2] = b[2+idx]; t[3] = b[3+idx];
1698:   for (i=1; i<n; i++) {
1699:     v     = aa + 16*ai[i];
1700:     vi    = aj + ai[i];
1701:     nz    = diag[i] - ai[i];
1702:     idx   = 4*(*r++);
1703:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1704:     while (nz--) {
1705:       idx   = 4*(*vi++);
1706:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1707:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1708:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1709:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1710:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1711:       v    += 16;
1712:     }
1713:     idx        = 4*i;
1714:     t[idx]   = s1;t[1+idx] = s2;
1715:     t[2+idx] = s3;t[3+idx] = s4;
1716:   }
1717:   /* backward solve the upper triangular */
1718:   for (i=n-1; i>=0; i--){
1719:     v    = aa + 16*diag[i] + 16;
1720:     vi   = aj + diag[i] + 1;
1721:     nz   = ai[i+1] - diag[i] - 1;
1722:     idt  = 4*i;
1723:     s1 = t[idt];  s2 = t[1+idt];
1724:     s3 = t[2+idt];s4 = t[3+idt];
1725:     while (nz--) {
1726:       idx   = 4*(*vi++);
1727:       x1    = t[idx];   x2 = t[1+idx];
1728:       x3    = t[2+idx]; x4 = t[3+idx];
1729:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1730:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1731:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1732:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1733:       v += 16;
1734:     }
1735:     idc      = 4*(*c--);
1736:     v        = aa + 16*diag[i];
1737:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1738:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1739:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1740:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1741:   }

1743:   ISRestoreIndices(isrow,&rout);
1744:   ISRestoreIndices(iscol,&cout);
1745:   VecRestoreArray(bb,&b);
1746:   VecRestoreArray(xx,&x);
1747:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1748:   return(0);
1749: }

1753: int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1754: {
1755:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1756:   IS              iscol=a->col,isrow=a->row;
1757:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1758:   int             *diag = a->diag;
1759:   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1760:   PetscScalar     *x,*b;

1763:   VecGetArray(bb,&b);
1764:   VecGetArray(xx,&x);
1765:   t  = (MatScalar *)a->solve_work;

1767:   ISGetIndices(isrow,&rout); r = rout;
1768:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1770:   /* forward solve the lower triangular */
1771:   idx    = 4*(*r++);
1772:   t[0] = (MatScalar)b[idx];
1773:   t[1] = (MatScalar)b[1+idx];
1774:   t[2] = (MatScalar)b[2+idx];
1775:   t[3] = (MatScalar)b[3+idx];
1776:   for (i=1; i<n; i++) {
1777:     v     = aa + 16*ai[i];
1778:     vi    = aj + ai[i];
1779:     nz    = diag[i] - ai[i];
1780:     idx   = 4*(*r++);
1781:     s1 = (MatScalar)b[idx];
1782:     s2 = (MatScalar)b[1+idx];
1783:     s3 = (MatScalar)b[2+idx];
1784:     s4 = (MatScalar)b[3+idx];
1785:     while (nz--) {
1786:       idx   = 4*(*vi++);
1787:       x1  = t[idx];
1788:       x2  = t[1+idx];
1789:       x3  = t[2+idx];
1790:       x4  = t[3+idx];
1791:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1792:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1793:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1794:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1795:       v    += 16;
1796:     }
1797:     idx        = 4*i;
1798:     t[idx]   = s1;
1799:     t[1+idx] = s2;
1800:     t[2+idx] = s3;
1801:     t[3+idx] = s4;
1802:   }
1803:   /* backward solve the upper triangular */
1804:   for (i=n-1; i>=0; i--){
1805:     v    = aa + 16*diag[i] + 16;
1806:     vi   = aj + diag[i] + 1;
1807:     nz   = ai[i+1] - diag[i] - 1;
1808:     idt  = 4*i;
1809:     s1 = t[idt];
1810:     s2 = t[1+idt];
1811:     s3 = t[2+idt];
1812:     s4 = t[3+idt];
1813:     while (nz--) {
1814:       idx   = 4*(*vi++);
1815:       x1  = t[idx];
1816:       x2  = t[1+idx];
1817:       x3  = t[2+idx];
1818:       x4  = t[3+idx];
1819:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1820:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1821:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1822:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1823:       v += 16;
1824:     }
1825:     idc      = 4*(*c--);
1826:     v        = aa + 16*diag[i];
1827:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1828:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1829:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1830:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1831:     x[idc]   = (PetscScalar)t[idt];
1832:     x[1+idc] = (PetscScalar)t[1+idt];
1833:     x[2+idc] = (PetscScalar)t[2+idt];
1834:     x[3+idc] = (PetscScalar)t[3+idt];
1835:  }

1837:   ISRestoreIndices(isrow,&rout);
1838:   ISRestoreIndices(iscol,&cout);
1839:   VecRestoreArray(bb,&b);
1840:   VecRestoreArray(xx,&x);
1841:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1842:   return(0);
1843: }

1845: #if defined (PETSC_HAVE_SSE)

1847: #include PETSC_HAVE_SSE

1851: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1852: {
1853:   /* 
1854:      Note: This code uses demotion of double
1855:      to float when performing the mixed-mode computation.
1856:      This may not be numerically reasonable for all applications.
1857:   */
1858:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1859:   IS              iscol=a->col,isrow=a->row;
1860:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1861:   int             *diag = a->diag,ai16;
1862:   MatScalar       *aa=a->a,*v;
1863:   PetscScalar     *x,*b,*t;

1865:   /* Make space in temp stack for 16 Byte Aligned arrays */
1866:   float           ssealignedspace[11],*tmps,*tmpx;
1867:   unsigned long   offset;
1868: 
1870:   SSE_SCOPE_BEGIN;

1872:     offset = (unsigned long)ssealignedspace % 16;
1873:     if (offset) offset = (16 - offset)/4;
1874:     tmps = &ssealignedspace[offset];
1875:     tmpx = &ssealignedspace[offset+4];
1876:     PREFETCH_NTA(aa+16*ai[1]);

1878:     VecGetArray(bb,&b);
1879:     VecGetArray(xx,&x);
1880:     t  = a->solve_work;

1882:     ISGetIndices(isrow,&rout); r = rout;
1883:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1885:     /* forward solve the lower triangular */
1886:     idx  = 4*(*r++);
1887:     t[0] = b[idx];   t[1] = b[1+idx];
1888:     t[2] = b[2+idx]; t[3] = b[3+idx];
1889:     v    =  aa + 16*ai[1];

1891:     for (i=1; i<n;) {
1892:       PREFETCH_NTA(&v[8]);
1893:       vi   =  aj      + ai[i];
1894:       nz   =  diag[i] - ai[i];
1895:       idx  =  4*(*r++);

1897:       /* Demote sum from double to float */
1898:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1899:       LOAD_PS(tmps,XMM7);

1901:       while (nz--) {
1902:         PREFETCH_NTA(&v[16]);
1903:         idx = 4*(*vi++);
1904: 
1905:         /* Demote solution (so far) from double to float */
1906:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1908:         /* 4x4 Matrix-Vector product with negative accumulation: */
1909:         SSE_INLINE_BEGIN_2(tmpx,v)
1910:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1912:           /* First Column */
1913:           SSE_COPY_PS(XMM0,XMM6)
1914:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1915:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1916:           SSE_SUB_PS(XMM7,XMM0)
1917: 
1918:           /* Second Column */
1919:           SSE_COPY_PS(XMM1,XMM6)
1920:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1921:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1922:           SSE_SUB_PS(XMM7,XMM1)
1923: 
1924:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1925: 
1926:           /* Third Column */
1927:           SSE_COPY_PS(XMM2,XMM6)
1928:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1929:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1930:           SSE_SUB_PS(XMM7,XMM2)

1932:           /* Fourth Column */
1933:           SSE_COPY_PS(XMM3,XMM6)
1934:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1935:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1936:           SSE_SUB_PS(XMM7,XMM3)
1937:         SSE_INLINE_END_2
1938: 
1939:         v  += 16;
1940:       }
1941:       idx = 4*i;
1942:       v   = aa + 16*ai[++i];
1943:       PREFETCH_NTA(v);
1944:       STORE_PS(tmps,XMM7);

1946:       /* Promote result from float to double */
1947:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1948:     }
1949:     /* backward solve the upper triangular */
1950:     idt  = 4*(n-1);
1951:     ai16 = 16*diag[n-1];
1952:     v    = aa + ai16 + 16;
1953:     for (i=n-1; i>=0;){
1954:       PREFETCH_NTA(&v[8]);
1955:       vi = aj + diag[i] + 1;
1956:       nz = ai[i+1] - diag[i] - 1;
1957: 
1958:       /* Demote accumulator from double to float */
1959:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1960:       LOAD_PS(tmps,XMM7);

1962:       while (nz--) {
1963:         PREFETCH_NTA(&v[16]);
1964:         idx = 4*(*vi++);

1966:         /* Demote solution (so far) from double to float */
1967:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1969:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1970:         SSE_INLINE_BEGIN_2(tmpx,v)
1971:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1973:           /* First Column */
1974:           SSE_COPY_PS(XMM0,XMM6)
1975:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1976:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1977:           SSE_SUB_PS(XMM7,XMM0)

1979:           /* Second Column */
1980:           SSE_COPY_PS(XMM1,XMM6)
1981:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1982:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1983:           SSE_SUB_PS(XMM7,XMM1)

1985:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1986: 
1987:           /* Third Column */
1988:           SSE_COPY_PS(XMM2,XMM6)
1989:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1990:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1991:           SSE_SUB_PS(XMM7,XMM2)

1993:           /* Fourth Column */
1994:           SSE_COPY_PS(XMM3,XMM6)
1995:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1996:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1997:           SSE_SUB_PS(XMM7,XMM3)
1998:         SSE_INLINE_END_2
1999:         v  += 16;
2000:       }
2001:       v    = aa + ai16;
2002:       ai16 = 16*diag[--i];
2003:       PREFETCH_NTA(aa+ai16+16);
2004:       /* 
2005:          Scale the result by the diagonal 4x4 block, 
2006:          which was inverted as part of the factorization
2007:       */
2008:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2009:         /* First Column */
2010:         SSE_COPY_PS(XMM0,XMM7)
2011:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2012:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2014:         /* Second Column */
2015:         SSE_COPY_PS(XMM1,XMM7)
2016:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2017:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2018:         SSE_ADD_PS(XMM0,XMM1)

2020:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2021: 
2022:         /* Third Column */
2023:         SSE_COPY_PS(XMM2,XMM7)
2024:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2025:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2026:         SSE_ADD_PS(XMM0,XMM2)

2028:         /* Fourth Column */
2029:         SSE_COPY_PS(XMM3,XMM7)
2030:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2031:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2032:         SSE_ADD_PS(XMM0,XMM3)
2033: 
2034:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2035:       SSE_INLINE_END_3

2037:       /* Promote solution from float to double */
2038:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2040:       /* Apply reordering to t and stream into x.    */
2041:       /* This way, x doesn't pollute the cache.      */
2042:       /* Be careful with size: 2 doubles = 4 floats! */
2043:       idc  = 4*(*c--);
2044:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2045:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2046:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2047:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2048:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2049:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2050:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2051:       SSE_INLINE_END_2
2052:       v    = aa + ai16 + 16;
2053:       idt -= 4;
2054:     }

2056:     ISRestoreIndices(isrow,&rout);
2057:     ISRestoreIndices(iscol,&cout);
2058:     VecRestoreArray(bb,&b);
2059:     VecRestoreArray(xx,&x);
2060:     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2061:   SSE_SCOPE_END;
2062:   return(0);
2063: }

2065: #endif


2068: /*
2069:       Special case where the matrix was ILU(0) factored in the natural
2070:    ordering. This eliminates the need for the column and row permutation.
2071: */
2074: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2075: {
2076:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2077:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2078:   int             ierr,*diag = a->diag;
2079:   MatScalar       *aa=a->a;
2080:   PetscScalar     *x,*b;

2083:   VecGetArray(bb,&b);
2084:   VecGetArray(xx,&x);

2086: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2087:   {
2088:     static PetscScalar w[2000]; /* very BAD need to fix */
2089:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2090:   }
2091: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2092:   {
2093:     static PetscScalar w[2000]; /* very BAD need to fix */
2094:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2095:   }
2096: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2097:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2098: #else
2099:   {
2100:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2101:     MatScalar    *v;
2102:     int          jdx,idt,idx,nz,*vi,i,ai16;

2104:   /* forward solve the lower triangular */
2105:   idx    = 0;
2106:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2107:   for (i=1; i<n; i++) {
2108:     v     =  aa      + 16*ai[i];
2109:     vi    =  aj      + ai[i];
2110:     nz    =  diag[i] - ai[i];
2111:     idx   +=  4;
2112:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2113:     while (nz--) {
2114:       jdx   = 4*(*vi++);
2115:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2116:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2117:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2118:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2119:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2120:       v    += 16;
2121:     }
2122:     x[idx]   = s1;
2123:     x[1+idx] = s2;
2124:     x[2+idx] = s3;
2125:     x[3+idx] = s4;
2126:   }
2127:   /* backward solve the upper triangular */
2128:   idt = 4*(n-1);
2129:   for (i=n-1; i>=0; i--){
2130:     ai16 = 16*diag[i];
2131:     v    = aa + ai16 + 16;
2132:     vi   = aj + diag[i] + 1;
2133:     nz   = ai[i+1] - diag[i] - 1;
2134:     s1 = x[idt];  s2 = x[1+idt];
2135:     s3 = x[2+idt];s4 = x[3+idt];
2136:     while (nz--) {
2137:       idx   = 4*(*vi++);
2138:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2139:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2140:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2141:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2142:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2143:       v    += 16;
2144:     }
2145:     v        = aa + ai16;
2146:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2147:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2148:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2149:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2150:     idt -= 4;
2151:   }
2152:   }
2153: #endif

2155:   VecRestoreArray(bb,&b);
2156:   VecRestoreArray(xx,&x);
2157:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2158:   return(0);
2159: }

2163: int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2164: {
2165:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2166:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2167:   int             ierr,*diag = a->diag;
2168:   MatScalar       *aa=a->a;
2169:   PetscScalar     *x,*b;

2172:   VecGetArray(bb,&b);
2173:   VecGetArray(xx,&x);

2175:   {
2176:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2177:     MatScalar  *v,*t=(MatScalar *)x;
2178:     int        jdx,idt,idx,nz,*vi,i,ai16;

2180:     /* forward solve the lower triangular */
2181:     idx  = 0;
2182:     t[0] = (MatScalar)b[0];
2183:     t[1] = (MatScalar)b[1];
2184:     t[2] = (MatScalar)b[2];
2185:     t[3] = (MatScalar)b[3];
2186:     for (i=1; i<n; i++) {
2187:       v     =  aa      + 16*ai[i];
2188:       vi    =  aj      + ai[i];
2189:       nz    =  diag[i] - ai[i];
2190:       idx   +=  4;
2191:       s1 = (MatScalar)b[idx];
2192:       s2 = (MatScalar)b[1+idx];
2193:       s3 = (MatScalar)b[2+idx];
2194:       s4 = (MatScalar)b[3+idx];
2195:       while (nz--) {
2196:         jdx = 4*(*vi++);
2197:         x1  = t[jdx];
2198:         x2  = t[1+jdx];
2199:         x3  = t[2+jdx];
2200:         x4  = t[3+jdx];
2201:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2202:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2203:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2204:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2205:         v    += 16;
2206:       }
2207:       t[idx]   = s1;
2208:       t[1+idx] = s2;
2209:       t[2+idx] = s3;
2210:       t[3+idx] = s4;
2211:     }
2212:     /* backward solve the upper triangular */
2213:     idt = 4*(n-1);
2214:     for (i=n-1; i>=0; i--){
2215:       ai16 = 16*diag[i];
2216:       v    = aa + ai16 + 16;
2217:       vi   = aj + diag[i] + 1;
2218:       nz   = ai[i+1] - diag[i] - 1;
2219:       s1   = t[idt];
2220:       s2   = t[1+idt];
2221:       s3   = t[2+idt];
2222:       s4   = t[3+idt];
2223:       while (nz--) {
2224:         idx = 4*(*vi++);
2225:         x1  = (MatScalar)x[idx];
2226:         x2  = (MatScalar)x[1+idx];
2227:         x3  = (MatScalar)x[2+idx];
2228:         x4  = (MatScalar)x[3+idx];
2229:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2230:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2231:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2232:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2233:         v    += 16;
2234:       }
2235:       v        = aa + ai16;
2236:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2237:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2238:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2239:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2240:       idt -= 4;
2241:     }
2242:   }

2244:   VecRestoreArray(bb,&b);
2245:   VecRestoreArray(xx,&x);
2246:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2247:   return(0);
2248: }

2250: #if defined (PETSC_HAVE_SSE)

2252: #include PETSC_HAVE_SSE
2255: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2256: {
2257:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2258:   unsigned short *aj=(unsigned short *)a->j;
2259:   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2260:   MatScalar      *aa=a->a;
2261:   PetscScalar    *x,*b;

2264:   SSE_SCOPE_BEGIN;
2265:   /* 
2266:      Note: This code currently uses demotion of double
2267:      to float when performing the mixed-mode computation.
2268:      This may not be numerically reasonable for all applications.
2269:   */
2270:   PREFETCH_NTA(aa+16*ai[1]);

2272:   VecGetArray(bb,&b);
2273:   VecGetArray(xx,&x);
2274:   {
2275:     /* x will first be computed in single precision then promoted inplace to double */
2276:     MatScalar      *v,*t=(MatScalar *)x;
2277:     int            nz,i,idt,ai16;
2278:     unsigned int   jdx,idx;
2279:     unsigned short *vi;
2280:     /* Forward solve the lower triangular factor. */

2282:     /* First block is the identity. */
2283:     idx  = 0;
2284:     CONVERT_DOUBLE4_FLOAT4(t,b);
2285:     v    =  aa + 16*((unsigned int)ai[1]);

2287:     for (i=1; i<n;) {
2288:       PREFETCH_NTA(&v[8]);
2289:       vi   =  aj      + ai[i];
2290:       nz   =  diag[i] - ai[i];
2291:       idx +=  4;

2293:       /* Demote RHS from double to float. */
2294:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2295:       LOAD_PS(&t[idx],XMM7);

2297:       while (nz--) {
2298:         PREFETCH_NTA(&v[16]);
2299:         jdx = 4*((unsigned int)(*vi++));
2300: 
2301:         /* 4x4 Matrix-Vector product with negative accumulation: */
2302:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2303:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2305:           /* First Column */
2306:           SSE_COPY_PS(XMM0,XMM6)
2307:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2308:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2309:           SSE_SUB_PS(XMM7,XMM0)

2311:           /* Second Column */
2312:           SSE_COPY_PS(XMM1,XMM6)
2313:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2314:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2315:           SSE_SUB_PS(XMM7,XMM1)

2317:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2318: 
2319:           /* Third Column */
2320:           SSE_COPY_PS(XMM2,XMM6)
2321:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2322:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2323:           SSE_SUB_PS(XMM7,XMM2)

2325:           /* Fourth Column */
2326:           SSE_COPY_PS(XMM3,XMM6)
2327:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2328:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2329:           SSE_SUB_PS(XMM7,XMM3)
2330:         SSE_INLINE_END_2
2331: 
2332:         v  += 16;
2333:       }
2334:       v    =  aa + 16*ai[++i];
2335:       PREFETCH_NTA(v);
2336:       STORE_PS(&t[idx],XMM7);
2337:     }

2339:     /* Backward solve the upper triangular factor.*/

2341:     idt  = 4*(n-1);
2342:     ai16 = 16*diag[n-1];
2343:     v    = aa + ai16 + 16;
2344:     for (i=n-1; i>=0;){
2345:       PREFETCH_NTA(&v[8]);
2346:       vi = aj + diag[i] + 1;
2347:       nz = ai[i+1] - diag[i] - 1;
2348: 
2349:       LOAD_PS(&t[idt],XMM7);

2351:       while (nz--) {
2352:         PREFETCH_NTA(&v[16]);
2353:         idx = 4*((unsigned int)(*vi++));

2355:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2356:         SSE_INLINE_BEGIN_2(&t[idx],v)
2357:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2359:           /* First Column */
2360:           SSE_COPY_PS(XMM0,XMM6)
2361:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2362:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2363:           SSE_SUB_PS(XMM7,XMM0)

2365:           /* Second Column */
2366:           SSE_COPY_PS(XMM1,XMM6)
2367:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2368:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2369:           SSE_SUB_PS(XMM7,XMM1)

2371:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2372: 
2373:           /* Third Column */
2374:           SSE_COPY_PS(XMM2,XMM6)
2375:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2376:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2377:           SSE_SUB_PS(XMM7,XMM2)

2379:           /* Fourth Column */
2380:           SSE_COPY_PS(XMM3,XMM6)
2381:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2382:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2383:           SSE_SUB_PS(XMM7,XMM3)
2384:         SSE_INLINE_END_2
2385:         v  += 16;
2386:       }
2387:       v    = aa + ai16;
2388:       ai16 = 16*diag[--i];
2389:       PREFETCH_NTA(aa+ai16+16);
2390:       /* 
2391:          Scale the result by the diagonal 4x4 block, 
2392:          which was inverted as part of the factorization
2393:       */
2394:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2395:         /* First Column */
2396:         SSE_COPY_PS(XMM0,XMM7)
2397:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2398:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2400:         /* Second Column */
2401:         SSE_COPY_PS(XMM1,XMM7)
2402:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2403:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2404:         SSE_ADD_PS(XMM0,XMM1)

2406:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2407: 
2408:         /* Third Column */
2409:         SSE_COPY_PS(XMM2,XMM7)
2410:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2411:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2412:         SSE_ADD_PS(XMM0,XMM2)

2414:         /* Fourth Column */
2415:         SSE_COPY_PS(XMM3,XMM7)
2416:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2417:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2418:         SSE_ADD_PS(XMM0,XMM3)

2420:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2421:       SSE_INLINE_END_3

2423:       v    = aa + ai16 + 16;
2424:       idt -= 4;
2425:     }

2427:     /* Convert t from single precision back to double precision (inplace)*/
2428:     idt = 4*(n-1);
2429:     for (i=n-1;i>=0;i--) {
2430:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2431:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2432:       PetscScalar *xtemp=&x[idt];
2433:       MatScalar   *ttemp=&t[idt];
2434:       xtemp[3] = (PetscScalar)ttemp[3];
2435:       xtemp[2] = (PetscScalar)ttemp[2];
2436:       xtemp[1] = (PetscScalar)ttemp[1];
2437:       xtemp[0] = (PetscScalar)ttemp[0];
2438:       idt -= 4;
2439:     }

2441:   } /* End of artificial scope. */
2442:   VecRestoreArray(bb,&b);
2443:   VecRestoreArray(xx,&x);
2444:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2445:   SSE_SCOPE_END;
2446:   return(0);
2447: }

2451: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2452: {
2453:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2454:   int            *aj=a->j;
2455:   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2456:   MatScalar      *aa=a->a;
2457:   PetscScalar    *x,*b;

2460:   SSE_SCOPE_BEGIN;
2461:   /* 
2462:      Note: This code currently uses demotion of double
2463:      to float when performing the mixed-mode computation.
2464:      This may not be numerically reasonable for all applications.
2465:   */
2466:   PREFETCH_NTA(aa+16*ai[1]);

2468:   VecGetArray(bb,&b);
2469:   VecGetArray(xx,&x);
2470:   {
2471:     /* x will first be computed in single precision then promoted inplace to double */
2472:     MatScalar *v,*t=(MatScalar *)x;
2473:     int       nz,i,idt,ai16;
2474:     int       jdx,idx;
2475:     int       *vi;
2476:     /* Forward solve the lower triangular factor. */

2478:     /* First block is the identity. */
2479:     idx  = 0;
2480:     CONVERT_DOUBLE4_FLOAT4(t,b);
2481:     v    =  aa + 16*ai[1];

2483:     for (i=1; i<n;) {
2484:       PREFETCH_NTA(&v[8]);
2485:       vi   =  aj      + ai[i];
2486:       nz   =  diag[i] - ai[i];
2487:       idx +=  4;

2489:       /* Demote RHS from double to float. */
2490:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2491:       LOAD_PS(&t[idx],XMM7);

2493:       while (nz--) {
2494:         PREFETCH_NTA(&v[16]);
2495:         jdx = 4*(*vi++);
2496: /*          jdx = *vi++; */
2497: 
2498:         /* 4x4 Matrix-Vector product with negative accumulation: */
2499:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2500:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2502:           /* First Column */
2503:           SSE_COPY_PS(XMM0,XMM6)
2504:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2505:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2506:           SSE_SUB_PS(XMM7,XMM0)

2508:           /* Second Column */
2509:           SSE_COPY_PS(XMM1,XMM6)
2510:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2511:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2512:           SSE_SUB_PS(XMM7,XMM1)

2514:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2515: 
2516:           /* Third Column */
2517:           SSE_COPY_PS(XMM2,XMM6)
2518:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2519:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2520:           SSE_SUB_PS(XMM7,XMM2)

2522:           /* Fourth Column */
2523:           SSE_COPY_PS(XMM3,XMM6)
2524:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2525:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2526:           SSE_SUB_PS(XMM7,XMM3)
2527:         SSE_INLINE_END_2
2528: 
2529:         v  += 16;
2530:       }
2531:       v    =  aa + 16*ai[++i];
2532:       PREFETCH_NTA(v);
2533:       STORE_PS(&t[idx],XMM7);
2534:     }

2536:     /* Backward solve the upper triangular factor.*/

2538:     idt  = 4*(n-1);
2539:     ai16 = 16*diag[n-1];
2540:     v    = aa + ai16 + 16;
2541:     for (i=n-1; i>=0;){
2542:       PREFETCH_NTA(&v[8]);
2543:       vi = aj + diag[i] + 1;
2544:       nz = ai[i+1] - diag[i] - 1;
2545: 
2546:       LOAD_PS(&t[idt],XMM7);

2548:       while (nz--) {
2549:         PREFETCH_NTA(&v[16]);
2550:         idx = 4*(*vi++);
2551: /*          idx = *vi++; */

2553:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2554:         SSE_INLINE_BEGIN_2(&t[idx],v)
2555:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2557:           /* First Column */
2558:           SSE_COPY_PS(XMM0,XMM6)
2559:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2560:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2561:           SSE_SUB_PS(XMM7,XMM0)

2563:           /* Second Column */
2564:           SSE_COPY_PS(XMM1,XMM6)
2565:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2566:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2567:           SSE_SUB_PS(XMM7,XMM1)

2569:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2570: 
2571:           /* Third Column */
2572:           SSE_COPY_PS(XMM2,XMM6)
2573:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2574:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2575:           SSE_SUB_PS(XMM7,XMM2)

2577:           /* Fourth Column */
2578:           SSE_COPY_PS(XMM3,XMM6)
2579:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2580:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2581:           SSE_SUB_PS(XMM7,XMM3)
2582:         SSE_INLINE_END_2
2583:         v  += 16;
2584:       }
2585:       v    = aa + ai16;
2586:       ai16 = 16*diag[--i];
2587:       PREFETCH_NTA(aa+ai16+16);
2588:       /* 
2589:          Scale the result by the diagonal 4x4 block, 
2590:          which was inverted as part of the factorization
2591:       */
2592:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2593:         /* First Column */
2594:         SSE_COPY_PS(XMM0,XMM7)
2595:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2596:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2598:         /* Second Column */
2599:         SSE_COPY_PS(XMM1,XMM7)
2600:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2601:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2602:         SSE_ADD_PS(XMM0,XMM1)

2604:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2605: 
2606:         /* Third Column */
2607:         SSE_COPY_PS(XMM2,XMM7)
2608:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2609:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2610:         SSE_ADD_PS(XMM0,XMM2)

2612:         /* Fourth Column */
2613:         SSE_COPY_PS(XMM3,XMM7)
2614:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2615:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2616:         SSE_ADD_PS(XMM0,XMM3)

2618:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2619:       SSE_INLINE_END_3

2621:       v    = aa + ai16 + 16;
2622:       idt -= 4;
2623:     }

2625:     /* Convert t from single precision back to double precision (inplace)*/
2626:     idt = 4*(n-1);
2627:     for (i=n-1;i>=0;i--) {
2628:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2629:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2630:       PetscScalar *xtemp=&x[idt];
2631:       MatScalar   *ttemp=&t[idt];
2632:       xtemp[3] = (PetscScalar)ttemp[3];
2633:       xtemp[2] = (PetscScalar)ttemp[2];
2634:       xtemp[1] = (PetscScalar)ttemp[1];
2635:       xtemp[0] = (PetscScalar)ttemp[0];
2636:       idt -= 4;
2637:     }

2639:   } /* End of artificial scope. */
2640:   VecRestoreArray(bb,&b);
2641:   VecRestoreArray(xx,&x);
2642:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2643:   SSE_SCOPE_END;
2644:   return(0);
2645: }

2647: #endif
2650: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2651: {
2652:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2653:   IS              iscol=a->col,isrow=a->row;
2654:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2655:   int             *diag = a->diag;
2656:   MatScalar       *aa=a->a,*v;
2657:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;

2660:   VecGetArray(bb,&b);
2661:   VecGetArray(xx,&x);
2662:   t  = a->solve_work;

2664:   ISGetIndices(isrow,&rout); r = rout;
2665:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2667:   /* forward solve the lower triangular */
2668:   idx    = 3*(*r++);
2669:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2670:   for (i=1; i<n; i++) {
2671:     v     = aa + 9*ai[i];
2672:     vi    = aj + ai[i];
2673:     nz    = diag[i] - ai[i];
2674:     idx   = 3*(*r++);
2675:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2676:     while (nz--) {
2677:       idx   = 3*(*vi++);
2678:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2679:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2680:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2681:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2682:       v += 9;
2683:     }
2684:     idx = 3*i;
2685:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2686:   }
2687:   /* backward solve the upper triangular */
2688:   for (i=n-1; i>=0; i--){
2689:     v    = aa + 9*diag[i] + 9;
2690:     vi   = aj + diag[i] + 1;
2691:     nz   = ai[i+1] - diag[i] - 1;
2692:     idt  = 3*i;
2693:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2694:     while (nz--) {
2695:       idx   = 3*(*vi++);
2696:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2697:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2698:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2699:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2700:       v += 9;
2701:     }
2702:     idc = 3*(*c--);
2703:     v   = aa + 9*diag[i];
2704:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2705:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2706:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2707:   }
2708:   ISRestoreIndices(isrow,&rout);
2709:   ISRestoreIndices(iscol,&cout);
2710:   VecRestoreArray(bb,&b);
2711:   VecRestoreArray(xx,&x);
2712:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2713:   return(0);
2714: }

2716: /*
2717:       Special case where the matrix was ILU(0) factored in the natural
2718:    ordering. This eliminates the need for the column and row permutation.
2719: */
2722: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2723: {
2724:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2725:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2726:   int             ierr,*diag = a->diag;
2727:   MatScalar       *aa=a->a,*v;
2728:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2729:   int             jdx,idt,idx,nz,*vi,i;

2732:   VecGetArray(bb,&b);
2733:   VecGetArray(xx,&x);


2736:   /* forward solve the lower triangular */
2737:   idx    = 0;
2738:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2739:   for (i=1; i<n; i++) {
2740:     v     =  aa      + 9*ai[i];
2741:     vi    =  aj      + ai[i];
2742:     nz    =  diag[i] - ai[i];
2743:     idx   +=  3;
2744:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2745:     while (nz--) {
2746:       jdx   = 3*(*vi++);
2747:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2748:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2749:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2750:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2751:       v    += 9;
2752:     }
2753:     x[idx]   = s1;
2754:     x[1+idx] = s2;
2755:     x[2+idx] = s3;
2756:   }
2757:   /* backward solve the upper triangular */
2758:   for (i=n-1; i>=0; i--){
2759:     v    = aa + 9*diag[i] + 9;
2760:     vi   = aj + diag[i] + 1;
2761:     nz   = ai[i+1] - diag[i] - 1;
2762:     idt  = 3*i;
2763:     s1 = x[idt];  s2 = x[1+idt];
2764:     s3 = x[2+idt];
2765:     while (nz--) {
2766:       idx   = 3*(*vi++);
2767:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2768:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2769:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2770:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2771:       v    += 9;
2772:     }
2773:     v        = aa +  9*diag[i];
2774:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2775:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2776:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2777:   }

2779:   VecRestoreArray(bb,&b);
2780:   VecRestoreArray(xx,&x);
2781:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2782:   return(0);
2783: }

2787: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2788: {
2789:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2790:   IS              iscol=a->col,isrow=a->row;
2791:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2792:   int             *diag = a->diag;
2793:   MatScalar       *aa=a->a,*v;
2794:   PetscScalar     *x,*b,s1,s2,x1,x2,*t;

2797:   VecGetArray(bb,&b);
2798:   VecGetArray(xx,&x);
2799:   t  = a->solve_work;

2801:   ISGetIndices(isrow,&rout); r = rout;
2802:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2804:   /* forward solve the lower triangular */
2805:   idx    = 2*(*r++);
2806:   t[0] = b[idx]; t[1] = b[1+idx];
2807:   for (i=1; i<n; i++) {
2808:     v     = aa + 4*ai[i];
2809:     vi    = aj + ai[i];
2810:     nz    = diag[i] - ai[i];
2811:     idx   = 2*(*r++);
2812:     s1  = b[idx]; s2 = b[1+idx];
2813:     while (nz--) {
2814:       idx   = 2*(*vi++);
2815:       x1    = t[idx]; x2 = t[1+idx];
2816:       s1 -= v[0]*x1 + v[2]*x2;
2817:       s2 -= v[1]*x1 + v[3]*x2;
2818:       v += 4;
2819:     }
2820:     idx = 2*i;
2821:     t[idx] = s1; t[1+idx] = s2;
2822:   }
2823:   /* backward solve the upper triangular */
2824:   for (i=n-1; i>=0; i--){
2825:     v    = aa + 4*diag[i] + 4;
2826:     vi   = aj + diag[i] + 1;
2827:     nz   = ai[i+1] - diag[i] - 1;
2828:     idt  = 2*i;
2829:     s1 = t[idt]; s2 = t[1+idt];
2830:     while (nz--) {
2831:       idx   = 2*(*vi++);
2832:       x1    = t[idx]; x2 = t[1+idx];
2833:       s1 -= v[0]*x1 + v[2]*x2;
2834:       s2 -= v[1]*x1 + v[3]*x2;
2835:       v += 4;
2836:     }
2837:     idc = 2*(*c--);
2838:     v   = aa + 4*diag[i];
2839:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2840:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2841:   }
2842:   ISRestoreIndices(isrow,&rout);
2843:   ISRestoreIndices(iscol,&cout);
2844:   VecRestoreArray(bb,&b);
2845:   VecRestoreArray(xx,&x);
2846:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2847:   return(0);
2848: }

2850: /*
2851:       Special case where the matrix was ILU(0) factored in the natural
2852:    ordering. This eliminates the need for the column and row permutation.
2853: */
2856: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2857: {
2858:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2859:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2860:   int             ierr,*diag = a->diag;
2861:   MatScalar       *aa=a->a,*v;
2862:   PetscScalar     *x,*b,s1,s2,x1,x2;
2863:   int             jdx,idt,idx,nz,*vi,i;

2866:   VecGetArray(bb,&b);
2867:   VecGetArray(xx,&x);

2869:   /* forward solve the lower triangular */
2870:   idx    = 0;
2871:   x[0]   = b[0]; x[1] = b[1];
2872:   for (i=1; i<n; i++) {
2873:     v     =  aa      + 4*ai[i];
2874:     vi    =  aj      + ai[i];
2875:     nz    =  diag[i] - ai[i];
2876:     idx   +=  2;
2877:     s1  =  b[idx];s2 = b[1+idx];
2878:     while (nz--) {
2879:       jdx   = 2*(*vi++);
2880:       x1    = x[jdx];x2 = x[1+jdx];
2881:       s1 -= v[0]*x1 + v[2]*x2;
2882:       s2 -= v[1]*x1 + v[3]*x2;
2883:       v    += 4;
2884:     }
2885:     x[idx]   = s1;
2886:     x[1+idx] = s2;
2887:   }
2888:   /* backward solve the upper triangular */
2889:   for (i=n-1; i>=0; i--){
2890:     v    = aa + 4*diag[i] + 4;
2891:     vi   = aj + diag[i] + 1;
2892:     nz   = ai[i+1] - diag[i] - 1;
2893:     idt  = 2*i;
2894:     s1 = x[idt];  s2 = x[1+idt];
2895:     while (nz--) {
2896:       idx   = 2*(*vi++);
2897:       x1    = x[idx];   x2 = x[1+idx];
2898:       s1 -= v[0]*x1 + v[2]*x2;
2899:       s2 -= v[1]*x1 + v[3]*x2;
2900:       v    += 4;
2901:     }
2902:     v        = aa +  4*diag[i];
2903:     x[idt]   = v[0]*s1 + v[2]*s2;
2904:     x[1+idt] = v[1]*s1 + v[3]*s2;
2905:   }

2907:   VecRestoreArray(bb,&b);
2908:   VecRestoreArray(xx,&x);
2909:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2910:   return(0);
2911: }

2915: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2916: {
2917:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2918:   IS              iscol=a->col,isrow=a->row;
2919:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2920:   int             *diag = a->diag;
2921:   MatScalar       *aa=a->a,*v;
2922:   PetscScalar     *x,*b,s1,*t;

2925:   if (!n) return(0);

2927:   VecGetArray(bb,&b);
2928:   VecGetArray(xx,&x);
2929:   t  = a->solve_work;

2931:   ISGetIndices(isrow,&rout); r = rout;
2932:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2934:   /* forward solve the lower triangular */
2935:   t[0] = b[*r++];
2936:   for (i=1; i<n; i++) {
2937:     v     = aa + ai[i];
2938:     vi    = aj + ai[i];
2939:     nz    = diag[i] - ai[i];
2940:     s1  = b[*r++];
2941:     while (nz--) {
2942:       s1 -= (*v++)*t[*vi++];
2943:     }
2944:     t[i] = s1;
2945:   }
2946:   /* backward solve the upper triangular */
2947:   for (i=n-1; i>=0; i--){
2948:     v    = aa + diag[i] + 1;
2949:     vi   = aj + diag[i] + 1;
2950:     nz   = ai[i+1] - diag[i] - 1;
2951:     s1 = t[i];
2952:     while (nz--) {
2953:       s1 -= (*v++)*t[*vi++];
2954:     }
2955:     x[*c--] = t[i] = aa[diag[i]]*s1;
2956:   }

2958:   ISRestoreIndices(isrow,&rout);
2959:   ISRestoreIndices(iscol,&cout);
2960:   VecRestoreArray(bb,&b);
2961:   VecRestoreArray(xx,&x);
2962:   PetscLogFlops(2*1*(a->nz) - A->n);
2963:   return(0);
2964: }
2965: /*
2966:       Special case where the matrix was ILU(0) factored in the natural
2967:    ordering. This eliminates the need for the column and row permutation.
2968: */
2971: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2972: {
2973:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2974:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2975:   int             ierr,*diag = a->diag;
2976:   MatScalar       *aa=a->a;
2977:   PetscScalar     *x,*b;
2978:   PetscScalar     s1,x1;
2979:   MatScalar       *v;
2980:   int             jdx,idt,idx,nz,*vi,i;

2983:   VecGetArray(bb,&b);
2984:   VecGetArray(xx,&x);

2986:   /* forward solve the lower triangular */
2987:   idx    = 0;
2988:   x[0]   = b[0];
2989:   for (i=1; i<n; i++) {
2990:     v     =  aa      + ai[i];
2991:     vi    =  aj      + ai[i];
2992:     nz    =  diag[i] - ai[i];
2993:     idx   +=  1;
2994:     s1  =  b[idx];
2995:     while (nz--) {
2996:       jdx   = *vi++;
2997:       x1    = x[jdx];
2998:       s1 -= v[0]*x1;
2999:       v    += 1;
3000:     }
3001:     x[idx]   = s1;
3002:   }
3003:   /* backward solve the upper triangular */
3004:   for (i=n-1; i>=0; i--){
3005:     v    = aa + diag[i] + 1;
3006:     vi   = aj + diag[i] + 1;
3007:     nz   = ai[i+1] - diag[i] - 1;
3008:     idt  = i;
3009:     s1 = x[idt];
3010:     while (nz--) {
3011:       idx   = *vi++;
3012:       x1    = x[idx];
3013:       s1 -= v[0]*x1;
3014:       v    += 1;
3015:     }
3016:     v        = aa +  diag[i];
3017:     x[idt]   = v[0]*s1;
3018:   }
3019:   VecRestoreArray(bb,&b);
3020:   VecRestoreArray(xx,&x);
3021:   PetscLogFlops(2*(a->nz) - A->n);
3022:   return(0);
3023: }

3025: /* ----------------------------------------------------------------*/
3026: /*
3027:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3028:    except that the data structure of Mat_SeqAIJ is slightly different.
3029:    Not a good example of code reuse.
3030: */
3031: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);

3035: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3036: {
3037:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3038:   IS          isicol;
3039:   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3040:   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3041:   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3042:   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3043:   PetscTruth  col_identity,row_identity;
3044:   PetscReal   f;

3047:   f             = info->fill;
3048:   levels        = (int)info->levels;
3049:   diagonal_fill = (int)info->diagonal_fill;
3050:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3051:   ISIdentity(isrow,&row_identity);
3052:   ISIdentity(iscol,&col_identity);

3054:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3055:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3056:     (*fact)->factor = FACTOR_LU;
3057:     b               = (Mat_SeqBAIJ*)(*fact)->data;
3058:     if (!b->diag) {
3059:       MatMarkDiagonal_SeqBAIJ(*fact);
3060:     }
3061:     MatMissingDiagonal_SeqBAIJ(*fact);
3062:     b->row        = isrow;
3063:     b->col        = iscol;
3064:     PetscObjectReference((PetscObject)isrow);
3065:     PetscObjectReference((PetscObject)iscol);
3066:     b->icol       = isicol;
3067:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3068:     PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
3069:   } else { /* general case perform the symbolic factorization */
3070:     ISGetIndices(isrow,&r);
3071:     ISGetIndices(isicol,&ic);

3073:     /* get new row pointers */
3074:     PetscMalloc((n+1)*sizeof(int),&ainew);
3075:     ainew[0] = 0;
3076:     /* don't know how many column pointers are needed so estimate */
3077:     jmax = (int)(f*ai[n] + 1);
3078:     PetscMalloc((jmax)*sizeof(int),&ajnew);
3079:     /* ajfill is level of fill for each fill entry */
3080:     PetscMalloc((jmax)*sizeof(int),&ajfill);
3081:     /* fill is a linked list of nonzeros in active row */
3082:     PetscMalloc((n+1)*sizeof(int),&fill);
3083:     /* im is level for each filled value */
3084:     PetscMalloc((n+1)*sizeof(int),&im);
3085:     /* dloc is location of diagonal in factor */
3086:     PetscMalloc((n+1)*sizeof(int),&dloc);
3087:     dloc[0]  = 0;
3088:     for (prow=0; prow<n; prow++) {

3090:       /* copy prow into linked list */
3091:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3092:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3093:       xi         = aj + ai[r[prow]];
3094:       fill[n]    = n;
3095:       fill[prow] = -1; /* marker for diagonal entry */
3096:       while (nz--) {
3097:         fm  = n;
3098:         idx = ic[*xi++];
3099:         do {
3100:           m  = fm;
3101:           fm = fill[m];
3102:         } while (fm < idx);
3103:         fill[m]   = idx;
3104:         fill[idx] = fm;
3105:         im[idx]   = 0;
3106:       }

3108:       /* make sure diagonal entry is included */
3109:       if (diagonal_fill && fill[prow] == -1) {
3110:         fm = n;
3111:         while (fill[fm] < prow) fm = fill[fm];
3112:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3113:         fill[fm]   = prow;
3114:         im[prow]   = 0;
3115:         nzf++;
3116:         dcount++;
3117:       }

3119:       nzi = 0;
3120:       row = fill[n];
3121:       while (row < prow) {
3122:         incrlev = im[row] + 1;
3123:         nz      = dloc[row];
3124:         xi      = ajnew  + ainew[row] + nz + 1;
3125:         flev    = ajfill + ainew[row] + nz + 1;
3126:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3127:         fm      = row;
3128:         while (nnz-- > 0) {
3129:           idx = *xi++;
3130:           if (*flev + incrlev > levels) {
3131:             flev++;
3132:             continue;
3133:           }
3134:           do {
3135:             m  = fm;
3136:             fm = fill[m];
3137:           } while (fm < idx);
3138:           if (fm != idx) {
3139:             im[idx]   = *flev + incrlev;
3140:             fill[m]   = idx;
3141:             fill[idx] = fm;
3142:             fm        = idx;
3143:             nzf++;
3144:           } else {
3145:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3146:           }
3147:           flev++;
3148:         }
3149:         row = fill[row];
3150:         nzi++;
3151:       }
3152:       /* copy new filled row into permanent storage */
3153:       ainew[prow+1] = ainew[prow] + nzf;
3154:       if (ainew[prow+1] > jmax) {

3156:         /* estimate how much additional space we will need */
3157:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3158:         /* just double the memory each time */
3159:         int maxadd = jmax;
3160:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3161:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3162:         jmax += maxadd;

3164:         /* allocate a longer ajnew and ajfill */
3165:         PetscMalloc(jmax*sizeof(int),&xi);
3166:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
3167:         PetscFree(ajnew);
3168:         ajnew = xi;
3169:         PetscMalloc(jmax*sizeof(int),&xi);
3170:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
3171:         PetscFree(ajfill);
3172:         ajfill = xi;
3173:         reallocate++; /* count how many reallocations are needed */
3174:       }
3175:       xi          = ajnew + ainew[prow];
3176:       flev        = ajfill + ainew[prow];
3177:       dloc[prow]  = nzi;
3178:       fm          = fill[n];
3179:       while (nzf--) {
3180:         *xi++   = fm;
3181:         *flev++ = im[fm];
3182:         fm      = fill[fm];
3183:       }
3184:       /* make sure row has diagonal entry */
3185:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3186:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3187:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3188:       }
3189:     }
3190:     PetscFree(ajfill);
3191:     ISRestoreIndices(isrow,&r);
3192:     ISRestoreIndices(isicol,&ic);
3193:     PetscFree(fill);
3194:     PetscFree(im);

3196:     {
3197:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3198:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3199:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3200:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3201:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3202:       if (diagonal_fill) {
3203:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3204:       }
3205:     }

3207:     /* put together the new matrix */
3208:     MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);
3209:     MatSetType(*fact,A->type_name);
3210:     MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);
3211:     PetscLogObjectParent(*fact,isicol);
3212:     b = (Mat_SeqBAIJ*)(*fact)->data;
3213:     PetscFree(b->imax);
3214:     b->singlemalloc = PETSC_FALSE;
3215:     len = bs2*ainew[n]*sizeof(MatScalar);
3216:     /* the next line frees the default space generated by the Create() */
3217:     PetscFree(b->a);
3218:     PetscFree(b->ilen);
3219:     PetscMalloc(len,&b->a);
3220:     b->j          = ajnew;
3221:     b->i          = ainew;
3222:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3223:     b->diag       = dloc;
3224:     b->ilen       = 0;
3225:     b->imax       = 0;
3226:     b->row        = isrow;
3227:     b->col        = iscol;
3228:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229:     PetscObjectReference((PetscObject)isrow);
3230:     PetscObjectReference((PetscObject)iscol);
3231:     b->icol       = isicol;
3232:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3233:     /* In b structure:  Free imax, ilen, old a, old j.  
3234:        Allocate dloc, solve_work, new a, new j */
3235:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3236:     b->maxnz          = b->nz = ainew[n];
3237:     (*fact)->factor   = FACTOR_LU;

3239:     (*fact)->info.factor_mallocs    = reallocate;
3240:     (*fact)->info.fill_ratio_given  = f;
3241:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242:   }

3244:   if (row_identity && col_identity) {
3245:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3246:   }
3247:   return(0);
3248: }

3252: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3253: {
3254:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3255:   /* int i,*AJ=a->j,nz=a->nz; */
3257:   /* Undo Column scaling */
3258: /*    while (nz--) { */
3259: /*      AJ[i] = AJ[i]/4; */
3260: /*    } */
3261:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3262:   A->ops->setunfactored = PETSC_NULL;
3263:   return(0);
3264: }

3268: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3269: {
3270:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3271:   int *AJ=a->j,nz=a->nz;
3272:   unsigned short *aj=(unsigned short *)AJ;
3274:   /* Is this really necessary? */
3275:   while (nz--) {
3276:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3277:   }
3278:   A->ops->setunfactored = PETSC_NULL;
3279:   return(0);
3280: }

3284: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3285: {
3286:   /*
3287:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3288:       with natural ordering
3289:   */
3290:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;

3293:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3294:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3295:   switch (a->bs) {
3296:   case 1:
3297:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299:     break;
3300:   case 2:
3301:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303:     break;
3304:   case 3:
3305:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307:     break;
3308:   case 4:
3309: #if defined(PETSC_USE_MAT_SINGLE)
3310:     {
3311:       PetscTruth  sse_enabled_local;
3312:       int         ierr;
3313:       PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3314:       if (sse_enabled_local) {
3315: #  if defined(PETSC_HAVE_SSE)
3316:         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3317:         if (n==(unsigned short)n) {
3318:           unsigned short *aj=(unsigned short *)AJ;
3319:           for (i=0;i<nz;i++) {
3320:             aj[i] = (unsigned short)AJ[i];
3321:           }
3322:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3324:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3325:         } else {
3326:         /* Scale the column indices for easier indexing in MatSolve. */
3327: /*            for (i=0;i<nz;i++) { */
3328: /*              AJ[i] = AJ[i]*4; */
3329: /*            } */
3330:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3331:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3332:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3333:         }
3334: #  else
3335:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3336:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337: #  endif
3338:       } else {
3339:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340:         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3341:       }
3342:     }
3343: #else
3344:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346: #endif
3347:     break;
3348:   case 5:
3349:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351:     break;
3352:   case 6:
3353:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355:     break;
3356:   case 7:
3357:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359:     break;
3360:   }
3361:   return(0);
3362: }

3366: int MatSeqBAIJ_UpdateSolvers(Mat A)
3367: {
3368:   /*
3369:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3370:       with natural ordering
3371:   */
3372:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3373:   IS          row = a->row, col = a->col;
3374:   PetscTruth  row_identity, col_identity;
3375:   PetscTruth  use_natural;
3376:   int         ierr;


3380:   use_natural = PETSC_FALSE;
3381:   if (row && col) {
3382:     ISIdentity(row,&row_identity);
3383:     ISIdentity(col,&col_identity);

3385:     if (row_identity && col_identity) {
3386:       use_natural = PETSC_TRUE;
3387:     }
3388:   } else {
3389:     use_natural = PETSC_TRUE;
3390:   }

3392:   switch (a->bs) {
3393:   case 1:
3394:     if (use_natural) {
3395:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3396:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3397:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3398:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3399:     } else {
3400:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3401:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3402:     }
3403:     break;
3404:   case 2:
3405:     if (use_natural) {
3406:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3407:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3408:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3409:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3410:     } else {
3411:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3412:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3413:     }
3414:     break;
3415:   case 3:
3416:     if (use_natural) {
3417:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3418:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3419:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3420:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3421:     } else {
3422:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3423:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3424:     }
3425:     break;
3426:   case 4:
3427:     {
3428:       PetscTruth sse_enabled_local;
3429:       PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3430:       if (use_natural) {
3431: #if defined(PETSC_USE_MAT_SINGLE)
3432:         if (sse_enabled_local) { /* Natural + Single + SSE */
3433: #  if defined(PETSC_HAVE_SSE)
3434:           int n=a->mbs;
3435:           if (n==(unsigned short)n) {
3436:             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3437:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3438:           } else {
3439:             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3440:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3441:           }
3442: #  else
3443:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3444:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3445: #  endif
3446:         } else { /* Natural + Single */
3447:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3448:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3449:         }
3450: #else
3451:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3452:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3453: #endif
3454:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3455:         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3456:       } else { /* Arbitrary ordering */
3457: #if defined(PETSC_USE_MAT_SINGLE)
3458:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3459: #  if defined(PETSC_HAVE_SSE)
3460:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3461:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3462: #  else
3463:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3464:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3465: #  endif
3466:         } else { /* Arbitrary + Single */
3467:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3468:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3469:         }
3470: #else
3471:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3472: #endif
3473:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3474:       }
3475:     }
3476:     break;
3477:   case 5:
3478:     if (use_natural) {
3479:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3480:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3481:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3482:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3483:     } else {
3484:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3485:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3486:     }
3487:     break;
3488:   case 6:
3489:     if (use_natural) {
3490:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3491:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3492:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3493:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3494:     } else {
3495:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3496:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3497:     }
3498:     break;
3499:   case 7:
3500:     if (use_natural) {
3501:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3502:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3503:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3504:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3505:     } else {
3506:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3507:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3508:     }
3509:     break;
3510:   default:
3511:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3512:     break;
3513:   }
3514:   return(0);
3515: }

3519: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3523:   MatSeqBAIJ_UpdateSolvers(A);
3524:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3525:     (*A->ops->solve)(A,x,y);
3526:   } else {
3527:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3528:   }
3529:   return(0);
3530: }

3534: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3538:   MatSeqBAIJ_UpdateSolvers(A);
3539:   (*A->ops->solvetranspose)(A,x,y);
3540:   return(0);
3541: }