Actual source code: ilu.h
1: /* $Id: ilu.h,v 1.29 2001/08/07 03:01:46 balay Exp $ */
2: /*
3: Kernels used in sparse ILU (and LU) and in the resulting triangular
4: solves. These are for block algorithms where the block sizes are on
5: the order of 2-6+.
7: There are TWO versions of the macros below.
8: 1) standard for MatScalar == PetscScalar use the standard BLAS for
9: block size larger than 7 and
10: 2) handcoded Fortran single precision for the matrices, since BLAS
11: does not have some arguments in single and some in double.
13: */
17: #include petscblaslapack.h
19: /*
20: These are C kernels,they are contained in
21: src/mat/impls/baij/seq
22: */
24: EXTERN int LINPACKdgefa(MatScalar *,int,int *);
25: EXTERN int LINPACKdgedi(MatScalar *,int,int *,MatScalar*);
26: EXTERN int Kernel_A_gets_inverse_A_2(MatScalar *);
27: EXTERN int Kernel_A_gets_inverse_A_3(MatScalar *);
29: #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;\
30: {\
31: MatScalar d, di;\
32: \
33: di = mat[0];\
34: mat[0] = d = 1.0 / di;\
35: mat[4] *= -d;\
36: mat[8] *= -d;\
37: mat[12] *= -d;\
38: mat[1] *= d;\
39: mat[2] *= d;\
40: mat[3] *= d;\
41: mat[5] += mat[4] * mat[1] * di;\
42: mat[6] += mat[4] * mat[2] * di;\
43: mat[7] += mat[4] * mat[3] * di;\
44: mat[9] += mat[8] * mat[1] * di;\
45: mat[10] += mat[8] * mat[2] * di;\
46: mat[11] += mat[8] * mat[3] * di;\
47: mat[13] += mat[12] * mat[1] * di;\
48: mat[14] += mat[12] * mat[2] * di;\
49: mat[15] += mat[12] * mat[3] * di;\
50: di = mat[5];\
51: mat[5] = d = 1.0 / di;\
52: mat[1] *= -d;\
53: mat[9] *= -d;\
54: mat[13] *= -d;\
55: mat[4] *= d;\
56: mat[6] *= d;\
57: mat[7] *= d;\
58: mat[0] += mat[1] * mat[4] * di;\
59: mat[2] += mat[1] * mat[6] * di;\
60: mat[3] += mat[1] * mat[7] * di;\
61: mat[8] += mat[9] * mat[4] * di;\
62: mat[10] += mat[9] * mat[6] * di;\
63: mat[11] += mat[9] * mat[7] * di;\
64: mat[12] += mat[13] * mat[4] * di;\
65: mat[14] += mat[13] * mat[6] * di;\
66: mat[15] += mat[13] * mat[7] * di;\
67: di = mat[10];\
68: mat[10] = d = 1.0 / di;\
69: mat[2] *= -d;\
70: mat[6] *= -d;\
71: mat[14] *= -d;\
72: mat[8] *= d;\
73: mat[9] *= d;\
74: mat[11] *= d;\
75: mat[0] += mat[2] * mat[8] * di;\
76: mat[1] += mat[2] * mat[9] * di;\
77: mat[3] += mat[2] * mat[11] * di;\
78: mat[4] += mat[6] * mat[8] * di;\
79: mat[5] += mat[6] * mat[9] * di;\
80: mat[7] += mat[6] * mat[11] * di;\
81: mat[12] += mat[14] * mat[8] * di;\
82: mat[13] += mat[14] * mat[9] * di;\
83: mat[15] += mat[14] * mat[11] * di;\
84: di = mat[15];\
85: mat[15] = d = 1.0 / di;\
86: mat[3] *= -d;\
87: mat[7] *= -d;\
88: mat[11] *= -d;\
89: mat[12] *= d;\
90: mat[13] *= d;\
91: mat[14] *= d;\
92: mat[0] += mat[3] * mat[12] * di;\
93: mat[1] += mat[3] * mat[13] * di;\
94: mat[2] += mat[3] * mat[14] * di;\
95: mat[4] += mat[7] * mat[12] * di;\
96: mat[5] += mat[7] * mat[13] * di;\
97: mat[6] += mat[7] * mat[14] * di;\
98: mat[8] += mat[11] * mat[12] * di;\
99: mat[9] += mat[11] * mat[13] * di;\
100: mat[10] += mat[11] * mat[14] * di;\
101: }
103: EXTERN int Kernel_A_gets_inverse_A_4(MatScalar *);
104: # if defined(PETSC_HAVE_SSE)
105: EXTERN int Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
106: # endif
108: EXTERN int Kernel_A_gets_inverse_A_5(MatScalar *);
109: EXTERN int Kernel_A_gets_inverse_A_6(MatScalar *);
110: EXTERN int Kernel_A_gets_inverse_A_7(MatScalar *);
113: /*
114: A = inv(A) A_gets_inverse_A
116: A - square bs by bs array stored in column major order
117: pivots - integer work array of length bs
118: W - bs by 1 work array
119: */
120: #define Kernel_A_gets_inverse_A(bs,A,pivots,W)\
121: { \
122: LINPACKdgefa((A),(bs),(pivots)); \
123: LINPACKdgedi((A),(bs),(pivots),(W)); \
124: }
126: /* -----------------------------------------------------------------------*/
128: #if !defined(PETSC_USE_MAT_SINGLE)
129: /*
130: Version that calls the BLAS directly
131: */
132: /*
133: A = A * B A_gets_A_times_B
135: A, B - square bs by bs arrays stored in column major order
136: W - square bs by bs work array
138: */
139: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
140: { \
141: PetscScalar _one = 1.0,_zero = 0.0; \
142: int _ierr; \
143: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
144: BLgemm_("N","N",&(bs),&(bs),&(bs),&_one,(W),&(bs),(B),&(bs),&_zero,(A),&(bs));\
145: }
147: /*
149: A = A - B * C A_gets_A_minus_B_times_C
151: A, B, C - square bs by bs arrays stored in column major order
152: */
153: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
154: { \
155: PetscScalar _mone = -1.0,_one = 1.0; \
156: BLgemm_("N","N",&(bs),&(bs),&(bs),&_mone,(B),&(bs),(C),&(bs),&_one,(A),&(bs));\
157: }
159: /*
161: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
163: A, B, C - square bs by bs arrays stored in column major order
164: */
165: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
166: { \
167: PetscScalar _one = 1.0; \
168: BLgemm_("T","N",&(bs),&(bs),&(bs),&_one,(B),&(bs),(C),&(bs),&_one,(A),&(bs));\
169: }
171: /*
172: v = v + A^T w v_gets_v_plus_Atranspose_times_w
174: v - array of length bs
175: A - square bs by bs array
176: w - array of length bs
177: */
178: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
179: { \
180: PetscScalar _one = 1.0; \
181: int _ione = 1; \
182: LAgemv_("T",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); \
183: }
185: /*
186: v = v - A w v_gets_v_minus_A_times_w
188: v - array of length bs
189: A - square bs by bs array
190: w - array of length bs
191: */
192: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
193: { \
194: PetscScalar _mone = -1.0,_one = 1.0; \
195: int _ione = 1; \
196: LAgemv_("N",&(bs),&(bs),&_mone,A,&(bs),w,&_ione,&_one,v,&_ione); \
197: }
199: /*
200: v = v + A w v_gets_v_plus_A_times_w
202: v - array of length bs
203: A - square bs by bs array
204: w - array of length bs
205: */
206: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
207: { \
208: PetscScalar _one = 1.0; \
209: int _ione = 1; \
210: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); \
211: }
213: /*
214: v = v + A w v_gets_v_plus_Ar_times_w
216: v - array of length bs
217: A - square bs by bs array
218: w - array of length bs
219: */
220: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
221: { \
222: PetscScalar _one = 1.0; \
223: int _ione = 1; \
224: LAgemv_("N",&(bs),&(ncols),&_one,A,&(bs),v,&_ione,&_one,w,&_ione); \
225: }
227: /*
228: w = A v w_gets_A_times_v
230: v - array of length bs
231: A - square bs by bs array
232: w - array of length bs
233: */
234: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
235: { \
236: PetscScalar _zero = 0.0,_one = 1.0; \
237: int _ione = 1; \
238: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),v,&_ione,&_zero,w,&_ione); \
239: }
241: /*
242: z = A*x
243: */
244: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
245: { \
246: PetscScalar _one = 1.0,_zero = 0.0; \
247: int _ione = 1; \
248: LAgemv_("N",&bs,&ncols,&_one,A,&bs,x,&_ione,&_zero,z,&_ione); \
249: }
251: /*
252: z = trans(A)*x
253: */
254: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
255: { \
256: PetscScalar _one = 1.0; \
257: int _ione = 1; \
258: LAgemv_("T",&bs,&ncols,&_one,A,&bs,x,&_ione,&_one,z,&_ione); \
259: }
261: #else
262: /*
263: Version that calls Fortran routines; can handle different precision
264: of matrix (array) and vectors
265: */
266: /*
267: These are Fortran kernels: They replace certain BLAS routines but
268: have some arguments that may be single precision,rather than double
269: These routines are provided in src/fortran/kernels/sgemv.F
270: They are pretty pitiful but get the job done. The intention is
271: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
272: code is used.
273: */
274: #ifdef PETSC_HAVE_FORTRAN_CAPS
275: #define msgemv_ MSGEMV
276: #define msgemvp_ MSGEMVP
277: #define msgemvm_ MSGEMVM
278: #define msgemvt_ MSGEMVT
279: #define msgemmi_ MSGEMMI
280: #define msgemm_ MSGEMM
281: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
282: #define msgemv_ msgemv
283: #define msgemvp_ msgemvp
284: #define msgemvm_ msgemvm
285: #define msgemvt_ msgemvt
286: #define msgemmi_ msgemmi
287: #define msgemm_ msgemm
288: #endif
289: EXTERN_C_BEGIN
290: EXTERN void msgemv_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
291: EXTERN void msgemvp_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
292: EXTERN void msgemvm_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
293: EXTERN void msgemvt_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
294: EXTERN void msgemmi_(int *,MatScalar*,MatScalar*,MatScalar*);
295: EXTERN void msgemm_(int *,MatScalar*,MatScalar*,MatScalar*);
296: EXTERN_C_END
298: /*
299: A = A * B A_gets_A_times_B
301: A, B - square bs by bs arrays stored in column major order
302: W - square bs by bs work array
304: */
305: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
306: { \
307: int _ierr; \
308: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
309: msgemmi_(&bs,A,B,W); \
310: }
312: /*
314: A = A - B * C A_gets_A_minus_B_times_C
316: A, B, C - square bs by bs arrays stored in column major order
317: */
318: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
319: { \
320: msgemm_(&bs,A,B,C); \
321: }
323: /*
324: v = v - A w v_gets_v_minus_A_times_w
326: v - array of length bs
327: A - square bs by bs array
328: w - array of length bs
329: */
330: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
331: { \
332: msgemvm_(&bs,&bs,A,w,v); \
333: }
335: /*
336: v = v + A w v_gets_v_plus_A_times_w
338: v - array of length bs
339: A - square bs by bs array
340: w - array of length bs
341: */
342: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
343: { \
344: msgemvp_(&bs,&bs,A,w,v);\
345: }
347: /*
348: v = v + A w v_gets_v_plus_Ar_times_w
350: v - array of length bs
351: A - square bs by bs array
352: w - array of length bs
353: */
354: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
355: { \
356: msgemvp_(&bs,&ncol,A,v,w);\
357: }
359: /*
360: w = A v w_gets_A_times_v
362: v - array of length bs
363: A - square bs by bs array
364: w - array of length bs
365: */
366: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
367: { \
368: msgemv_(&bs,&bs,A,v,w);\
369: }
370:
371: /*
372: z = A*x
373: */
374: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
375: { \
376: msgemv_(&bs,&ncols,A,x,z);\
377: }
379: /*
380: z = trans(A)*x
381: */
382: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
383: { \
384: msgemvt_(&bs,&ncols,A,x,z);\
385: }
387: /* These do not work yet */
388: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
389: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
392: #endif
394: #endif