Actual source code: axpy.h
1: /* $Id: axpy.h,v 1.19 2001/08/07 03:01:46 balay Exp $ */
3: /*
4: These are macros for daxpy like operations. The format is
5: APXY(U,Alpha,P,n)
6: for
7: U += Alpha * P
9: In addition,versions that process 2 and 4 vectors are provided;
10: these can give significantly better use of memory resources than
11: successive calls to the regular daxpy.
12: */
14: #ifndef APXY
16: #include petscblaslapack.h
18: #if defined(PETSC_HAVE_FORTRAN_CAPS)
19: #define fortrancopy_ FORTRANCOPY
20: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
21: #define fortrancopy_ fortrancopy
22: #endif
23: EXTERN_C_BEGIN
24: extern void fortrancopy_(int*,PetscScalar*,PetscScalar*);
25: EXTERN_C_END
27: #if defined(PETSC_HAVE_FORTRAN_CAPS)
28: #define fortranzero_ FORTRANZERO
29: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
30: #define fortranzero_ fortranzero
31: #endif
32: EXTERN_C_BEGIN
33: extern void fortranzero_(int*,PetscScalar*);
34: EXTERN_C_END
37: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
38: #if defined(PETSC_HAVE_FORTRAN_CAPS)
39: #define fortranaypx_ FORTRANAYPX
40: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
41: #define fortranaypx_ fortranaypx
42: #endif
43: EXTERN_C_BEGIN
44: extern void fortranaypx_(int*,const PetscScalar*,PetscScalar*,PetscScalar*);
45: EXTERN_C_END
46: #endif
48: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
49: #if defined(PETSC_HAVE_FORTRAN_CAPS)
50: #define fortranwaxpy_ FORTRANWAXPY
51: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
52: #define fortranwaxpy_ fortranwaxpy
53: #endif
54: EXTERN_C_BEGIN
55: extern void fortranwaxpy_(int*,const PetscScalar*,PetscScalar*,PetscScalar*,PetscScalar*);
56: EXTERN_C_END
57: #endif
59: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
61: #if defined(PETSC_HAVE_FORTRAN_CAPS)
62: #define fortranmaxpy4_ FORTRANMAXPY4
63: #define fortranmaxpy3_ FORTRANMAXPY3
64: #define fortranmaxpy2_ FORTRANMAXPY2
65: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
66: #define fortranmaxpy4_ fortranmaxpy4
67: #define fortranmaxpy3_ fortranmaxpy3
68: #define fortranmaxpy2_ fortranmaxpy2
69: #endif
71: EXTERN_C_BEGIN
72: EXTERN void fortranmaxpy4_(void *,void *,void *,void *,void *,void *,void *,void *,void *,int *);
73: EXTERN void fortranmaxpy3_(void *,void *,void *,void *,void *,void *,void *,int *);
74: EXTERN void fortranmaxpy2_(void *,void *,void *,void *,void *,int *);
75: EXTERN_C_END
77: #define APXY(U,a1,p1,n) {int one=1;\
78: BLaxpy_(&n,&a1,p1,&one,U,&one);}
79: #define APXY2(U,a1,a2,p1,p2,n) { \
80: fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
81: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) { \
82: fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
83: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){ \
84: fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}
86: #elif defined(PETSC_USE_UNROLL_KERNELS)
88: #define APXY(U,Alpha,P,n) {\
89: switch (n & 0x3) {\
90: case 3: *U++ += Alpha * *P++;\
91: case 2: *U++ += Alpha * *P++;\
92: case 1: *U++ += Alpha * *P++;\
93: n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];\
94: U[2] += Alpha * P[2]; U[3] += Alpha * P[3]; \
95: U += 4; P += 4; n -= 4;}}
96: #define APXY2(U,a1,a2,p1,p2,n) {\
97: switch (n & 0x3) {\
98: case 3: *U++ += a1 * *p1++ + a2 * *p2++;\
99: case 2: *U++ += a1 * *p1++ + a2 * *p2++;\
100: case 1: *U++ += a1 * *p1++ + a2 * *p2++;\
101: n -= 4;case 0: break;}\
102: while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];\
103: U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
104: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
105: switch (n & 0x3) {\
106: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
107: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
108: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
109: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];\
110: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];\
111: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];\
112: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
113: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
114: switch (n & 0x3) {\
115: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
116: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
117: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
118: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
119: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];\
120: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];\
121: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}
123: #elif defined(PETSC_USE_WHILE_KERNELS)
125: #define APXY(U,a1,p1,n) {\
126: while (n--) *U++ += a1 * *p1++;}
127: #define APXY2(U,a1,a2,p1,p2,n) {\
128: while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
129: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
130: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
131: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
132: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}
134: #elif defined(PETSC_USE_BLAS_KERNELS)
136: #define APXY(U,a1,p1,n) {int one=1;\
137: daxpy_(&n,&a1,p1,&one,U,&one);}
138: #define APXY2(U,a1,a2,p1,p2,n) {int one=1,two=2,off=(int)(p2-p1);\
139: double fone=1.0,aa[2];\
140: aa[0]=a1;aa[1]=a2;\
141: LAgemv_("N",&n,&two,&fone,p1,&off,aa,&one,&fone,U,&one);}
142: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);\
143: APXY(U,a3,p3,n);}
144: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);\
145: APXY2(U,a3,a4,p3,p4,n);}
147: #elif defined(PETSC_USE_FOR_KERNELS)
149: #define APXY(U,a1,p1,n) {int __i;PetscScalar __s1,__s2; \
150: for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];\
151: __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\
152: if (n & 0x1) U[__i] += a1 * p1[__i];}
153: #define APXY2(U,a1,a2,p1,p2,n) {int __i;\
154: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
155: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;\
156: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
157: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;\
158: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
160: #else
162: #define APXY(U,a1,p1,n) {int __i;PetscScalar _a1=a1;\
163: for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
164: #define APXY2(U,a1,a2,p1,p2,n) {int __i;\
165: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
166: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;\
167: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
168: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;\
169: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
171: #endif
174: /* ----------------------------------------------------------------------------
175: axpy() but for increments of inc in both U and P
176: ---------------------------------------------------------------------------*/
177: #ifdef PETSC_USE_UNROLL_KERNELS
178: #define APXYINC(U,Alpha,P,n,inc) {\
179: if (n & 0x1) {\
180: *U += Alpha * *P; U += inc; P += inc; n--;}\
181: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];\
182: U += 2*inc; P += 2*inc; n -= 2;}}
183: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
184: if (n & 0x1) {\
185: *U += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}\
186: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];\
187: U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
188: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
189: if (n & 0x1) {\
190: *U += a1 * *p1 + a2 * *p2 + a3 * *p3; \
191: U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}\
192: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];\
193: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];\
194: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
195: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
196: ;if (n & 0x1) {\
197: *U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4; \
198: U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}\
199: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
200: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];\
201: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}
203: #elif defined(PETSC_USE_WHILE_KERNELS)
204: #define APXYINC(U,a1,p1,n,inc) {\
205: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
206: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
207: while (n--) {*U += a1 * *p1 + a2 * *p2;\
208: U+=inc;p1+=inc;p2+=inc;}}
209: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){\
210: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
211: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
212: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
213: p2+=inc;p3+=inc;p4+=inc;}}
215: #else
216: /* These need to be converted to for loops */
217: #define APXYINC(U,a1,p1,n,inc) {\
218: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
219: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
220: while (n--) {*U += a1 * *p1 + a2 * *p2;\
221: U+=inc;p1+=inc;p2+=inc;}}
222: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
223: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
224: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){\
225: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
226: p2+=inc;p3+=inc;p4+=inc;}}
227: #endif
229: /* --------------------------------------------------------------------
230: This is aypx:
231: for (i=0; i<n; i++)
232: y[i] = x[i] + alpha * y[i];
233: ---------------------------------------------------------------------*/
234: #if defined(PETSC_USE_UNROLL_KERNELS)
235: #define AYPX(U,Alpha,P,n) {\
236: switch (n & 0x3) {\
237: case 3: *U = *P++ + Alpha * *U;U++;\
238: case 2: *U = *P++ + Alpha * *U;U++;\
239: case 1: *U = *P++ + Alpha * *U;U++;\
240: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];\
241: U[1] = P[1] + Alpha * U[1];\
242: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3]; \
243: U += 4; P += 4; n -= 4;}}
245: #elif defined(PETSC_USE_WHILE_KERNELS)
246: #define AYPX(U,a1,p1,n) {\
247: while (n--) {*U = *p1++ + a1 * *U;U++;}
249: #elif defined(PETSC_USE_FOR_KERNELS)
250: #define AYPX(U,a1,p1,n) {int __i;PetscScalar __s1,__s2; \
251: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];\
252: __s1+=a1*U[__i];__s2+=a1*U[__i+1];\
253: U[__i]=__s1;U[__i+1]=__s2;}\
254: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}
256: #else
257: #define AYPX(U,a1,p1,n) {int __i;\
258: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
259: #endif
261: /* ----------------------------------------------------------------------------------
262: Useful for APXY where alpha == -1
263: ----------------------------------------------------------------------------------
264: */
265: #define YMX(U,p1,n) {int __i;\
266: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
267: /* Useful for APXY where alpha == 1 */
268: #define YPX(U,p1,n) {int __i;\
269: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}
271: #endif