Actual source code: iclsse.h
1: /* $Id: iclsse.h,v 1.3 2001/07/11 04:50:32 buschelm Exp $ */
3: #ifndef __ICL_SSE_H_
5: #include <xmmintrin.h>
6: PETSC_EXTERN_CXX_BEGIN
9: /* SSE_FUNCTION_BEGIN must be after the LAST declaration in the outermost SSE scope */
10: #define SSE_SCOPE_BEGIN { __m128 XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7; {
11: #define SSE_SCOPE_END }}
13: /* For use with SSE Inlined Assembly Blocks */
14: /* Note: SSE_ macro invokations must NOT be followed by a ; */
16: #define SSE_INLINE_BEGIN_1(arg1) { float *_tmp_arg1; _tmp_arg1=arg1;
17: #define SSE_INLINE_END_1 }
18: #define SSE_INLINE_BEGIN_2(arg1,arg2) { float *_tmp_arg1, *_tmp_arg2; _tmp_arg1=arg1; _tmp_arg2=arg2;
19: #define SSE_INLINE_END_2 }
20: #define SSE_INLINE_BEGIN_3(arg1,arg2,arg3) { float *_tmp_arg1, *_tmp_arg2, *_tmp_arg3; \
21: _tmp_arg1=arg1; _tmp_arg2=arg2; _tmp_arg3=arg3;
22: #define SSE_INLINE_END_3 }
24: #define SSE_ARG_1 _tmp_arg1
25: #define SSE_ARG_2 _tmp_arg2
26: #define SSE_ARG_3 _tmp_arg3
27: /* Note: If more args are to be used, be sure the debug version uses the most args allowed */
29: /* Offset values for SSE_ load/store/arithmetic memory ops */
30: #define FLOAT_0 0
31: #define FLOAT_1 1
32: #define FLOAT_2 2
33: #define FLOAT_3 3
34: #define FLOAT_4 4
35: #define FLOAT_5 5
36: #define FLOAT_6 6
37: #define FLOAT_7 7
38: #define FLOAT_8 8
39: #define FLOAT_9 9
40: #define FLOAT_10 10
41: #define FLOAT_11 11
42: #define FLOAT_12 12
43: #define FLOAT_13 13
44: #define FLOAT_14 14
45: #define FLOAT_15 15
47: #define FLOAT_16 16
48: #define FLOAT_24 24
49: #define FLOAT_32 32
50: #define FLOAT_40 40
51: #define FLOAT_48 48
52: #define FLOAT_56 56
53: #define FLOAT_64 64
55: #define DOUBLE_0 0
56: #define DOUBLE_1 1
57: #define DOUBLE_2 2
58: #define DOUBLE_3 3
59: #define DOUBLE_4 4
60: #define DOUBLE_5 5
61: #define DOUBLE_6 6
62: #define DOUBLE_7 7
64: #define DOUBLE_8 8
65: #define DOUBLE_16 16
66: #define DOUBLE_20 20
67: #define DOUBLE_24 24
68: #define DOUBLE_28 28
69: #define DOUBLE_32 32
71: /* xmmintrin.h provides for inline/debug versions automatically */
72: /* Inline versions */
74: /* Prefetch Macros */
75: #define SSE_PREFETCH_NTA(arg,offset) PREFETCH_NTA(&arg[offset]);
76: #define SSE_PREFETCH_L1(arg,offset) PREFETCH_L1(&arg[offset]);
77: #define SSE_PREFETCH_L2(arg,offset) PREFETCH_L2(&arg[offset]);
78: #define SSE_PREFETCH_L3(arg,offset) PREFETCH_L3(&arg[offset]);
80: /* Store Macros */
81: #define SSE_STORE_SS(arg,offset,srcreg) STORE_SS(&arg[offset],srcreg);
82: #define SSE_STOREL_PS(arg,offset,srcreg) STOREL_PS(&arg[offset],srcreg);
83: #define SSE_STOREH_PS(arg,offset,srcreg) STOREH_PS(&arg[offset],srcreg);
84: #define SSE_STORE_PS(arg,offset,srcreg) STORE_PS(&arg[offset],srcreg);
85: #define SSE_STOREU_PS(arg,offset,srcreg) STOREU_PS(&arg[offset],srcreg);
86: #define SSE_STREAM_PS(arg,offset,srcreg) STREAM_PS(&arg[offset],srcreg);
88: /* Register-Register Copy Macros */
89: #define SSE_COPY_SS(dstreg,srcreg) COPY_SS(dstreg,srcreg);
90: #define SSE_COPY_PS(dstreg,srcreg) COPY_PS(dstreg,srcreg);
92: /* Load Macros */
93: #define SSE_LOAD_SS(arg,offset,dstreg) LOAD_SS(&arg[offset],dstreg);
94: #define SSE_LOADL_PS(arg,offset,dstreg) LOADL_PS(&arg[offset],dstreg);
95: #define SSE_LOADH_PS(arg,offset,dstreg) LOADH_PS(&arg[offset],dstreg);
96: #define SSE_LOAD_PS(arg,offset,dstreg) LOAD_PS(&arg[offset],dstreg);
97: #define SSE_LOADU_PS(arg,offset,dstreg) LOADU_PS(&arg[offset],dstreg);
99: /* Shuffle */
100: #define SSE_SHUFFLE(dstreg,srcreg,imm) SHUFFLE(dstreg,srcreg,imm);
102: /* Multiply: A:=A*B */
103: #define SSE_MULT_SS(dstreg,srcreg) MULT_SS(dstreg,srcreg);
104: #define SSE_MULT_PS(dstreg,srcreg) MULT_PS(dstreg,srcreg);
105: #define SSE_MULT_SS_M(dstreg,arg,offset) MULT_SS_M(dstreg,&arg[offset]);
106: #define SSE_MULT_PS_M(dstreg,arg,offset) MULT_PS_M(dstreg,&arg[offset]);
108: /* Divide: A:=A/B */
109: #define SSE_DIV_SS(dstreg,srcreg) DIV_SS(dstreg,srcreg);
110: #define SSE_DIV_PS(dstreg,srcreg) DIV_PS(dstreg,srcreg);
111: #define SSE_DIV_SS_M(dstreg,arg,offset) DIV_SS_M(dstreg,&arg[offset]);
112: #define SSE_DIV_PS_M(dstreg,arg,offset) DIV_PS_M(dstreg,&arg[offset]);
114: /* Reciprocal: A:=1/B */
115: #define SSE_RECIP_SS(dstreg,srcreg) RECIP_SS(dstreg,srcreg);
116: #define SSE_RECIP_PS(dstreg,srcreg) RECIP_PS(dstreg,srcreg);
117: #define SSE_RECIP_SS_M(dstreg,arg,offset) RECIP_SS_M(dstreg,&arg[offset]);
118: #define SSE_RECIP_PS_M(dstreg,arg,offset) RECIP_PS_M(dstreg,&arg[offset]);
120: /* Add: A:=A+B */
121: #define SSE_ADD_SS(dstreg,srcreg) ADD_SS(dstreg,srcreg);
122: #define SSE_ADD_PS(dstreg,srcreg) ADD_PS(dstreg,srcreg);
123: #define SSE_ADD_SS_M(dstreg,arg,offset) ADD_SS_M(dstreg,&arg[offset]);
124: #define SSE_ADD_PS_M(dstreg,arg,offset) ADD_PS_M(dstreg,&arg[offset]);
126: /* Subtract: A:=A-B */
127: #define SSE_SUB_SS(dstreg,srcreg) SUB_SS(dstreg,srcreg);
128: #define SSE_SUB_PS(dstreg,srcreg) SUB_PS(dstreg,srcreg);
129: #define SSE_SUB_SS_M(dstreg,arg,offset) SUB_SS_M(dstreg,&arg[offset]);
130: #define SSE_SUB_PS_M(dstreg,arg,offset) SUB_PS_M(dstreg,&arg[offset]);
132: /* Logical: A:=A<op>B */
133: #define SSE_AND_SS(dstreg,srcreg) AND_SS(dstreg,srcreg);
134: #define SSE_ANDNOT_SS(dstreg,srcreg) ANDNOT_SS(dstreg,srcreg);
135: #define SSE_OR_SS(dstreg,srcreg) OR_SS(dstreg,srcreg);
136: #define SSE_XOR_SS(dstreg,srcreg) XOR_SS(dstreg,srcreg);
138: #define SSE_AND_PS(dstreg,srcreg) AND_PS(dstreg,srcreg);
139: #define SSE_ANDNOT_PS(dstreg,srcreg) ANDNOT_PS(dstreg,srcreg);
140: #define SSE_OR_PS(dstreg,srcreg) OR_PS(dstreg,srcreg);
141: #define SSE_XOR_PS(dstreg,srcreg) XOR_PS(dstreg,srcreg);
143: /* Comparisons A:=A<compare>B */
144: #define SSE_CMPEQ_SS(dstreg,srcreg) CMPEQ_SS(dstreg,srcreg);
145: #define SSE_CMPLT_SS(dstreg,srcreg) CMPLT_SS(dstreg,srcreg);
146: #define SSE_CMPLE_SS(dstreg,srcreg) CMPLE_SS(dstreg,srcreg);
147: #define SSE_CMPUNORD_SS(dstreg,srcreg) CMPUNORD_SS(dstreg,srcreg);
148: #define SSE_CMPNEQ_SS(dstreg,srcreg) CMPNEQ_SS(dstreg,srcreg);
149: #define SSE_CMPNLT_SS(dstreg,srcreg) CMPNLT_SS(dstreg,srcreg);
150: #define SSE_CMPNLE_SS(dstreg,srcreg) CMPNLE_SS(dstreg,srcreg);
151: #define SSE_CMPORD_SS(dstreg,srcreg) CMPORD_SS(dstreg,srcreg);
153: #define SSE_CMPEQ_PS(dstreg,srcreg) CMPEQ_PS(dstreg,srcreg);
154: #define SSE_CMPLT_PS(dstreg,srcreg) CMPLT_PS(dstreg,srcreg);
155: #define SSE_CMPLE_PS(dstreg,srcreg) CMPLE_PS(dstreg,srcreg);
156: #define SSE_CMPUNORD_PS(dstreg,srcreg) CMPUNORD_PS(dstreg,srcreg);
157: #define SSE_CMPNEQ_PS(dstreg,srcreg) CMPNEQ_PS(dstreg,srcreg);
158: #define SSE_CMPNLT_PS(dstreg,srcreg) CMPNLT_PS(dstreg,srcreg);
159: #define SSE_CMPNLE_PS(dstreg,srcreg) CMPNLE_PS(dstreg,srcreg);
160: #define SSE_CMPORD_PS(dstreg,srcreg) CMPORD_PS(dstreg,srcreg);
162: /* ================================================================================================ */
164: /* Other useful macros whose destinations are not SSE registers */
166: /* Movemask (for use after comparisons) */
167: /* Reduces 128 bit mask to an integer based on most significant bits of 32 bit parts. */
168: #define MOVEMASK(integ,srcxmmreg) integ = _mm_movemask_ps(srcxmmreg)
170: /* Double_4/Float_4 Conversions */
171: #define CONVERT_FLOAT4_DOUBLE4(dst,src) { double *_tmp_double_ptr; float *_tmp_float_ptr; \
172: _tmp_double_ptr = dst; _tmp_float_ptr = src; \
173: _tmp_double_ptr[0]=(double)_tmp_float_ptr[0]; \
174: _tmp_double_ptr[1]=(double)_tmp_float_ptr[1]; \
175: _tmp_double_ptr[2]=(double)_tmp_float_ptr[2]; \
176: _tmp_double_ptr[3]=(double)_tmp_float_ptr[3]; }
178: #define CONVERT_DOUBLE4_FLOAT4(dst,src) { double *_tmp_double_ptr; float *_tmp_float_ptr; \
179: _tmp_double_ptr = src; _tmp_float_ptr = dst; \
180: _tmp_float_ptr[0]=(float)_tmp_double_ptr[0]; \
181: _tmp_float_ptr[1]=(float)_tmp_double_ptr[1]; \
182: _tmp_float_ptr[2]=(float)_tmp_double_ptr[2]; \
183: _tmp_float_ptr[3]=(float)_tmp_double_ptr[3]; }
185: /* Aligned Malloc */
186: #define SSE_MALLOC(var,size) { void *_tmp_void_ptr = *var; size_t _tmp_size; _tmp_size = size; \
187: *var = _mm_malloc(size,16); }
188: #define SSE_FREE(var) { void *_tmp_void_ptr = var; \
189: _mm_free(var); }
191: /* CPUID Instruction Macros */
193: #define CPUID_VENDOR 0
194: #define CPUID_FEATURES 1
195: #define CPUID_CACHE 2
197: #define CPUID(imm,_eax,_ebx,_ecx,_edx) { int _tmp_imm; \
198: unsigned long _tmp_eax, _tmp_ebx, _tmp_ecx, _tmp_edx; \
199: _tmp_eax=*_eax; _tmp_ebx=*_ebx; _tmp_ecx=*_ecx; _tmp_edx=*_edx; \
200: _tmp_imm=imm; \
201: __asm { \
202: __asm mov eax, imm \
203: __asm cpuid \
204: __asm mov _tmp_eax, eax \
205: __asm mov _tmp_ebx, ebx \
206: __asm mov _tmp_ecx, ecx \
207: __asm mov _tmp_edx, edx \
208: } \
209: *_eax=_tmp_eax; *_ebx=_tmp_ebx; *_ecx=_tmp_ecx; *_edx=_tmp_edx; \
210: }
212: #define CPUID_GET_VENDOR(result) { char *_gv_vendor=result; int _gv_i; \
213: unsigned long _gv_eax=0;unsigned long _gv_ebx=0;unsigned long _gv_ecx=0;unsigned long _gv_edx=0;\
214: CPUID(CPUID_VENDOR,&_gv_eax,&_gv_ebx,&_gv_ecx,&_gv_edx); \
215: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+0]=*(((char *)(&_gv_ebx))+_gv_i); \
216: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+4]=*(((char *)(&_gv_edx))+_gv_i); \
217: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+8]=*(((char *)(&_gv_ecx))+_gv_i); \
218: }
220: /* ================================================================================================ */
222: /* The Stand Alone Versions of the SSE Macros */
224: /* Prefetch Macros */
225: #define PREFETCH_NTA(var) _mm_prefetch((char *)(var),_MM_HINT_NTA)
226: #define PREFETCH_L1(var) _mm_prefetch((char *)(var),_MM_HINT_T0)
227: #define PREFETCH_L2(var) _mm_prefetch((char *)(var),_MM_HINT_T1)
228: #define PREFETCH_L3(var) _mm_prefetch((char *)(var),_MM_HINT_T2)
230: /* Store Macros */
231: #define STORE_SS(var,srcreg) _mm_store_ss(var,srcreg)
232: #define STOREL_PS(var,srcreg) _mm_storel_pi((__m64 *)(var),srcreg)
233: #define STOREH_PS(var,srcreg) _mm_storeh_pi((__m64 *)(var),srcreg)
234: #define STORE_PS(var,srcreg) _mm_store_ps(var,srcreg)
235: #define STOREU_PS(var,srcreg) _mm_storeu_ps(var,srcreg)
236: #define STREAM_PS(var,srcreg) _mm_stream_ps(var,srcreg)
238: /* Register-Register Copy Macros */
239: #define COPY_SS(dstreg,srcreg) dstreg = _mm_move_ss(dstreg,srcreg)
240: #define COPY_PS(dstreg,srcreg) dstreg = srcreg
242: /* Load Macros */
243: #define LOAD_SS(var,dstreg) dstreg = _mm_load_ss(var)
244: #define LOADL_PS(var,dstreg) dstreg = _mm_loadl_pi(dstreg,(__m64 *)(var))
245: #define LOADH_PS(var,dstreg) dstreg = _mm_loadh_pi(dstreg,(__m64 *)(var))
246: #define LOAD_PS(var,dstreg) dstreg = _mm_load_ps(var)
247: #define LOADU_PS(var,dstreg) dstreg = _mm_loadu_ps(var)
249: /* Shuffle */
250: #define SHUFFLE(dstreg,srcreg,i) dstreg = _mm_shuffle_ps(dstreg,srcreg,i)
252: /* Multiply: A:=A*B */
253: #define MULT_SS(dstreg,srcreg) dstreg = _mm_mul_ss(dstreg,srcreg)
254: #define MULT_PS(dstreg,srcreg) dstreg = _mm_mul_ps(dstreg,srcreg)
255: #define MULT_SS_M(dstreg,var) dstreg = _mm_mul_ss(dstreg,_mm_load_ss(var))
256: #define MULT_PS_M(dstreg,var) dstreg = _mm_mul_ps(dstreg,_mm_load_ps(var))
258: /* Divide: A:=A/B */
259: #define DIV_SS(dstreg,srcreg) dstreg = _mm_div_ss(dstreg,srcreg)
260: #define DIV_PS(dstreg,srcreg) dstreg = _mm_div_ps(dstreg,srcreg)
261: #define DIV_SS_M(dstreg,var) dstreg = _mm_div_ss(dstreg,_mm_load_ss(var))
262: #define DIV_PS_M(dstreg,var) dstreg = _mm_div_ps(dstreg,_mm_load_ps(var))
264: /* Reciprocal: A:=1/B */
265: #define RECIP_SS(dstreg,srcreg) dstreg = _mm_rcp_ss(srcreg)
266: #define RECIP_PS(dstreg,srcreg) dstreg = _mm_rcp_ps(srcreg)
267: #define RECIP_SS_M(dstreg,var) dstreg = _mm_rcp_ss(_mm_load_ss(var))
268: #define RECIP_PS_M(dstreg,var) dstreg = _mm_rcp_ps(_mm_load_ps(var))
270: /* Add: A:=A+B */
271: #define ADD_SS(dstreg,srcreg) dstreg = _mm_add_ss(dstreg,srcreg)
272: #define ADD_PS(dstreg,srcreg) dstreg = _mm_add_ps(dstreg,srcreg)
273: #define ADD_SS_M(dstreg,var) dstreg = _mm_add_ss(dstreg,_mm_load_ss(var))
274: #define ADD_PS_M(dstreg,var) dstreg = _mm_add_ps(dstreg,_mm_load_ps(var))
276: /* Subtract: A:=A-B */
277: #define SUB_SS(dstreg,srcreg) dstreg = _mm_sub_ss(dstreg,srcreg)
278: #define SUB_PS(dstreg,srcreg) dstreg = _mm_sub_ps(dstreg,srcreg)
279: #define SUB_SS_M(dstreg,var) dstreg = _mm_sub_ss(dstreg,_mm_load_ss(var))
280: #define SUB_PS_M(dstreg,var) dstreg = _mm_sub_ps(dstreg,_mm_load_ps(var))
282: /* Logical: A:=A<op>B */
283: #define AND_SS(dstreg,srcreg) dstreg = _mm_and_ss(dstreg,srcreg)
284: #define ANDNOT_SS(dstreg,srcreg) dstreg = _mm_andnot_ss(dstreg,srcreg)
285: #define OR_SS(dstreg,srcreg) dstreg = _mm_or_ss(dstreg,srcreg)
286: #define XOR_SS(dstreg,srcreg) dstreg = _mm_xor_ss(dstreg,srcreg)
288: #define AND_PS(dstreg,srcreg) dstreg = _mm_and_ps(dstreg,srcreg)
289: #define ANDNOT_PS(dstreg,srcreg) dstreg = _mm_andnot_ps(dstreg,srcreg)
290: #define OR_PS(dstreg,srcreg) dstreg = _mm_or_ps(dstreg,srcreg)
291: #define XOR_PS(dstreg,srcreg) dstreg = _mm_xor_ps(dstreg,srcreg)
293: /* Implementing an if():
294: First perform the comparison, then use Movemask to get an integer, say i, then
295: if(i) ....
296: */
298: /*
299: Note: From the IA Software Developer's Manual:
300: The greater-than relations not implemented in hardware require more than one instruction to
301: emulate in software and therefore should not be implemented as pseudo-ops. (For these, the
302: programmer should reverse the operands of the corresponding less than relations and use move
303: instructions to ensure that the mask is moved to the correct destination register and that the
304: source operand is left intact.)
305: */
307: /* Comparisons A:=A<compare>B */
308: #define CMPEQ_SS(dstreg,srcreg) dstreg = _mm_cmpeq_ss(dstreg,srcreg)
309: #define CMPLT_SS(dstreg,srcreg) dstreg = _mm_cmplt_ss(dstreg,srcreg)
310: #define CMPLE_SS(dstreg,srcreg) dstreg = _mm_cmple_ss(dstreg,srcreg)
311: #define CMPUNORD_SS(dstreg,srcreg) dstreg = _mm_cmpunord_ss(dstreg,srcreg)
312: #define CMPNEQ_SS(dstreg,srcreg) dstreg = _mm_cmpneq_ss(dstreg,srcreg)
313: #define CMPNLT_SS(dstreg,srcreg) dstreg = _mm_cmpnlt_ss(dstreg,srcreg)
314: #define CMPNLE_SS(dstreg,srcreg) dstreg = _mm_cmpnle_ss(dstreg,srcreg)
315: #define CMPORD_SS(dstreg,srcreg) dstreg = _mm_cmpord_ss(dstreg,srcreg)
317: #define CMPEQ_PS(dstreg,srcreg) dstreg = _mm_cmpeq_ps(dstreg,srcreg)
318: #define CMPLT_PS(dstreg,srcreg) dstreg = _mm_cmplt_ps(dstreg,srcreg)
319: #define CMPLE_PS(dstreg,srcreg) dstreg = _mm_cmple_ps(dstreg,srcreg)
320: #define CMPUNORD_PS(dstreg,srcreg) dstreg = _mm_cmpunord_ps(dstreg,srcreg)
321: #define CMPNEQ_PS(dstreg,srcreg) dstreg = _mm_cmpneq_ps(dstreg,srcreg)
322: #define CMPNLT_PS(dstreg,srcreg) dstreg = _mm_cmpnlt_ps(dstreg,srcreg)
323: #define CMPNLE_PS(dstreg,srcreg) dstreg = _mm_cmpnle_ps(dstreg,srcreg)
324: #define CMPORD_PS(dstreg,srcreg) dstreg = _mm_cmpord_ps(dstreg,srcreg)
326: /* ================================================================================================ */
328: PETSC_EXTERN_CXX_END
329: #endif