Actual source code: iclsse.h

  1: /* $Id: iclsse.h,v 1.3 2001/07/11 04:50:32 buschelm Exp $ */

  3: #ifndef __ICL_SSE_H_
  5: #include <xmmintrin.h>
  6: PETSC_EXTERN_CXX_BEGIN


  9: /* SSE_FUNCTION_BEGIN must be after the LAST declaration in the outermost SSE scope */
 10: #define SSE_SCOPE_BEGIN { __m128 XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7; {
 11: #define SSE_SCOPE_END   }}

 13: /* For use with SSE Inlined Assembly  Blocks */
 14: /* Note: SSE_ macro invokations must NOT be followed by a ; */

 16: #define SSE_INLINE_BEGIN_1(arg1)           { float *_tmp_arg1; _tmp_arg1=arg1;
 17: #define SSE_INLINE_END_1                   }
 18: #define SSE_INLINE_BEGIN_2(arg1,arg2)      { float *_tmp_arg1, *_tmp_arg2; _tmp_arg1=arg1; _tmp_arg2=arg2;
 19: #define SSE_INLINE_END_2                   }
 20: #define SSE_INLINE_BEGIN_3(arg1,arg2,arg3) { float *_tmp_arg1, *_tmp_arg2, *_tmp_arg3; \
 21:                                              _tmp_arg1=arg1; _tmp_arg2=arg2; _tmp_arg3=arg3;
 22: #define SSE_INLINE_END_3                   }

 24: #define SSE_ARG_1 _tmp_arg1
 25: #define SSE_ARG_2 _tmp_arg2
 26: #define SSE_ARG_3 _tmp_arg3
 27: /* Note: If more args are to be used, be sure the debug version uses the most args allowed */

 29: /* Offset values for SSE_ load/store/arithmetic memory ops */
 30: #define FLOAT_0    0
 31: #define FLOAT_1    1
 32: #define FLOAT_2    2
 33: #define FLOAT_3    3
 34: #define FLOAT_4    4
 35: #define FLOAT_5    5
 36: #define FLOAT_6    6
 37: #define FLOAT_7    7
 38: #define FLOAT_8    8
 39: #define FLOAT_9    9
 40: #define FLOAT_10  10
 41: #define FLOAT_11  11
 42: #define FLOAT_12  12
 43: #define FLOAT_13  13
 44: #define FLOAT_14  14
 45: #define FLOAT_15  15

 47: #define FLOAT_16  16
 48: #define FLOAT_24  24
 49: #define FLOAT_32  32
 50: #define FLOAT_40  40
 51: #define FLOAT_48  48
 52: #define FLOAT_56  56
 53: #define FLOAT_64  64

 55: #define DOUBLE_0   0
 56: #define DOUBLE_1   1
 57: #define DOUBLE_2   2 
 58: #define DOUBLE_3   3
 59: #define DOUBLE_4   4
 60: #define DOUBLE_5   5
 61: #define DOUBLE_6   6
 62: #define DOUBLE_7   7

 64: #define DOUBLE_8   8
 65: #define DOUBLE_16 16
 66: #define DOUBLE_20 20
 67: #define DOUBLE_24 24
 68: #define DOUBLE_28 28
 69: #define DOUBLE_32 32

 71: /* xmmintrin.h provides for inline/debug versions automatically */
 72: /* Inline versions */

 74: /* Prefetch Macros */
 75: #define SSE_PREFETCH_NTA(arg,offset)      PREFETCH_NTA(&arg[offset]);
 76: #define SSE_PREFETCH_L1(arg,offset)       PREFETCH_L1(&arg[offset]);
 77: #define SSE_PREFETCH_L2(arg,offset)       PREFETCH_L2(&arg[offset]);
 78: #define SSE_PREFETCH_L3(arg,offset)       PREFETCH_L3(&arg[offset]);

 80: /* Store Macros */
 81: #define SSE_STORE_SS(arg,offset,srcreg)   STORE_SS(&arg[offset],srcreg);
 82: #define SSE_STOREL_PS(arg,offset,srcreg)  STOREL_PS(&arg[offset],srcreg);
 83: #define SSE_STOREH_PS(arg,offset,srcreg)  STOREH_PS(&arg[offset],srcreg);
 84: #define SSE_STORE_PS(arg,offset,srcreg)   STORE_PS(&arg[offset],srcreg);
 85: #define SSE_STOREU_PS(arg,offset,srcreg)  STOREU_PS(&arg[offset],srcreg);
 86: #define SSE_STREAM_PS(arg,offset,srcreg)  STREAM_PS(&arg[offset],srcreg);

 88: /* Register-Register Copy Macros */
 89: #define SSE_COPY_SS(dstreg,srcreg)        COPY_SS(dstreg,srcreg);
 90: #define SSE_COPY_PS(dstreg,srcreg)        COPY_PS(dstreg,srcreg);

 92: /* Load Macros */
 93: #define SSE_LOAD_SS(arg,offset,dstreg)    LOAD_SS(&arg[offset],dstreg);
 94: #define SSE_LOADL_PS(arg,offset,dstreg)   LOADL_PS(&arg[offset],dstreg);
 95: #define SSE_LOADH_PS(arg,offset,dstreg)   LOADH_PS(&arg[offset],dstreg);
 96: #define SSE_LOAD_PS(arg,offset,dstreg)    LOAD_PS(&arg[offset],dstreg);
 97: #define SSE_LOADU_PS(arg,offset,dstreg)   LOADU_PS(&arg[offset],dstreg);

 99: /* Shuffle */
100: #define SSE_SHUFFLE(dstreg,srcreg,imm)    SHUFFLE(dstreg,srcreg,imm);

102: /* Multiply: A:=A*B */
103: #define SSE_MULT_SS(dstreg,srcreg)        MULT_SS(dstreg,srcreg);
104: #define SSE_MULT_PS(dstreg,srcreg)        MULT_PS(dstreg,srcreg);
105: #define SSE_MULT_SS_M(dstreg,arg,offset)  MULT_SS_M(dstreg,&arg[offset]);
106: #define SSE_MULT_PS_M(dstreg,arg,offset)  MULT_PS_M(dstreg,&arg[offset]);

108: /* Divide: A:=A/B */
109: #define SSE_DIV_SS(dstreg,srcreg)         DIV_SS(dstreg,srcreg);
110: #define SSE_DIV_PS(dstreg,srcreg)         DIV_PS(dstreg,srcreg);
111: #define SSE_DIV_SS_M(dstreg,arg,offset)   DIV_SS_M(dstreg,&arg[offset]);
112: #define SSE_DIV_PS_M(dstreg,arg,offset)   DIV_PS_M(dstreg,&arg[offset]);

114: /* Reciprocal: A:=1/B */
115: #define SSE_RECIP_SS(dstreg,srcreg)       RECIP_SS(dstreg,srcreg); 
116: #define SSE_RECIP_PS(dstreg,srcreg)       RECIP_PS(dstreg,srcreg);
117: #define SSE_RECIP_SS_M(dstreg,arg,offset) RECIP_SS_M(dstreg,&arg[offset]);
118: #define SSE_RECIP_PS_M(dstreg,arg,offset) RECIP_PS_M(dstreg,&arg[offset]);

120: /* Add: A:=A+B */
121: #define SSE_ADD_SS(dstreg,srcreg)         ADD_SS(dstreg,srcreg);
122: #define SSE_ADD_PS(dstreg,srcreg)         ADD_PS(dstreg,srcreg);
123: #define SSE_ADD_SS_M(dstreg,arg,offset)   ADD_SS_M(dstreg,&arg[offset]);
124: #define SSE_ADD_PS_M(dstreg,arg,offset)   ADD_PS_M(dstreg,&arg[offset]);

126: /* Subtract: A:=A-B */
127: #define SSE_SUB_SS(dstreg,srcreg)         SUB_SS(dstreg,srcreg);
128: #define SSE_SUB_PS(dstreg,srcreg)         SUB_PS(dstreg,srcreg);
129: #define SSE_SUB_SS_M(dstreg,arg,offset)   SUB_SS_M(dstreg,&arg[offset]);
130: #define SSE_SUB_PS_M(dstreg,arg,offset)   SUB_PS_M(dstreg,&arg[offset]);

132: /* Logical: A:=A<op>B */
133: #define SSE_AND_SS(dstreg,srcreg)         AND_SS(dstreg,srcreg);
134: #define SSE_ANDNOT_SS(dstreg,srcreg)      ANDNOT_SS(dstreg,srcreg);
135: #define SSE_OR_SS(dstreg,srcreg)          OR_SS(dstreg,srcreg);
136: #define SSE_XOR_SS(dstreg,srcreg)         XOR_SS(dstreg,srcreg);

138: #define SSE_AND_PS(dstreg,srcreg)         AND_PS(dstreg,srcreg);
139: #define SSE_ANDNOT_PS(dstreg,srcreg)      ANDNOT_PS(dstreg,srcreg);
140: #define SSE_OR_PS(dstreg,srcreg)          OR_PS(dstreg,srcreg);
141: #define SSE_XOR_PS(dstreg,srcreg)         XOR_PS(dstreg,srcreg);

143: /* Comparisons A:=A<compare>B */
144: #define SSE_CMPEQ_SS(dstreg,srcreg)       CMPEQ_SS(dstreg,srcreg);
145: #define SSE_CMPLT_SS(dstreg,srcreg)       CMPLT_SS(dstreg,srcreg);
146: #define SSE_CMPLE_SS(dstreg,srcreg)       CMPLE_SS(dstreg,srcreg);
147: #define SSE_CMPUNORD_SS(dstreg,srcreg)    CMPUNORD_SS(dstreg,srcreg);
148: #define SSE_CMPNEQ_SS(dstreg,srcreg)      CMPNEQ_SS(dstreg,srcreg);
149: #define SSE_CMPNLT_SS(dstreg,srcreg)      CMPNLT_SS(dstreg,srcreg);
150: #define SSE_CMPNLE_SS(dstreg,srcreg)      CMPNLE_SS(dstreg,srcreg);
151: #define SSE_CMPORD_SS(dstreg,srcreg)      CMPORD_SS(dstreg,srcreg);

153: #define SSE_CMPEQ_PS(dstreg,srcreg)       CMPEQ_PS(dstreg,srcreg);
154: #define SSE_CMPLT_PS(dstreg,srcreg)       CMPLT_PS(dstreg,srcreg);
155: #define SSE_CMPLE_PS(dstreg,srcreg)       CMPLE_PS(dstreg,srcreg);
156: #define SSE_CMPUNORD_PS(dstreg,srcreg)    CMPUNORD_PS(dstreg,srcreg);
157: #define SSE_CMPNEQ_PS(dstreg,srcreg)      CMPNEQ_PS(dstreg,srcreg);
158: #define SSE_CMPNLT_PS(dstreg,srcreg)      CMPNLT_PS(dstreg,srcreg);
159: #define SSE_CMPNLE_PS(dstreg,srcreg)      CMPNLE_PS(dstreg,srcreg);
160: #define SSE_CMPORD_PS(dstreg,srcreg)      CMPORD_PS(dstreg,srcreg);

162: /* ================================================================================================ */

164: /* Other useful macros whose destinations are not SSE registers */

166: /* Movemask (for use after comparisons) */
167: /* Reduces 128 bit mask to an integer based on most significant bits of 32 bit parts. */
168: #define MOVEMASK(integ,srcxmmreg)         integ = _mm_movemask_ps(srcxmmreg)

170: /* Double_4/Float_4 Conversions */
171: #define CONVERT_FLOAT4_DOUBLE4(dst,src)   { double *_tmp_double_ptr; float *_tmp_float_ptr; \
172:                                             _tmp_double_ptr = dst; _tmp_float_ptr = src; \
173:                                             _tmp_double_ptr[0]=(double)_tmp_float_ptr[0]; \
174:                                             _tmp_double_ptr[1]=(double)_tmp_float_ptr[1]; \
175:                                             _tmp_double_ptr[2]=(double)_tmp_float_ptr[2]; \
176:                                             _tmp_double_ptr[3]=(double)_tmp_float_ptr[3]; }

178: #define CONVERT_DOUBLE4_FLOAT4(dst,src)   { double *_tmp_double_ptr; float *_tmp_float_ptr; \
179:                                             _tmp_double_ptr = src; _tmp_float_ptr = dst; \
180:                                             _tmp_float_ptr[0]=(float)_tmp_double_ptr[0]; \
181:                                             _tmp_float_ptr[1]=(float)_tmp_double_ptr[1]; \
182:                                             _tmp_float_ptr[2]=(float)_tmp_double_ptr[2]; \
183:                                             _tmp_float_ptr[3]=(float)_tmp_double_ptr[3]; }

185: /* Aligned Malloc */
186: #define SSE_MALLOC(var,size)              { void *_tmp_void_ptr = *var; size_t _tmp_size; _tmp_size = size; \
187:                                             *var = _mm_malloc(size,16); }
188: #define SSE_FREE(var)                     { void *_tmp_void_ptr = var; \
189:                                             _mm_free(var); }

191: /* CPUID Instruction Macros */

193: #define CPUID_VENDOR   0
194: #define CPUID_FEATURES 1
195: #define CPUID_CACHE    2

197: #define CPUID(imm,_eax,_ebx,_ecx,_edx) { int _tmp_imm; \
198:   unsigned long _tmp_eax, _tmp_ebx, _tmp_ecx, _tmp_edx; \
199:   _tmp_eax=*_eax; _tmp_ebx=*_ebx; _tmp_ecx=*_ecx; _tmp_edx=*_edx; \
200:   _tmp_imm=imm; \
201:   __asm { \
202:     __asm mov eax, imm \
203:     __asm cpuid \
204:     __asm mov _tmp_eax, eax \
205:     __asm mov _tmp_ebx, ebx \
206:     __asm mov _tmp_ecx, ecx \
207:     __asm mov _tmp_edx, edx \
208:   } \
209:   *_eax=_tmp_eax; *_ebx=_tmp_ebx; *_ecx=_tmp_ecx; *_edx=_tmp_edx; \
210: }

212: #define CPUID_GET_VENDOR(result) { char *_gv_vendor=result; int _gv_i; \
213:   unsigned long _gv_eax=0;unsigned long _gv_ebx=0;unsigned long _gv_ecx=0;unsigned long _gv_edx=0;\
214:   CPUID(CPUID_VENDOR,&_gv_eax,&_gv_ebx,&_gv_ecx,&_gv_edx); \
215:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+0]=*(((char *)(&_gv_ebx))+_gv_i); \
216:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+4]=*(((char *)(&_gv_edx))+_gv_i); \
217:   for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+8]=*(((char *)(&_gv_ecx))+_gv_i); \
218: }

220: /* ================================================================================================ */

222: /* The Stand Alone Versions of the SSE Macros */

224: /* Prefetch Macros */
225: #define PREFETCH_NTA(var)             _mm_prefetch((char *)(var),_MM_HINT_NTA)
226: #define PREFETCH_L1(var)              _mm_prefetch((char *)(var),_MM_HINT_T0)
227: #define PREFETCH_L2(var)              _mm_prefetch((char *)(var),_MM_HINT_T1)
228: #define PREFETCH_L3(var)              _mm_prefetch((char *)(var),_MM_HINT_T2)

230: /* Store Macros */
231: #define STORE_SS(var,srcreg)          _mm_store_ss(var,srcreg)
232: #define STOREL_PS(var,srcreg)         _mm_storel_pi((__m64 *)(var),srcreg)
233: #define STOREH_PS(var,srcreg)         _mm_storeh_pi((__m64 *)(var),srcreg)
234: #define STORE_PS(var,srcreg)          _mm_store_ps(var,srcreg)
235: #define STOREU_PS(var,srcreg)         _mm_storeu_ps(var,srcreg)
236: #define STREAM_PS(var,srcreg)         _mm_stream_ps(var,srcreg)

238: /* Register-Register Copy Macros */
239: #define COPY_SS(dstreg,srcreg)        dstreg = _mm_move_ss(dstreg,srcreg)
240: #define COPY_PS(dstreg,srcreg)        dstreg = srcreg

242: /* Load Macros */
243: #define LOAD_SS(var,dstreg)           dstreg = _mm_load_ss(var)
244: #define LOADL_PS(var,dstreg)          dstreg = _mm_loadl_pi(dstreg,(__m64 *)(var))
245: #define LOADH_PS(var,dstreg)          dstreg = _mm_loadh_pi(dstreg,(__m64 *)(var))
246: #define LOAD_PS(var,dstreg)           dstreg = _mm_load_ps(var)
247: #define LOADU_PS(var,dstreg)          dstreg = _mm_loadu_ps(var)

249: /* Shuffle */
250: #define SHUFFLE(dstreg,srcreg,i)      dstreg = _mm_shuffle_ps(dstreg,srcreg,i)

252: /* Multiply: A:=A*B */
253: #define MULT_SS(dstreg,srcreg)        dstreg = _mm_mul_ss(dstreg,srcreg)
254: #define MULT_PS(dstreg,srcreg)        dstreg = _mm_mul_ps(dstreg,srcreg)
255: #define MULT_SS_M(dstreg,var)         dstreg = _mm_mul_ss(dstreg,_mm_load_ss(var))
256: #define MULT_PS_M(dstreg,var)         dstreg = _mm_mul_ps(dstreg,_mm_load_ps(var))

258: /* Divide: A:=A/B */
259: #define DIV_SS(dstreg,srcreg)         dstreg = _mm_div_ss(dstreg,srcreg)
260: #define DIV_PS(dstreg,srcreg)         dstreg = _mm_div_ps(dstreg,srcreg)
261: #define DIV_SS_M(dstreg,var)          dstreg = _mm_div_ss(dstreg,_mm_load_ss(var))
262: #define DIV_PS_M(dstreg,var)          dstreg = _mm_div_ps(dstreg,_mm_load_ps(var))

264: /* Reciprocal: A:=1/B */
265: #define RECIP_SS(dstreg,srcreg)       dstreg = _mm_rcp_ss(srcreg)
266: #define RECIP_PS(dstreg,srcreg)       dstreg = _mm_rcp_ps(srcreg)
267: #define RECIP_SS_M(dstreg,var)        dstreg = _mm_rcp_ss(_mm_load_ss(var))
268: #define RECIP_PS_M(dstreg,var)        dstreg = _mm_rcp_ps(_mm_load_ps(var))

270: /* Add: A:=A+B */
271: #define ADD_SS(dstreg,srcreg)         dstreg = _mm_add_ss(dstreg,srcreg)
272: #define ADD_PS(dstreg,srcreg)         dstreg = _mm_add_ps(dstreg,srcreg)
273: #define ADD_SS_M(dstreg,var)          dstreg = _mm_add_ss(dstreg,_mm_load_ss(var))
274: #define ADD_PS_M(dstreg,var)          dstreg = _mm_add_ps(dstreg,_mm_load_ps(var))

276: /* Subtract: A:=A-B */
277: #define SUB_SS(dstreg,srcreg)         dstreg = _mm_sub_ss(dstreg,srcreg)
278: #define SUB_PS(dstreg,srcreg)         dstreg = _mm_sub_ps(dstreg,srcreg)
279: #define SUB_SS_M(dstreg,var)          dstreg = _mm_sub_ss(dstreg,_mm_load_ss(var))
280: #define SUB_PS_M(dstreg,var)          dstreg = _mm_sub_ps(dstreg,_mm_load_ps(var))

282: /* Logical: A:=A<op>B */
283: #define AND_SS(dstreg,srcreg)         dstreg = _mm_and_ss(dstreg,srcreg)
284: #define ANDNOT_SS(dstreg,srcreg)      dstreg = _mm_andnot_ss(dstreg,srcreg)
285: #define OR_SS(dstreg,srcreg)          dstreg = _mm_or_ss(dstreg,srcreg)
286: #define XOR_SS(dstreg,srcreg)         dstreg = _mm_xor_ss(dstreg,srcreg)

288: #define AND_PS(dstreg,srcreg)         dstreg = _mm_and_ps(dstreg,srcreg)
289: #define ANDNOT_PS(dstreg,srcreg)      dstreg = _mm_andnot_ps(dstreg,srcreg)
290: #define OR_PS(dstreg,srcreg)          dstreg = _mm_or_ps(dstreg,srcreg)
291: #define XOR_PS(dstreg,srcreg)         dstreg = _mm_xor_ps(dstreg,srcreg)

293: /* Implementing an if():
294:    First perform the comparison, then use Movemask to get an integer, say i, then
295:    if(i) ....
296: */

298: /* 
299:    Note: From the IA Software Developer's Manual:
300:    The greater-than relations not implemented in hardware require more than one instruction to
301:    emulate in software and therefore should not be implemented as pseudo-ops. (For these, the
302:    programmer should reverse the operands of the corresponding less than relations and use move
303:    instructions to ensure that the mask is moved to the correct destination register and that the
304:    source operand is left intact.)
305: */

307: /* Comparisons A:=A<compare>B */
308: #define CMPEQ_SS(dstreg,srcreg)       dstreg = _mm_cmpeq_ss(dstreg,srcreg)
309: #define CMPLT_SS(dstreg,srcreg)       dstreg = _mm_cmplt_ss(dstreg,srcreg)
310: #define CMPLE_SS(dstreg,srcreg)       dstreg = _mm_cmple_ss(dstreg,srcreg)
311: #define CMPUNORD_SS(dstreg,srcreg)    dstreg = _mm_cmpunord_ss(dstreg,srcreg)
312: #define CMPNEQ_SS(dstreg,srcreg)      dstreg = _mm_cmpneq_ss(dstreg,srcreg)
313: #define CMPNLT_SS(dstreg,srcreg)      dstreg = _mm_cmpnlt_ss(dstreg,srcreg)
314: #define CMPNLE_SS(dstreg,srcreg)      dstreg = _mm_cmpnle_ss(dstreg,srcreg)
315: #define CMPORD_SS(dstreg,srcreg)      dstreg = _mm_cmpord_ss(dstreg,srcreg)

317: #define CMPEQ_PS(dstreg,srcreg)       dstreg = _mm_cmpeq_ps(dstreg,srcreg)
318: #define CMPLT_PS(dstreg,srcreg)       dstreg = _mm_cmplt_ps(dstreg,srcreg)
319: #define CMPLE_PS(dstreg,srcreg)       dstreg = _mm_cmple_ps(dstreg,srcreg)
320: #define CMPUNORD_PS(dstreg,srcreg)    dstreg = _mm_cmpunord_ps(dstreg,srcreg)
321: #define CMPNEQ_PS(dstreg,srcreg)      dstreg = _mm_cmpneq_ps(dstreg,srcreg)
322: #define CMPNLT_PS(dstreg,srcreg)      dstreg = _mm_cmpnlt_ps(dstreg,srcreg)
323: #define CMPNLE_PS(dstreg,srcreg)      dstreg = _mm_cmpnle_ps(dstreg,srcreg)
324: #define CMPORD_PS(dstreg,srcreg)      dstreg = _mm_cmpord_ps(dstreg,srcreg)

326: /* ================================================================================================ */

328: PETSC_EXTERN_CXX_END
329: #endif