FOX/ObjCryst++  1.10.X (development)
sse_mathfun.h
1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2 
3  Inspired by Intel Approximate Math library, and based on the
4  corresponding algorithms of the cephes math library
5 
6  The default is to use the SSE1 version. If you define USE_SSE2 the
7  the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8  not expect any significant performance improvement with SSE2.
9 
10 NOTE:
11 - Original version available @: http://gruntthepeon.free.fr/ssemath/
12 - Modifications for inclusion in ObjCryst++ (http://objcryst.sf.net):
13  - functions are now inline.
14 */
15 
16 /* Copyright (C) 2007 Julien Pommier
17 
18  This software is provided 'as-is', without any express or implied
19  warranty. In no event will the authors be held liable for any damages
20  arising from the use of this software.
21 
22  Permission is granted to anyone to use this software for any purpose,
23  including commercial applications, and to alter it and redistribute it
24  freely, subject to the following restrictions:
25 
26  1. The origin of this software must not be misrepresented; you must not
27  claim that you wrote the original software. If you use this software
28  in a product, an acknowledgment in the product documentation would be
29  appreciated but is not required.
30  2. Altered source versions must be plainly marked as such, and must not be
31  misrepresented as being the original software.
32  3. This notice may not be removed or altered from any source distribution.
33 
34  (this is the zlib license)
35 */
36 
37 #include <xmmintrin.h>
38 
39 /* yes I know, the top of this file is quite ugly */
40 
41 #ifdef _MSC_VER /* visual c++ */
42 # define ALIGN16_BEG __declspec(align(16))
43 # define ALIGN16_END
44 #else /* gcc or icc */
45 # define ALIGN16_BEG
46 # define ALIGN16_END __attribute__((aligned(16)))
47 #endif
48 
49 /* __m128 is ugly to write */
50 typedef __m128 v4sf; // vector of 4 float (sse1)
51 
52 #ifdef USE_SSE2
53 # include <emmintrin.h>
54 typedef __m128i v4si; // vector of 4 int (sse2)
55 #else
56 typedef __m64 v2si; // vector of 2 int (mmx)
57 #endif
58 
59 /* declare some SSE constants -- why can't I figure a better way to do that? */
60 #define _PS_CONST(Name, Val) \
61  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
62 #define _PI32_CONST(Name, Val) \
63  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
64 #define _PS_CONST_TYPE(Name, Type, Val) \
65  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
66 
67 _PS_CONST(1 , 1.0f);
68 _PS_CONST(0p5, 0.5f);
69 /* the smallest non denormalized float number */
70 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
73 
74 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
76 
77 _PI32_CONST(1, 1);
78 _PI32_CONST(inv1, ~1);
79 _PI32_CONST(2, 2);
80 _PI32_CONST(4, 4);
81 _PI32_CONST(0x7f, 0x7f);
82 
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
85 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
87 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
88 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
89 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
90 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
91 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
92 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4);
94 _PS_CONST(cephes_log_q2, 0.693359375);
95 
96 #if defined (__MINGW32__)
97 
98 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
99  The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
100  broken on my mingw gcc 3.4.5 ...
101 
102  Note that the bug on _mm_cmp* does occur only at -O0 optimization level
103 */
104 
105 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
106  asm (
107  "movhlps %2,%0\n\t"
108  : "=x" (a)
109  : "0" (a), "x"(b)
110  );
111  return a; }
112 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
113 #define _mm_movehl_ps my_movehl_ps
114 
115 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
116  asm (
117  "cmpltps %2,%0\n\t"
118  : "=x" (a)
119  : "0" (a), "x"(b)
120  );
121  return a;
122  }
123 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
124  asm (
125  "cmpnleps %2,%0\n\t"
126  : "=x" (a)
127  : "0" (a), "x"(b)
128  );
129  return a;
130 }
131 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
132  asm (
133  "cmpeqps %2,%0\n\t"
134  : "=x" (a)
135  : "0" (a), "x"(b)
136  );
137  return a;
138 }
139 #warning "redefined _mm_cmpxx_ps functions..."
140 #define _mm_cmplt_ps my_cmplt_ps
141 #define _mm_cmpgt_ps my_cmpgt_ps
142 #define _mm_cmpeq_ps my_cmpeq_ps
143 #endif
144 
145 #ifndef USE_SSE2
146 typedef union xmm_mm_union {
147  __m128 xmm;
148  __m64 mm[2];
149 } xmm_mm_union;
150 
151 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
152  xmm_mm_union u; u.xmm = xmm_; \
153  mm0_ = u.mm[0]; \
154  mm1_ = u.mm[1]; \
155 }
156 
157 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
158  xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
159  }
160 
161 #endif // USE_SSE2
162 
163 /* natural logarithm computed for 4 simultaneous float
164  return NaN for x <= 0
165 */
166 inline v4sf log_ps(v4sf x) {
167 #ifdef USE_SSE2
168  v4si emm0;
169 #else
170  v2si mm0, mm1;
171 #endif
172  v4sf one = *(v4sf*)_ps_1;
173 
174  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
175 
176  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
177 
178 #ifndef USE_SSE2
179  /* part 1: x = frexpf(x, &e); */
180  COPY_XMM_TO_MM(x, mm0, mm1);
181  mm0 = _mm_srli_pi32(mm0, 23);
182  mm1 = _mm_srli_pi32(mm1, 23);
183 #else
184  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
185 #endif
186  /* keep only the fractional part */
187  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
188  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
189 
190 #ifndef USE_SSE2
191  /* now e=mm0:mm1 contain the really base-2 exponent */
192  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
193  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
194  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
195  _mm_empty(); /* bye bye mmx */
196 #else
197  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
198  v4sf e = _mm_cvtepi32_ps(emm0);
199 #endif
200 
201  e = _mm_add_ps(e, one);
202 
203  /* part2:
204  if( x < SQRTHF ) {
205  e -= 1;
206  x = x + x - 1.0;
207  } else { x = x - 1.0; }
208  */
209  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
210  v4sf tmp = _mm_and_ps(x, mask);
211  x = _mm_sub_ps(x, one);
212  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
213  x = _mm_add_ps(x, tmp);
214 
215 
216  v4sf z = _mm_mul_ps(x,x);
217 
218  v4sf y = *(v4sf*)_ps_cephes_log_p0;
219  y = _mm_mul_ps(y, x);
220  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
221  y = _mm_mul_ps(y, x);
222  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
223  y = _mm_mul_ps(y, x);
224  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
225  y = _mm_mul_ps(y, x);
226  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
227  y = _mm_mul_ps(y, x);
228  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
229  y = _mm_mul_ps(y, x);
230  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
231  y = _mm_mul_ps(y, x);
232  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
233  y = _mm_mul_ps(y, x);
234  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
235  y = _mm_mul_ps(y, x);
236 
237  y = _mm_mul_ps(y, z);
238 
239 
240  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
241  y = _mm_add_ps(y, tmp);
242 
243 
244  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
245  y = _mm_sub_ps(y, tmp);
246 
247  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
248  x = _mm_add_ps(x, y);
249  x = _mm_add_ps(x, tmp);
250  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
251  return x;
252 }
253 
254 _PS_CONST(exp_hi, 88.3762626647949f);
255 _PS_CONST(exp_lo, -88.3762626647949f);
256 
257 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
258 _PS_CONST(cephes_exp_C1, 0.693359375);
259 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
260 
261 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
262 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
263 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
264 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
265 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
266 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
267 
268 inline v4sf exp_ps(v4sf x) {
269  v4sf tmp = _mm_setzero_ps(), fx;
270 #ifdef USE_SSE2
271  v4si emm0;
272 #else
273  v2si mm0, mm1;
274 #endif
275  v4sf one = *(v4sf*)_ps_1;
276 
277  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
278  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
279 
280  /* express exp(x) as exp(g + n*log(2)) */
281  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
282  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
283 
284  /* how to perform a floorf with SSE: just below */
285 #ifndef USE_SSE2
286  /* step 1 : cast to int */
287  tmp = _mm_movehl_ps(tmp, fx);
288  mm0 = _mm_cvttps_pi32(fx);
289  mm1 = _mm_cvttps_pi32(tmp);
290  /* step 2 : cast back to float */
291  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
292 #else
293  emm0 = _mm_cvttps_epi32(fx);
294  tmp = _mm_cvtepi32_ps(emm0);
295 #endif
296  /* if greater, substract 1 */
297  v4sf mask = _mm_cmpgt_ps(tmp, fx);
298  mask = _mm_and_ps(mask, one);
299  fx = _mm_sub_ps(tmp, mask);
300 
301  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
302  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
303  x = _mm_sub_ps(x, tmp);
304  x = _mm_sub_ps(x, z);
305 
306  z = _mm_mul_ps(x,x);
307 
308  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
309  y = _mm_mul_ps(y, x);
310  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
311  y = _mm_mul_ps(y, x);
312  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
313  y = _mm_mul_ps(y, x);
314  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
315  y = _mm_mul_ps(y, x);
316  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
317  y = _mm_mul_ps(y, x);
318  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
319  y = _mm_mul_ps(y, z);
320  y = _mm_add_ps(y, x);
321  y = _mm_add_ps(y, one);
322 
323  /* build 2^n */
324 #ifndef USE_SSE2
325  z = _mm_movehl_ps(z, fx);
326  mm0 = _mm_cvttps_pi32(fx);
327  mm1 = _mm_cvttps_pi32(z);
328  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
329  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
330  mm0 = _mm_slli_pi32(mm0, 23);
331  mm1 = _mm_slli_pi32(mm1, 23);
332 
333  v4sf pow2n;
334  COPY_MM_TO_XMM(mm0, mm1, pow2n);
335  _mm_empty();
336 #else
337  emm0 = _mm_cvttps_epi32(fx);
338  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
339  emm0 = _mm_slli_epi32(emm0, 23);
340  v4sf pow2n = _mm_castsi128_ps(emm0);
341 #endif
342  y = _mm_mul_ps(y, pow2n);
343  return y;
344 }
345 
346 _PS_CONST(minus_cephes_DP1, -0.78515625);
347 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
348 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
349 _PS_CONST(sincof_p0, -1.9515295891E-4);
350 _PS_CONST(sincof_p1, 8.3321608736E-3);
351 _PS_CONST(sincof_p2, -1.6666654611E-1);
352 _PS_CONST(coscof_p0, 2.443315711809948E-005);
353 _PS_CONST(coscof_p1, -1.388731625493765E-003);
354 _PS_CONST(coscof_p2, 4.166664568298827E-002);
355 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
356 
357 
358 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
359  it runs also on old athlons XPs and the pentium III of your grand
360  mother.
361 
362  The code is the exact rewriting of the cephes sinf function.
363  Precision is excellent as long as x < 8192 (I did not bother to
364  take into account the special handling they have for greater values
365  -- it does not return garbage for arguments over 8192, though, but
366  the extra precision is missing).
367 
368  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
369  surprising but correct result.
370 
371  Performance is also surprisingly good, 1.33 times faster than the
372  macos vsinf SSE2 function, and 1.5 times faster than the
373  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
374  too bad for an SSE1 function (with no special tuning) !
375  However the latter libraries probably have a much better handling of NaN,
376  Inf, denormalized and other special arguments..
377 
378  On my core 1 duo, the execution of this function takes approximately 95 cycles.
379 
380  From what I have observed on the experiments with Intel AMath lib, switching to an
381  SSE2 version would improve the perf by only 10%.
382 
383  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
384  deliver full speed.
385 */
386 inline v4sf sin_ps(v4sf x) { // any x
387  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
388 
389 #ifdef USE_SSE2
390  v4si emm0, emm2;
391 #else
392  v2si mm0, mm1, mm2, mm3;
393 #endif
394  sign_bit = x;
395  /* take the absolute value */
396  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
397  /* extract the sign bit (upper one) */
398  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
399 
400  /* scale by 4/Pi */
401  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
402 
403  //printf("plop:"); print4(y);
404 #ifdef USE_SSE2
405  /* store the integer part of y in mm0 */
406  emm2 = _mm_cvttps_epi32(y);
407  /* j=(j+1) & (~1) (see the cephes sources) */
408  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
409  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
410  y = _mm_cvtepi32_ps(emm2);
411  /* get the swap sign flag */
412  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
413  emm0 = _mm_slli_epi32(emm0, 29);
414  /* get the polynom selection mask
415  there is one polynom for 0 <= x <= Pi/4
416  and another one for Pi/4<x<=Pi/2
417 
418  Both branches will be computed.
419  */
420  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
421  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
422 
423  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
424  v4sf poly_mask = _mm_castsi128_ps(emm2);
425  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
426 #else
427  /* store the integer part of y in mm0:mm1 */
428  xmm2 = _mm_movehl_ps(xmm2, y);
429  mm2 = _mm_cvttps_pi32(y);
430  mm3 = _mm_cvttps_pi32(xmm2);
431  /* j=(j+1) & (~1) (see the cephes sources) */
432  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
433  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
434  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
435  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
436  y = _mm_cvtpi32x2_ps(mm2, mm3);
437  /* get the swap sign flag */
438  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
439  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
440  mm0 = _mm_slli_pi32(mm0, 29);
441  mm1 = _mm_slli_pi32(mm1, 29);
442  /* get the polynom selection mask */
443  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
444  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
445  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
446  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
447  v4sf swap_sign_bit, poly_mask;
448  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
449  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
450  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
451  _mm_empty(); /* good-bye mmx */
452 #endif
453 
454  /* The magic pass: "Extended precision modular arithmetic"
455  x = ((x - y * DP1) - y * DP2) - y * DP3; */
456  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
457  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
458  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
459  xmm1 = _mm_mul_ps(y, xmm1);
460  xmm2 = _mm_mul_ps(y, xmm2);
461  xmm3 = _mm_mul_ps(y, xmm3);
462  x = _mm_add_ps(x, xmm1);
463  x = _mm_add_ps(x, xmm2);
464  x = _mm_add_ps(x, xmm3);
465 
466  /* Evaluate the first polynom (0 <= x <= Pi/4) */
467  y = *(v4sf*)_ps_coscof_p0;
468  v4sf z = _mm_mul_ps(x,x);
469 
470  y = _mm_mul_ps(y, z);
471  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
472  y = _mm_mul_ps(y, z);
473  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
474  y = _mm_mul_ps(y, z);
475  y = _mm_mul_ps(y, z);
476  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
477  y = _mm_sub_ps(y, tmp);
478  y = _mm_add_ps(y, *(v4sf*)_ps_1);
479 
480  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
481 
482  v4sf y2 = *(v4sf*)_ps_sincof_p0;
483  y2 = _mm_mul_ps(y2, z);
484  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
485  y2 = _mm_mul_ps(y2, z);
486  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
487  y2 = _mm_mul_ps(y2, z);
488  y2 = _mm_mul_ps(y2, x);
489  y2 = _mm_add_ps(y2, x);
490 
491  /* select the correct result from the two polynoms */
492  xmm3 = poly_mask;
493  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
494  y = _mm_andnot_ps(xmm3, y);
495  y = _mm_add_ps(y,y2);
496  /* update the sign */
497  y = _mm_xor_ps(y, sign_bit);
498 
499  return y;
500 }
501 
502 /* almost the same as sin_ps */
503 inline v4sf cos_ps(v4sf x) { // any x
504  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
505 #ifdef USE_SSE2
506  v4si emm0, emm2;
507 #else
508  v2si mm0, mm1, mm2, mm3;
509 #endif
510  /* take the absolute value */
511  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
512 
513  /* scale by 4/Pi */
514  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
515 
516 #ifdef USE_SSE2
517  /* store the integer part of y in mm0 */
518  emm2 = _mm_cvttps_epi32(y);
519  /* j=(j+1) & (~1) (see the cephes sources) */
520  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
521  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
522  y = _mm_cvtepi32_ps(emm2);
523 
524  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
525 
526  /* get the swap sign flag */
527  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
528  emm0 = _mm_slli_epi32(emm0, 29);
529  /* get the polynom selection mask */
530  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
531  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
532 
533  v4sf sign_bit = _mm_castsi128_ps(emm0);
534  v4sf poly_mask = _mm_castsi128_ps(emm2);
535 #else
536  /* store the integer part of y in mm0:mm1 */
537  xmm2 = _mm_movehl_ps(xmm2, y);
538  mm2 = _mm_cvttps_pi32(y);
539  mm3 = _mm_cvttps_pi32(xmm2);
540 
541  /* j=(j+1) & (~1) (see the cephes sources) */
542  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
543  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
544  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
545  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
546 
547  y = _mm_cvtpi32x2_ps(mm2, mm3);
548 
549 
550  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
551  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
552 
553  /* get the swap sign flag in mm0:mm1 and the
554  polynom selection mask in mm2:mm3 */
555 
556  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
557  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
558  mm0 = _mm_slli_pi32(mm0, 29);
559  mm1 = _mm_slli_pi32(mm1, 29);
560 
561  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
562  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
563 
564  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
565  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
566 
567  v4sf sign_bit, poly_mask;
568  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
569  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
570  _mm_empty(); /* good-bye mmx */
571 #endif
572  /* The magic pass: "Extended precision modular arithmetic"
573  x = ((x - y * DP1) - y * DP2) - y * DP3; */
574  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
575  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
576  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
577  xmm1 = _mm_mul_ps(y, xmm1);
578  xmm2 = _mm_mul_ps(y, xmm2);
579  xmm3 = _mm_mul_ps(y, xmm3);
580  x = _mm_add_ps(x, xmm1);
581  x = _mm_add_ps(x, xmm2);
582  x = _mm_add_ps(x, xmm3);
583 
584  /* Evaluate the first polynom (0 <= x <= Pi/4) */
585  y = *(v4sf*)_ps_coscof_p0;
586  v4sf z = _mm_mul_ps(x,x);
587 
588  y = _mm_mul_ps(y, z);
589  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
590  y = _mm_mul_ps(y, z);
591  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
592  y = _mm_mul_ps(y, z);
593  y = _mm_mul_ps(y, z);
594  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
595  y = _mm_sub_ps(y, tmp);
596  y = _mm_add_ps(y, *(v4sf*)_ps_1);
597 
598  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
599 
600  v4sf y2 = *(v4sf*)_ps_sincof_p0;
601  y2 = _mm_mul_ps(y2, z);
602  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
603  y2 = _mm_mul_ps(y2, z);
604  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
605  y2 = _mm_mul_ps(y2, z);
606  y2 = _mm_mul_ps(y2, x);
607  y2 = _mm_add_ps(y2, x);
608 
609  /* select the correct result from the two polynoms */
610  xmm3 = poly_mask;
611  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
612  y = _mm_andnot_ps(xmm3, y);
613  y = _mm_add_ps(y,y2);
614  /* update the sign */
615  y = _mm_xor_ps(y, sign_bit);
616 
617  return y;
618 }
619 
620 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
621  it is almost as fast, and gives you a free cosine with your sine */
622 inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
623  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
624 #ifdef USE_SSE2
625  v4si emm0, emm2, emm4;
626 #else
627  v2si mm0, mm1, mm2, mm3, mm4, mm5;
628 #endif
629  sign_bit_sin = x;
630  /* take the absolute value */
631  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
632  /* extract the sign bit (upper one) */
633  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
634 
635  /* scale by 4/Pi */
636  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
637 
638 #ifdef USE_SSE2
639  /* store the integer part of y in emm2 */
640  emm2 = _mm_cvttps_epi32(y);
641 
642  /* j=(j+1) & (~1) (see the cephes sources) */
643  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
644  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
645  y = _mm_cvtepi32_ps(emm2);
646 
647  emm4 = emm2;
648 
649  /* get the swap sign flag for the sine */
650  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
651  emm0 = _mm_slli_epi32(emm0, 29);
652  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
653 
654  /* get the polynom selection mask for the sine*/
655  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
656  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
657  v4sf poly_mask = _mm_castsi128_ps(emm2);
658 #else
659  /* store the integer part of y in mm2:mm3 */
660  xmm3 = _mm_movehl_ps(xmm3, y);
661  mm2 = _mm_cvttps_pi32(y);
662  mm3 = _mm_cvttps_pi32(xmm3);
663 
664  /* j=(j+1) & (~1) (see the cephes sources) */
665  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
666  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
667  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
668  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
669 
670  y = _mm_cvtpi32x2_ps(mm2, mm3);
671 
672  mm4 = mm2;
673  mm5 = mm3;
674 
675  /* get the swap sign flag for the sine */
676  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
677  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
678  mm0 = _mm_slli_pi32(mm0, 29);
679  mm1 = _mm_slli_pi32(mm1, 29);
680  v4sf swap_sign_bit_sin;
681  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
682 
683  /* get the polynom selection mask for the sine */
684 
685  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
686  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
687  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
688  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
689  v4sf poly_mask;
690  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
691 #endif
692 
693  /* The magic pass: "Extended precision modular arithmetic"
694  x = ((x - y * DP1) - y * DP2) - y * DP3; */
695  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
696  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
697  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
698  xmm1 = _mm_mul_ps(y, xmm1);
699  xmm2 = _mm_mul_ps(y, xmm2);
700  xmm3 = _mm_mul_ps(y, xmm3);
701  x = _mm_add_ps(x, xmm1);
702  x = _mm_add_ps(x, xmm2);
703  x = _mm_add_ps(x, xmm3);
704 
705 #ifdef USE_SSE2
706  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
707  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
708  emm4 = _mm_slli_epi32(emm4, 29);
709  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
710 #else
711  /* get the sign flag for the cosine */
712  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
713  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
714  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
715  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
716  mm4 = _mm_slli_pi32(mm4, 29);
717  mm5 = _mm_slli_pi32(mm5, 29);
718  v4sf sign_bit_cos;
719  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
720  _mm_empty(); /* good-bye mmx */
721 #endif
722 
723  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
724 
725 
726  /* Evaluate the first polynom (0 <= x <= Pi/4) */
727  v4sf z = _mm_mul_ps(x,x);
728  y = *(v4sf*)_ps_coscof_p0;
729 
730  y = _mm_mul_ps(y, z);
731  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
732  y = _mm_mul_ps(y, z);
733  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
734  y = _mm_mul_ps(y, z);
735  y = _mm_mul_ps(y, z);
736  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
737  y = _mm_sub_ps(y, tmp);
738  y = _mm_add_ps(y, *(v4sf*)_ps_1);
739 
740  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
741 
742  v4sf y2 = *(v4sf*)_ps_sincof_p0;
743  y2 = _mm_mul_ps(y2, z);
744  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
745  y2 = _mm_mul_ps(y2, z);
746  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
747  y2 = _mm_mul_ps(y2, z);
748  y2 = _mm_mul_ps(y2, x);
749  y2 = _mm_add_ps(y2, x);
750 
751  /* select the correct result from the two polynoms */
752  xmm3 = poly_mask;
753  v4sf ysin2 = _mm_and_ps(xmm3, y2);
754  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
755  y2 = _mm_sub_ps(y2,ysin2);
756  y = _mm_sub_ps(y, ysin1);
757 
758  xmm1 = _mm_add_ps(ysin1,ysin2);
759  xmm2 = _mm_add_ps(y,y2);
760 
761  /* update the sign */
762  *s = _mm_xor_ps(xmm1, sign_bit_sin);
763  *c = _mm_xor_ps(xmm2, sign_bit_cos);
764 }