37 #include <xmmintrin.h>
42 # define ALIGN16_BEG __declspec(align(16))
46 # define ALIGN16_END __attribute__((aligned(16)))
53 # include <emmintrin.h>
60 #define _PS_CONST(Name, Val) \
61 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
62 #define _PI32_CONST(Name, Val) \
63 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
64 #define _PS_CONST_TYPE(Name, Type, Val) \
65 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
70 _PS_CONST_TYPE(min_norm_pos,
int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask,
int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask,
int, ~0x7f800000);
74 _PS_CONST_TYPE(sign_mask,
int, 0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask,
int, ~0x80000000);
78 _PI32_CONST(inv1, ~1);
81 _PI32_CONST(0x7f, 0x7f);
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
85 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
87 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
88 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
89 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
90 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
91 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
92 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4);
94 _PS_CONST(cephes_log_q2, 0.693359375);
96 #if defined (__MINGW32__)
105 inline __m128 my_movehl_ps(__m128 a,
const __m128 b) {
112 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
113 #define _mm_movehl_ps my_movehl_ps
115 inline __m128 my_cmplt_ps(__m128 a,
const __m128 b) {
123 inline __m128 my_cmpgt_ps(__m128 a,
const __m128 b) {
131 inline __m128 my_cmpeq_ps(__m128 a,
const __m128 b) {
139 #warning "redefined _mm_cmpxx_ps functions..."
140 #define _mm_cmplt_ps my_cmplt_ps
141 #define _mm_cmpgt_ps my_cmpgt_ps
142 #define _mm_cmpeq_ps my_cmpeq_ps
151 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
152 xmm_mm_union u; u.xmm = xmm_; \
157 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
158 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
166 inline v4sf log_ps(v4sf x) {
172 v4sf one = *(v4sf*)_ps_1;
174 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
176 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);
180 COPY_XMM_TO_MM(x, mm0, mm1);
181 mm0 = _mm_srli_pi32(mm0, 23);
182 mm1 = _mm_srli_pi32(mm1, 23);
184 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
187 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
188 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
192 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
193 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
194 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
197 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
198 v4sf e = _mm_cvtepi32_ps(emm0);
201 e = _mm_add_ps(e, one);
209 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
210 v4sf tmp = _mm_and_ps(x, mask);
211 x = _mm_sub_ps(x, one);
212 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
213 x = _mm_add_ps(x, tmp);
216 v4sf z = _mm_mul_ps(x,x);
218 v4sf y = *(v4sf*)_ps_cephes_log_p0;
219 y = _mm_mul_ps(y, x);
220 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
221 y = _mm_mul_ps(y, x);
222 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
223 y = _mm_mul_ps(y, x);
224 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
225 y = _mm_mul_ps(y, x);
226 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
227 y = _mm_mul_ps(y, x);
228 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
229 y = _mm_mul_ps(y, x);
230 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
231 y = _mm_mul_ps(y, x);
232 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
233 y = _mm_mul_ps(y, x);
234 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
235 y = _mm_mul_ps(y, x);
237 y = _mm_mul_ps(y, z);
240 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
241 y = _mm_add_ps(y, tmp);
244 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
245 y = _mm_sub_ps(y, tmp);
247 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
248 x = _mm_add_ps(x, y);
249 x = _mm_add_ps(x, tmp);
250 x = _mm_or_ps(x, invalid_mask);
254 _PS_CONST(exp_hi, 88.3762626647949f);
255 _PS_CONST(exp_lo, -88.3762626647949f);
257 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
258 _PS_CONST(cephes_exp_C1, 0.693359375);
259 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
261 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
262 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
263 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
264 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
265 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
266 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
268 inline v4sf exp_ps(v4sf x) {
269 v4sf tmp = _mm_setzero_ps(), fx;
275 v4sf one = *(v4sf*)_ps_1;
277 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
278 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
281 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
282 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
287 tmp = _mm_movehl_ps(tmp, fx);
288 mm0 = _mm_cvttps_pi32(fx);
289 mm1 = _mm_cvttps_pi32(tmp);
291 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
293 emm0 = _mm_cvttps_epi32(fx);
294 tmp = _mm_cvtepi32_ps(emm0);
297 v4sf mask = _mm_cmpgt_ps(tmp, fx);
298 mask = _mm_and_ps(mask, one);
299 fx = _mm_sub_ps(tmp, mask);
301 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
302 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
303 x = _mm_sub_ps(x, tmp);
304 x = _mm_sub_ps(x, z);
308 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
309 y = _mm_mul_ps(y, x);
310 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
311 y = _mm_mul_ps(y, x);
312 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
313 y = _mm_mul_ps(y, x);
314 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
315 y = _mm_mul_ps(y, x);
316 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
317 y = _mm_mul_ps(y, x);
318 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
319 y = _mm_mul_ps(y, z);
320 y = _mm_add_ps(y, x);
321 y = _mm_add_ps(y, one);
325 z = _mm_movehl_ps(z, fx);
326 mm0 = _mm_cvttps_pi32(fx);
327 mm1 = _mm_cvttps_pi32(z);
328 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
329 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
330 mm0 = _mm_slli_pi32(mm0, 23);
331 mm1 = _mm_slli_pi32(mm1, 23);
334 COPY_MM_TO_XMM(mm0, mm1, pow2n);
337 emm0 = _mm_cvttps_epi32(fx);
338 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
339 emm0 = _mm_slli_epi32(emm0, 23);
340 v4sf pow2n = _mm_castsi128_ps(emm0);
342 y = _mm_mul_ps(y, pow2n);
346 _PS_CONST(minus_cephes_DP1, -0.78515625);
347 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
348 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
349 _PS_CONST(sincof_p0, -1.9515295891E-4);
350 _PS_CONST(sincof_p1, 8.3321608736E-3);
351 _PS_CONST(sincof_p2, -1.6666654611E-1);
352 _PS_CONST(coscof_p0, 2.443315711809948E-005);
353 _PS_CONST(coscof_p1, -1.388731625493765E-003);
354 _PS_CONST(coscof_p2, 4.166664568298827E-002);
355 _PS_CONST(cephes_FOPI, 1.27323954473516);
386 inline v4sf sin_ps(v4sf x) {
387 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
392 v2si mm0, mm1, mm2, mm3;
396 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
398 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
401 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
406 emm2 = _mm_cvttps_epi32(y);
408 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
409 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
410 y = _mm_cvtepi32_ps(emm2);
412 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
413 emm0 = _mm_slli_epi32(emm0, 29);
420 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
421 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
423 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
424 v4sf poly_mask = _mm_castsi128_ps(emm2);
425 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
428 xmm2 = _mm_movehl_ps(xmm2, y);
429 mm2 = _mm_cvttps_pi32(y);
430 mm3 = _mm_cvttps_pi32(xmm2);
432 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
433 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
434 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
435 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
436 y = _mm_cvtpi32x2_ps(mm2, mm3);
438 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
439 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
440 mm0 = _mm_slli_pi32(mm0, 29);
441 mm1 = _mm_slli_pi32(mm1, 29);
443 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
444 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
445 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
446 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
447 v4sf swap_sign_bit, poly_mask;
448 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
449 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
450 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
456 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
457 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
458 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
459 xmm1 = _mm_mul_ps(y, xmm1);
460 xmm2 = _mm_mul_ps(y, xmm2);
461 xmm3 = _mm_mul_ps(y, xmm3);
462 x = _mm_add_ps(x, xmm1);
463 x = _mm_add_ps(x, xmm2);
464 x = _mm_add_ps(x, xmm3);
467 y = *(v4sf*)_ps_coscof_p0;
468 v4sf z = _mm_mul_ps(x,x);
470 y = _mm_mul_ps(y, z);
471 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
472 y = _mm_mul_ps(y, z);
473 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
474 y = _mm_mul_ps(y, z);
475 y = _mm_mul_ps(y, z);
476 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
477 y = _mm_sub_ps(y, tmp);
478 y = _mm_add_ps(y, *(v4sf*)_ps_1);
482 v4sf y2 = *(v4sf*)_ps_sincof_p0;
483 y2 = _mm_mul_ps(y2, z);
484 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
485 y2 = _mm_mul_ps(y2, z);
486 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
487 y2 = _mm_mul_ps(y2, z);
488 y2 = _mm_mul_ps(y2, x);
489 y2 = _mm_add_ps(y2, x);
493 y2 = _mm_and_ps(xmm3, y2);
494 y = _mm_andnot_ps(xmm3, y);
495 y = _mm_add_ps(y,y2);
497 y = _mm_xor_ps(y, sign_bit);
503 inline v4sf cos_ps(v4sf x) {
504 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
508 v2si mm0, mm1, mm2, mm3;
511 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
514 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
518 emm2 = _mm_cvttps_epi32(y);
520 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
521 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
522 y = _mm_cvtepi32_ps(emm2);
524 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
527 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
528 emm0 = _mm_slli_epi32(emm0, 29);
530 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
531 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
533 v4sf sign_bit = _mm_castsi128_ps(emm0);
534 v4sf poly_mask = _mm_castsi128_ps(emm2);
537 xmm2 = _mm_movehl_ps(xmm2, y);
538 mm2 = _mm_cvttps_pi32(y);
539 mm3 = _mm_cvttps_pi32(xmm2);
542 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
543 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
544 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
545 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
547 y = _mm_cvtpi32x2_ps(mm2, mm3);
550 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
551 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
556 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
557 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
558 mm0 = _mm_slli_pi32(mm0, 29);
559 mm1 = _mm_slli_pi32(mm1, 29);
561 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
562 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
564 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
565 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
567 v4sf sign_bit, poly_mask;
568 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
569 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
574 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
575 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
576 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
577 xmm1 = _mm_mul_ps(y, xmm1);
578 xmm2 = _mm_mul_ps(y, xmm2);
579 xmm3 = _mm_mul_ps(y, xmm3);
580 x = _mm_add_ps(x, xmm1);
581 x = _mm_add_ps(x, xmm2);
582 x = _mm_add_ps(x, xmm3);
585 y = *(v4sf*)_ps_coscof_p0;
586 v4sf z = _mm_mul_ps(x,x);
588 y = _mm_mul_ps(y, z);
589 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
590 y = _mm_mul_ps(y, z);
591 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
592 y = _mm_mul_ps(y, z);
593 y = _mm_mul_ps(y, z);
594 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
595 y = _mm_sub_ps(y, tmp);
596 y = _mm_add_ps(y, *(v4sf*)_ps_1);
600 v4sf y2 = *(v4sf*)_ps_sincof_p0;
601 y2 = _mm_mul_ps(y2, z);
602 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
603 y2 = _mm_mul_ps(y2, z);
604 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
605 y2 = _mm_mul_ps(y2, z);
606 y2 = _mm_mul_ps(y2, x);
607 y2 = _mm_add_ps(y2, x);
611 y2 = _mm_and_ps(xmm3, y2);
612 y = _mm_andnot_ps(xmm3, y);
613 y = _mm_add_ps(y,y2);
615 y = _mm_xor_ps(y, sign_bit);
622 inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
623 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
625 v4si emm0, emm2, emm4;
627 v2si mm0, mm1, mm2, mm3, mm4, mm5;
631 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
633 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
636 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
640 emm2 = _mm_cvttps_epi32(y);
643 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
644 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
645 y = _mm_cvtepi32_ps(emm2);
650 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
651 emm0 = _mm_slli_epi32(emm0, 29);
652 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
655 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
656 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
657 v4sf poly_mask = _mm_castsi128_ps(emm2);
660 xmm3 = _mm_movehl_ps(xmm3, y);
661 mm2 = _mm_cvttps_pi32(y);
662 mm3 = _mm_cvttps_pi32(xmm3);
665 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
666 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
667 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
668 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
670 y = _mm_cvtpi32x2_ps(mm2, mm3);
676 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
677 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
678 mm0 = _mm_slli_pi32(mm0, 29);
679 mm1 = _mm_slli_pi32(mm1, 29);
680 v4sf swap_sign_bit_sin;
681 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
685 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
686 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
687 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
688 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
690 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
695 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
696 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
697 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
698 xmm1 = _mm_mul_ps(y, xmm1);
699 xmm2 = _mm_mul_ps(y, xmm2);
700 xmm3 = _mm_mul_ps(y, xmm3);
701 x = _mm_add_ps(x, xmm1);
702 x = _mm_add_ps(x, xmm2);
703 x = _mm_add_ps(x, xmm3);
706 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
707 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
708 emm4 = _mm_slli_epi32(emm4, 29);
709 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
712 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
713 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
714 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
715 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
716 mm4 = _mm_slli_pi32(mm4, 29);
717 mm5 = _mm_slli_pi32(mm5, 29);
719 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
723 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
727 v4sf z = _mm_mul_ps(x,x);
728 y = *(v4sf*)_ps_coscof_p0;
730 y = _mm_mul_ps(y, z);
731 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
732 y = _mm_mul_ps(y, z);
733 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
734 y = _mm_mul_ps(y, z);
735 y = _mm_mul_ps(y, z);
736 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
737 y = _mm_sub_ps(y, tmp);
738 y = _mm_add_ps(y, *(v4sf*)_ps_1);
742 v4sf y2 = *(v4sf*)_ps_sincof_p0;
743 y2 = _mm_mul_ps(y2, z);
744 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
745 y2 = _mm_mul_ps(y2, z);
746 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
747 y2 = _mm_mul_ps(y2, z);
748 y2 = _mm_mul_ps(y2, x);
749 y2 = _mm_add_ps(y2, x);
753 v4sf ysin2 = _mm_and_ps(xmm3, y2);
754 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
755 y2 = _mm_sub_ps(y2,ysin2);
756 y = _mm_sub_ps(y, ysin1);
758 xmm1 = _mm_add_ps(ysin1,ysin2);
759 xmm2 = _mm_add_ps(y,y2);
762 *s = _mm_xor_ps(xmm1, sign_bit_sin);
763 *c = _mm_xor_ps(xmm2, sign_bit_cos);