60    __m128 one = _mm_set_ps1(1.0);
 
   62    __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
 
   65    x = _mm_max_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));  
 
   67    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 
   69    x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000)));
 
   70    x = _mm_or_ps(x, _mm_set_ps1(0.5));
 
   72    emm0 = _mm_sub_epi32(emm0, _mm_set1_epi32(0x7f));
 
   73    __m128 e = _mm_cvtepi32_ps(emm0);
 
   75    e = _mm_add_ps(e, one);
 
   83    __m128 mask = _mm_cmplt_ps(x, _mm_set_ps1(0.707106781186547524));
 
   84    __m128 tmp = _mm_and_ps(x, mask);
 
   85    x = _mm_sub_ps(x, one);
 
   86    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
 
   87    x = _mm_add_ps(x, tmp);
 
   90    __m128 z = _mm_mul_ps(x, x);
 
   92    __m128 y = _mm_set_ps1(7.0376836292E-2);
 
   94    y = _mm_add_ps(y, _mm_set_ps1(-1.1514610310E-1));
 
   96    y = _mm_add_ps(y, _mm_set_ps1(1.1676998740E-1));
 
   98    y = _mm_add_ps(y, _mm_set_ps1(-1.2420140846E-1));
 
  100    y = _mm_add_ps(y, _mm_set_ps1(1.4249322787E-1));
 
  101    y = _mm_mul_ps(y, x);
 
  102    y = _mm_add_ps(y, _mm_set_ps1(-1.6668057665E-1));
 
  103    y = _mm_mul_ps(y, x);
 
  104    y = _mm_add_ps(y, _mm_set_ps1(2.0000714765E-1));
 
  105    y = _mm_mul_ps(y, x);
 
  106    y = _mm_add_ps(y, _mm_set_ps1(-2.4999993993E-1));
 
  107    y = _mm_mul_ps(y, x);
 
  108    y = _mm_add_ps(y, _mm_set_ps1(3.3333331174E-1));
 
  109    y = _mm_mul_ps(y, x);
 
  111    y = _mm_mul_ps(y, z);
 
  114    tmp = _mm_mul_ps(e, _mm_set_ps1(-2.12194440e-4));
 
  115    y = _mm_add_ps(y, tmp);
 
  118    tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
 
  119    y = _mm_sub_ps(y, tmp);
 
  121    tmp = _mm_mul_ps(e, _mm_set_ps1(0.693359375));
 
  122    x = _mm_add_ps(x, y);
 
  123    x = _mm_add_ps(x, tmp);
 
  124    x = _mm_or_ps(x, invalid_mask); 
 
 
  130    __m128 tmp = _mm_setzero_ps(), fx;
 
  132    __m128 one = _mm_set_ps1(1.0);
 
  134    x = _mm_min_ps(x, _mm_set_ps1(88.3762626647949f));
 
  135    x = _mm_max_ps(x, _mm_set_ps1(-88.3762626647949f));
 
  138    fx = _mm_mul_ps(x, _mm_set_ps1(1.44269504088896341));
 
  139    fx = _mm_add_ps(fx, _mm_set_ps1(0.5));
 
  142    emm0 = _mm_cvttps_epi32(fx);
 
  143    tmp  = _mm_cvtepi32_ps(emm0);
 
  145    __m128 mask = _mm_cmpgt_ps(tmp, fx);
 
  146    mask = _mm_and_ps(mask, one);
 
  147    fx = _mm_sub_ps(tmp, mask);
 
  149    tmp = _mm_mul_ps(fx, _mm_set_ps1(0.693359375));
 
  150    __m128 z = _mm_mul_ps(fx, _mm_set_ps1(-2.12194440e-4));
 
  151    x = _mm_sub_ps(x, tmp);
 
  152    x = _mm_sub_ps(x, z);
 
  154    z = _mm_mul_ps(x, x);
 
  156    __m128 y = _mm_set_ps1(1.9875691500E-4);
 
  157    y = _mm_mul_ps(y, x);
 
  158    y = _mm_add_ps(y, _mm_set_ps1(1.3981999507E-3));
 
  159    y = _mm_mul_ps(y, x);
 
  160    y = _mm_add_ps(y, _mm_set_ps1(8.3334519073E-3));
 
  161    y = _mm_mul_ps(y, x);
 
  162    y = _mm_add_ps(y, _mm_set_ps1(4.1665795894E-2));
 
  163    y = _mm_mul_ps(y, x);
 
  164    y = _mm_add_ps(y, _mm_set_ps1(1.6666665459E-1));
 
  165    y = _mm_mul_ps(y, x);
 
  166    y = _mm_add_ps(y, _mm_set_ps1(5.0000001201E-1));
 
  167    y = _mm_mul_ps(y, z);
 
  168    y = _mm_add_ps(y, x);
 
  169    y = _mm_add_ps(y, one);
 
  172    emm0 = _mm_cvttps_epi32(fx);
 
  173    emm0 = _mm_add_epi32(emm0, _mm_set1_epi32(0x7f));
 
  174    emm0 = _mm_slli_epi32(emm0, 23);
 
  175    __m128 pow2n = _mm_castsi128_ps(emm0);
 
  176    y = _mm_mul_ps(y, pow2n);
 
 
  210    __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 
  215    const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 
  216    x = _mm_and_ps(x, inv_sign_mask);
 
  218    const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
  219    sign_bit = _mm_and_ps(sign_bit, sign_mask);
 
  222    const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516); 
 
  223    y = _mm_mul_ps(x, cephes_FOPI);
 
  226    emm2 = _mm_cvttps_epi32(y);
 
  228    emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
 
  229    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
 
  230    y = _mm_cvtepi32_ps(emm2);
 
  233    emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
 
  234    emm0 = _mm_slli_epi32(emm0, 29);
 
  241    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
 
  242    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  244    __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
 
  245    __m128 poly_mask = _mm_castsi128_ps(emm2);
 
  246    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 
  250    xmm1 = _mm_set_ps1(-0.78515625);
 
  251    xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
 
  252    xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
 
  253    xmm1 = _mm_mul_ps(y, xmm1);
 
  254    xmm2 = _mm_mul_ps(y, xmm2);
 
  255    xmm3 = _mm_mul_ps(y, xmm3);
 
  256    x = _mm_add_ps(x, xmm1);
 
  257    x = _mm_add_ps(x, xmm2);
 
  258    x = _mm_add_ps(x, xmm3);
 
  261    y = _mm_set_ps1(2.443315711809948E-005);
 
  262    __m128 z = _mm_mul_ps(x, x);
 
  264    y = _mm_mul_ps(y, z);
 
  265    y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
 
  266    y = _mm_mul_ps(y, z);
 
  267    y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
 
  268    y = _mm_mul_ps(y, z);
 
  269    y = _mm_mul_ps(y, z);
 
  270    __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
 
  271    y = _mm_sub_ps(y, tmp);
 
  272    y = _mm_add_ps(y, _mm_set_ps1(1.0));
 
  276    __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
 
  277    y2 = _mm_mul_ps(y2, z);
 
  278    y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
 
  279    y2 = _mm_mul_ps(y2, z);
 
  280    y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
 
  281    y2 = _mm_mul_ps(y2, z);
 
  282    y2 = _mm_mul_ps(y2, x);
 
  283    y2 = _mm_add_ps(y2, x);
 
  287    y2 = _mm_and_ps(xmm3, y2); 
 
  288    y = _mm_andnot_ps(xmm3, y);
 
  289    y = _mm_add_ps(y, y2);
 
  291    y = _mm_xor_ps(y, sign_bit);
 
 
  298    __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 
  301    const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 
  302    x = _mm_and_ps(x, inv_sign_mask);
 
  305    const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516); 
 
  306    y = _mm_mul_ps(x, cephes_FOPI);
 
  309    emm2 = _mm_cvttps_epi32(y);
 
  311    emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
 
  312    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
 
  313    y = _mm_cvtepi32_ps(emm2);
 
  315    emm2 = _mm_sub_epi32(emm2, _mm_set1_epi32(2));
 
  318    emm0 = _mm_andnot_si128(emm2, _mm_set1_epi32(4));
 
  319    emm0 = _mm_slli_epi32(emm0, 29);
 
  321    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
 
  322    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  324    __m128 sign_bit = _mm_castsi128_ps(emm0);
 
  325    __m128 poly_mask = _mm_castsi128_ps(emm2);
 
  328    xmm1 = _mm_set_ps1(-0.78515625);
 
  329    xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
 
  330    xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
 
  331    xmm1 = _mm_mul_ps(y, xmm1);
 
  332    xmm2 = _mm_mul_ps(y, xmm2);
 
  333    xmm3 = _mm_mul_ps(y, xmm3);
 
  334    x = _mm_add_ps(x, xmm1);
 
  335    x = _mm_add_ps(x, xmm2);
 
  336    x = _mm_add_ps(x, xmm3);
 
  339    y = _mm_set_ps1(2.443315711809948E-005);
 
  340    __m128 z = _mm_mul_ps(x, x);
 
  342    y = _mm_mul_ps(y, z);
 
  343    y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
 
  344    y = _mm_mul_ps(y, z);
 
  345    y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
 
  346    y = _mm_mul_ps(y, z);
 
  347    y = _mm_mul_ps(y, z);
 
  348    __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
 
  349    y = _mm_sub_ps(y, tmp);
 
  350    y = _mm_add_ps(y, _mm_set_ps1(1.0));
 
  354    __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
 
  355    y2 = _mm_mul_ps(y2, z);
 
  356    y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
 
  357    y2 = _mm_mul_ps(y2, z);
 
  358    y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
 
  359    y2 = _mm_mul_ps(y2, z);
 
  360    y2 = _mm_mul_ps(y2, x);
 
  361    y2 = _mm_add_ps(y2, x);
 
  365    y2 = _mm_and_ps(xmm3, y2); 
 
  366    y = _mm_andnot_ps(xmm3, y);
 
  367    y = _mm_add_ps(y, y2);
 
  369    y = _mm_xor_ps(y, sign_bit);
 
 
  378    __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 
  379    __m128i emm0, emm2, emm4;
 
  382    const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 
  383    x = _mm_and_ps(x, inv_sign_mask);
 
  385    const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
  386    sign_bit_sin = _mm_and_ps(sign_bit_sin, sign_mask);
 
  389    const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516); 
 
  390    y = _mm_mul_ps(x, cephes_FOPI);
 
  393    emm2 = _mm_cvttps_epi32(y);
 
  396    emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
 
  397    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
 
  398    y = _mm_cvtepi32_ps(emm2);
 
  403    emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
 
  404    emm0 = _mm_slli_epi32(emm0, 29);
 
  405    __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
 
  408    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
 
  409    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  410    __m128 poly_mask = _mm_castsi128_ps(emm2);
 
  414    xmm1 = _mm_set_ps1(-0.78515625);
 
  415    xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
 
  416    xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
 
  417    xmm1 = _mm_mul_ps(y, xmm1);
 
  418    xmm2 = _mm_mul_ps(y, xmm2);
 
  419    xmm3 = _mm_mul_ps(y, xmm3);
 
  420    x = _mm_add_ps(x, xmm1);
 
  421    x = _mm_add_ps(x, xmm2);
 
  422    x = _mm_add_ps(x, xmm3);
 
  424    emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
 
  425    emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
 
  426    emm4 = _mm_slli_epi32(emm4, 29);
 
  427    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
 
  429    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
 
  433    __m128 z = _mm_mul_ps(x, x);
 
  434    y = _mm_set_ps1(2.443315711809948E-005);
 
  436    y = _mm_mul_ps(y, z);
 
  437    y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
 
  438    y = _mm_mul_ps(y, z);
 
  439    y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
 
  440    y = _mm_mul_ps(y, z);
 
  441    y = _mm_mul_ps(y, z);
 
  442    __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
 
  443    y = _mm_sub_ps(y, tmp);
 
  444    y = _mm_add_ps(y, _mm_set_ps1(1.0));
 
  448    __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
 
  449    y2 = _mm_mul_ps(y2, z);
 
  450    y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
 
  451    y2 = _mm_mul_ps(y2, z);
 
  452    y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
 
  453    y2 = _mm_mul_ps(y2, z);
 
  454    y2 = _mm_mul_ps(y2, x);
 
  455    y2 = _mm_add_ps(y2, x);
 
  459    __m128 ysin2 = _mm_and_ps(xmm3, y2);
 
  460    __m128 ysin1 = _mm_andnot_ps(xmm3, y);
 
  461    y2 = _mm_sub_ps(y2, ysin2);
 
  462    y = _mm_sub_ps(y, ysin1);
 
  464    xmm1 = _mm_add_ps(ysin1, ysin2);
 
  465    xmm2 = _mm_add_ps(y, y2);
 
  468    *s = _mm_xor_ps(xmm1, sign_bit_sin);
 
  469    *c = _mm_xor_ps(xmm2, sign_bit_cos);