76    __m128 p0 = _mm_set_ps1(9.38540185543E-3);
 
   77    __m128 p1 = _mm_set_ps1(3.11992232697E-3);
 
   78    __m128 p2 = _mm_set_ps1(2.44301354525E-2);
 
   79    __m128 p3 = _mm_set_ps1(5.34112807005E-2);
 
   80    __m128 p4 = _mm_set_ps1(1.33387994085E-1);
 
   81    __m128 p5 = _mm_set_ps1(3.33331568548E-1);
 
   83    __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 
   88    __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
   89    __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 
   90    x = _mm_and_ps(x, inv_sign_mask);
 
   92    sign_bit = _mm_and_ps(sign_bit, sign_mask);
 
   95    __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
 
   96    y = _mm_mul_ps(x, cephes_FOPI);
 
   99    emm2 = _mm_cvttps_epi32(y);
 
  101    emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
 
  102    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
 
  103    y = _mm_cvtepi32_ps(emm2);
 
  105    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
 
  106    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 
  108    __m128 poly_mask = _mm_castsi128_ps(emm2);
 
  111    __m128 minus_cephes_DP1 = _mm_set_ps1(-0.78515625);
 
  112    __m128 minus_cephes_DP2 = _mm_set_ps1(-2.4187564849853515625e-4);
 
  113    __m128 minus_cephes_DP3 = _mm_set_ps1(-3.77489497744594108e-8);
 
  114    xmm1 = minus_cephes_DP1;
 
  115    xmm2 = minus_cephes_DP2;
 
  116    xmm3 = minus_cephes_DP3;
 
  117    xmm1 = _mm_mul_ps(y, xmm1);
 
  118    xmm2 = _mm_mul_ps(y, xmm2);
 
  119    xmm3 = _mm_mul_ps(y, xmm3);
 
  120    __m128 z = _mm_add_ps(x, xmm1);
 
  121    z = _mm_add_ps(z, xmm2);
 
  122    z = _mm_add_ps(z, xmm3);
 
  124    __m128 zz = _mm_mul_ps(z, z);
 
  127    y = _mm_mul_ps(y, zz);
 
  128    y = _mm_add_ps(y, p1);
 
  129    y = _mm_mul_ps(y, zz);
 
  130    y = _mm_add_ps(y, p2);
 
  131    y = _mm_mul_ps(y, zz);
 
  132    y = _mm_add_ps(y, p3);
 
  133    y = _mm_mul_ps(y, zz);
 
  134    y = _mm_add_ps(y, p4);
 
  135    y = _mm_mul_ps(y, zz);
 
  136    y = _mm_add_ps(y, p5);
 
  137    y = _mm_mul_ps(y, zz);
 
  138    y = _mm_mul_ps(y, z);
 
  139    y = _mm_add_ps(y, z);
 
  143        y2 = _mm_xor_ps(y, sign_mask);
 
  146        y = _mm_div_ps(_mm_set_ps1(1.f), y);
 
  151        y2 = _mm_div_ps(_mm_set_ps1(1.f), y);
 
  152        y2 = _mm_xor_ps(y2, sign_mask);
 
  157    y = _mm_and_ps(xmm3, y);
 
  158    y2 = _mm_andnot_ps(xmm3, y2);
 
  159    y = _mm_or_ps(y, y2);
 
  162    y = _mm_xor_ps(y, sign_bit);
 
 
  177    __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
  178    __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 
  180    __m128 atanrange_hi = _mm_set_ps1(2.414213562373095);
 
  181    __m128 atanrange_lo = _mm_set_ps1(0.4142135623730950);
 
  182    __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
 
  183    __m128 cephes_PIO4F = _mm_set_ps1(0.7853981633974483096);
 
  185    __m128 atancof_p0 = _mm_set_ps1(8.05374449538e-2);
 
  186    __m128 atancof_p1 = _mm_set_ps1(1.38776856032E-1);
 
  187    __m128 atancof_p2 = _mm_set_ps1(1.99777106478E-1);
 
  188    __m128 atancof_p3 = _mm_set_ps1(3.33329491539E-1);
 
  194    x = _mm_and_ps(x, inv_sign_mask);
 
  196    sign_bit = _mm_and_ps(sign_bit, sign_mask);
 
  200    __m128 cmp0 = _mm_cmpgt_ps(x, atanrange_hi);
 
  202    __m128 cmp1 = _mm_cmpgt_ps(x, atanrange_lo);
 
  205    __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
 
  208    __m128 y0 = _mm_and_ps(cmp0, cephes_PIO2F);
 
  209    __m128 x0 = _mm_div_ps(_mm_set_ps1(1.f), x);
 
  210    x0 = _mm_xor_ps(x0, sign_mask);
 
  212    __m128 y1 = _mm_and_ps(cmp2, cephes_PIO4F);
 
  214    __m128 x1_o = _mm_sub_ps(x, _mm_set_ps1(1.f));
 
  215    __m128 x1_u = _mm_add_ps(x, _mm_set_ps1(1.f));
 
  216    __m128 x1 = _mm_div_ps(x1_o, x1_u);
 
  218    __m128 x2 = _mm_and_ps(cmp2, x1);
 
  219    x0 = _mm_and_ps(cmp0, x0);
 
  220    x2 = _mm_or_ps(x2, x0);
 
  221    cmp1 = _mm_or_ps(cmp0, cmp2);
 
  222    x2 = _mm_and_ps(cmp1, x2);
 
  223    x = _mm_andnot_ps(cmp1, x);
 
  224    x = _mm_or_ps(x2, x);
 
  226    y = _mm_or_ps(y0, y1);
 
  228    __m128 zz = _mm_mul_ps(x, x);
 
  229    __m128 acc = atancof_p0;
 
  230    acc = _mm_mul_ps(acc, zz);
 
  231    acc = _mm_sub_ps(acc, atancof_p1);
 
  232    acc = _mm_mul_ps(acc, zz);
 
  233    acc = _mm_add_ps(acc, atancof_p2);
 
  234    acc = _mm_mul_ps(acc, zz);
 
  235    acc = _mm_sub_ps(acc, atancof_p3);
 
  236    acc = _mm_mul_ps(acc, zz);
 
  237    acc = _mm_mul_ps(acc, x);
 
  238    acc = _mm_add_ps(acc, x);
 
  239    y = _mm_add_ps(y, acc);
 
  242    y = _mm_xor_ps(y, sign_bit);
 
 
  248    __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
  249    __m128 x_eq_0 = _mm_cmpeq_ps(x, _mm_setzero_ps());
 
  250    __m128 x_gt_0 = _mm_cmpgt_ps(x, _mm_setzero_ps());
 
  251    __m128 x_le_0 = _mm_cmple_ps(x, _mm_setzero_ps());
 
  252    __m128 y_eq_0 = _mm_cmpeq_ps(y, _mm_setzero_ps());
 
  253    __m128 x_lt_0 = _mm_cmplt_ps(x, _mm_setzero_ps());
 
  254    __m128 y_lt_0 = _mm_cmplt_ps(y, _mm_setzero_ps());
 
  255    __m128 cephes_PIF = _mm_set_ps1(3.141592653589793238);
 
  256    __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
 
  258    __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
 
  259    __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
 
  260    zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
 
  262    __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
 
  263    __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, sign_mask);
 
  264    __m128 pio2_result = cephes_PIO2F;
 
  265    pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
 
  266    pio2_result = _mm_and_ps(pio2_mask, pio2_result);
 
  268    __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
 
  269    __m128 pi = cephes_PIF;
 
  270    __m128 pi_result = _mm_and_ps(pi_mask, pi);
 
  272    __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
 
  273    swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, sign_mask);
 
  275    __m128 offset0 = _mm_setzero_ps();
 
  276    __m128 offset1 = cephes_PIF;
 
  277    offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
 
  279    __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
 
  280    offset = _mm_and_ps(x_lt_0, offset1);
 
  282    __m128 arg = _mm_div_ps(y, x);
 
  284    atan_result = _mm_add_ps(atan_result, offset);
 
  288    __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
 
  289    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
 
  290    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
 
  291    result = _mm_or_ps(result, atan_result);
 
  292    result = _mm_or_ps(result, pi_result);