76 __m128 p0 = _mm_set_ps1(9.38540185543E-3);
77 __m128 p1 = _mm_set_ps1(3.11992232697E-3);
78 __m128 p2 = _mm_set_ps1(2.44301354525E-2);
79 __m128 p3 = _mm_set_ps1(5.34112807005E-2);
80 __m128 p4 = _mm_set_ps1(1.33387994085E-1);
81 __m128 p5 = _mm_set_ps1(3.33331568548E-1);
83 __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
88 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
89 __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
90 x = _mm_and_ps(x, inv_sign_mask);
92 sign_bit = _mm_and_ps(sign_bit, sign_mask);
95 __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
96 y = _mm_mul_ps(x, cephes_FOPI);
99 emm2 = _mm_cvttps_epi32(y);
101 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
102 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
103 y = _mm_cvtepi32_ps(emm2);
105 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
106 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
108 __m128 poly_mask = _mm_castsi128_ps(emm2);
111 __m128 minus_cephes_DP1 = _mm_set_ps1(-0.78515625);
112 __m128 minus_cephes_DP2 = _mm_set_ps1(-2.4187564849853515625e-4);
113 __m128 minus_cephes_DP3 = _mm_set_ps1(-3.77489497744594108e-8);
114 xmm1 = minus_cephes_DP1;
115 xmm2 = minus_cephes_DP2;
116 xmm3 = minus_cephes_DP3;
117 xmm1 = _mm_mul_ps(y, xmm1);
118 xmm2 = _mm_mul_ps(y, xmm2);
119 xmm3 = _mm_mul_ps(y, xmm3);
120 __m128 z = _mm_add_ps(x, xmm1);
121 z = _mm_add_ps(z, xmm2);
122 z = _mm_add_ps(z, xmm3);
124 __m128 zz = _mm_mul_ps(z, z);
127 y = _mm_mul_ps(y, zz);
128 y = _mm_add_ps(y, p1);
129 y = _mm_mul_ps(y, zz);
130 y = _mm_add_ps(y, p2);
131 y = _mm_mul_ps(y, zz);
132 y = _mm_add_ps(y, p3);
133 y = _mm_mul_ps(y, zz);
134 y = _mm_add_ps(y, p4);
135 y = _mm_mul_ps(y, zz);
136 y = _mm_add_ps(y, p5);
137 y = _mm_mul_ps(y, zz);
138 y = _mm_mul_ps(y, z);
139 y = _mm_add_ps(y, z);
143 y2 = _mm_xor_ps(y, sign_mask);
146 y = _mm_div_ps(_mm_set_ps1(1.f), y);
151 y2 = _mm_div_ps(_mm_set_ps1(1.f), y);
152 y2 = _mm_xor_ps(y2, sign_mask);
157 y = _mm_and_ps(xmm3, y);
158 y2 = _mm_andnot_ps(xmm3, y2);
159 y = _mm_or_ps(y, y2);
162 y = _mm_xor_ps(y, sign_bit);
177 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
178 __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
180 __m128 atanrange_hi = _mm_set_ps1(2.414213562373095);
181 __m128 atanrange_lo = _mm_set_ps1(0.4142135623730950);
182 __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
183 __m128 cephes_PIO4F = _mm_set_ps1(0.7853981633974483096);
185 __m128 atancof_p0 = _mm_set_ps1(8.05374449538e-2);
186 __m128 atancof_p1 = _mm_set_ps1(1.38776856032E-1);
187 __m128 atancof_p2 = _mm_set_ps1(1.99777106478E-1);
188 __m128 atancof_p3 = _mm_set_ps1(3.33329491539E-1);
194 x = _mm_and_ps(x, inv_sign_mask);
196 sign_bit = _mm_and_ps(sign_bit, sign_mask);
200 __m128 cmp0 = _mm_cmpgt_ps(x, atanrange_hi);
202 __m128 cmp1 = _mm_cmpgt_ps(x, atanrange_lo);
205 __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
208 __m128 y0 = _mm_and_ps(cmp0, cephes_PIO2F);
209 __m128 x0 = _mm_div_ps(_mm_set_ps1(1.f), x);
210 x0 = _mm_xor_ps(x0, sign_mask);
212 __m128 y1 = _mm_and_ps(cmp2, cephes_PIO4F);
214 __m128 x1_o = _mm_sub_ps(x, _mm_set_ps1(1.f));
215 __m128 x1_u = _mm_add_ps(x, _mm_set_ps1(1.f));
216 __m128 x1 = _mm_div_ps(x1_o, x1_u);
218 __m128 x2 = _mm_and_ps(cmp2, x1);
219 x0 = _mm_and_ps(cmp0, x0);
220 x2 = _mm_or_ps(x2, x0);
221 cmp1 = _mm_or_ps(cmp0, cmp2);
222 x2 = _mm_and_ps(cmp1, x2);
223 x = _mm_andnot_ps(cmp1, x);
224 x = _mm_or_ps(x2, x);
226 y = _mm_or_ps(y0, y1);
228 __m128 zz = _mm_mul_ps(x, x);
229 __m128 acc = atancof_p0;
230 acc = _mm_mul_ps(acc, zz);
231 acc = _mm_sub_ps(acc, atancof_p1);
232 acc = _mm_mul_ps(acc, zz);
233 acc = _mm_add_ps(acc, atancof_p2);
234 acc = _mm_mul_ps(acc, zz);
235 acc = _mm_sub_ps(acc, atancof_p3);
236 acc = _mm_mul_ps(acc, zz);
237 acc = _mm_mul_ps(acc, x);
238 acc = _mm_add_ps(acc, x);
239 y = _mm_add_ps(y, acc);
242 y = _mm_xor_ps(y, sign_bit);
248 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249 __m128 x_eq_0 = _mm_cmpeq_ps(x, _mm_setzero_ps());
250 __m128 x_gt_0 = _mm_cmpgt_ps(x, _mm_setzero_ps());
251 __m128 x_le_0 = _mm_cmple_ps(x, _mm_setzero_ps());
252 __m128 y_eq_0 = _mm_cmpeq_ps(y, _mm_setzero_ps());
253 __m128 x_lt_0 = _mm_cmplt_ps(x, _mm_setzero_ps());
254 __m128 y_lt_0 = _mm_cmplt_ps(y, _mm_setzero_ps());
255 __m128 cephes_PIF = _mm_set_ps1(3.141592653589793238);
256 __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
258 __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
259 __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
260 zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
262 __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
263 __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, sign_mask);
264 __m128 pio2_result = cephes_PIO2F;
265 pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
266 pio2_result = _mm_and_ps(pio2_mask, pio2_result);
268 __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
269 __m128 pi = cephes_PIF;
270 __m128 pi_result = _mm_and_ps(pi_mask, pi);
272 __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
273 swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, sign_mask);
275 __m128 offset0 = _mm_setzero_ps();
276 __m128 offset1 = cephes_PIF;
277 offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
279 __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
280 offset = _mm_and_ps(x_lt_0, offset1);
282 __m128 arg = _mm_div_ps(y, x);
284 atan_result = _mm_add_ps(atan_result, offset);
288 __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
289 atan_result = _mm_andnot_ps(pio2_mask, atan_result);
290 atan_result = _mm_andnot_ps(pio2_mask, atan_result);
291 result = _mm_or_ps(result, atan_result);
292 result = _mm_or_ps(result, pi_result);