60 __m128 one = _mm_set_ps1(1.0);
62 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
65 x = _mm_max_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
67 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
69 x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000)));
70 x = _mm_or_ps(x, _mm_set_ps1(0.5));
72 emm0 = _mm_sub_epi32(emm0, _mm_set1_epi32(0x7f));
73 __m128 e = _mm_cvtepi32_ps(emm0);
75 e = _mm_add_ps(e, one);
83 __m128 mask = _mm_cmplt_ps(x, _mm_set_ps1(0.707106781186547524));
84 __m128 tmp = _mm_and_ps(x, mask);
85 x = _mm_sub_ps(x, one);
86 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
87 x = _mm_add_ps(x, tmp);
90 __m128 z = _mm_mul_ps(x, x);
92 __m128 y = _mm_set_ps1(7.0376836292E-2);
94 y = _mm_add_ps(y, _mm_set_ps1(-1.1514610310E-1));
96 y = _mm_add_ps(y, _mm_set_ps1(1.1676998740E-1));
98 y = _mm_add_ps(y, _mm_set_ps1(-1.2420140846E-1));
100 y = _mm_add_ps(y, _mm_set_ps1(1.4249322787E-1));
101 y = _mm_mul_ps(y, x);
102 y = _mm_add_ps(y, _mm_set_ps1(-1.6668057665E-1));
103 y = _mm_mul_ps(y, x);
104 y = _mm_add_ps(y, _mm_set_ps1(2.0000714765E-1));
105 y = _mm_mul_ps(y, x);
106 y = _mm_add_ps(y, _mm_set_ps1(-2.4999993993E-1));
107 y = _mm_mul_ps(y, x);
108 y = _mm_add_ps(y, _mm_set_ps1(3.3333331174E-1));
109 y = _mm_mul_ps(y, x);
111 y = _mm_mul_ps(y, z);
114 tmp = _mm_mul_ps(e, _mm_set_ps1(-2.12194440e-4));
115 y = _mm_add_ps(y, tmp);
118 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
119 y = _mm_sub_ps(y, tmp);
121 tmp = _mm_mul_ps(e, _mm_set_ps1(0.693359375));
122 x = _mm_add_ps(x, y);
123 x = _mm_add_ps(x, tmp);
124 x = _mm_or_ps(x, invalid_mask);
130 __m128 tmp = _mm_setzero_ps(), fx;
132 __m128 one = _mm_set_ps1(1.0);
134 x = _mm_min_ps(x, _mm_set_ps1(88.3762626647949f));
135 x = _mm_max_ps(x, _mm_set_ps1(-88.3762626647949f));
138 fx = _mm_mul_ps(x, _mm_set_ps1(1.44269504088896341));
139 fx = _mm_add_ps(fx, _mm_set_ps1(0.5));
142 emm0 = _mm_cvttps_epi32(fx);
143 tmp = _mm_cvtepi32_ps(emm0);
145 __m128 mask = _mm_cmpgt_ps(tmp, fx);
146 mask = _mm_and_ps(mask, one);
147 fx = _mm_sub_ps(tmp, mask);
149 tmp = _mm_mul_ps(fx, _mm_set_ps1(0.693359375));
150 __m128 z = _mm_mul_ps(fx, _mm_set_ps1(-2.12194440e-4));
151 x = _mm_sub_ps(x, tmp);
152 x = _mm_sub_ps(x, z);
154 z = _mm_mul_ps(x, x);
156 __m128 y = _mm_set_ps1(1.9875691500E-4);
157 y = _mm_mul_ps(y, x);
158 y = _mm_add_ps(y, _mm_set_ps1(1.3981999507E-3));
159 y = _mm_mul_ps(y, x);
160 y = _mm_add_ps(y, _mm_set_ps1(8.3334519073E-3));
161 y = _mm_mul_ps(y, x);
162 y = _mm_add_ps(y, _mm_set_ps1(4.1665795894E-2));
163 y = _mm_mul_ps(y, x);
164 y = _mm_add_ps(y, _mm_set_ps1(1.6666665459E-1));
165 y = _mm_mul_ps(y, x);
166 y = _mm_add_ps(y, _mm_set_ps1(5.0000001201E-1));
167 y = _mm_mul_ps(y, z);
168 y = _mm_add_ps(y, x);
169 y = _mm_add_ps(y, one);
172 emm0 = _mm_cvttps_epi32(fx);
173 emm0 = _mm_add_epi32(emm0, _mm_set1_epi32(0x7f));
174 emm0 = _mm_slli_epi32(emm0, 23);
175 __m128 pow2n = _mm_castsi128_ps(emm0);
176 y = _mm_mul_ps(y, pow2n);
210 __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
215 const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
216 x = _mm_and_ps(x, inv_sign_mask);
218 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
219 sign_bit = _mm_and_ps(sign_bit, sign_mask);
222 const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
223 y = _mm_mul_ps(x, cephes_FOPI);
226 emm2 = _mm_cvttps_epi32(y);
228 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
229 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
230 y = _mm_cvtepi32_ps(emm2);
233 emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
234 emm0 = _mm_slli_epi32(emm0, 29);
241 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
242 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
244 __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
245 __m128 poly_mask = _mm_castsi128_ps(emm2);
246 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
250 xmm1 = _mm_set_ps1(-0.78515625);
251 xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
252 xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
253 xmm1 = _mm_mul_ps(y, xmm1);
254 xmm2 = _mm_mul_ps(y, xmm2);
255 xmm3 = _mm_mul_ps(y, xmm3);
256 x = _mm_add_ps(x, xmm1);
257 x = _mm_add_ps(x, xmm2);
258 x = _mm_add_ps(x, xmm3);
261 y = _mm_set_ps1(2.443315711809948E-005);
262 __m128 z = _mm_mul_ps(x, x);
264 y = _mm_mul_ps(y, z);
265 y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
266 y = _mm_mul_ps(y, z);
267 y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
268 y = _mm_mul_ps(y, z);
269 y = _mm_mul_ps(y, z);
270 __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
271 y = _mm_sub_ps(y, tmp);
272 y = _mm_add_ps(y, _mm_set_ps1(1.0));
276 __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
277 y2 = _mm_mul_ps(y2, z);
278 y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
279 y2 = _mm_mul_ps(y2, z);
280 y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
281 y2 = _mm_mul_ps(y2, z);
282 y2 = _mm_mul_ps(y2, x);
283 y2 = _mm_add_ps(y2, x);
287 y2 = _mm_and_ps(xmm3, y2);
288 y = _mm_andnot_ps(xmm3, y);
289 y = _mm_add_ps(y, y2);
291 y = _mm_xor_ps(y, sign_bit);
298 __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
301 const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
302 x = _mm_and_ps(x, inv_sign_mask);
305 const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
306 y = _mm_mul_ps(x, cephes_FOPI);
309 emm2 = _mm_cvttps_epi32(y);
311 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
312 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
313 y = _mm_cvtepi32_ps(emm2);
315 emm2 = _mm_sub_epi32(emm2, _mm_set1_epi32(2));
318 emm0 = _mm_andnot_si128(emm2, _mm_set1_epi32(4));
319 emm0 = _mm_slli_epi32(emm0, 29);
321 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
322 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
324 __m128 sign_bit = _mm_castsi128_ps(emm0);
325 __m128 poly_mask = _mm_castsi128_ps(emm2);
328 xmm1 = _mm_set_ps1(-0.78515625);
329 xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
330 xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
331 xmm1 = _mm_mul_ps(y, xmm1);
332 xmm2 = _mm_mul_ps(y, xmm2);
333 xmm3 = _mm_mul_ps(y, xmm3);
334 x = _mm_add_ps(x, xmm1);
335 x = _mm_add_ps(x, xmm2);
336 x = _mm_add_ps(x, xmm3);
339 y = _mm_set_ps1(2.443315711809948E-005);
340 __m128 z = _mm_mul_ps(x, x);
342 y = _mm_mul_ps(y, z);
343 y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
344 y = _mm_mul_ps(y, z);
345 y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
346 y = _mm_mul_ps(y, z);
347 y = _mm_mul_ps(y, z);
348 __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
349 y = _mm_sub_ps(y, tmp);
350 y = _mm_add_ps(y, _mm_set_ps1(1.0));
354 __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
355 y2 = _mm_mul_ps(y2, z);
356 y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
357 y2 = _mm_mul_ps(y2, z);
358 y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
359 y2 = _mm_mul_ps(y2, z);
360 y2 = _mm_mul_ps(y2, x);
361 y2 = _mm_add_ps(y2, x);
365 y2 = _mm_and_ps(xmm3, y2);
366 y = _mm_andnot_ps(xmm3, y);
367 y = _mm_add_ps(y, y2);
369 y = _mm_xor_ps(y, sign_bit);
378 __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
379 __m128i emm0, emm2, emm4;
382 const __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
383 x = _mm_and_ps(x, inv_sign_mask);
385 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
386 sign_bit_sin = _mm_and_ps(sign_bit_sin, sign_mask);
389 const __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
390 y = _mm_mul_ps(x, cephes_FOPI);
393 emm2 = _mm_cvttps_epi32(y);
396 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
397 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
398 y = _mm_cvtepi32_ps(emm2);
403 emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
404 emm0 = _mm_slli_epi32(emm0, 29);
405 __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
408 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
409 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
410 __m128 poly_mask = _mm_castsi128_ps(emm2);
414 xmm1 = _mm_set_ps1(-0.78515625);
415 xmm2 = _mm_set_ps1(-2.4187564849853515625e-4);
416 xmm3 = _mm_set_ps1(-3.77489497744594108e-8);
417 xmm1 = _mm_mul_ps(y, xmm1);
418 xmm2 = _mm_mul_ps(y, xmm2);
419 xmm3 = _mm_mul_ps(y, xmm3);
420 x = _mm_add_ps(x, xmm1);
421 x = _mm_add_ps(x, xmm2);
422 x = _mm_add_ps(x, xmm3);
424 emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
425 emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
426 emm4 = _mm_slli_epi32(emm4, 29);
427 __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
429 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
433 __m128 z = _mm_mul_ps(x, x);
434 y = _mm_set_ps1(2.443315711809948E-005);
436 y = _mm_mul_ps(y, z);
437 y = _mm_add_ps(y, _mm_set_ps1(-1.388731625493765E-003));
438 y = _mm_mul_ps(y, z);
439 y = _mm_add_ps(y, _mm_set_ps1(4.166664568298827E-002));
440 y = _mm_mul_ps(y, z);
441 y = _mm_mul_ps(y, z);
442 __m128 tmp = _mm_mul_ps(z, _mm_set_ps1(0.5));
443 y = _mm_sub_ps(y, tmp);
444 y = _mm_add_ps(y, _mm_set_ps1(1.0));
448 __m128 y2 = _mm_set_ps1(-1.9515295891E-4);
449 y2 = _mm_mul_ps(y2, z);
450 y2 = _mm_add_ps(y2, _mm_set_ps1(8.3321608736E-3));
451 y2 = _mm_mul_ps(y2, z);
452 y2 = _mm_add_ps(y2, _mm_set_ps1(-1.6666654611E-1));
453 y2 = _mm_mul_ps(y2, z);
454 y2 = _mm_mul_ps(y2, x);
455 y2 = _mm_add_ps(y2, x);
459 __m128 ysin2 = _mm_and_ps(xmm3, y2);
460 __m128 ysin1 = _mm_andnot_ps(xmm3, y);
461 y2 = _mm_sub_ps(y2, ysin2);
462 y = _mm_sub_ps(y, ysin1);
464 xmm1 = _mm_add_ps(ysin1, ysin2);
465 xmm2 = _mm_add_ps(y, y2);
468 *s = _mm_xor_ps(xmm1, sign_bit_sin);
469 *c = _mm_xor_ps(xmm2, sign_bit_cos);