VCV Rack API v2
Loading...
Searching...
No Matches
sse_mathfun_extension.h
Go to the documentation of this file.
1/* Modified version of https://github.com/to-miz/sse_mathfun_extension for VCV Rack.
2
3The following changes were made.
4- Make functions inline.
5- Change global constants to function-scope.
6
7This derived source file is released under the zlib license.
8*/
9
10/*
11sse_mathfun_extension.h - zlib license
12Written by Tolga Mizrak 2016
13Extension of sse_mathfun.h, which is written by Julien Pommier
14
15Based on the corresponding algorithms of the cephes math library
16
17This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
18to maintain a modified version of the original library. This way switching to a newer version of the
19library won't be a hassle.
20
21Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
22As such, currently you need to #define USE_SSE2 to compile.
23
24With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
25borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
26functions on my machine:
27checking tan on [-0.25*Pi, 0.25*Pi]
28max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
295.96046e-08
30 ->> precision OK for the tan_ps <<-
31
32checking tan on [-0.49*Pi, 0.49*Pi]
33max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
349.53674e-07
35 ->> precision OK for the tan_ps <<-
36
37checking cot on [0.2*Pi, 0.7*Pi]
38max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
391.19209e-07
40 ->> precision OK for the cot_ps <<-
41
42checking cot on [0.01*Pi, 0.99*Pi]
43max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
449.53674e-07
45 ->> precision OK for the cot_ps <<-
46
47With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
48atan2_ps max deviation is < 2.5e-7
49*/
50
51/* Copyright (C) 2016 Tolga Mizrak
52
53 This software is provided 'as-is', without any express or implied
54 warranty. In no event will the authors be held liable for any damages
55 arising from the use of this software.
56
57 Permission is granted to anyone to use this software for any purpose,
58 including commercial applications, and to alter it and redistribute it
59 freely, subject to the following restrictions:
60
61 1. The origin of this software must not be misrepresented; you must not
62 claim that you wrote the original software. If you use this software
63 in a product, an acknowledgment in the product documentation would be
64 appreciated but is not required.
65 2. Altered source versions must be plainly marked as such, and must not be
66 misrepresented as being the original software.
67 3. This notice may not be removed or altered from any source distribution.
68
69 (this is the zlib license)
70*/
71#pragma once
72#include "sse_mathfun.h"
73
74
75inline __m128 sse_mathfun_tancot_ps(__m128 x, int cotFlag) {
76 __m128 p0 = _mm_set_ps1(9.38540185543E-3);
77 __m128 p1 = _mm_set_ps1(3.11992232697E-3);
78 __m128 p2 = _mm_set_ps1(2.44301354525E-2);
79 __m128 p3 = _mm_set_ps1(5.34112807005E-2);
80 __m128 p4 = _mm_set_ps1(1.33387994085E-1);
81 __m128 p5 = _mm_set_ps1(3.33331568548E-1);
82
83 __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
84
85 __m128i emm2;
86 sign_bit = x;
87 /* take the absolute value */
88 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
89 __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
90 x = _mm_and_ps(x, inv_sign_mask);
91 /* extract the sign bit (upper one) */
92 sign_bit = _mm_and_ps(sign_bit, sign_mask);
93
94 /* scale by 4/Pi */
95 __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
96 y = _mm_mul_ps(x, cephes_FOPI);
97
98 /* store the integer part of y in mm0 */
99 emm2 = _mm_cvttps_epi32(y);
100 /* j=(j+1) & (~1) (see the cephes sources) */
101 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
102 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
103 y = _mm_cvtepi32_ps(emm2);
104
105 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
106 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
107
108 __m128 poly_mask = _mm_castsi128_ps(emm2);
109 /* The magic pass: "Extended precision modular arithmetic"
110 x = ((x - y * DP1) - y * DP2) - y * DP3; */
111 __m128 minus_cephes_DP1 = _mm_set_ps1(-0.78515625);
112 __m128 minus_cephes_DP2 = _mm_set_ps1(-2.4187564849853515625e-4);
113 __m128 minus_cephes_DP3 = _mm_set_ps1(-3.77489497744594108e-8);
114 xmm1 = minus_cephes_DP1;
115 xmm2 = minus_cephes_DP2;
116 xmm3 = minus_cephes_DP3;
117 xmm1 = _mm_mul_ps(y, xmm1);
118 xmm2 = _mm_mul_ps(y, xmm2);
119 xmm3 = _mm_mul_ps(y, xmm3);
120 __m128 z = _mm_add_ps(x, xmm1);
121 z = _mm_add_ps(z, xmm2);
122 z = _mm_add_ps(z, xmm3);
123
124 __m128 zz = _mm_mul_ps(z, z);
125
126 y = p0;
127 y = _mm_mul_ps(y, zz);
128 y = _mm_add_ps(y, p1);
129 y = _mm_mul_ps(y, zz);
130 y = _mm_add_ps(y, p2);
131 y = _mm_mul_ps(y, zz);
132 y = _mm_add_ps(y, p3);
133 y = _mm_mul_ps(y, zz);
134 y = _mm_add_ps(y, p4);
135 y = _mm_mul_ps(y, zz);
136 y = _mm_add_ps(y, p5);
137 y = _mm_mul_ps(y, zz);
138 y = _mm_mul_ps(y, z);
139 y = _mm_add_ps(y, z);
140
141 __m128 y2;
142 if (cotFlag) {
143 y2 = _mm_xor_ps(y, sign_mask);
144 /* y = _mm_rcp_ps(y); */
145 /* using _mm_rcp_ps here loses on way too much precision, better to do a div */
146 y = _mm_div_ps(_mm_set_ps1(1.f), y);
147 }
148 else {
149 /* y2 = _mm_rcp_ps(y); */
150 /* using _mm_rcp_ps here loses on way too much precision, better to do a div */
151 y2 = _mm_div_ps(_mm_set_ps1(1.f), y);
152 y2 = _mm_xor_ps(y2, sign_mask);
153 }
154
155 /* select the correct result from the two polynoms */
156 xmm3 = poly_mask;
157 y = _mm_and_ps(xmm3, y);
158 y2 = _mm_andnot_ps(xmm3, y2);
159 y = _mm_or_ps(y, y2);
160
161 /* update the sign */
162 y = _mm_xor_ps(y, sign_bit);
163
164 return y;
165}
166
167inline __m128 sse_mathfun_tan_ps(__m128 x) {
168 return sse_mathfun_tancot_ps(x, 0);
169}
170
171inline __m128 sse_mathfun_cot_ps(__m128 x) {
172 return sse_mathfun_tancot_ps(x, 1);
173}
174
175
176inline __m128 sse_mathfun_atan_ps(__m128 x) {
177 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
178 __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
179
180 __m128 atanrange_hi = _mm_set_ps1(2.414213562373095);
181 __m128 atanrange_lo = _mm_set_ps1(0.4142135623730950);
182 __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
183 __m128 cephes_PIO4F = _mm_set_ps1(0.7853981633974483096);
184
185 __m128 atancof_p0 = _mm_set_ps1(8.05374449538e-2);
186 __m128 atancof_p1 = _mm_set_ps1(1.38776856032E-1);
187 __m128 atancof_p2 = _mm_set_ps1(1.99777106478E-1);
188 __m128 atancof_p3 = _mm_set_ps1(3.33329491539E-1);
189
190 __m128 sign_bit, y;
191
192 sign_bit = x;
193 /* take the absolute value */
194 x = _mm_and_ps(x, inv_sign_mask);
195 /* extract the sign bit (upper one) */
196 sign_bit = _mm_and_ps(sign_bit, sign_mask);
197
198 /* range reduction, init x and y depending on range */
199 /* x > 2.414213562373095 */
200 __m128 cmp0 = _mm_cmpgt_ps(x, atanrange_hi);
201 /* x > 0.4142135623730950 */
202 __m128 cmp1 = _mm_cmpgt_ps(x, atanrange_lo);
203
204 /* x > 0.4142135623730950 && !(x > 2.414213562373095) */
205 __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
206
207 /* -(1.0/x) */
208 __m128 y0 = _mm_and_ps(cmp0, cephes_PIO2F);
209 __m128 x0 = _mm_div_ps(_mm_set_ps1(1.f), x);
210 x0 = _mm_xor_ps(x0, sign_mask);
211
212 __m128 y1 = _mm_and_ps(cmp2, cephes_PIO4F);
213 /* (x-1.0)/(x+1.0) */
214 __m128 x1_o = _mm_sub_ps(x, _mm_set_ps1(1.f));
215 __m128 x1_u = _mm_add_ps(x, _mm_set_ps1(1.f));
216 __m128 x1 = _mm_div_ps(x1_o, x1_u);
217
218 __m128 x2 = _mm_and_ps(cmp2, x1);
219 x0 = _mm_and_ps(cmp0, x0);
220 x2 = _mm_or_ps(x2, x0);
221 cmp1 = _mm_or_ps(cmp0, cmp2);
222 x2 = _mm_and_ps(cmp1, x2);
223 x = _mm_andnot_ps(cmp1, x);
224 x = _mm_or_ps(x2, x);
225
226 y = _mm_or_ps(y0, y1);
227
228 __m128 zz = _mm_mul_ps(x, x);
229 __m128 acc = atancof_p0;
230 acc = _mm_mul_ps(acc, zz);
231 acc = _mm_sub_ps(acc, atancof_p1);
232 acc = _mm_mul_ps(acc, zz);
233 acc = _mm_add_ps(acc, atancof_p2);
234 acc = _mm_mul_ps(acc, zz);
235 acc = _mm_sub_ps(acc, atancof_p3);
236 acc = _mm_mul_ps(acc, zz);
237 acc = _mm_mul_ps(acc, x);
238 acc = _mm_add_ps(acc, x);
239 y = _mm_add_ps(y, acc);
240
241 /* update the sign */
242 y = _mm_xor_ps(y, sign_bit);
243
244 return y;
245}
246
247inline __m128 sse_mathfun_atan2_ps(__m128 y, __m128 x) {
248 __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249 __m128 x_eq_0 = _mm_cmpeq_ps(x, _mm_setzero_ps());
250 __m128 x_gt_0 = _mm_cmpgt_ps(x, _mm_setzero_ps());
251 __m128 x_le_0 = _mm_cmple_ps(x, _mm_setzero_ps());
252 __m128 y_eq_0 = _mm_cmpeq_ps(y, _mm_setzero_ps());
253 __m128 x_lt_0 = _mm_cmplt_ps(x, _mm_setzero_ps());
254 __m128 y_lt_0 = _mm_cmplt_ps(y, _mm_setzero_ps());
255 __m128 cephes_PIF = _mm_set_ps1(3.141592653589793238);
256 __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
257
258 __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
259 __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
260 zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
261
262 __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
263 __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, sign_mask);
264 __m128 pio2_result = cephes_PIO2F;
265 pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
266 pio2_result = _mm_and_ps(pio2_mask, pio2_result);
267
268 __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
269 __m128 pi = cephes_PIF;
270 __m128 pi_result = _mm_and_ps(pi_mask, pi);
271
272 __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
273 swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, sign_mask);
274
275 __m128 offset0 = _mm_setzero_ps();
276 __m128 offset1 = cephes_PIF;
277 offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
278
279 __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
280 offset = _mm_and_ps(x_lt_0, offset1);
281
282 __m128 arg = _mm_div_ps(y, x);
283 __m128 atan_result = sse_mathfun_atan_ps(arg);
284 atan_result = _mm_add_ps(atan_result, offset);
285
286 /* select between zero_result, pio2_result and atan_result */
287
288 __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
289 atan_result = _mm_andnot_ps(pio2_mask, atan_result);
290 atan_result = _mm_andnot_ps(pio2_mask, atan_result);
291 result = _mm_or_ps(result, atan_result);
292 result = _mm_or_ps(result, pi_result);
293
294 return result;
295}
__m128 sse_mathfun_tancot_ps(__m128 x, int cotFlag)
Definition sse_mathfun_extension.h:75
__m128 sse_mathfun_atan_ps(__m128 x)
Definition sse_mathfun_extension.h:176
__m128 sse_mathfun_cot_ps(__m128 x)
Definition sse_mathfun_extension.h:171
__m128 sse_mathfun_tan_ps(__m128 x)
Definition sse_mathfun_extension.h:167
__m128 sse_mathfun_atan2_ps(__m128 y, __m128 x)
Definition sse_mathfun_extension.h:247