28 #define POLY0(x, c0) _mm_set1_ps(c0)
29 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
30 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
31 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
32 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
33 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
35 #define LOG_POLY_DEGREE 3
37 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
38 #define INCLUDED_volk_32f_x2_pow_32f_a_H
41 #include <smmintrin.h>
49 static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
const float* bVector,
const float* aVector,
unsigned int num_points){
51 float* cPtr = cVector;
52 const float* bPtr = bVector;
53 const float* aPtr = aVector;
55 unsigned int number = 0;
56 const unsigned int quarterPoints = num_points / 4;
58 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
59 __m128 tmp, fx, mask, pow2n, z, y;
60 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
61 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
62 __m128i bias, exp, emm0, pi32_0x7f;
64 one = _mm_set1_ps(1.0);
65 exp_hi = _mm_set1_ps(88.3762626647949);
66 exp_lo = _mm_set1_ps(-88.3762626647949);
67 ln2 = _mm_set1_ps(0.6931471805);
68 log2EF = _mm_set1_ps(1.44269504088896341);
69 half = _mm_set1_ps(0.5);
70 exp_C1 = _mm_set1_ps(0.693359375);
71 exp_C2 = _mm_set1_ps(-2.12194440e-4);
72 pi32_0x7f = _mm_set1_epi32(0x7f);
74 exp_p0 = _mm_set1_ps(1.9875691500e-4);
75 exp_p1 = _mm_set1_ps(1.3981999507e-3);
76 exp_p2 = _mm_set1_ps(8.3334519073e-3);
77 exp_p3 = _mm_set1_ps(4.1665795894e-2);
78 exp_p4 = _mm_set1_ps(1.6666665459e-1);
79 exp_p5 = _mm_set1_ps(5.0000001201e-1);
81 for(;number < quarterPoints; number++){
83 aVal = _mm_load_ps(aPtr);
84 bias = _mm_set1_epi32(127);
85 leadingOne = _mm_set1_ps(1.0f);
86 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
87 logarithm = _mm_cvtepi32_ps(exp);
89 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
91 #if LOG_POLY_DEGREE == 6
92 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
93 #elif LOG_POLY_DEGREE == 5
94 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
95 #elif LOG_POLY_DEGREE == 4
96 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
97 #elif LOG_POLY_DEGREE == 3
98 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
103 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
104 logarithm = _mm_mul_ps(logarithm, ln2);
108 bVal = _mm_load_ps(bPtr);
109 bVal = _mm_mul_ps(bVal, logarithm);
112 tmp = _mm_setzero_ps();
114 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
116 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
118 emm0 = _mm_cvttps_epi32(fx);
119 tmp = _mm_cvtepi32_ps(emm0);
121 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
122 fx = _mm_sub_ps(tmp, mask);
124 tmp = _mm_mul_ps(fx, exp_C1);
125 z = _mm_mul_ps(fx, exp_C2);
126 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
127 z = _mm_mul_ps(bVal, bVal);
129 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
130 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
131 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
132 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
133 y = _mm_add_ps(y, one);
135 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
137 pow2n = _mm_castsi128_ps(emm0);
138 cVal = _mm_mul_ps(y, pow2n);
140 _mm_store_ps(cPtr, cVal);
147 number = quarterPoints * 4;
148 for(;number < num_points; number++){
149 *cPtr++ = pow(*aPtr++, *bPtr++);
157 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
158 #define INCLUDED_volk_32f_x2_pow_32f_u_H
160 #ifdef LV_HAVE_GENERIC
168 static inline void volk_32f_x2_pow_32f_generic(
float* cVector,
const float* bVector,
const float* aVector,
unsigned int num_points){
169 float* cPtr = cVector;
170 const float* bPtr = bVector;
171 const float* aPtr = aVector;
172 unsigned int number = 0;
174 for(number = 0; number < num_points; number++){
175 *cPtr++ = pow(*aPtr++, *bPtr++);
182 #ifdef LV_HAVE_SSE4_1
183 #include <smmintrin.h>
191 static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
const float* bVector,
const float* aVector,
unsigned int num_points){
193 float* cPtr = cVector;
194 const float* bPtr = bVector;
195 const float* aPtr = aVector;
197 unsigned int number = 0;
198 const unsigned int quarterPoints = num_points / 4;
200 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
201 __m128 tmp, fx, mask, pow2n, z, y;
202 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
203 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
204 __m128i bias, exp, emm0, pi32_0x7f;
206 one = _mm_set1_ps(1.0);
207 exp_hi = _mm_set1_ps(88.3762626647949);
208 exp_lo = _mm_set1_ps(-88.3762626647949);
209 ln2 = _mm_set1_ps(0.6931471805);
210 log2EF = _mm_set1_ps(1.44269504088896341);
211 half = _mm_set1_ps(0.5);
212 exp_C1 = _mm_set1_ps(0.693359375);
213 exp_C2 = _mm_set1_ps(-2.12194440e-4);
214 pi32_0x7f = _mm_set1_epi32(0x7f);
216 exp_p0 = _mm_set1_ps(1.9875691500e-4);
217 exp_p1 = _mm_set1_ps(1.3981999507e-3);
218 exp_p2 = _mm_set1_ps(8.3334519073e-3);
219 exp_p3 = _mm_set1_ps(4.1665795894e-2);
220 exp_p4 = _mm_set1_ps(1.6666665459e-1);
221 exp_p5 = _mm_set1_ps(5.0000001201e-1);
223 for(;number < quarterPoints; number++){
226 aVal = _mm_loadu_ps(aPtr);
227 bias = _mm_set1_epi32(127);
228 leadingOne = _mm_set1_ps(1.0f);
229 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
230 logarithm = _mm_cvtepi32_ps(exp);
232 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
234 #if LOG_POLY_DEGREE == 6
235 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
236 #elif LOG_POLY_DEGREE == 5
237 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
238 #elif LOG_POLY_DEGREE == 4
239 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
240 #elif LOG_POLY_DEGREE == 3
241 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
246 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
247 logarithm = _mm_mul_ps(logarithm, ln2);
251 bVal = _mm_loadu_ps(bPtr);
252 bVal = _mm_mul_ps(bVal, logarithm);
255 tmp = _mm_setzero_ps();
257 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
259 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
261 emm0 = _mm_cvttps_epi32(fx);
262 tmp = _mm_cvtepi32_ps(emm0);
264 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
265 fx = _mm_sub_ps(tmp, mask);
267 tmp = _mm_mul_ps(fx, exp_C1);
268 z = _mm_mul_ps(fx, exp_C2);
269 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
270 z = _mm_mul_ps(bVal, bVal);
272 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
273 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
274 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
275 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
276 y = _mm_add_ps(y, one);
278 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
280 pow2n = _mm_castsi128_ps(emm0);
281 cVal = _mm_mul_ps(y, pow2n);
283 _mm_storeu_ps(cPtr, cVal);
290 number = quarterPoints * 4;
291 for(;number < num_points; number++){
292 *cPtr++ = pow(*aPtr++, *bPtr++);
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_x2_pow_32f.h:31
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_x2_pow_32f.h:33
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_x2_pow_32f.h:30
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_x2_pow_32f.h:32