57 #define POLY0(x, c0) _mm_set1_ps(c0)
58 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
59 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
60 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
61 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
62 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
64 #define LOG_POLY_DEGREE 6
67 #ifndef INCLUDED_volk_32f_log2_32f_a_H
68 #define INCLUDED_volk_32f_log2_32f_a_H
70 #ifdef LV_HAVE_GENERIC
77 static inline void volk_32f_log2_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points){
78 float* bPtr = bVector;
79 const float* aPtr = aVector;
80 unsigned int number = 0;
82 for(number = 0; number < num_points; number++){
83 *bPtr++ = log2(*aPtr++);
91 #include <smmintrin.h>
98 static inline void volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
100 float* bPtr = bVector;
101 const float* aPtr = aVector;
103 unsigned int number = 0;
104 const unsigned int quarterPoints = num_points / 4;
106 __m128 aVal, bVal, mantissa, frac, leadingOne;
109 for(;number < quarterPoints; number++){
111 aVal = _mm_load_ps(aPtr);
112 bias = _mm_set1_epi32(127);
113 leadingOne = _mm_set1_ps(1.0f);
114 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
115 bVal = _mm_cvtepi32_ps(exp);
118 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
120 #if LOG_POLY_DEGREE == 6
121 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
122 #elif LOG_POLY_DEGREE == 5
123 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
124 #elif LOG_POLY_DEGREE == 4
125 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
126 #elif LOG_POLY_DEGREE == 3
127 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
132 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
133 _mm_store_ps(bPtr, bVal);
140 number = quarterPoints * 4;
141 for(;number < num_points; number++){
142 *bPtr++ = log2(*aPtr++);
150 #include <arm_neon.h>
153 #define VLOG2Q_NEON_PREAMBLE() \
154 int32x4_t one = vdupq_n_s32(0x000800000); \
156 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
157 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
158 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
159 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
160 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
161 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
162 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
163 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
164 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
165 int32x4_t exp_bias = vdupq_n_s32(127);
168 #define VLOG2Q_NEON_F32(log2_approx, aval) \
169 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
170 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
171 exponent_i = vshrq_n_s32(exponent_i, 23); \
176 significand_i = vorrq_s32(one, significand_i); \
177 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
179 exponent_i = vsubq_s32(exponent_i, exp_bias); \
180 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
184 log2_approx = vaddq_f32(exponent_f, p0); \
185 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
186 log2_approx = vaddq_f32(log2_approx, tmp1); \
187 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
188 tmp1 = vmulq_f32(sig_2, p2); \
189 log2_approx = vaddq_f32(log2_approx, tmp1); \
191 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
192 tmp1 = vmulq_f32(sig_3, p3); \
193 log2_approx = vaddq_f32(log2_approx, tmp1); \
194 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
195 tmp1 = vmulq_f32(sig_4, p4); \
196 log2_approx = vaddq_f32(log2_approx, tmp1); \
197 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
198 tmp1 = vmulq_f32(sig_5, p5); \
199 log2_approx = vaddq_f32(log2_approx, tmp1); \
200 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
201 tmp1 = vmulq_f32(sig_6, p6); \
202 log2_approx = vaddq_f32(log2_approx, tmp1);
210 static inline void volk_32f_log2_32f_neon(
float* bVector,
const float* aVector,
unsigned int num_points){
211 float* bPtr = bVector;
212 const float* aPtr = aVector;
214 const unsigned int quarterPoints = num_points / 4;
217 float32x4_t log2_approx;
219 VLOG2Q_NEON_PREAMBLE()
228 for(number = 0; number < quarterPoints; ++number){
230 aval = vld1q_s32((
int*)aPtr);
232 VLOG2Q_NEON_F32(log2_approx, aval)
234 vst1q_f32(bPtr, log2_approx);
240 for(number = quarterPoints * 4; number < num_points; number++){
241 *bPtr++ = log2(*aPtr++);
250 #ifndef INCLUDED_volk_32f_log2_32f_u_H
251 #define INCLUDED_volk_32f_log2_32f_u_H
254 #ifdef LV_HAVE_GENERIC
261 static inline void volk_32f_log2_32f_u_generic(
float* bVector,
const float* aVector,
unsigned int num_points){
262 float* bPtr = bVector;
263 const float* aPtr = aVector;
264 unsigned int number = 0;
266 for(number = 0; number < num_points; number++){
267 *bPtr++ = log2(*aPtr++);
274 #ifdef LV_HAVE_SSE4_1
275 #include <smmintrin.h>
282 static inline void volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
284 float* bPtr = bVector;
285 const float* aPtr = aVector;
287 unsigned int number = 0;
288 const unsigned int quarterPoints = num_points / 4;
290 __m128 aVal, bVal, mantissa, frac, leadingOne;
293 for(;number < quarterPoints; number++){
295 aVal = _mm_loadu_ps(aPtr);
296 bias = _mm_set1_epi32(127);
297 leadingOne = _mm_set1_ps(1.0f);
298 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
299 bVal = _mm_cvtepi32_ps(exp);
302 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
304 #if LOG_POLY_DEGREE == 6
305 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
306 #elif LOG_POLY_DEGREE == 5
307 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
308 #elif LOG_POLY_DEGREE == 4
309 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
310 #elif LOG_POLY_DEGREE == 3
311 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
316 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
317 _mm_storeu_ps(bPtr, bVal);
324 number = quarterPoints * 4;
325 for(;number < num_points; number++){
326 *bPtr++ = log2(*aPtr++);
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_log2_32f.h:60
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_log2_32f.h:61
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_log2_32f.h:59
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_log2_32f.h:62