GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation
25  * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm
26  *
27  * Permission is hereby granted, free of charge, to any person obtaining a
28  * copy of this software and associated documentation files (the
29  * "Software"), to deal in the Software without restriction, including
30  * without limitation the rights to use, copy, modify, merge, publish,
31  * distribute, sub license, and/or sell copies of the Software, and to
32  * permit persons to whom the Software is furnished to do so, subject to
33  * the following conditions:
34  *
35  * The above copyright notice and this permission notice (including the
36  * next paragraph) shall be included in all copies or substantial portions
37  * of the Software.
38  *
39  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
40  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
41  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
42  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
43  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
44  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
45  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
46 
47  * This is the MIT License (MIT)
48 
49 */
50 
51 
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <inttypes.h>
55 #include <math.h>
56 
57 #define POLY0(x, c0) _mm_set1_ps(c0)
58 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
59 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
60 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
61 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
62 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
63 
64 #define LOG_POLY_DEGREE 6
65 
66 
67 #ifndef INCLUDED_volk_32f_log2_32f_a_H
68 #define INCLUDED_volk_32f_log2_32f_a_H
69 
70 #ifdef LV_HAVE_GENERIC
71 /*!
72  \brief Computes base 2 log of input vector and stores results in output vector
73  \param bVector The vector where results will be stored
74  \param aVector The input vector of floats
75  \param num_points Number of points for which log is to be computed
76 */
77 static inline void volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points){
78  float* bPtr = bVector;
79  const float* aPtr = aVector;
80  unsigned int number = 0;
81 
82  for(number = 0; number < num_points; number++){
83  *bPtr++ = log2(*aPtr++);
84  }
85 
86 }
87 #endif /* LV_HAVE_GENERIC */
88 
89 
90 #ifdef LV_HAVE_SSE4_1
91 #include <smmintrin.h>
92 /*!
93  \brief Computes base 2 log of input vector and stores results in output vector
94  \param bVector The vector where results will be stored
95  \param aVector The input vector of floats
96  \param num_points Number of points for which log is to be computed
97 */
98 static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
99 
100  float* bPtr = bVector;
101  const float* aPtr = aVector;
102 
103  unsigned int number = 0;
104  const unsigned int quarterPoints = num_points / 4;
105 
106  __m128 aVal, bVal, mantissa, frac, leadingOne;
107  __m128i bias, exp;
108 
109  for(;number < quarterPoints; number++){
110 
111  aVal = _mm_load_ps(aPtr);
112  bias = _mm_set1_epi32(127);
113  leadingOne = _mm_set1_ps(1.0f);
114  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
115  bVal = _mm_cvtepi32_ps(exp);
116 
117  // Now to extract mantissa
118  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
119 
120  #if LOG_POLY_DEGREE == 6
121  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
122  #elif LOG_POLY_DEGREE == 5
123  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
124  #elif LOG_POLY_DEGREE == 4
125  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
126  #elif LOG_POLY_DEGREE == 3
127  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
128  #else
129  #error
130  #endif
131 
132  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
133  _mm_store_ps(bPtr, bVal);
134 
135 
136  aPtr += 4;
137  bPtr += 4;
138  }
139 
140  number = quarterPoints * 4;
141  for(;number < num_points; number++){
142  *bPtr++ = log2(*aPtr++);
143  }
144 }
145 
146 #endif /* LV_HAVE_SSE4_1 for aligned */
147 
148 
149 #ifdef LV_HAVE_NEON
150 #include <arm_neon.h>
151 
152 /* these macros allow us to embed logs in other kernels */
153 #define VLOG2Q_NEON_PREAMBLE() \
154  int32x4_t one = vdupq_n_s32(0x000800000); \
155  /* minimax polynomial */ \
156  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
157  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
158  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
159  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
160  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
161  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
162  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
163  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
164  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
165  int32x4_t exp_bias = vdupq_n_s32(127);
166 
167 
168 #define VLOG2Q_NEON_F32(log2_approx, aval) \
169  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
170  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
171  exponent_i = vshrq_n_s32(exponent_i, 23); \
172  \
173  /* extract the exponent and significand \
174  we can treat this as fixed point to save ~9% on the \
175  conversion + float add */ \
176  significand_i = vorrq_s32(one, significand_i); \
177  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
178  /* debias the exponent and convert to float */ \
179  exponent_i = vsubq_s32(exponent_i, exp_bias); \
180  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
181  \
182  /* put the significand through a polynomial fit of log2(x) [1,2]\
183  add the result to the exponent */ \
184  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
185  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
186  log2_approx = vaddq_f32(log2_approx, tmp1); \
187  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
188  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
189  log2_approx = vaddq_f32(log2_approx, tmp1); \
190  \
191  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
192  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
193  log2_approx = vaddq_f32(log2_approx, tmp1); \
194  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
195  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
196  log2_approx = vaddq_f32(log2_approx, tmp1); \
197  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
198  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
199  log2_approx = vaddq_f32(log2_approx, tmp1); \
200  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
201  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
202  log2_approx = vaddq_f32(log2_approx, tmp1);
203 
204 /*!
205  \brief Computes base 2 log of input vector and stores results in output vector
206  \param bVector The vector where results will be stored
207  \param aVector The input vector of floats
208  \param num_points Number of points for which log is to be computed
209 */
210 static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){
211  float* bPtr = bVector;
212  const float* aPtr = aVector;
213  unsigned int number;
214  const unsigned int quarterPoints = num_points / 4;
215 
216  int32x4_t aval;
217  float32x4_t log2_approx;
218 
219  VLOG2Q_NEON_PREAMBLE()
220  // lms
221  //p0 = vdupq_n_f32(-1.649132280361871);
222  //p1 = vdupq_n_f32(1.995047138579499);
223  //p2 = vdupq_n_f32(-0.336914839219728);
224 
225  // keep in mind a single precision float is represented as
226  // (-1)^sign * 2^exp * 1.significand, so the log2 is
227  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
228  for(number = 0; number < quarterPoints; ++number){
229  // load float in to an int register without conversion
230  aval = vld1q_s32((int*)aPtr);
231 
232  VLOG2Q_NEON_F32(log2_approx, aval)
233 
234  vst1q_f32(bPtr, log2_approx);
235 
236  aPtr += 4;
237  bPtr += 4;
238  }
239 
240  for(number = quarterPoints * 4; number < num_points; number++){
241  *bPtr++ = log2(*aPtr++);
242  }
243 }
244 
245 #endif /* LV_HAVE_NEON */
246 
247 
248 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
249 
250 #ifndef INCLUDED_volk_32f_log2_32f_u_H
251 #define INCLUDED_volk_32f_log2_32f_u_H
252 
253 
254 #ifdef LV_HAVE_GENERIC
255 /*!
256  \brief Computes base 2 log of input vector and stores results in output vector
257  \param bVector The vector where results will be stored
258  \param aVector The input vector of floats
259  \param num_points Number of points for which log is to be computed
260 */
261 static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){
262  float* bPtr = bVector;
263  const float* aPtr = aVector;
264  unsigned int number = 0;
265 
266  for(number = 0; number < num_points; number++){
267  *bPtr++ = log2(*aPtr++);
268  }
269 
270 }
271 #endif /* LV_HAVE_GENERIC */
272 
273 
274 #ifdef LV_HAVE_SSE4_1
275 #include <smmintrin.h>
276 /*!
277  \brief Computes base 2 log of input vector and stores results in output vector
278  \param bVector The vector where results will be stored
279  \param aVector The input vector of floats
280  \param num_points Number of points for which log is to be computed
281 */
282 static inline void volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
283 
284  float* bPtr = bVector;
285  const float* aPtr = aVector;
286 
287  unsigned int number = 0;
288  const unsigned int quarterPoints = num_points / 4;
289 
290  __m128 aVal, bVal, mantissa, frac, leadingOne;
291  __m128i bias, exp;
292 
293  for(;number < quarterPoints; number++){
294 
295  aVal = _mm_loadu_ps(aPtr);
296  bias = _mm_set1_epi32(127);
297  leadingOne = _mm_set1_ps(1.0f);
298  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
299  bVal = _mm_cvtepi32_ps(exp);
300 
301  // Now to extract mantissa
302  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
303 
304  #if LOG_POLY_DEGREE == 6
305  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
306  #elif LOG_POLY_DEGREE == 5
307  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
308  #elif LOG_POLY_DEGREE == 4
309  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
310  #elif LOG_POLY_DEGREE == 3
311  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
312  #else
313  #error
314  #endif
315 
316  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
317  _mm_storeu_ps(bPtr, bVal);
318 
319 
320  aPtr += 4;
321  bPtr += 4;
322  }
323 
324  number = quarterPoints * 4;
325  for(;number < num_points; number++){
326  *bPtr++ = log2(*aPtr++);
327  }
328 }
329 
330 #endif /* LV_HAVE_SSE4_1 for unaligned */
331 
332 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_log2_32f.h:60
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_log2_32f.h:61
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_log2_32f.h:59
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_log2_32f.h:62