GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <inttypes.h>
26 #include <math.h>
27 
28 #define POLY0(x, c0) _mm_set1_ps(c0)
29 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
30 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
31 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
32 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
33 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
34 
35 #define LOG_POLY_DEGREE 3
36 
37 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
38 #define INCLUDED_volk_32f_x2_pow_32f_a_H
39 
40 #ifdef LV_HAVE_SSE4_1
41 #include <smmintrin.h>
42 /*!
43  \brief Computes pow(x,y) by using exp and log
44  \param cVector The vector where results will be stored
45  \param aVector The input vector of bases
46  \param bVector The input vector of indices
47  \param num_points Number of points for which pow is to be computed
48 */
49 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){
50 
51  float* cPtr = cVector;
52  const float* bPtr = bVector;
53  const float* aPtr = aVector;
54 
55  unsigned int number = 0;
56  const unsigned int quarterPoints = num_points / 4;
57 
58  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
59  __m128 tmp, fx, mask, pow2n, z, y;
60  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
61  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
62  __m128i bias, exp, emm0, pi32_0x7f;
63 
64  one = _mm_set1_ps(1.0);
65  exp_hi = _mm_set1_ps(88.3762626647949);
66  exp_lo = _mm_set1_ps(-88.3762626647949);
67  ln2 = _mm_set1_ps(0.6931471805);
68  log2EF = _mm_set1_ps(1.44269504088896341);
69  half = _mm_set1_ps(0.5);
70  exp_C1 = _mm_set1_ps(0.693359375);
71  exp_C2 = _mm_set1_ps(-2.12194440e-4);
72  pi32_0x7f = _mm_set1_epi32(0x7f);
73 
74  exp_p0 = _mm_set1_ps(1.9875691500e-4);
75  exp_p1 = _mm_set1_ps(1.3981999507e-3);
76  exp_p2 = _mm_set1_ps(8.3334519073e-3);
77  exp_p3 = _mm_set1_ps(4.1665795894e-2);
78  exp_p4 = _mm_set1_ps(1.6666665459e-1);
79  exp_p5 = _mm_set1_ps(5.0000001201e-1);
80 
81  for(;number < quarterPoints; number++){
82  // First compute the logarithm
83  aVal = _mm_load_ps(aPtr);
84  bias = _mm_set1_epi32(127);
85  leadingOne = _mm_set1_ps(1.0f);
86  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
87  logarithm = _mm_cvtepi32_ps(exp);
88 
89  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
90 
91  #if LOG_POLY_DEGREE == 6
92  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
93  #elif LOG_POLY_DEGREE == 5
94  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
95  #elif LOG_POLY_DEGREE == 4
96  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
97  #elif LOG_POLY_DEGREE == 3
98  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
99  #else
100  #error
101  #endif
102 
103  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
104  logarithm = _mm_mul_ps(logarithm, ln2);
105 
106 
107  // Now calculate b*lna
108  bVal = _mm_load_ps(bPtr);
109  bVal = _mm_mul_ps(bVal, logarithm);
110 
111  // Now compute exp(b*lna)
112  tmp = _mm_setzero_ps();
113 
114  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
115 
116  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
117 
118  emm0 = _mm_cvttps_epi32(fx);
119  tmp = _mm_cvtepi32_ps(emm0);
120 
121  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
122  fx = _mm_sub_ps(tmp, mask);
123 
124  tmp = _mm_mul_ps(fx, exp_C1);
125  z = _mm_mul_ps(fx, exp_C2);
126  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
127  z = _mm_mul_ps(bVal, bVal);
128 
129  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
130  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
131  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
132  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
133  y = _mm_add_ps(y, one);
134 
135  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
136 
137  pow2n = _mm_castsi128_ps(emm0);
138  cVal = _mm_mul_ps(y, pow2n);
139 
140  _mm_store_ps(cPtr, cVal);
141 
142  aPtr += 4;
143  bPtr += 4;
144  cPtr += 4;
145  }
146 
147  number = quarterPoints * 4;
148  for(;number < num_points; number++){
149  *cPtr++ = pow(*aPtr++, *bPtr++);
150  }
151 }
152 
153 #endif /* LV_HAVE_SSE4_1 for aligned */
154 
155 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
156 
157 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
158 #define INCLUDED_volk_32f_x2_pow_32f_u_H
159 
160 #ifdef LV_HAVE_GENERIC
161 /*!
162  \brief Computes pow(x,y) by using exp and log
163  \param cVector The vector where results will be stored
164  \param aVector The input vector of bases
165  \param bVector The input vector of indices
166  \param num_points Number of points for which pow is to be computed
167 */
168 static inline void volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){
169  float* cPtr = cVector;
170  const float* bPtr = bVector;
171  const float* aPtr = aVector;
172  unsigned int number = 0;
173 
174  for(number = 0; number < num_points; number++){
175  *cPtr++ = pow(*aPtr++, *bPtr++);
176  }
177 
178 }
179 #endif /* LV_HAVE_GENERIC */
180 
181 
182 #ifdef LV_HAVE_SSE4_1
183 #include <smmintrin.h>
184 /*!
185  \brief Computes pow(x,y) by using exp and log
186  \param cVector The vector where results will be stored
187  \param aVector The input vector of bases
188  \param bVector The input vector of indices
189  \param num_points Number of points for which pow is to be computed
190 */
191 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){
192 
193  float* cPtr = cVector;
194  const float* bPtr = bVector;
195  const float* aPtr = aVector;
196 
197  unsigned int number = 0;
198  const unsigned int quarterPoints = num_points / 4;
199 
200  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
201  __m128 tmp, fx, mask, pow2n, z, y;
202  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
203  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
204  __m128i bias, exp, emm0, pi32_0x7f;
205 
206  one = _mm_set1_ps(1.0);
207  exp_hi = _mm_set1_ps(88.3762626647949);
208  exp_lo = _mm_set1_ps(-88.3762626647949);
209  ln2 = _mm_set1_ps(0.6931471805);
210  log2EF = _mm_set1_ps(1.44269504088896341);
211  half = _mm_set1_ps(0.5);
212  exp_C1 = _mm_set1_ps(0.693359375);
213  exp_C2 = _mm_set1_ps(-2.12194440e-4);
214  pi32_0x7f = _mm_set1_epi32(0x7f);
215 
216  exp_p0 = _mm_set1_ps(1.9875691500e-4);
217  exp_p1 = _mm_set1_ps(1.3981999507e-3);
218  exp_p2 = _mm_set1_ps(8.3334519073e-3);
219  exp_p3 = _mm_set1_ps(4.1665795894e-2);
220  exp_p4 = _mm_set1_ps(1.6666665459e-1);
221  exp_p5 = _mm_set1_ps(5.0000001201e-1);
222 
223  for(;number < quarterPoints; number++){
224 
225  // First compute the logarithm
226  aVal = _mm_loadu_ps(aPtr);
227  bias = _mm_set1_epi32(127);
228  leadingOne = _mm_set1_ps(1.0f);
229  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
230  logarithm = _mm_cvtepi32_ps(exp);
231 
232  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
233 
234  #if LOG_POLY_DEGREE == 6
235  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
236  #elif LOG_POLY_DEGREE == 5
237  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
238  #elif LOG_POLY_DEGREE == 4
239  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
240  #elif LOG_POLY_DEGREE == 3
241  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
242  #else
243  #error
244  #endif
245 
246  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
247  logarithm = _mm_mul_ps(logarithm, ln2);
248 
249 
250  // Now calculate b*lna
251  bVal = _mm_loadu_ps(bPtr);
252  bVal = _mm_mul_ps(bVal, logarithm);
253 
254  // Now compute exp(b*lna)
255  tmp = _mm_setzero_ps();
256 
257  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
258 
259  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
260 
261  emm0 = _mm_cvttps_epi32(fx);
262  tmp = _mm_cvtepi32_ps(emm0);
263 
264  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
265  fx = _mm_sub_ps(tmp, mask);
266 
267  tmp = _mm_mul_ps(fx, exp_C1);
268  z = _mm_mul_ps(fx, exp_C2);
269  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
270  z = _mm_mul_ps(bVal, bVal);
271 
272  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
273  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
274  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
275  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
276  y = _mm_add_ps(y, one);
277 
278  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
279 
280  pow2n = _mm_castsi128_ps(emm0);
281  cVal = _mm_mul_ps(y, pow2n);
282 
283  _mm_storeu_ps(cPtr, cVal);
284 
285  aPtr += 4;
286  bPtr += 4;
287  cPtr += 4;
288  }
289 
290  number = quarterPoints * 4;
291  for(;number < num_points; number++){
292  *cPtr++ = pow(*aPtr++, *bPtr++);
293  }
294 }
295 
296 #endif /* LV_HAVE_SSE4_1 for unaligned */
297 
298 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_x2_pow_32f.h:31
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_x2_pow_32f.h:33
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_x2_pow_32f.h:30
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_x2_pow_32f.h:32