GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
24 #define INCLUDED_volk_32f_tanh_32f_a_H
25 
26 #include <inttypes.h>
27 #include <stdio.h>
28 #include <math.h>
29 #include <string.h>
30 
31 #ifdef LV_HAVE_GENERIC
32 /*!
33 \brief Calculates tanh(x)
34 \param cVector The vector where the results will be stored
35 \param aVector Input vector
36 \param num_points The number of values to calulate
37 */
38 static inline void volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
39  unsigned int num_points)
40 {
41  unsigned int number = 0;
42  float* cPtr = cVector;
43  const float* aPtr = aVector;
44  for(; number < num_points; number++) {
45  *cPtr++ = tanh(*aPtr++);
46  }
47 }
48 
49 #endif /* LV_HAVE_GENERIC */
50 
51 
52 #ifdef LV_HAVE_GENERIC
53 /*!
54 \brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
55 \param cVector The vector where the results will be stored
56 \param aVector Input vector
57 \param num_points The number of values to calulate
58 */
59 static inline void volk_32f_tanh_32f_series(float* cVector, const float* aVector,
60  unsigned int num_points)
61 {
62  unsigned int number = 0;
63  float* cPtr = cVector;
64  const float* aPtr = aVector;
65  for(; number < num_points; number++) {
66  if(*aPtr > 4.97)
67  *cPtr++ = 1;
68  else if(*aPtr <= -4.97)
69  *cPtr++ = -1;
70  else {
71  float x2 = (*aPtr) * (*aPtr);
72  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
73  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
74  *cPtr++ = a / b;
75  aPtr++;
76  }
77  }
78 }
79 
80 #endif /* LV_HAVE_GENERIC */
81 
82 
83 
84 #ifdef LV_HAVE_SSE
85 #include <xmmintrin.h>
86 /*!
87 \brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
88 \param cVector The vector where the results will be stored
89 \param aVector Input vector
90 \param num_points The number of values to calulate
91 */
92 static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
93  unsigned int num_points)
94 {
95  unsigned int number = 0;
96  const unsigned int quarterPoints = num_points / 4;
97 
98  float* cPtr = cVector;
99  const float* aPtr = aVector;
100 
101  __m128 aVal, cVal, x2, a, b;
102  __m128 const1, const2, const3, const4, const5, const6;
103  const1 = _mm_set_ps1(135135.0f);
104  const2 = _mm_set_ps1(17325.0f);
105  const3 = _mm_set_ps1(378.0f);
106  const4 = _mm_set_ps1(62370.0f);
107  const5 = _mm_set_ps1(3150.0f);
108  const6 = _mm_set_ps1(28.0f);
109  for(;number < quarterPoints; number++){
110 
111  aVal = _mm_load_ps(aPtr);
112  x2 = _mm_mul_ps(aVal, aVal);
113  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
114  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
115 
116  cVal = _mm_div_ps(a, b);
117 
118  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
119 
120  aPtr += 4;
121  cPtr += 4;
122  }
123 
124  number = quarterPoints * 4;
125  for(;number < num_points; number++) {
126  if(*aPtr > 4.97)
127  *cPtr++ = 1;
128  else if(*aPtr <= -4.97)
129  *cPtr++ = -1;
130  else {
131  float x2 = (*aPtr) * (*aPtr);
132  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
133  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
134  *cPtr++ = a / b;
135  aPtr++;
136  }
137  }
138 }
139 #endif /* LV_HAVE_SSE */
140 
141 
142 #ifdef LV_HAVE_AVX
143 #include <immintrin.h>
144 /*!
145 \brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
146 \param cVector The vector where the results will be stored
147 \param aVector Input vector
148 \param num_points The number of values to calulate
149 */
150 static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
151  unsigned int num_points)
152 {
153  unsigned int number = 0;
154  const unsigned int eighthPoints = num_points / 8;
155 
156  float* cPtr = cVector;
157  const float* aPtr = aVector;
158 
159  __m256 aVal, cVal, x2, a, b;
160  __m256 const1, const2, const3, const4, const5, const6;
161  const1 = _mm256_set1_ps(135135.0f);
162  const2 = _mm256_set1_ps(17325.0f);
163  const3 = _mm256_set1_ps(378.0f);
164  const4 = _mm256_set1_ps(62370.0f);
165  const5 = _mm256_set1_ps(3150.0f);
166  const6 = _mm256_set1_ps(28.0f);
167  for(;number < eighthPoints; number++){
168 
169  aVal = _mm256_load_ps(aPtr);
170  x2 = _mm256_mul_ps(aVal, aVal);
171  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
172  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
173 
174  cVal = _mm256_div_ps(a, b);
175 
176  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
177 
178  aPtr += 8;
179  cPtr += 8;
180  }
181 
182  number = eighthPoints * 8;
183  for(;number < num_points; number++) {
184  if(*aPtr > 4.97)
185  *cPtr++ = 1;
186  else if(*aPtr <= -4.97)
187  *cPtr++ = -1;
188  else {
189  float x2 = (*aPtr) * (*aPtr);
190  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
191  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
192  *cPtr++ = a / b;
193  aPtr++;
194  }
195  }
196 }
197 #endif /* LV_HAVE_AVX */
198 
199 
200 
201 
202 #ifdef LV_HAVE_SSE
203 #include <xmmintrin.h>
204 /*!
205 \brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
206 \param cVector The vector where the results will be stored
207 \param aVector Input vector
208 \param num_points The number of values to calulate
209 */
210 static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
211  unsigned int num_points)
212 {
213  unsigned int number = 0;
214  const unsigned int quarterPoints = num_points / 4;
215 
216  float* cPtr = cVector;
217  const float* aPtr = aVector;
218 
219  __m128 aVal, cVal, x2, a, b;
220  __m128 const1, const2, const3, const4, const5, const6;
221  const1 = _mm_set_ps1(135135.0f);
222  const2 = _mm_set_ps1(17325.0f);
223  const3 = _mm_set_ps1(378.0f);
224  const4 = _mm_set_ps1(62370.0f);
225  const5 = _mm_set_ps1(3150.0f);
226  const6 = _mm_set_ps1(28.0f);
227  for(;number < quarterPoints; number++){
228 
229  aVal = _mm_loadu_ps(aPtr);
230  x2 = _mm_mul_ps(aVal, aVal);
231  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
232  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
233 
234  cVal = _mm_div_ps(a, b);
235 
236  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
237 
238  aPtr += 4;
239  cPtr += 4;
240  }
241 
242  number = quarterPoints * 4;
243  for(;number < num_points; number++) {
244  if(*aPtr > 4.97)
245  *cPtr++ = 1;
246  else if(*aPtr <= -4.97)
247  *cPtr++ = -1;
248  else {
249  float x2 = (*aPtr) * (*aPtr);
250  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
251  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
252  *cPtr++ = a / b;
253  aPtr++;
254  }
255  }
256 }
257 #endif /* LV_HAVE_SSE */
258 
259 
260 
261 #ifdef LV_HAVE_AVX
262 #include <immintrin.h>
263 /*!
264 \brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
265 \param cVector The vector where the results will be stored
266 \param aVector Input vector
267 \param num_points The number of values to calulate
268 */
269 static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
270  unsigned int num_points)
271 {
272  unsigned int number = 0;
273  const unsigned int eighthPoints = num_points / 8;
274 
275  float* cPtr = cVector;
276  const float* aPtr = aVector;
277 
278  __m256 aVal, cVal, x2, a, b;
279  __m256 const1, const2, const3, const4, const5, const6;
280  const1 = _mm256_set1_ps(135135.0f);
281  const2 = _mm256_set1_ps(17325.0f);
282  const3 = _mm256_set1_ps(378.0f);
283  const4 = _mm256_set1_ps(62370.0f);
284  const5 = _mm256_set1_ps(3150.0f);
285  const6 = _mm256_set1_ps(28.0f);
286  for(;number < eighthPoints; number++){
287 
288  aVal = _mm256_loadu_ps(aPtr);
289  x2 = _mm256_mul_ps(aVal, aVal);
290  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
291  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
292 
293  cVal = _mm256_div_ps(a, b);
294 
295  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
296 
297  aPtr += 8;
298  cPtr += 8;
299  }
300 
301  number = eighthPoints * 8;
302  for(;number < num_points; number++) {
303  if(*aPtr > 4.97)
304  *cPtr++ = 1;
305  else if(*aPtr <= -4.97)
306  *cPtr++ = -1;
307  else {
308  float x2 = (*aPtr) * (*aPtr);
309  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
310  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
311  *cPtr++ = a / b;
312  aPtr++;
313  }
314  }
315 }
316 #endif /* LV_HAVE_AVX */
317 
318 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */